diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a9450588..1b6b588e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -29,6 +29,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Checkout submodules
         run: |
+          apt update&&apt install ninja-build
           ./third_party/prepare.sh
           ./third_party/install-mkl.sh
       - name: Build MegEngine
@@ -57,6 +58,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Checkout submodules
         run: |
+          apt update&&apt install ninja-build
           ./third_party/prepare.sh
           ./third_party/install-mkl.sh
       - name: Build MegEngine
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2dbd6d28..e025b9b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,14 @@
 cmake_minimum_required(VERSION 3.15.2)
-message(STATUS "CMAKE_GENERATOR: ${CMAKE_GENERATOR}" )
-if (NOT ${CMAKE_GENERATOR} STREQUAL "Ninja")
-    message(WARNING "CMAKE_GENERATOR NOT EQUAL Ninja, which we do not recommend")
+message(STATUS "CMAKE_GENERATOR: ${CMAKE_GENERATOR}")
+if(NOT ${CMAKE_GENERATOR} STREQUAL "Ninja")
+  message(WARNING "CMAKE_GENERATOR NOT EQUAL Ninja, which we do not recommend")
 endif()
 
-include (cmake/FetchMegBrainVersion.cmake)
-project(MegEngine LANGUAGES C CXX VERSION ${MGB_VER_STRING})
+include(cmake/FetchMegBrainVersion.cmake)
+project(
+  MegEngine
+  LANGUAGES C CXX
+  VERSION ${MGB_VER_STRING})
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -15,43 +18,55 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 set(CMAKE_POLICY_DEFAULT_CMP0048 NEW)
 
-if(NOT MSVC AND NOT APPLE AND NOT WIN32)
-    set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Dqc <TARGET> <LINK_FLAGS> <OBJECTS>")
-    set(CMAKE_CXX_ARCHIVE_APPEND "<CMAKE_AR> Dq  <TARGET> <LINK_FLAGS> <OBJECTS>")
-    set(CMAKE_CXX_ARCHIVE_FINISH "<CMAKE_RANLIB> -D <TARGET>")
+if(NOT MSVC
+   AND NOT APPLE
+   AND NOT WIN32)
+  set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Dqc <TARGET> <LINK_FLAGS> <OBJECTS>")
+  set(CMAKE_CXX_ARCHIVE_APPEND "<CMAKE_AR> Dq  <TARGET> <LINK_FLAGS> <OBJECTS>")
+  set(CMAKE_CXX_ARCHIVE_FINISH "<CMAKE_RANLIB> -D <TARGET>")
 endif()
 
 include(GNUInstallDirs)
 include(CheckCXXCompilerFlag)
 include(CheckIPOSupported)
 
-CHECK_CXX_COMPILER_FLAG(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS)
-
-set(MGE_ARCH AUTO CACHE STRING "Architecture on which MegEngine to be built.")
-set_property(CACHE MGE_ARCH PROPERTY STRINGS AUTO
-    x86_64 i386
-    armv7 aarch64
-    naive fallback
-)
-set (MGE_EXPORT_TARGETS MegEngine-targets)
+check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS)
+
+set(MGE_ARCH
+    AUTO
+    CACHE STRING "Architecture on which MegEngine to be built.")
+set_property(
+  CACHE MGE_ARCH
+  PROPERTY STRINGS
+           AUTO
+           x86_64
+           i386
+           armv7
+           aarch64
+           naive
+           fallback)
+set(MGE_EXPORT_TARGETS MegEngine-targets)
 
 if(NOT "$ENV{LD_LIBRARY_PATH}" STREQUAL "")
-    string(REPLACE ":" ";" ALTER_LD_LIBRARY_PATHS $ENV{LD_LIBRARY_PATH})
+  string(REPLACE ":" ";" ALTER_LD_LIBRARY_PATHS $ENV{LD_LIBRARY_PATH})
 else()
-    set(ALTER_LD_LIBRARY_PATHS "")
+  set(ALTER_LD_LIBRARY_PATHS "")
 endif()
 
 if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "")
-    string(REPLACE ":" ";" ALTER_LIBRARY_PATHS $ENV{LIBRARY_PATH})
+  string(REPLACE ":" ";" ALTER_LIBRARY_PATHS $ENV{LIBRARY_PATH})
 else()
-    set(ALTER_LIBRARY_PATHS "")
+  set(ALTER_LIBRARY_PATHS "")
 endif()
 
 option(MGE_WITH_JIT "Build MegEngine with JIT." ON)
 option(MGE_WITH_JIT_MLIR "Build MegEngine with MLIR JIT." OFF)
 option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" OFF)
 option(MGE_WITH_MIDOUT_PROFILE "Build MegEngine with Midout profile." OFF)
-option(MGE_WITH_MINIMUM_SIZE "Swith off MGE_ENABLE_RTTI、MGE_ENABLE_EXCEPTIONS、MGE_ENABLE_LOGGING and switch on MGE_INFERENCE_ONLY so that compile minimum load_and_run." OFF)
+option(
+  MGE_WITH_MINIMUM_SIZE
+  "Swith off MGE_ENABLE_RTTI、MGE_ENABLE_EXCEPTIONS、MGE_ENABLE_LOGGING and switch on MGE_INFERENCE_ONLY so that compile minimum load_and_run."
+  OFF)
 option(MGE_ARMV8_2_FEATURE_FP16 "Enable armv8.2-a+fp16 support" OFF)
 option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF)
 option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON)
@@ -81,781 +96,906 @@ option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
 option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF)
 option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF)
 if(MSVC OR WIN32)
-    # FIXME: static link Windows vc runtime with some version from Visual Studio have
-    # some runtime issue at some call PATH, for example: _imperative_rt.pyd --> megengine_shared.dll
-    # for example c api flush can not find the fd args, I have no idea about this issue
-    # as a Workround, dynamic link vc runtime,  but at some case, we will static link vcrt
-    # when MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP/MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2, so please
-    # use lite_static_all_in_one(lite/CMakeLists.txt) in Windows XP env as possible
-    # How to install VC runtime if you env do not install, refer to:
-    # https://docs.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-160
-    option(MGE_STATIC_LINK_WITH_VC_RUNTIME "Enable mge static link with Windows vc runtime" OFF)
-
-    option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP "Enable deploy inference on Windows xp" OFF)
-    # special MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 for Windows XP sp2(32bit)
-    # internal behavior:
-    # 1: will force define MGB_HAVE_THREAD=0, which means only support single thread
-    # 2: some Feature will be disable, eg: MGB_ENABLE_JSON and var sanity check, do
-    #    not too many care this!!, if you want to use this Feature to 'DEBUG', you can
-    #    run same model at NON-XP-SP2 env, eg Win7 or XP-SP3(build without MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
-    # 3: we only support MegEngine(load_and_run) and MegEngineLite API work on XP SP2
-    #    some debug utils, eg, megbrain_test/megdnn_test not support run, most caused by gtest src code
-    # sdk caller:
-    # 1: as we remove mutex, when you use MSVC self API eg CreateThread to start several MegEngine instances
-    #    in the same progress, please call MegEngine API(init/run) as serial as possible, also please
-    #    do not use std::thread std::mutex/std::this_thread_id at SDK caller side!!!
-    # check dll/exe can deploy on Windows XP sp2 or not:
-    #    please checkout scripts/misc/check_windows_xp_sp2_deploy.py
-    option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 "Enable deploy inference on Windows xp sp2" OFF)
-
-    # PE file linked by LLVM lld can not run at Windows XP env, so we force use link.exe
-    # which always locate in Microsoft Visual Studio/*/*/VC/Tools/MSVC/*/bin/*/*/link.exe
-    set(CMAKE_LINKER "link.exe")
-    if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP OR MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
-        set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON)
-        message(STATUS "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows XP")
-
-        if(NOT ${MGE_ARCH} STREQUAL "i386")
-            message(FATAL_ERROR "only support 32bit when build for Windows xp")
-        endif()
+  # FIXME: static link Windows vc runtime with some version from Visual Studio have some
+  # runtime issue at some call PATH, for example: _imperative_rt.pyd -->
+  # megengine_shared.dll for example c api flush can not find the fd args, I have no
+  # idea about this issue as a Workround, dynamic link vc runtime,  but at some case, we
+  # will static link vcrt when
+  # MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP/MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2, so please
+  # use lite_static_all_in_one(lite/CMakeLists.txt) in Windows XP env as possible How to
+  # install VC runtime if you env do not install, refer to:
+  # https://docs.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-160
+  option(MGE_STATIC_LINK_WITH_VC_RUNTIME
+         "Enable mge static link with Windows vc runtime" OFF)
+
+  option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP "Enable deploy inference on Windows xp" OFF)
+  # special MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2 for Windows XP sp2(32bit) internal
+  # behavior: 1: will force define MGB_HAVE_THREAD=0, which means only support single
+  # thread 2: some Feature will be disable, eg: MGB_ENABLE_JSON and var sanity check, do
+  # not too many care this!!, if you want to use this Feature to 'DEBUG', you can run
+  # same model at NON-XP-SP2 env, eg Win7 or XP-SP3(build without
+  # MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2) 3: we only support MegEngine(load_and_run)
+  # and MegEngineLite API work on XP SP2 some debug utils, eg, megbrain_test/megdnn_test
+  # not support run, most caused by gtest src code sdk caller: 1: as we remove mutex,
+  # when you use MSVC self API eg CreateThread to start several MegEngine instances in
+  # the same progress, please call MegEngine API(init/run) as serial as possible, also
+  # please do not use std::thread std::mutex/std::this_thread_id at SDK caller side!!!
+  # check dll/exe can deploy on Windows XP sp2 or not: please checkout
+  # scripts/misc/check_windows_xp_sp2_deploy.py
+  option(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2
+         "Enable deploy inference on Windows xp sp2" OFF)
+
+  # PE file linked by LLVM lld can not run at Windows XP env, so we force use link.exe
+  # which always locate in Microsoft Visual Studio/*/*/VC/Tools/MSVC/*/bin/*/*/link.exe
+  set(CMAKE_LINKER "link.exe")
+  if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP OR MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
+    set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON)
+    message(
+      STATUS "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows XP")
+
+    if(NOT ${MGE_ARCH} STREQUAL "i386")
+      message(FATAL_ERROR "only support 32bit when build for Windows xp")
+    endif()
 
-        if(NOT MGE_INFERENCE_ONLY)
-            message(FATAL_ERROR "only support inference when build for Windows xp")
-        endif()
+    if(NOT MGE_INFERENCE_ONLY)
+      message(FATAL_ERROR "only support inference when build for Windows xp")
+    endif()
 
-        if(MGE_WITH_CUDA)
-            message(FATAL_ERROR "do not support CUDA when build for Windows xp")
-        endif()
+    if(MGE_WITH_CUDA)
+      message(FATAL_ERROR "do not support CUDA when build for Windows xp")
+    endif()
 
-        # Windows XP sp3 have thread issue, Workround for it
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-")
-        # for Windows XP type
-        add_link_options("/SUBSYSTEM:CONSOLE,5.01")
-        # some old lib(for example mkl for xp) use legacy stdio, so we force link legacy_stdio_definitions
-        add_link_options("/DEFAULTLIB:legacy_stdio_definitions.lib")
-
-        if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__DEPLOY_ON_XP_SP2__=1")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__DEPLOY_ON_XP_SP2__=1")
-        endif()
-    else()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0601")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0601")
+    # Windows XP sp3 have thread issue, Workround for it
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0501 /Zc:threadSafeInit-")
+    # for Windows XP type
+    add_link_options("/SUBSYSTEM:CONSOLE,5.01")
+    # some old lib(for example mkl for xp) use legacy stdio, so we force link
+    # legacy_stdio_definitions
+    add_link_options("/DEFAULTLIB:legacy_stdio_definitions.lib")
+
+    if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__DEPLOY_ON_XP_SP2__=1")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__DEPLOY_ON_XP_SP2__=1")
     endif()
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_WIN32_WINNT=0x0601")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /D_WIN32_WINNT=0x0601")
+  endif()
 endif()
 
 if(MSVC OR WIN32)
-    message(STATUS "windows force cudnn static link")
-    set(MGE_WITH_CUDNN_SHARED OFF)
+  message(STATUS "windows force cudnn static link")
+  set(MGE_WITH_CUDNN_SHARED OFF)
 endif()
 
 if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB)
-    set(MGE_WITH_ANY_CUDA_STUB ON)
+  set(MGE_WITH_ANY_CUDA_STUB ON)
 else()
-    set(MGE_WITH_ANY_CUDA_STUB OFF)
+  set(MGE_WITH_ANY_CUDA_STUB OFF)
 endif()
 
 if(MGE_WITH_MIDOUT_PROFILE)
-    message(STATUS "build with MIDOUT PROFILE and force set MGE_WITH_MINIMUM_SIZE off and force rtti ON")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMIDOUT_PROFILING")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMIDOUT_PROFILING")
-    set(MGE_WITH_MINIMUM_SIZE OFF)
-    set(MGE_ENABLE_RTTI ON)
-    if(WIN32)
-        message(FATAL_ERROR "do not support midout at WIN32")
-    endif()
+  message(
+    STATUS
+      "build with MIDOUT PROFILE and force set MGE_WITH_MINIMUM_SIZE off and force rtti ON"
+  )
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMIDOUT_PROFILING")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMIDOUT_PROFILING")
+  set(MGE_WITH_MINIMUM_SIZE OFF)
+  set(MGE_ENABLE_RTTI ON)
+  if(WIN32)
+    message(FATAL_ERROR "do not support midout at WIN32")
+  endif()
 endif()
 
 set(BIN_REDUCE ${PROJECT_SOURCE_DIR}/src/bin_reduce_cmake.h)
 if(MGE_WITH_MINIMUM_SIZE)
-    message(STATUS "build with MGE_WITH_MINIMUM_SIZE bin_reduce header is: ${BIN_REDUCE}")
-    set(MGE_ENABLE_RTTI OFF)
-    set(MGE_ENABLE_LOGGING OFF)
-    set(MGE_ENABLE_EXCEPTIONS OFF)
-    set(MGE_INFERENCE_ONLY ON)
-    # MGE_WITH_MINIMUM_SIZE will triger unused-parameter
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter")
+  message(STATUS "build with MGE_WITH_MINIMUM_SIZE bin_reduce header is: ${BIN_REDUCE}")
+  set(MGE_ENABLE_RTTI OFF)
+  set(MGE_ENABLE_LOGGING OFF)
+  set(MGE_ENABLE_EXCEPTIONS OFF)
+  set(MGE_INFERENCE_ONLY ON)
+  # MGE_WITH_MINIMUM_SIZE will triger unused-parameter
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter")
 endif()
 
 if(NOT MGE_WITH_MIDOUT_PROFILE AND NOT WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include ${BIN_REDUCE}")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${BIN_REDUCE}")
-endif()
-
-if (NOT APPLE)
-    # check CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT on APPLE will leak cmake crash
-    CHECK_CXX_COMPILER_FLAG("-ffunction-sections -fdata-sections  -Wl,--gc-sections"  CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT)
-    if(CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections")
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections")
-    endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include ${BIN_REDUCE}")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${BIN_REDUCE}")
+endif()
+
+if(NOT APPLE)
+  # check CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT on APPLE will leak cmake crash
+  check_cxx_compiler_flag("-ffunction-sections -fdata-sections  -Wl,--gc-sections"
+                          CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT)
+  if(CXX_FUNCTION_DATA_GC_SECTIONS_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections")
+  endif()
 endif()
 
 check_ipo_supported(RESULT IS_LTO_SUPPORT OUTPUT output_info)
 # LLVM on Windows report support LTO, but do not support -flto=full at link stage
 if(IS_LTO_SUPPORT AND NOT WIN32)
-    message(STATUS "lto is supported in this compiler")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -flto=full")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -flto=full")
+  message(STATUS "lto is supported in this compiler")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -flto=full")
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -flto=full")
 else()
-    message(STATUS "lto is not supported in this compiler")
+  message(STATUS "lto is not supported in this compiler")
 endif()
 
-if (APPLE)
-    set (BUILD_SHARED_LIBS OFF)
-    message(STATUS "build static for xcode framework require")
+if(APPLE)
+  set(BUILD_SHARED_LIBS OFF)
+  message(STATUS "build static for xcode framework require")
 endif()
 
-if (MGE_USE_SYSTEM_LIB)
-    set (MGE_CUDA_USE_STATIC OFF)
+if(MGE_USE_SYSTEM_LIB)
+  set(MGE_CUDA_USE_STATIC OFF)
 endif()
 
-if (MGB_WITH_FLATBUFFERS)
-    set(MGB_ENABLE_FBS_SERIALIZATION ON)
+if(MGB_WITH_FLATBUFFERS)
+  set(MGB_ENABLE_FBS_SERIALIZATION ON)
 endif()
 
 if(CMAKE_TOOLCHAIN_FILE)
-    message(STATUS "We are cross compiling.")
-    message(STATUS "config FLATBUFFERS_FLATC_EXECUTABLE to: ${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc")
-    set(FLATBUFFERS_FLATC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc")
-    if(ANDROID_TOOLCHAIN_ROOT)
-        if(NOT "${ANDROID_ARCH_NAME}" STREQUAL "")
-            set(ANDROID_ARCH ${ANDROID_ARCH_NAME})
-        endif()
-        if(${ANDROID_ARCH} STREQUAL "arm")
-            set(MGE_ARCH "armv7")
-        elseif(${ANDROID_ARCH} STREQUAL "arm64")
-            set(MGE_ARCH "aarch64")
-        else()
-            message(FATAL_ERROR "DO NOT SUPPORT ANDROID ARCH NOW")
-        endif()
-    elseif(IOS_TOOLCHAIN_ROOT)
-        if(${IOS_ARCH} STREQUAL "armv7")
-            set(MGE_ARCH "armv7")
-        elseif(${IOS_ARCH} STREQUAL "arm64")
-            set(MGE_ARCH "aarch64")
-        elseif(${IOS_ARCH} STREQUAL "armv7k")
-            set(MGE_ARCH "armv7")
-        elseif(${IOS_ARCH} STREQUAL "arm64e")
-            set(MGE_ARCH "aarch64")
-        elseif(${IOS_ARCH} STREQUAL "armv7s")
-            set(MGE_ARCH "armv7")
-        else()
-            message(FATAL_ERROR "Unsupported IOS_ARCH.")
-        endif()
-    elseif(RISCV_TOOLCHAIN_ROOT)
-        set(MGE_ARCH "riscv64")
-    elseif(NOT "${ARM_CROSS_BUILD_ARCH}" STREQUAL "")
-        set(MGE_ARCH ${ARM_CROSS_BUILD_ARCH})
+  message(STATUS "We are cross compiling.")
+  message(
+    STATUS
+      "config FLATBUFFERS_FLATC_EXECUTABLE to: ${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc"
+  )
+  set(FLATBUFFERS_FLATC_EXECUTABLE
+      "${PROJECT_SOURCE_DIR}/build_dir/host_flatc/install/bin/flatc")
+  if(ANDROID_TOOLCHAIN_ROOT)
+    if(NOT "${ANDROID_ARCH_NAME}" STREQUAL "")
+      set(ANDROID_ARCH ${ANDROID_ARCH_NAME})
+    endif()
+    if(${ANDROID_ARCH} STREQUAL "arm")
+      set(MGE_ARCH "armv7")
+    elseif(${ANDROID_ARCH} STREQUAL "arm64")
+      set(MGE_ARCH "aarch64")
+    else()
+      message(FATAL_ERROR "DO NOT SUPPORT ANDROID ARCH NOW")
+    endif()
+  elseif(IOS_TOOLCHAIN_ROOT)
+    if(${IOS_ARCH} STREQUAL "armv7")
+      set(MGE_ARCH "armv7")
+    elseif(${IOS_ARCH} STREQUAL "arm64")
+      set(MGE_ARCH "aarch64")
+    elseif(${IOS_ARCH} STREQUAL "armv7k")
+      set(MGE_ARCH "armv7")
+    elseif(${IOS_ARCH} STREQUAL "arm64e")
+      set(MGE_ARCH "aarch64")
+    elseif(${IOS_ARCH} STREQUAL "armv7s")
+      set(MGE_ARCH "armv7")
     else()
-        message(FATAL_ERROR "Unknown cross-compiling settings.")
+      message(FATAL_ERROR "Unsupported IOS_ARCH.")
     endif()
-    message(STATUS "CONFIG MGE_ARCH TO ${MGE_ARCH}")
+  elseif(RISCV_TOOLCHAIN_ROOT)
+    set(MGE_ARCH "riscv64")
+  elseif(NOT "${ARM_CROSS_BUILD_ARCH}" STREQUAL "")
+    set(MGE_ARCH ${ARM_CROSS_BUILD_ARCH})
+  else()
+    message(FATAL_ERROR "Unknown cross-compiling settings.")
+  endif()
+  message(STATUS "CONFIG MGE_ARCH TO ${MGE_ARCH}")
 endif()
 
 if(${MGE_ARCH} STREQUAL "AUTO")
-    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
-        set(MGE_ARCH "x86_64")
-    elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686")
-        set(MGE_ARCH "i386")
-    elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64")
-        set(MGE_ARCH "aarch64")
-    elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm")
-        set(MGE_ARCH "armv7")
-    else()
-        message(FATAL_ERROR "Unknown machine architecture for MegEngine.")
-    endif()
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL
+                                                    "AMD64")
+    set(MGE_ARCH "x86_64")
+  elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR}
+                                                      STREQUAL "i686")
+    set(MGE_ARCH "i386")
+  elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR}
+                                                         STREQUAL "arm64")
+    set(MGE_ARCH "aarch64")
+  elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm")
+    set(MGE_ARCH "armv7")
+  else()
+    message(FATAL_ERROR "Unknown machine architecture for MegEngine.")
+  endif()
 endif()
 
 if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
-    message(STATUS "Setting build type to 'RelWithDebInfo' as none was specified.")
-    set(CMAKE_BUILD_TYPE RelWithDebInfo)
+  message(STATUS "Setting build type to 'RelWithDebInfo' as none was specified.")
+  set(CMAKE_BUILD_TYPE RelWithDebInfo)
 endif()
 
-if(${CMAKE_BUILD_TYPE} STREQUAL "Release" AND NOT MGE_WITH_TEST AND NOT ${MGE_ARCH} STREQUAL "x86_64" AND NOT MGE_WITH_MIDOUT_PROFILE)
-    set(MGE_ENABLE_RTTI OFF)
-    message(STATUS "disable MGE_ENABLE_RTTI when Release/NON-x86_64/NON-MGE_WITH_MIDOUT_PROFILE mode!!")
+if(${CMAKE_BUILD_TYPE} STREQUAL "Release"
+   AND NOT MGE_WITH_TEST
+   AND NOT ${MGE_ARCH} STREQUAL "x86_64"
+   AND NOT MGE_WITH_MIDOUT_PROFILE)
+  set(MGE_ENABLE_RTTI OFF)
+  message(
+    STATUS
+      "disable MGE_ENABLE_RTTI when Release/NON-x86_64/NON-MGE_WITH_MIDOUT_PROFILE mode!!"
+  )
 endif()
 
 if(MSVC OR WIN32)
-    # for cmake after 3.15.2
-    cmake_policy(SET CMP0091 NEW)
-    set(CMAKE_OBJECT_PATH_MAX 300)
-    if(MGE_BUILD_WITH_ASAN)
-        set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON)
-        message(STATUS "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows MGE_BUILD_WITH_ASAN")
-    endif()
-    if(MGE_STATIC_LINK_WITH_VC_RUNTIME)
-        if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-            set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug")
-        else()
-            set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded")
-        endif()
+  # for cmake after 3.15.2
+  cmake_policy(SET CMP0091 NEW)
+  set(CMAKE_OBJECT_PATH_MAX 300)
+  if(MGE_BUILD_WITH_ASAN)
+    set(MGE_STATIC_LINK_WITH_VC_RUNTIME ON)
+    message(
+      STATUS
+        "Force set MGE_STATIC_LINK_WITH_VC_RUNTIME ON when build for Windows MGE_BUILD_WITH_ASAN"
+    )
+  endif()
+  if(MGE_STATIC_LINK_WITH_VC_RUNTIME)
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+      set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug")
     else()
-        if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-            set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
-        else()
-            set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL")
-        endif()
+      set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded")
     endif()
-
-    add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
-    message(STATUS "into windows build CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
-    if (NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl")
-        message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md")
-    endif()
-    # on windows need append VS_PATH/VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows
-    # and VS_PATH/VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows to PATH env
-    if (MGE_BUILD_WITH_ASAN)
-        message(WARNING "please do (set)export ASAN_OPTIONS=windows_hook_rtl_allocators=true when run test after build finish, caused by we link asan dll!!")
-        if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-            message(WARNING "Windows AddressSanitizer doesn't support linking with debug runtime libraries yet, which means do not support CMAKE_BUILD_TYPE=Debug")
-            message(FATAL_ERROR "Please build with RelWithDebInfo or Release by : EXTRA_CMAKE_ARGS=\"-DMGE_BUILD_WITH_ASAN=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ...\"")
-        endif()
-        if("$ENV{VS_PATH}" STREQUAL "")
-            message(FATAL_ERROR "can not find VS_PATH, please export Visual Studio root dir to VS_PATH env")
-        endif()
-        if(${MGE_ARCH} STREQUAL "x86_64")
-            set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-x86_64.lib")
-            set(WINDOWS_ASAN_RUNTIME_THUNK_NAME "clang_rt.asan_dynamic_runtime_thunk-x86_64")
-            set(WINDOWS_ASAN_PATH_SUFFIXES "VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows")
-        elseif(${MGE_ARCH} STREQUAL "i386")
-            set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-i386.lib")
-            set(WINDOWS_ASAN_RUNTIME_THUNK_NAME "clang_rt.asan_dynamic_runtime_thunk-i386.lib")
-            set(WINDOWS_ASAN_PATH_SUFFIXES "VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows")
-        else()
-            message(FATAL_ERROR "unsupport asan ARCH: ${MGE_ARCH} on Windows")
-        endif()
-        find_path(ASAN_DLL_PATH
-            NAMES ${WINDOWS_ASAN_DLL_NAME}
-            HINTS $ENV{VS_PATH}
-            PATH_SUFFIXES ${WINDOWS_ASAN_PATH_SUFFIXES}
-            DOC "Windows asan library path" )
-        if(ASAN_DLL_PATH STREQUAL "ASAN_DLL_PATH-NOTFOUND")
-            message(FATAL_ERROR "can not find asan dll, please upgrade you LLVM")
-        endif()
-
-        message(STATUS "Windows asan dll path: ${ASAN_DLL_PATH}")
-        link_directories(${ASAN_DLL_PATH})
-        link_libraries(${WINDOWS_ASAN_DLL_NAME})
-        link_libraries(${WINDOWS_ASAN_RUNTIME_THUNK_NAME})
-        set(WIN_FLAGS "/Od -DNDEBUG -fsanitize=address")
-        # windows Llvm asan do not take effect when /O2
-        # RELWITHDEBINFO default value is /O2, so override it
-        set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG")
-        set(CMAKE_C_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG")
-        set(CMAKE_CXX_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG")
+  else()
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+      set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
     else()
-        set(WIN_FLAGS "/O2")
+      set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL")
     endif()
-    # add flags for enable sse instruction optimize for X86, enable avx header to compile avx code
-    set(WIN_FLAGS "${WIN_FLAGS} -msse4.2 -D_AVX_ -D_AVX2_ -D__AVX__ -D__AVX2__ -D__FMA__")
-    # if u CPU is cascadelake series, u can enable for performance
-    # set(WIN_FLAGS "{WIN_FLAGS} -march=cascadelake -mtune=cascadelake")
-    # set(WIN_FLAGS "{WIN_FLAGS} -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vnni")
-
-    # for windows build
-    set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=implicit-int-conversion -Wno-error=double-promotion")
-    set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion")
-    set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default")
-    set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated  -Wno-error=documentation  -Wno-error=unreachable-code-break")
-    set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /wd4819")
-    set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj")
-    set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport")
-
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}")
-
-    #FIXME: fix halide JIT on windows
-    message(STATUS "disable jit, halide and mlir on windows host build...")
-    set(MGE_WITH_HALIDE OFF)
-    set(MGE_WITH_JIT OFF)
-    set(MGE_WITH_JIT_MLIR OFF)
-    #FIXME: fix MegRay on windows
-    message(STATUS "Disable distributed build on windows host build...")
-    set(MGE_WITH_DISTRIBUTED OFF)
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
-
-    # NONE windows DEBUG general flags
-    if(MGE_BUILD_WITH_ASAN)
-        set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer")
-        set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer")
-    else()
-        set(CMAKE_C_FLAGS_DEBUG "-O0 -g")
-        set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
+  endif()
+
+  add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
+  message(STATUS "into windows build CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
+  if(NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND NOT ${CMAKE_C_COMPILER_ID}
+                                                     STREQUAL "Clang-cl")
+    message(
+      FATAL_ERROR
+        "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md"
+    )
+  endif()
+  # on windows need append
+  # VS_PATH/VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows and
+  # VS_PATH/VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows to PATH
+  # env
+  if(MGE_BUILD_WITH_ASAN)
+    message(
+      WARNING
+        "please do (set)export ASAN_OPTIONS=windows_hook_rtl_allocators=true when run test after build finish, caused by we link asan dll!!"
+    )
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+      message(
+        WARNING
+          "Windows AddressSanitizer doesn't support linking with debug runtime libraries yet, which means do not support CMAKE_BUILD_TYPE=Debug"
+      )
+      message(
+        FATAL_ERROR
+          "Please build with RelWithDebInfo or Release by : EXTRA_CMAKE_ARGS=\"-DMGE_BUILD_WITH_ASAN=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ...\""
+      )
     endif()
-
-    # NONE windows opt general flags
-    if (MGE_BUILD_WITH_ASAN)
-        set(OPTIMIZE_LEVEL "-g -O0 -DNDEBUG -fsanitize=address -fno-omit-frame-pointer")
-    elseif(ANDROID)
-        set(OPTIMIZE_LEVEL "-g -Ofast -DNDEBUG")
-    else()
-        set(OPTIMIZE_LEVEL "-g -O3 -DNDEBUG")
+    if("$ENV{VS_PATH}" STREQUAL "")
+      message(
+        FATAL_ERROR
+          "can not find VS_PATH, please export Visual Studio root dir to VS_PATH env")
     endif()
-    #remove finite-math-only opt from Ofast, caused by clang have a different
-    #runtime finite math logic, this issue do not find at g++, but as a unity
-    #build flags, we force add -fno-finite-math-only when compiler support
-    CHECK_CXX_COMPILER_FLAG("-fno-finite-math-only"  CXX_NO_FINITE_MATH_ONLY_SUPPORT)
-    if(CXX_NO_FINITE_MATH_ONLY_SUPPORT)
-        message(STATUS "force add -fno-finite-math-only for this compiler")
-        set(OPTIMIZE_LEVEL "${OPTIMIZE_LEVEL} -fno-finite-math-only")
+    if(${MGE_ARCH} STREQUAL "x86_64")
+      set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-x86_64.lib")
+      set(WINDOWS_ASAN_RUNTIME_THUNK_NAME "clang_rt.asan_dynamic_runtime_thunk-x86_64")
+      set(WINDOWS_ASAN_PATH_SUFFIXES
+          "VC/Tools/Llvm/x64/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows")
+    elseif(${MGE_ARCH} STREQUAL "i386")
+      set(WINDOWS_ASAN_DLL_NAME "clang_rt.asan_dynamic-i386.lib")
+      set(WINDOWS_ASAN_RUNTIME_THUNK_NAME
+          "clang_rt.asan_dynamic_runtime_thunk-i386.lib")
+      set(WINDOWS_ASAN_PATH_SUFFIXES
+          "VC/Tools/Llvm/lib/clang/${CMAKE_CXX_COMPILER_VERSION}/lib/windows")
+    else()
+      message(FATAL_ERROR "unsupport asan ARCH: ${MGE_ARCH} on Windows")
     endif()
-    set(CMAKE_C_FLAGS_RELEASE "${OPTIMIZE_LEVEL}")
-    set(CMAKE_CXX_FLAGS_RELEASE "${OPTIMIZE_LEVEL}")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}")
-    #some gnu(gcc) compiler use -static -libasan have runtime issue
-    #also, when target is big, clang ld will take a long long long
-    #time when use -static-libsan, so we use dynamic asan by default
-    #ANDROID asan.so depends on log, so broadcast log link_libraries
-    #for megengine depends target, for example flatc target
-    if (MGE_BUILD_WITH_ASAN AND ANDROID)
-        link_libraries(log)
+    find_path(
+      ASAN_DLL_PATH
+      NAMES ${WINDOWS_ASAN_DLL_NAME}
+      HINTS $ENV{VS_PATH}
+      PATH_SUFFIXES ${WINDOWS_ASAN_PATH_SUFFIXES}
+      DOC "Windows asan library path")
+    if(ASAN_DLL_PATH STREQUAL "ASAN_DLL_PATH-NOTFOUND")
+      message(FATAL_ERROR "can not find asan dll, please upgrade you LLVM")
     endif()
+
+    message(STATUS "Windows asan dll path: ${ASAN_DLL_PATH}")
+    link_directories(${ASAN_DLL_PATH})
+    link_libraries(${WINDOWS_ASAN_DLL_NAME})
+    link_libraries(${WINDOWS_ASAN_RUNTIME_THUNK_NAME})
+    set(WIN_FLAGS "/Od -DNDEBUG -fsanitize=address")
+    # windows Llvm asan do not take effect when /O2 RELWITHDEBINFO default value is /O2,
+    # so override it
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /Od /Ob1 /DNDEBUG")
+    set(CMAKE_C_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELEASE "/Zi /Od /Ob1 /DNDEBUG")
+  else()
+    set(WIN_FLAGS "/O2")
+  endif()
+  # add flags for enable sse instruction optimize for X86, enable avx header to compile
+  # avx code
+  set(WIN_FLAGS "${WIN_FLAGS} -msse4.2 -D_AVX_ -D_AVX2_ -D__AVX__ -D__AVX2__ -D__FMA__")
+  # if u CPU is cascadelake series, u can enable for performance set(WIN_FLAGS
+  # "{WIN_FLAGS} -march=cascadelake -mtune=cascadelake") set(WIN_FLAGS "{WIN_FLAGS}
+  # -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vnni")
+
+  # for windows build
+  set(WIN_FLAGS
+      "${WIN_FLAGS} -Wno-error=implicit-int-conversion -Wno-error=double-promotion")
+  set(WIN_FLAGS
+      "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion"
+  )
+  set(WIN_FLAGS
+      "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default"
+  )
+  set(WIN_FLAGS
+      "${WIN_FLAGS} -Wno-error=deprecated  -Wno-error=documentation  -Wno-error=unreachable-code-break"
+  )
+  set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /wd4819")
+  set(WIN_FLAGS
+      "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj"
+  )
+  set(WIN_FLAGS
+      "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport"
+  )
+
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}")
+
+  # FIXME: fix halide JIT on windows
+  message(STATUS "disable jit, halide and mlir on windows host build...")
+  set(MGE_WITH_HALIDE OFF)
+  set(MGE_WITH_JIT OFF)
+  set(MGE_WITH_JIT_MLIR OFF)
+  # FIXME: fix MegRay on windows
+  message(STATUS "Disable distributed build on windows host build...")
+  set(MGE_WITH_DISTRIBUTED OFF)
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
+
+  # NONE windows DEBUG general flags
+  if(MGE_BUILD_WITH_ASAN)
+    set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer")
+    set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fsanitize=address -fno-omit-frame-pointer")
+  else()
+    set(CMAKE_C_FLAGS_DEBUG "-O0 -g")
+    set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
+  endif()
+
+  # NONE windows opt general flags
+  if(MGE_BUILD_WITH_ASAN)
+    set(OPTIMIZE_LEVEL "-g -O0 -DNDEBUG -fsanitize=address -fno-omit-frame-pointer")
+  elseif(ANDROID)
+    set(OPTIMIZE_LEVEL "-g -Ofast -DNDEBUG")
+  else()
+    set(OPTIMIZE_LEVEL "-g -O3 -DNDEBUG")
+  endif()
+  # remove finite-math-only opt from Ofast, caused by clang have a different runtime
+  # finite math logic, this issue do not find at g++, but as a unity build flags, we
+  # force add -fno-finite-math-only when compiler support
+  check_cxx_compiler_flag("-fno-finite-math-only" CXX_NO_FINITE_MATH_ONLY_SUPPORT)
+  if(CXX_NO_FINITE_MATH_ONLY_SUPPORT)
+    message(STATUS "force add -fno-finite-math-only for this compiler")
+    set(OPTIMIZE_LEVEL "${OPTIMIZE_LEVEL} -fno-finite-math-only")
+  endif()
+  set(CMAKE_C_FLAGS_RELEASE "${OPTIMIZE_LEVEL}")
+  set(CMAKE_CXX_FLAGS_RELEASE "${OPTIMIZE_LEVEL}")
+  set(CMAKE_C_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${OPTIMIZE_LEVEL}")
+  # some gnu(gcc) compiler use -static -libasan have runtime issue also, when target is
+  # big, clang ld will take a long long long time when use -static-libsan, so we use
+  # dynamic asan by default ANDROID asan.so depends on log, so broadcast log
+  # link_libraries for megengine depends target, for example flatc target
+  if(MGE_BUILD_WITH_ASAN AND ANDROID)
+    link_libraries(log)
+  endif()
 endif()
 
 if(MGE_WITH_CUDA)
-include(cmake/cudnn.cmake)
-    if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED))
-        message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON")
-        set(MGE_WITH_LARGE_ARCHIVE ON)
-    endif()
-endif()
-CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD)
+  include(cmake/cudnn.cmake)
+  if(MGE_CUDA_USE_STATIC
+     AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL
+                                                        "8.0.0")
+     AND (NOT MGE_WITH_CUDNN_SHARED))
+    message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON")
+    set(MGE_WITH_LARGE_ARCHIVE ON)
+  endif()
+endif()
+check_cxx_compiler_flag(-fuse-ld=gold CXX_SUPPORT_GOLD)
 if(MGE_WITH_LARGE_ARCHIVE)
-    message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold")
-    set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large")
-elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE)
-    message(STATUS "Using GNU gold linker.")
-    set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold")    
+  message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold")
+  set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large")
+elseif(
+  CXX_SUPPORT_GOLD
+  AND NOT ANDROID
+  AND NOT APPLE
+  AND NOT MSVC
+  AND NOT WIN32
+  AND NOT MGE_WITH_LARGE_ARCHIVE)
+  message(STATUS "Using GNU gold linker.")
+  set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold")
 endif()
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
 set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
 
 if(NOT MGE_WITH_JIT)
-    if(MGE_WITH_HALIDE)
-        message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled")
-        set(MGE_WITH_HALIDE OFF)
-    endif()
-    if(MGE_WITH_JIT_MLIR)
-        message(WARNING "MGE_WITH_JIT_MLIR is set to OFF with MGE_WITH_JIT disabled")
-        set(MGE_WITH_JIT_MLIR OFF)
-    endif()
+  if(MGE_WITH_HALIDE)
+    message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled")
+    set(MGE_WITH_HALIDE OFF)
+  endif()
+  if(MGE_WITH_JIT_MLIR)
+    message(WARNING "MGE_WITH_JIT_MLIR is set to OFF with MGE_WITH_JIT disabled")
+    set(MGE_WITH_JIT_MLIR OFF)
+  endif()
 endif()
 
-# FIXME At present, there are some conflicts between the LLVM that halide
-# depends on and the LLVM that MLIR depends on. Should be fixed in subsequent
-# versions.
+# FIXME At present, there are some conflicts between the LLVM that halide depends on and
+# the LLVM that MLIR depends on. Should be fixed in subsequent versions.
 if(MGE_BUILD_IMPERATIVE_RT AND MGE_WITH_HALIDE)
-    message(FATAL_ERROR "cannot use HALIDE when building IMPERATIVE_RT")
+  message(FATAL_ERROR "cannot use HALIDE when building IMPERATIVE_RT")
 endif()
 if(MGE_WITH_JIT_MLIR AND MGE_WITH_HALIDE)
-    message(FATAL_ERROR "cannot use HALIDE with MGE_WITH_JIT_MLIR enabled")
+  message(FATAL_ERROR "cannot use HALIDE with MGE_WITH_JIT_MLIR enabled")
 endif()
 
 if(MGE_WITH_CUDA)
-    # FIXME: check_language(CUDA) failed when sbsa mode!
-    # detail: https://gitlab.kitware.com/cmake/cmake/-/issues/20676
-    if(CMAKE_TOOLCHAIN_FILE)
-        set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-        message(WARNING "force set CMAKE_CUDA_HOST_COMPILER to CMAKE_CXX_COMPILER when nvcc sbsa mode!!")
-    endif()
+  # FIXME: check_language(CUDA) failed when sbsa mode! detail:
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/20676
+  if(CMAKE_TOOLCHAIN_FILE)
+    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+    message(
+      WARNING
+        "force set CMAKE_CUDA_HOST_COMPILER to CMAKE_CXX_COMPILER when nvcc sbsa mode!!"
+    )
+  endif()
 
-    include(CheckLanguage)
-    check_language(CUDA)
-    if(NOT CMAKE_CUDA_COMPILER AND NOT CMAKE_TOOLCHAIN_FILE)
-        message(FATAL_ERROR "CUDA compiler not found in PATH")
-    endif()
+  include(CheckLanguage)
+  check_language(CUDA)
+  if(NOT CMAKE_CUDA_COMPILER AND NOT CMAKE_TOOLCHAIN_FILE)
+    message(FATAL_ERROR "CUDA compiler not found in PATH")
+  endif()
 
-    # remove this after CMAKE fix nvcc sbsa
-    if(NOT CMAKE_CUDA_COMPILER AND CMAKE_TOOLCHAIN_FILE)
-        set(CMAKE_CUDA_COMPILER "nvcc")
-        message(WARNING "force set CMAKE_CUDA_COMPILER to nvcc when nvcc sbsa mode!!")
-    endif()
+  # remove this after CMAKE fix nvcc sbsa
+  if(NOT CMAKE_CUDA_COMPILER AND CMAKE_TOOLCHAIN_FILE)
+    set(CMAKE_CUDA_COMPILER "nvcc")
+    message(WARNING "force set CMAKE_CUDA_COMPILER to nvcc when nvcc sbsa mode!!")
+  endif()
 
-    enable_language(CUDA)
-    set(CMAKE_CUDA_STANDARD 14)
-    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_STANDARD 14)
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 endif()
 
 if(NOT MGE_WITH_CUDA)
-    if(NOT MGE_ARCH STREQUAL "x86_64" AND NOT MGE_ARCH STREQUAL "i386")
-        message(STATUS "Disable JIT support, as the MGE_ARCH is not X86 and CUDA is not enabled.")
-        set(MGE_WITH_JIT OFF)
-        set(MGE_WITH_JIT_MLIR OFF)
-    endif()
-    set(MGE_WITH_HALIDE OFF)
-    message(STATUS "Disable TensorRT support, as CUDA is not enabled.")
-    set(MGE_WITH_TRT OFF)
+  if(NOT MGE_ARCH STREQUAL "x86_64" AND NOT MGE_ARCH STREQUAL "i386")
+    message(
+      STATUS "Disable JIT support, as the MGE_ARCH is not X86 and CUDA is not enabled.")
+    set(MGE_WITH_JIT OFF)
+    set(MGE_WITH_JIT_MLIR OFF)
+  endif()
+  set(MGE_WITH_HALIDE OFF)
+  message(STATUS "Disable TensorRT support, as CUDA is not enabled.")
+  set(MGE_WITH_TRT OFF)
 endif()
 
 find_package(PythonInterp 3 REQUIRED)
-# NOTICE: just use for target, which do not depend on python api
-# PURPOSE: reuse target obj when switch python3 version
-# will fallback to PYTHON_EXECUTABLE if can not find in PATH env
+# NOTICE: just use for target, which do not depend on python api PURPOSE: reuse target
+# obj when switch python3 version will fallback to PYTHON_EXECUTABLE if can not find in
+# PATH env
 set(PYTHON3_IN_ENV "python3")
 find_program(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV})
-if (PYTHON3_EXECUTABLE_WITHOUT_VERSION)
-    message(STATUS "use ${PYTHON3_IN_ENV} as PYTHON3_EXECUTABLE_WITHOUT_VERSION")
-    set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV})
+if(PYTHON3_EXECUTABLE_WITHOUT_VERSION)
+  message(STATUS "use ${PYTHON3_IN_ENV} as PYTHON3_EXECUTABLE_WITHOUT_VERSION")
+  set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON3_IN_ENV})
 else()
-    message(STATUS "fallback ${PYTHON_EXECUTABLE} as PYTHON3_EXECUTABLE_WITHOUT_VERSION,\
-    target which depend on PYTHON3_EXECUTABLE_WITHOUT_VERSION will be rebuild when switch python3")
-    set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON_EXECUTABLE})
+  message(
+    STATUS
+      "fallback ${PYTHON_EXECUTABLE} as PYTHON3_EXECUTABLE_WITHOUT_VERSION,\
+    target which depend on PYTHON3_EXECUTABLE_WITHOUT_VERSION will be rebuild when switch python3"
+  )
+  set(PYTHON3_EXECUTABLE_WITHOUT_VERSION ${PYTHON_EXECUTABLE})
 endif()
 
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads)
 if(NOT "${CMAKE_THREAD_LIBS_INIT}" STREQUAL "")
-    if(${CMAKE_THREAD_LIBS_INIT} STREQUAL "-pthread" AND MGE_WITH_CUDA)
-        set_property(TARGET Threads::Threads
-            PROPERTY INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-pthread>"
-            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>")
-    endif()
-endif()
-
-set(MGE_BLAS MKL CACHE STRING "BLAS implementaion used by MegEngine.")
+  if(${CMAKE_THREAD_LIBS_INIT} STREQUAL "-pthread" AND MGE_WITH_CUDA)
+    set_property(
+      TARGET Threads::Threads
+      PROPERTY INTERFACE_COMPILE_OPTIONS
+               "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-pthread>"
+               "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>")
+  endif()
+endif()
+
+set(MGE_BLAS
+    MKL
+    CACHE STRING "BLAS implementaion used by MegEngine.")
 set_property(CACHE MGE_BLAS PROPERTY STRINGS MKL OpenBLAS)
-set(MGE_CUDA_GENCODE "" CACHE STRING "Overwrite -gencode specifications for CUDA")
+set(MGE_CUDA_GENCODE
+    ""
+    CACHE STRING "Overwrite -gencode specifications for CUDA")
 if(NOT CMAKE_CUDA_HOST_COMPILER)
-    set(CMAKE_CUDA_HOST_COMPILER $(CMAKE_CXX_COMPILER))
+  set(CMAKE_CUDA_HOST_COMPILER $(CMAKE_CXX_COMPILER))
 endif()
 
 if(NOT MGE_ENABLE_RTTI)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
 endif()
 
 if(NOT MGE_ENABLE_EXCEPTIONS)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
 endif()
 
 if(MGE_WITH_TEST)
-    include(cmake/gtest.cmake)
+  include(cmake/gtest.cmake)
 endif()
 
 include(cmake/gflags.cmake)
 
 if(MGE_BUILD_IMPERATIVE_RT)
-    set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD 17)
 endif()
 
 if(NOT ${MGE_WITH_CUDA} AND NOT ${MGE_WITH_ROCM})
-    message(STATUS "Disable distributed support, as both CUDA and ROCm are disabled.")
-    set(MGE_WITH_DISTRIBUTED OFF)
+  message(STATUS "Disable distributed support, as both CUDA and ROCm are disabled.")
+  set(MGE_WITH_DISTRIBUTED OFF)
 endif()
 
 if(MGE_INFERENCE_ONLY)
-    message(STATUS "Disable distributed support for inference only build.")
-    set(MGE_WITH_DISTRIBUTED OFF)
-    message(STATUS "Disable imperative_rt python module for inference only build.")
-    set(MGE_BUILD_IMPERATIVE_RT OFF)
+  message(STATUS "Disable distributed support for inference only build.")
+  set(MGE_WITH_DISTRIBUTED OFF)
+  message(STATUS "Disable imperative_rt python module for inference only build.")
+  set(MGE_BUILD_IMPERATIVE_RT OFF)
 endif()
 
 if(MGE_WITH_JIT_MLIR OR MGE_BUILD_IMPERATIVE_RT)
-    include(cmake/llvm-project.cmake)
+  include(cmake/llvm-project.cmake)
 endif()
 
 if(MGE_WITH_DISTRIBUTED)
-    include(cmake/protobuf.cmake)
-    include(cmake/zmq.cmake)
+  include(cmake/protobuf.cmake)
+  include(cmake/zmq.cmake)
 endif()
 
 if(MGB_WITH_FLATBUFFERS)
-    include(cmake/flatbuffers.cmake)
+  include(cmake/flatbuffers.cmake)
 endif()
 
 if(MGE_WITH_CUDA)
-    include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
-        get_filename_component(_NAME ${path} NAME)
-        if(NOT ${_NAME} STREQUAL "stubs")
-            list(APPEND CUDA_LINK_DIRECTORIES ${path})
-        endif()
-    endforeach()
-    link_directories(${CUDA_LINK_DIRECTORIES})
-
-    set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
-    set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
-    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g")
-    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os")
-    if(MSVC OR WIN32)
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all")
-        set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14 /bigobj")
-        if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-            set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd")
-        endif()
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ")
-    else()
-        set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all")
-    endif()
-
-    if(NOT MGE_ENABLE_RTTI)
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti")
+  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
+    get_filename_component(_NAME ${path} NAME)
+    if(NOT ${_NAME} STREQUAL "stubs")
+      list(APPEND CUDA_LINK_DIRECTORIES ${path})
     endif()
-    if(NOT MGE_ENABLE_EXCEPTIONS)
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions")
+  endforeach()
+  link_directories(${CUDA_LINK_DIRECTORIES})
+
+  set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
+  set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
+  set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g")
+  set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os")
+  if(MSVC OR WIN32)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all")
+    set(CCBIN_FLAG
+        "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068 /std:c++14 /bigobj"
+    )
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+      set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd")
     endif()
-    if(NOT MGE_CUDA_GENCODE)
-        if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "aarch64")
-            set(MEGDNN_THREADS_512 0)
-            if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED))
-                message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
-            elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86")                
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86")
-            elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80")
-            elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=compute_75")
-            elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "9.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "9.0.0")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=compute_70")
-            else()
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_35,code=sm_35")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
-                set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=compute_61")
-            endif()
-        else()
-            message(FATAL_ERROR "Unsupported CUDA host arch.")
-        endif()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ")
+  else()
+    set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all")
+  endif()
+
+  if(NOT MGE_ENABLE_RTTI)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti")
+  endif()
+  if(NOT MGE_ENABLE_EXCEPTIONS)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions")
+  endif()
+  if(NOT MGE_CUDA_GENCODE)
+    if(${MGE_ARCH} STREQUAL "x86_64"
+       OR ${MGE_ARCH} STREQUAL "i386"
+       OR ${MGE_ARCH} STREQUAL "aarch64")
+      set(MEGDNN_THREADS_512 0)
+      if(MGE_WITH_CUDA
+         AND MGE_CUDA_USE_STATIC
+         AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}"
+                                                            VERSION_EQUAL "8.0.0")
+         AND (NOT MGE_WITH_CUDNN_SHARED))
+        message(
+          WARNING
+            "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON"
+        )
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
+      elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0"
+             OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86")
+        set(MGE_CUDA_GENCODE
+            "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86")
+      elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0"
+             OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80")
+        set(MGE_CUDA_GENCODE
+            "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80")
+      elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0"
+             OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
+        set(MGE_CUDA_GENCODE
+            "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=compute_75")
+      elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "9.0.0"
+             OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "9.0.0")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
+        set(MGE_CUDA_GENCODE
+            "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=compute_70")
+      else()
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_35,code=sm_35")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
+        set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
+        set(MGE_CUDA_GENCODE
+            "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=compute_61")
+      endif()
     else()
-        set(MEGDNN_THREADS_512 1)
+      message(FATAL_ERROR "Unsupported CUDA host arch.")
     endif()
-
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}")
+  else()
+    set(MEGDNN_THREADS_512 1)
+  endif()
+
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}")
+  if(MGE_WITH_TRT)
+    include(cmake/tensorrt.cmake)
+  endif()
+  if(MGE_CUDA_USE_STATIC)
     if(MGE_WITH_TRT)
-        include(cmake/tensorrt.cmake)
+      if(MSVC OR WIN32)
+        message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}")
+        list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${TRT_PLUGIN_LIBRARY})
+      else()
+        list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libnvinfer_plugin
+             -Wl,--no-whole-archive)
+      endif()
+      if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
+        message(STATUS "handle trt myelin lib after trt7")
+        list(APPEND MGE_CUDA_LIBS libmyelin_compiler libmyelin_executor
+             libmyelin_pattern_runtime libmyelin_pattern_library)
+      endif()
     endif()
-    if(MGE_CUDA_USE_STATIC)
-        if(MGE_WITH_TRT)
-            if(MSVC OR WIN32)
-                message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}")
-                list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${TRT_PLUGIN_LIBRARY})
-            else()                
-                list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libnvinfer_plugin -Wl,--no-whole-archive)
-            endif()
-            if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
-                message(STATUS "handle trt myelin lib after trt7")
-                list(APPEND MGE_CUDA_LIBS libmyelin_compiler libmyelin_executor libmyelin_pattern_runtime libmyelin_pattern_library)
-            endif()
-        endif()
-        
-        if("${CUDNN_VERSION}" STREQUAL "7.5.0")
-            if(MSVC OR WIN32)
-                message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
-                list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY})
-            else()
-                message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html")
-                list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive)
-            endif()
-        else()
-            if(MSVC OR WIN32)
-                message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
-                list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY})
-            else()
-                list(APPEND MGE_CUDA_LIBS libcudnn)
-            endif()
-        endif()
-        if(MSVC OR WIN32)
-            list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib)
-        else()
-            list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static)
-        endif()
-        if(MSVC OR WIN32)
-            list(APPEND MGE_CUDA_LIBS cublas.lib)
-        else()
-            if(MGE_WITH_CUBLAS_SHARED)
-                list(APPEND MGE_CUDA_LIBS cublas)
-            else()
-                list(APPEND MGE_CUDA_LIBS cublas_static)
-            endif()
-        endif()
-        if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
-            if(MSVC OR WIN32)
-                list(APPEND MGE_CUDA_LIBS cublasLt.lib)
-            else()
-                if(MGE_WITH_CUBLAS_SHARED)
-                    list(APPEND MGE_CUDA_LIBS cublasLt)
-                else()
-                    list(APPEND MGE_CUDA_LIBS cublasLt_static culibos)
-                endif()
-            endif()
-        endif()
-        if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)
-            # mark all symbols from liblapack_static.a as weak to avoid
-            # duplicated definition with mkl
-            find_library(
-                LAPACK_STATIC_PATH lapack_static
-                HINTS ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
-            if(NOT LAPACK_STATIC_PATH)
-                message(FATAL_ERROR "liblapack_static.a not found")
-            endif()
-            set(LAPACK_STATIC_COPY_PATH ${CMAKE_CURRENT_BINARY_DIR}/liblapack_static_copy.a)
-
-            # add a target that run objcopy
-            add_custom_command(
-                OUTPUT ${LAPACK_STATIC_COPY_PATH}
-                COMMAND ${CMAKE_OBJCOPY} -w -W* ${LAPACK_STATIC_PATH} ${LAPACK_STATIC_COPY_PATH}
-                VERBATIM)
-            add_custom_target(lapack_static_weak_target DEPENDS ${LAPACK_STATIC_COPY_PATH})
-
-            # create a library named "lapack_static_weak"
-            add_library(lapack_static_weak STATIC IMPORTED GLOBAL)
-            add_dependencies(lapack_static_weak lapack_static_weak_target)
-            set_target_properties(
-                lapack_static_weak PROPERTIES
-                IMPORTED_LOCATION ${LAPACK_STATIC_COPY_PATH})
-            list(APPEND MGE_CUDA_LIBS lapack_static_weak ${LAPACK_STATIC_COPY_PATH})
-        endif()
+
+    if("${CUDNN_VERSION}" STREQUAL "7.5.0")
+      if(MSVC OR WIN32)
+        message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
+        list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY})
+      else()
+        message(
+          STATUS
+            "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html"
+        )
+        list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive)
+      endif()
     else()
-        if(MGE_WITH_TRT)
-            list(APPEND MGE_CUDA_LIBS libnvinfer libnvinfer_plugin)
-            if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
-                message(STATUS "handle trt myelin lib after trt7")
-                list(APPEND MGE_CUDA_LIBS libmyelin)
-            endif()
-        endif()
+      if(MSVC OR WIN32)
+        message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
+        list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY})
+      else()
         list(APPEND MGE_CUDA_LIBS libcudnn)
-        if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
-            list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand)
-        endif()
-        list(APPEND MGE_CUDA_LIBS cudart)
+      endif()
     endif()
-
-    if(NOT MGE_WITH_CUDA_STUB)
-        if(MSVC OR WIN32)
-            list(APPEND MGE_CUDA_LIBS cuda.lib)
-        else()
-            list(APPEND MGE_CUDA_LIBS cuda)
-        endif()
+    if(MSVC OR WIN32)
+      list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib)
+    else()
+      list(
+        APPEND
+        MGE_CUDA_LIBS
+        cusolver_static
+        curand_static
+        culibos
+        cudart_static
+        cusparse_static)
     endif()
-
-    if(NOT MGE_WITH_NVRTC_STUB)
-        if(MSVC OR WIN32)
-            list(APPEND MGE_CUDA_LIBS nvrtc.lib)
+    if(MSVC OR WIN32)
+      list(APPEND MGE_CUDA_LIBS cublas.lib)
+    else()
+      if(MGE_WITH_CUBLAS_SHARED)
+        list(APPEND MGE_CUDA_LIBS cublas)
+      else()
+        list(APPEND MGE_CUDA_LIBS cublas_static)
+      endif()
+    endif()
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0"
+       OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
+      if(MSVC OR WIN32)
+        list(APPEND MGE_CUDA_LIBS cublasLt.lib)
+      else()
+        if(MGE_WITH_CUBLAS_SHARED)
+          list(APPEND MGE_CUDA_LIBS cublasLt)
         else()
-            list(APPEND MGE_CUDA_LIBS nvrtc)
+          list(APPEND MGE_CUDA_LIBS cublasLt_static culibos)
         endif()
+      endif()
     endif()
-
-    if(MGE_WITH_ANY_CUDA_STUB)
-        add_subdirectory(dnn/cuda-stub)
-        list(APPEND MGE_CUDA_LIBS cuda-stub)
+    if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0"
+        OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0")
+       AND NOT MSVC
+       AND NOT WIN32)
+      # mark all symbols from liblapack_static.a as weak to avoid duplicated definition
+      # with mkl
+      find_library(LAPACK_STATIC_PATH lapack_static
+                   HINTS ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
+      if(NOT LAPACK_STATIC_PATH)
+        message(FATAL_ERROR "liblapack_static.a not found")
+      endif()
+      set(LAPACK_STATIC_COPY_PATH ${CMAKE_CURRENT_BINARY_DIR}/liblapack_static_copy.a)
+
+      # add a target that run objcopy
+      add_custom_command(
+        OUTPUT ${LAPACK_STATIC_COPY_PATH}
+        COMMAND ${CMAKE_OBJCOPY} -w -W* ${LAPACK_STATIC_PATH} ${LAPACK_STATIC_COPY_PATH}
+        VERBATIM)
+      add_custom_target(lapack_static_weak_target DEPENDS ${LAPACK_STATIC_COPY_PATH})
+
+      # create a library named "lapack_static_weak"
+      add_library(lapack_static_weak STATIC IMPORTED GLOBAL)
+      add_dependencies(lapack_static_weak lapack_static_weak_target)
+      set_target_properties(lapack_static_weak PROPERTIES IMPORTED_LOCATION
+                                                          ${LAPACK_STATIC_COPY_PATH})
+      list(APPEND MGE_CUDA_LIBS lapack_static_weak ${LAPACK_STATIC_COPY_PATH})
     endif()
+  else()
+    if(MGE_WITH_TRT)
+      list(APPEND MGE_CUDA_LIBS libnvinfer libnvinfer_plugin)
+      if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
+        message(STATUS "handle trt myelin lib after trt7")
+        list(APPEND MGE_CUDA_LIBS libmyelin)
+      endif()
+    endif()
+    list(APPEND MGE_CUDA_LIBS libcudnn)
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0"
+       OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
+      list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand)
+    endif()
+    list(APPEND MGE_CUDA_LIBS cudart)
+  endif()
 
+  if(NOT MGE_WITH_CUDA_STUB)
     if(MSVC OR WIN32)
-        list(APPEND MGE_CUDA_LIBS nvrtc.lib)
+      list(APPEND MGE_CUDA_LIBS cuda.lib)
     else()
-        list(APPEND MGE_CUDA_LIBS nvToolsExt)
-    endif()
-    
-    set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt")
-    if(UNIX)
-        set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -ldl")
+      list(APPEND MGE_CUDA_LIBS cuda)
     endif()
+  endif()
 
-endif()
-
-###########please add_subdirectory from here###############
-if((${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "armv7" OR ${MGE_ARCH} STREQUAL "aarch64") AND NOT APPLE AND NOT MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
-    option(MGE_ENABLE_CPUINFO "Build cpuinfo library for check runtime." ON)
-    if(MGE_ENABLE_CPUINFO)
-        message(STATUS "Enable cpuinfo runtime check and little kernel optimize.")
-        add_definitions(-DMGB_ENABLE_CPUINFO_CHECK)
-        include(cmake/cpuinfo.cmake)
+  if(NOT MGE_WITH_NVRTC_STUB)
+    if(MSVC OR WIN32)
+      list(APPEND MGE_CUDA_LIBS nvrtc.lib)
+    else()
+      list(APPEND MGE_CUDA_LIBS nvrtc)
     endif()
+  endif()
+
+  if(MGE_WITH_ANY_CUDA_STUB)
+    add_subdirectory(dnn/cuda-stub)
+    list(APPEND MGE_CUDA_LIBS cuda-stub)
+  endif()
+
+  if(MSVC OR WIN32)
+    list(APPEND MGE_CUDA_LIBS nvrtc.lib)
+  else()
+    list(APPEND MGE_CUDA_LIBS nvToolsExt)
+  endif()
+
+  set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt")
+  if(UNIX)
+    set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -ldl")
+  endif()
+
+endif()
+
+# ##########please add_subdirectory from here###############
+if((${MGE_ARCH} STREQUAL "x86_64"
+    OR ${MGE_ARCH} STREQUAL "i386"
+    OR ${MGE_ARCH} STREQUAL "armv7"
+    OR ${MGE_ARCH} STREQUAL "aarch64"
+   )
+   AND NOT APPLE
+   AND NOT MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
+  option(MGE_ENABLE_CPUINFO "Build cpuinfo library for check runtime." ON)
+  if(MGE_ENABLE_CPUINFO)
+    message(STATUS "Enable cpuinfo runtime check and little kernel optimize.")
+    add_definitions(-DMGB_ENABLE_CPUINFO_CHECK)
+    include(cmake/cpuinfo.cmake)
+  endif()
 endif()
 
 if(MGE_WITH_CAMBRICON)
-    include_directories("$ENV{NEUWARE_HOME}/include")
-    link_directories("$ENV{NEUWARE_HOME}/lib64")
-    list(APPEND MGE_CAMBRICON_LIBS libcnrt libcndev)
-    if (CNRT_VERSION_STRING VERSION_GREATER "5.0.0") 
-        include(cmake/cnnl.cmake)
-        include(cmake/cnlight.cmake)
-        include(cmake/magicmind.cmake)
-        list(APPEND MGE_CAMBRICON_LIBS libcnnl libcnnl_extra libcnlight libmagicmind libmagicmind_runtime)
-    else()
-        include(cmake/cnml.cmake)
-        list(APPEND MGE_CAMBRICON_LIBS libcnml)
-    endif()
-    set(MGE_CAMBRICON_LIBS "${MGE_CAMBRICON_LIBS}")
+  include_directories("$ENV{NEUWARE_HOME}/include")
+  link_directories("$ENV{NEUWARE_HOME}/lib64")
+  list(APPEND MGE_CAMBRICON_LIBS libcnrt libcndev)
+  if(CNRT_VERSION_STRING VERSION_GREATER "5.0.0")
+    include(cmake/cnnl.cmake)
+    include(cmake/cnlight.cmake)
+    include(cmake/magicmind.cmake)
+    list(
+      APPEND
+      MGE_CAMBRICON_LIBS
+      libcnnl
+      libcnnl_extra
+      libcnlight
+      libmagicmind
+      libmagicmind_runtime)
+  else()
+    include(cmake/cnml.cmake)
+    list(APPEND MGE_CAMBRICON_LIBS libcnml)
+  endif()
+  set(MGE_CAMBRICON_LIBS "${MGE_CAMBRICON_LIBS}")
+endif()
+
+if(MGE_WITH_ROCM)
+  include(cmake/rocm.cmake)
 endif()
 
-if (MGE_WITH_ROCM)
-    include(cmake/rocm.cmake)
-endif ()
-
 if(MGE_WITH_ATLAS)
-    add_subdirectory(dnn/atlas-stub)
-    list(APPEND MGE_ATLAS_LIBS atlas-stub)
-    set(MGE_ATLAS_LIBS "${MGE_ATLAS_LIBS}")
-    set(MGB_ATLAS ${MGE_WITH_ATLAS})
+  add_subdirectory(dnn/atlas-stub)
+  list(APPEND MGE_ATLAS_LIBS atlas-stub)
+  set(MGE_ATLAS_LIBS "${MGE_ATLAS_LIBS}")
+  set(MGB_ATLAS ${MGE_WITH_ATLAS})
 endif()
 
 find_program(CCACHE_BIN ccache)
 if(CCACHE_BIN)
-    set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN})
-    if(MGE_WITH_CUDA AND NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0")
-        message(STATUS "Using ccache as CMAKE_CUDA_COMPILER_LAUNCHER")
-        set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_BIN})
-    endif()
+  set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_BIN})
+  if(MGE_WITH_CUDA AND NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0")
+    message(STATUS "Using ccache as CMAKE_CUDA_COMPILER_LAUNCHER")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_BIN})
+  endif()
 endif()
 
 if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386")
-    if(${MGE_BLAS} STREQUAL "MKL")
-        include(cmake/mkl.cmake)
-        set(MGE_BLAS_LIBS libmkl)
-    elseif(${MGE_BLAS} STREQUAL "OpenBLAS")
-        include(cmake/OpenBLAS.cmake)
-        set(MGE_BLAS_LIBS libopenblas)
-    else()
-        message(FATAL_ERROR "Unknown BLAS implementation ${MGE_BLAS}")
-    endif()
+  if(${MGE_BLAS} STREQUAL "MKL")
+    include(cmake/mkl.cmake)
+    set(MGE_BLAS_LIBS libmkl)
+  elseif(${MGE_BLAS} STREQUAL "OpenBLAS")
+    include(cmake/OpenBLAS.cmake)
+    set(MGE_BLAS_LIBS libopenblas)
+  else()
+    message(FATAL_ERROR "Unknown BLAS implementation ${MGE_BLAS}")
+  endif()
 endif()
 
 # MKLDNN build
 if(MGE_WITH_MKLDNN AND ${MGE_ARCH} STREQUAL "x86_64")
-    include(cmake/MKL_DNN.cmake)
-    set(MEGDNN_X86_WITH_MKL_DNN 1)
+  include(cmake/MKL_DNN.cmake)
+  set(MEGDNN_X86_WITH_MKL_DNN 1)
 endif()
 
 # RTTI
 if(MGE_ENABLE_RTTI)
-    set(MEGDNN_ENABLE_MANGLING 0)
-    set(MEGDNN_ENABLE_RTTI 1)
+  set(MEGDNN_ENABLE_MANGLING 0)
+  set(MEGDNN_ENABLE_RTTI 1)
 else()
-    set(MEGDNN_ENABLE_MANGLING 1)
-    set(MEGDNN_ENABLE_RTTI 0)
+  set(MEGDNN_ENABLE_MANGLING 1)
+  set(MEGDNN_ENABLE_RTTI 0)
 endif()
 set(MGB_VERBOSE_TYPEINFO_NAME ${MGE_ENABLE_RTTI})
 
@@ -866,72 +1006,79 @@ set(MGB_ENABLE_JSON ${MGE_ENABLE_LOGGING})
 
 # Exception
 if(NOT MGE_ENABLE_EXCEPTIONS)
-    message(STATUS "Exceptions disabled; MegEngine would kill itself when it is supposed to throw an exception.")
+  message(
+    STATUS
+      "Exceptions disabled; MegEngine would kill itself when it is supposed to throw an exception."
+  )
 endif()
 set(MGB_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS})
 set(MEGDNN_ENABLE_EXCEPTIONS ${MGE_ENABLE_EXCEPTIONS})
 
 # JIT
 if(MGE_WITH_JIT AND MGE_WITH_HALIDE)
-    set(HALIDE_SHARED_LIBRARY OFF CACHE BOOL "Build as a shared library")
-    include(cmake/Halide.cmake)
+  set(HALIDE_SHARED_LIBRARY
+      OFF
+      CACHE BOOL "Build as a shared library")
+  include(cmake/Halide.cmake)
 endif()
 
 include(cmake/cpp_redis.cmake)
 
 # Thread
-IF(APPLE)
-    set(CMAKE_THREAD_LIBS_INIT "-lpthread")
-    set(CMAKE_HAVE_THREADS_LIBRARY 1)
-    set(CMAKE_USE_WIN32_THREADS_INIT 0)
-    set(CMAKE_USE_PTHREADS_INIT 1)
-    set(THREADS_PREFER_PTHREAD_FLAG ON)
-    message(STATUS "disable jit, halide and mlir on macos host build...")
-    set(MGE_WITH_HALIDE OFF)
-    set(MGE_WITH_JIT OFF)
-    set(MGE_WITH_JIT_MLIR OFF)
-ENDIF()
+if(APPLE)
+  set(CMAKE_THREAD_LIBS_INIT "-lpthread")
+  set(CMAKE_HAVE_THREADS_LIBRARY 1)
+  set(CMAKE_USE_WIN32_THREADS_INIT 0)
+  set(CMAKE_USE_PTHREADS_INIT 1)
+  set(THREADS_PREFER_PTHREAD_FLAG ON)
+  message(STATUS "disable jit, halide and mlir on macos host build...")
+  set(MGE_WITH_HALIDE OFF)
+  set(MGE_WITH_JIT OFF)
+  set(MGE_WITH_JIT_MLIR OFF)
+endif()
 
 set(MGB_JIT ${MGE_WITH_JIT})
 set(MGB_JIT_MLIR ${MGE_WITH_JIT_MLIR})
 set(MGB_JIT_HALIDE ${MGE_WITH_HALIDE})
 # for consumer override MGB_C_OPR_INIT_FUNC symbol interface
 if(NOT "${CUSTOM_C_OPR_INIT_FUNC}" STREQUAL "")
-    add_compile_definitions(MGB_C_OPR_INIT_FUNC=${CUSTOM_C_OPR_INIT_FUNC})
-    message(STATUS "override MGB_C_OPR_INIT_FUNC to ${CUSTOM_C_OPR_INIT_FUNC}")
+  add_compile_definitions(MGB_C_OPR_INIT_FUNC=${CUSTOM_C_OPR_INIT_FUNC})
+  message(STATUS "override MGB_C_OPR_INIT_FUNC to ${CUSTOM_C_OPR_INIT_FUNC}")
 endif()
 
 set(MGB_CUSTOM_OP ${MGE_WITH_CUSTOM_OP})
 
 if(MSVC OR WIN32)
-    set(CMAKE_HAVE_THREADS_LIBRARY 1)
-    set(CMAKE_USE_WIN32_THREADS_INIT 1)
-    set(CMAKE_USE_PTHREADS_INIT 1)
-    set(THREADS_PREFER_PTHREAD_FLAG ON)
+  set(CMAKE_HAVE_THREADS_LIBRARY 1)
+  set(CMAKE_USE_WIN32_THREADS_INIT 1)
+  set(CMAKE_USE_PTHREADS_INIT 1)
+  set(THREADS_PREFER_PTHREAD_FLAG ON)
 endif()
 
-if(CMAKE_THREAD_LIBS_INIT OR CMAKE_USE_WIN32_THREADS_INIT OR ANDROID)
-    set(MGB_HAVE_THREAD 1)
+if(CMAKE_THREAD_LIBS_INIT
+   OR CMAKE_USE_WIN32_THREADS_INIT
+   OR ANDROID)
+  set(MGB_HAVE_THREAD 1)
 endif()
 
 if(MSVC OR WIN32)
-    if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
-        message(STATUS "disable MGB_HAVE_THREAD/MGB_ENABLE_JSON when DEPLOY ON XP SP2")
-        set(MGB_HAVE_THREAD 0)
-        set(MGB_ENABLE_JSON 0)
-    endif()
+  if(MGE_DEPLOY_INFERENCE_ON_WINDOWS_XP_SP2)
+    message(STATUS "disable MGB_HAVE_THREAD/MGB_ENABLE_JSON when DEPLOY ON XP SP2")
+    set(MGB_HAVE_THREAD 0)
+    set(MGB_ENABLE_JSON 0)
+  endif()
 endif()
 
 if(MGE_WITH_TEST)
-    # use intra-op multi threads
-    set(MEGDNN_ENABLE_MULTI_THREADS 1)
+  # use intra-op multi threads
+  set(MEGDNN_ENABLE_MULTI_THREADS 1)
 endif()
 
 # CUDA
 set(MGB_CUDA ${MGE_WITH_CUDA})
 set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA})
 
-#ROCM
+# ROCM
 set(MGB_ROCM ${MGE_WITH_ROCM})
 set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM})
 
@@ -943,19 +1090,20 @@ set(MGB_ENFLAME ${MGE_WITH_ENFLAME})
 set(MEGDNN_WITH_ENFLAME ${MGE_WITH_ENFLAME})
 
 # Debug info
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
-    set(MGB_ASSERT_LOC 1)
-    set(MGB_ENABLE_DEBUG_UTIL 1)
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL
+                                           "RelWithDebInfo")
+  set(MGB_ASSERT_LOC 1)
+  set(MGB_ENABLE_DEBUG_UTIL 1)
 else()
-    set(MGB_ASSERT_LOC 0)
-    set(MGB_ENABLE_DEBUG_UTIL 0)
+  set(MGB_ASSERT_LOC 0)
+  set(MGB_ENABLE_DEBUG_UTIL 0)
 endif()
 
 if(MSVC OR WIN32)
-    if(${MGE_ARCH} STREQUAL "i386")
-        set(MGB_ENABLE_DEBUG_UTIL 0)
-        message(STATUS "disable MGB_ENABLE_DEBUG_UTIL at Windows i386 build")
-    endif()
+  if(${MGE_ARCH} STREQUAL "i386")
+    set(MGB_ENABLE_DEBUG_UTIL 0)
+    message(STATUS "disable MGB_ENABLE_DEBUG_UTIL at Windows i386 build")
+  endif()
 endif()
 
 # TensorRT
@@ -963,11 +1111,11 @@ set(MGB_ENABLE_TENSOR_RT ${MGE_WITH_TRT})
 
 # Inference only
 if(MGE_INFERENCE_ONLY AND NOT MGE_WITH_TEST)
-    set(MGB_ENABLE_GRAD 0)
-    set(MGB_BUILD_SLIM_SERVING 1)
+  set(MGB_ENABLE_GRAD 0)
+  set(MGB_BUILD_SLIM_SERVING 1)
 else()
-    set(MGB_ENABLE_GRAD 1)
-    set(MGB_BUILD_SLIM_SERVING 0)
+  set(MGB_ENABLE_GRAD 1)
+  set(MGB_BUILD_SLIM_SERVING 0)
 endif()
 
 # Distributed communication
@@ -975,227 +1123,264 @@ set(MGB_ENABLE_OPR_MM ${MGE_WITH_DISTRIBUTED})
 
 # MGE_ARCH related flags
 if(MGE_ARCH STREQUAL "x86_64" OR MGE_ARCH STREQUAL "i386")
-    if(MGE_BLAS STREQUAL "MKL")
-        set(MEGDNN_X86_WITH_MKL 1)
-    elseif(MGE_BLAS STREQUAL "OpenBLAS")
-        set(MEGDNN_X86_WITH_OPENBLAS 1)
-    endif()
+  if(MGE_BLAS STREQUAL "MKL")
+    set(MEGDNN_X86_WITH_MKL 1)
+  elseif(MGE_BLAS STREQUAL "OpenBLAS")
+    set(MEGDNN_X86_WITH_OPENBLAS 1)
+  endif()
 endif()
 
 # Enable Naive
 if(MGE_ARCH STREQUAL "naive")
-    set(MEGDNN_NAIVE 1)
-    message(STATUS "MEGDNN_NAIVE is enabled; MegDNN performance is degraded.")
+  set(MEGDNN_NAIVE 1)
+  message(STATUS "MEGDNN_NAIVE is enabled; MegDNN performance is degraded.")
 endif()
 
 if(MGE_ARCH STREQUAL "x86_64" OR MGE_ARCH STREQUAL "i386")
-    set(MEGDNN_X86 1)
-    if(MGE_ARCH STREQUAL "x86_64")
-        set(MEGDNN_X86_64 1)
-        set(MEGDNN_64_BIT 1)
-        if(NOT MSVC)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
-        endif()
-    else()
-        set(MEGDNN_X86_32 1)
-        if(NOT MSVC)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
-        endif()
+  set(MEGDNN_X86 1)
+  if(MGE_ARCH STREQUAL "x86_64")
+    set(MEGDNN_X86_64 1)
+    set(MEGDNN_64_BIT 1)
+    if(NOT MSVC)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
     endif()
+  else()
+    set(MEGDNN_X86_32 1)
     if(NOT MSVC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mfpmath=sse")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
     endif()
+  endif()
+  if(NOT MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mfpmath=sse")
+  endif()
 endif()
 # dotprod is not enable by default on APPLE, cpuinfo has some problem on APPLE
 if(NOT APPLE AND ${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
-    CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+dotprod" CXX_COMPILER_SUPPORT_DOT)
-    if(CXX_COMPILER_SUPPORT_DOT)
-        message(STATUS "Enable dotprod feature in armv8.2-a using MGB_ENABLE_DOT")
-        set(MGB_ENABLE_DOT 1)
-    endif()
+  check_cxx_compiler_flag("-march=armv8.2-a+dotprod" CXX_COMPILER_SUPPORT_DOT)
+  if(CXX_COMPILER_SUPPORT_DOT)
+    message(STATUS "Enable dotprod feature in armv8.2-a using MGB_ENABLE_DOT")
+    set(MGB_ENABLE_DOT 1)
+  endif()
 endif()
 
 if(MGE_ARCH STREQUAL "armv7")
-    # -funsafe-math-optimizations to enable neon auto-vectorization (since neon is not fully IEEE 754 compatible, GCC does not turn on neon auto-vectorization by default.
-    if(ANDROID)
-        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon")
-    endif()
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funsafe-math-optimizations")
-    set (MARCH "-march=armv7-a")
-    set (MEGDNN_ARMV7 1)
+  # -funsafe-math-optimizations to enable neon auto-vectorization (since neon is not
+  # fully IEEE 754 compatible, GCC does not turn on neon auto-vectorization by default.
+  if(ANDROID)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon")
+  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funsafe-math-optimizations")
+  set(MARCH "-march=armv7-a")
+  set(MEGDNN_ARMV7 1)
 endif()
 
 if(MGE_ARCH STREQUAL "aarch64")
-    set(MEGDNN_AARCH64 1)
-    set(MEGDNN_64_BIT 1)
-    set(MARCH "-march=armv8-a")
-    set(MGB_AARCH64 1)
-    if(MGE_ARMV8_2_FEATURE_FP16)
-        message(STATUS "Enable fp16 feature support in armv8.2")
-        if(NOT ${MGE_DISABLE_FLOAT16})
-            set(MEGDNN_ENABLE_FP16_NEON 1)
-        endif()
-        set(MARCH "-march=armv8.2-a+fp16")
+  set(MEGDNN_AARCH64 1)
+  set(MEGDNN_64_BIT 1)
+  set(MARCH "-march=armv8-a")
+  set(MGB_AARCH64 1)
+  if(MGE_ARMV8_2_FEATURE_FP16)
+    message(STATUS "Enable fp16 feature support in armv8.2")
+    if(NOT ${MGE_DISABLE_FLOAT16})
+      set(MEGDNN_ENABLE_FP16_NEON 1)
     endif()
+    set(MARCH "-march=armv8.2-a+fp16")
+  endif()
 
-    if(MGE_WITH_CUDA)
-        message(WARNING "aarch64 ld will add -mfix-cortex-a53-843419 and -mfix-cortex-a53-835769,\
+  if(MGE_WITH_CUDA)
+    message(
+      WARNING
+        "aarch64 ld will add -mfix-cortex-a53-843419 and -mfix-cortex-a53-835769,\
         when cuda enable and CMAKE with DEBUG build type,ld will take about 14min+,\
         for save link time(14min->1min), you may open below flags if not deploy on\
         arm a53 platform, or just build release type!")
-        #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mno-fix-cortex-a53-843419 -mno-fix-cortex-a53-835769")
-    endif()
+    # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mno-fix-cortex-a53-843419
+    # -mno-fix-cortex-a53-835769")
+  endif()
 endif()
 
 if(MGE_ARCH STREQUAL "riscv64")
-    set(MEGDNN_RISCV64 1)
-    set(MEGDNN_64_BIT 1)
+  set(MEGDNN_RISCV64 1)
+  set(MEGDNN_64_BIT 1)
 endif()
 
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}")
 
-set(MGE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/src/version.ld CACHE INTERNAL "Path to linker version script")
+set(MGE_VERSION_SCRIPT
+    ${PROJECT_SOURCE_DIR}/src/version.ld
+    CACHE INTERNAL "Path to linker version script")
 
-# Write out megbrain_build_config.h
-# It defines macros needed by both megbrain and dnn
-configure_file(src/megbrain_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+# Write out megbrain_build_config.h It defines macros needed by both megbrain and dnn
+configure_file(src/megbrain_build_config.h.in
+               ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 add_subdirectory(dnn)
 
-list(APPEND MGB_OPR_PARAM_DEFS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py)
+list(APPEND MGB_OPR_PARAM_DEFS_SRCS
+     ${CMAKE_CURRENT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py)
 set(MGB_OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/dnn/scripts/gen_param_defs.py)
 
 set(MGB_OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/opr/include/)
 file(MAKE_DIRECTORY ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr)
 add_custom_command(
-    OUTPUT ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
-    COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${MGB_OPR_PARAM_DEFS_SCRIPT} ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
-    DEPENDS ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_SCRIPT}
-    VERBATIM
-)
+  OUTPUT ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
+  COMMAND
+    ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${MGB_OPR_PARAM_DEFS_SCRIPT}
+    ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
+  DEPENDS ${MGB_OPR_PARAM_DEFS_SRCS} ${MGB_OPR_PARAM_DEFS_SCRIPT}
+  VERBATIM)
 
 list(APPEND MGB_OPR_PARAM_DEFS_OUTS
-    ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h
-)
+     ${MGB_OPR_PARAM_DEFS_OUT_DIR}/megbrain/opr/param_defs.h)
 
-install(FILES ${MGB_OPR_PARAM_DEFS_OUTS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/megbrain/opr/)
+install(FILES ${MGB_OPR_PARAM_DEFS_OUTS}
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/megbrain/opr/)
 
 list(APPEND MGB_OPR_PARAM_DEFS_INC ${MGB_OPR_PARAM_DEFS_OUT_DIR})
 add_custom_target(_mgb_opr_param_defs DEPENDS ${MGB_OPR_PARAM_DEFS_OUTS})
 add_library(mgb_opr_param_defs INTERFACE)
-target_include_directories(mgb_opr_param_defs
-    INTERFACE
-        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-        $<BUILD_INTERFACE:${MGB_OPR_PARAM_DEFS_INC}>
-)
+target_include_directories(
+  mgb_opr_param_defs INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+                               $<BUILD_INTERFACE:${MGB_OPR_PARAM_DEFS_INC}>)
 add_dependencies(mgb_opr_param_defs _mgb_opr_param_defs)
 install(TARGETS mgb_opr_param_defs EXPORT ${MGE_EXPORT_TARGETS})
 
 if(MGE_WITH_JIT_MLIR OR MGE_BUILD_IMPERATIVE_RT)
-    # generate param_defs.td
-    set(MGE_GENFILE_DIR ${PROJECT_BINARY_DIR}/src/genfiles)
-    set(MGE_GEN_IR_DIR ${PROJECT_BINARY_DIR}/src/core/include/megbrain/ir)
-    set(OPR_PARAM_DEFS_SRCS ${MGE_GENFILE_DIR}/opr_param_defs.py)
-    set(OPR_PARAM_DEFS_SCRIPT ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_tablegen.py)
-    set(OPR_PARAM_DEFS_OUT ${MGE_GEN_IR_DIR}/param_defs.td)
-    file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py DESTINATION ${MGE_GENFILE_DIR})
-    file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS)
-    file(APPEND ${OPR_PARAM_DEFS_SRCS} ${CONTENTS})
-    file(MAKE_DIRECTORY ${MGE_GEN_IR_DIR})
-    add_custom_command(
-        OUTPUT ${OPR_PARAM_DEFS_OUT}
-        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT}
-        DEPENDS ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py ${OPR_PARAM_DEFS_SCRIPT}
-        VERBATIM
-    )
-    # mlir tblgen sources
-    set(MGE_IR_DIR ${PROJECT_SOURCE_DIR}/src/core/include/megbrain/ir)
-    set(MGE_IR_INCLUDE_DIRS ${MLIR_LLVM_INCLUDE_DIR} ${MGE_IR_DIR} ${MGE_GEN_IR_DIR})
-    list(TRANSFORM MGE_IR_INCLUDE_DIRS PREPEND "-I")
-    file(GLOB_RECURSE MGE_IR_TDS ${MGE_IR_DIR}/*.td)
-    add_custom_target(param_defs_tblgen DEPENDS ${OPR_PARAM_DEFS_OUT})
+  # generate param_defs.td
+  set(MGE_GENFILE_DIR ${PROJECT_BINARY_DIR}/src/genfiles)
+  set(MGE_GEN_IR_DIR ${PROJECT_BINARY_DIR}/src/core/include/megbrain/ir)
+  set(OPR_PARAM_DEFS_SRCS ${MGE_GENFILE_DIR}/opr_param_defs.py)
+  set(OPR_PARAM_DEFS_SCRIPT ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_tablegen.py)
+  set(OPR_PARAM_DEFS_OUT ${MGE_GEN_IR_DIR}/param_defs.td)
+  file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py
+       DESTINATION ${MGE_GENFILE_DIR})
+  file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS)
+  file(APPEND ${OPR_PARAM_DEFS_SRCS} ${CONTENTS})
+  file(MAKE_DIRECTORY ${MGE_GEN_IR_DIR})
+  add_custom_command(
+    OUTPUT ${OPR_PARAM_DEFS_OUT}
+    COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT}
+            ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT}
+    DEPENDS ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py
+            ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py
+            ${OPR_PARAM_DEFS_SCRIPT}
+    VERBATIM)
+  # mlir tblgen sources
+  set(MGE_IR_DIR ${PROJECT_SOURCE_DIR}/src/core/include/megbrain/ir)
+  set(MGE_IR_INCLUDE_DIRS ${MLIR_LLVM_INCLUDE_DIR} ${MGE_IR_DIR} ${MGE_GEN_IR_DIR})
+  list(TRANSFORM MGE_IR_INCLUDE_DIRS PREPEND "-I")
+  file(GLOB_RECURSE MGE_IR_TDS ${MGE_IR_DIR}/*.td)
+  add_custom_target(param_defs_tblgen DEPENDS ${OPR_PARAM_DEFS_OUT})
 endif()
 
 if(MGE_WITH_DISTRIBUTED)
-    set(MEGRAY_WITH_NCCL ${MGE_WITH_CUDA} CACHE BOOL "Override MegRay option" FORCE)
-    set(MEGRAY_WITH_SHM ${MGE_WITH_CUDA} CACHE BOOL "Override MegRay option" FORCE)
-    set(MEGRAY_WITH_RCCL ${MGE_WITH_ROCM} CACHE BOOL "Override MegRay option" FORCE)
-    set(MEGRAY_CUDA_GENCODE ${MGE_CUDA_GENCODE} CACHE STRING "Overwrite MegRay CUDA -gencode specifications" FORCE)
-    add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/MegRay)
+  set(MEGRAY_WITH_NCCL
+      ${MGE_WITH_CUDA}
+      CACHE BOOL "Override MegRay option" FORCE)
+  set(MEGRAY_WITH_SHM
+      ${MGE_WITH_CUDA}
+      CACHE BOOL "Override MegRay option" FORCE)
+  set(MEGRAY_WITH_RCCL
+      ${MGE_WITH_ROCM}
+      CACHE BOOL "Override MegRay option" FORCE)
+  set(MEGRAY_CUDA_GENCODE
+      ${MGE_CUDA_GENCODE}
+      CACHE STRING "Overwrite MegRay CUDA -gencode specifications" FORCE)
+  add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/MegRay)
 endif()
 
 add_subdirectory(src)
 
 if(MGE_BUILD_IMPERATIVE_RT)
-    add_subdirectory(imperative)
-    message(STATUS "Enable imperative python wrapper runtime")
+  add_subdirectory(imperative)
+  message(STATUS "Enable imperative python wrapper runtime")
 endif()
 
 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
-    add_subdirectory(test)
+  add_subdirectory(test)
 endif()
 
 if(TARGET _imperative_rt)
-    add_custom_target(
-        develop
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}>
-          ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}>
-        COMMAND ${CMAKE_COMMAND} -E create_symlink
-          ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py
-          ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py
-        DEPENDS _imperative_rt
-        VERBATIM
-    )
-endif()
-
-# Configure and install pkg-config.
-# Note that unlike the Config.cmake modules, this is not relocatable (and not
-# really portable) because we have two dependencies without pkg-config
-# descriptions: FlatBuffers and MKL-DNN
-if (MGE_USE_SYSTEM_MKLDNN)
-    set (MGE_PKGCONFIG_LIBS_PRIVATE "-ldnnl")
-endif()
-if (MGE_USE_SYSTEM_OPENBLAS)
-    set (MGE_PKGCONFIG_LIBS_PRIVATE "${MGE_PKGCONFIG_LIBS_PRIVATE} -lopenblas")
-endif()
-configure_file(cmake/megengine.pc.in
-               ${CMAKE_CURRENT_BINARY_DIR}/megengine.pc
-               @ONLY)
+  add_custom_target(
+    develop
+    COMMAND
+      ${CMAKE_COMMAND} -E create_symlink
+      ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}>
+      ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}>
+    COMMAND
+      ${CMAKE_COMMAND} -E create_symlink
+      ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py
+      ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py
+    COMMAND
+      ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/src/custom/include
+      ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/include
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+            ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib
+    COMMAND
+      ${CMAKE_COMMAND} -E create_symlink
+      ${CMAKE_CURRENT_BINARY_DIR}/src/$<TARGET_FILE_NAME:${MGE_SHARED_LIB}>
+      ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib/$<TARGET_FILE_NAME:${MGE_SHARED_LIB}>
+    DEPENDS _imperative_rt
+    VERBATIM)
+endif()
+
+# Configure and install pkg-config. Note that unlike the Config.cmake modules, this is
+# not relocatable (and not really portable) because we have two dependencies without
+# pkg-config descriptions: FlatBuffers and MKL-DNN
+if(MGE_USE_SYSTEM_MKLDNN)
+  set(MGE_PKGCONFIG_LIBS_PRIVATE "-ldnnl")
+endif()
+if(MGE_USE_SYSTEM_OPENBLAS)
+  set(MGE_PKGCONFIG_LIBS_PRIVATE "${MGE_PKGCONFIG_LIBS_PRIVATE} -lopenblas")
+endif()
+configure_file(cmake/megengine.pc.in ${CMAKE_CURRENT_BINARY_DIR}/megengine.pc @ONLY)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/megengine.pc
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
 
 # Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready.
-if (NOT MGE_WITH_DISTRIBUTED)
-    include(CMakePackageConfigHelpers)
-    set (MGE_INSTALL_CMAKEDIR ${CMAKE_INSTALL_LIBDIR}/cmake/MegEngine)
-    configure_package_config_file(cmake/MegEngineConfig.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake
-        INSTALL_DESTINATION ${MGE_INSTALL_CMAKEDIR}
-    )
-    write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake
-        VERSION ${MGB_VER_STRING}
-        COMPATIBILITY SameMajorVersion)
-
-    install(EXPORT ${MGE_EXPORT_TARGETS} DESTINATION ${MGE_INSTALL_CMAKEDIR})
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake
+if(NOT MGE_WITH_DISTRIBUTED)
+  include(CMakePackageConfigHelpers)
+  set(MGE_INSTALL_CMAKEDIR ${CMAKE_INSTALL_LIBDIR}/cmake/MegEngine)
+  configure_package_config_file(
+    cmake/MegEngineConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake
+    INSTALL_DESTINATION ${MGE_INSTALL_CMAKEDIR})
+  write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake
+    VERSION ${MGB_VER_STRING}
+    COMPATIBILITY SameMajorVersion)
+
+  install(EXPORT ${MGE_EXPORT_TARGETS} DESTINATION ${MGE_INSTALL_CMAKEDIR})
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfig.cmake
                 ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake
-            DESTINATION ${MGE_INSTALL_CMAKEDIR})
+          DESTINATION ${MGE_INSTALL_CMAKEDIR})
 endif()
 
 if(MGE_WITH_JIT_MLIR)
-    add_subdirectory(tools/mlir/mgb-opt)
-    add_subdirectory(tools/mlir/mgb-file-check)
-endif()
-
-if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED))
-    message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
-    message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
-    message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
+  add_subdirectory(tools/mlir/mgb-opt)
+  add_subdirectory(tools/mlir/mgb-file-check)
+endif()
+
+if(MGE_WITH_CUDA
+   AND MGE_CUDA_USE_STATIC
+   AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL
+                                                      "8.0.0")
+   AND (NOT MGE_WITH_CUDNN_SHARED))
+  message(
+    WARNING
+      "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" "
+  )
+  message(
+    WARNING
+      "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" "
+  )
+  message(
+    WARNING
+      "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" "
+  )
 endif()
 
 if(MGE_WITH_LITE)
-    add_subdirectory(lite)
+  add_subdirectory(lite)
 endif()
diff --git a/ci/cmake.sh b/ci/cmake.sh
index 8d8c55bf..4808e63e 100755
--- a/ci/cmake.sh
+++ b/ci/cmake.sh
@@ -27,7 +27,8 @@ function build() {
             -DMGE_WITH_DISTRIBUTED=${DMGE_WITH_DISTRIBUTED} \
             -DMGE_WITH_CUDA=${DMGE_WITH_CUDA} \
             -DMGE_WITH_TEST=ON \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+	    -DMGE_WITH_CUSTOM_OP=ON
         make -j$(($(nproc) * 2)) -I ${build_dir}
         make develop
     popd >/dev/null
diff --git a/cmake/BuildFlatBuffers.cmake b/cmake/BuildFlatBuffers.cmake
index 91cf5f97..ea85ae99 100644
--- a/cmake/BuildFlatBuffers.cmake
+++ b/cmake/BuildFlatBuffers.cmake
@@ -1,59 +1,56 @@
 # Copyright 2015 Google Inc. All rights reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
+# file except in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
 
-# General function to create FlatBuffer build rules for the given list of
-# schemas.
+# General function to create FlatBuffer build rules for the given list of schemas.
 #
 # flatbuffers_schemas: A list of flatbuffer schema files to process.
 #
-# schema_include_dirs: A list of schema file include directories, which will be
-# passed to flatc via the -I parameter.
+# schema_include_dirs: A list of schema file include directories, which will be passed
+# to flatc via the -I parameter.
 #
-# custom_target_name: The generated files will be added as dependencies for a
-# new custom target with this name. You should add that target as a dependency
-# for your main target to ensure these files are built. You can also retrieve
-# various properties from this target, such as GENERATED_INCLUDES_DIR,
-# BINARY_SCHEMAS_DIR, and COPY_TEXT_SCHEMAS_DIR.
+# custom_target_name: The generated files will be added as dependencies for a new custom
+# target with this name. You should add that target as a dependency for your main target
+# to ensure these files are built. You can also retrieve various properties from this
+# target, such as GENERATED_INCLUDES_DIR, BINARY_SCHEMAS_DIR, and COPY_TEXT_SCHEMAS_DIR.
 #
-# additional_dependencies: A list of additional dependencies that you'd like
-# all generated files to depend on. Pass in a blank string if you have none.
+# additional_dependencies: A list of additional dependencies that you'd like all
+# generated files to depend on. Pass in a blank string if you have none.
 #
-# generated_includes_dir: Where to generate the C++ header files for these
-# schemas. The generated includes directory will automatically be added to
-# CMake's include directories, and will be where generated header files are
-# placed. This parameter is optional; pass in empty string if you don't want to
-# generate include files for these schemas.
+# generated_includes_dir: Where to generate the C++ header files for these schemas. The
+# generated includes directory will automatically be added to CMake's include
+# directories, and will be where generated header files are placed. This parameter is
+# optional; pass in empty string if you don't want to generate include files for these
+# schemas.
 #
-# binary_schemas_dir: If you specify an optional binary schema directory, binary
-# schemas will be generated for these schemas as well, and placed into the given
-# directory.
+# binary_schemas_dir: If you specify an optional binary schema directory, binary schemas
+# will be generated for these schemas as well, and placed into the given directory.
 #
-# copy_text_schemas_dir: If you want all text schemas (including schemas from
-# all schema include directories) copied into a directory (for example, if you
-# need them within your project to build JSON files), you can specify that
-# folder here. All text schemas will be copied to that folder.
+# copy_text_schemas_dir: If you want all text schemas (including schemas from all schema
+# include directories) copied into a directory (for example, if you need them within
+# your project to build JSON files), you can specify that folder here. All text schemas
+# will be copied to that folder.
 #
-# IMPORTANT: Make sure you quote all list arguments you pass to this function!
-# Otherwise CMake will only pass in the first element.
-# Example: build_flatbuffers("${fb_files}" "${include_dirs}" target_name ...)
-function(build_flatbuffers flatbuffers_schemas
-                           schema_include_dirs
-                           custom_target_name
-                           additional_dependencies
-                           generated_includes_dir
-                           binary_schemas_dir
-                           copy_text_schemas_dir)
+# IMPORTANT: Make sure you quote all list arguments you pass to this function! Otherwise
+# CMake will only pass in the first element. Example: build_flatbuffers("${fb_files}"
+# "${include_dirs}" target_name ...)
+function(
+  build_flatbuffers
+  flatbuffers_schemas
+  schema_include_dirs
+  custom_target_name
+  additional_dependencies
+  generated_includes_dir
+  binary_schemas_dir
+  copy_text_schemas_dir)
 
   # Test if including from FindFlatBuffers
   if(FLATBUFFERS_FLATC_EXECUTABLE)
@@ -65,10 +62,7 @@ function(build_flatbuffers flatbuffers_schemas
   endif()
   set(FLATC_SCHEMA_ARGS --gen-mutable)
   if(FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS)
-    set(FLATC_SCHEMA_ARGS
-      ${FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS}
-      ${FLATC_SCHEMA_ARGS}
-      )
+    set(FLATC_SCHEMA_ARGS ${FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS} ${FLATC_SCHEMA_ARGS})
   endif()
 
   set(working_dir "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -77,12 +71,12 @@ function(build_flatbuffers flatbuffers_schemas
   # Generate the include files parameters.
   set(include_params "")
   set(all_generated_files "")
-  foreach (include_dir ${schema_include_dirs})
+  foreach(include_dir ${schema_include_dirs})
     set(include_params -I ${include_dir} ${include_params})
-    if (NOT ${copy_text_schemas_dir} STREQUAL "")
+    if(NOT ${copy_text_schemas_dir} STREQUAL "")
       # Copy text schemas from dependent folders.
       file(GLOB_RECURSE dependent_schemas ${include_dir}/${schema_glob})
-      foreach (dependent_schema ${dependent_schemas})
+      foreach(dependent_schema ${dependent_schemas})
         file(COPY ${dependent_schema} DESTINATION ${copy_text_schemas_dir})
       endforeach()
     endif()
@@ -91,62 +85,54 @@ function(build_flatbuffers flatbuffers_schemas
   foreach(schema ${flatbuffers_schemas})
     get_filename_component(filename ${schema} NAME_WE)
     # For each schema, do the things we requested.
-    if (NOT ${generated_includes_dir} STREQUAL "")
+    if(NOT ${generated_includes_dir} STREQUAL "")
       set(generated_include ${generated_includes_dir}/${filename}_generated.h)
       add_custom_command(
         OUTPUT ${generated_include}
-        COMMAND ${FLATC} ${FLATC_SCHEMA_ARGS}
-        -o ${generated_includes_dir}
-        ${include_params}
-        -c ${schema}
+        COMMAND ${FLATC} ${FLATC_SCHEMA_ARGS} -o ${generated_includes_dir}
+                ${include_params} -c ${schema}
         DEPENDS ${FLATC_TARGET} ${schema} ${additional_dependencies}
         WORKING_DIRECTORY "${working_dir}")
       list(APPEND all_generated_files ${generated_include})
     endif()
 
-    if (NOT ${binary_schemas_dir} STREQUAL "")
+    if(NOT ${binary_schemas_dir} STREQUAL "")
       set(binary_schema ${binary_schemas_dir}/${filename}.bfbs)
       add_custom_command(
         OUTPUT ${binary_schema}
-        COMMAND ${FLATC} -b --schema
-        -o ${binary_schemas_dir}
-        ${include_params}
-        ${schema}
+        COMMAND ${FLATC} -b --schema -o ${binary_schemas_dir} ${include_params}
+                ${schema}
         DEPENDS ${FLATC_TARGET} ${schema} ${additional_dependencies}
         WORKING_DIRECTORY "${working_dir}")
       list(APPEND all_generated_files ${binary_schema})
     endif()
 
-    if (NOT ${copy_text_schemas_dir} STREQUAL "")
+    if(NOT ${copy_text_schemas_dir} STREQUAL "")
       file(COPY ${schema} DESTINATION ${copy_text_schemas_dir})
     endif()
   endforeach()
 
-  # Create a custom target that depends on all the generated files.
-  # This is the target that you can depend on to trigger all these
-  # to be built.
-  add_custom_target(${custom_target_name}
-                    DEPENDS ${all_generated_files} ${additional_dependencies})
+  # Create a custom target that depends on all the generated files. This is the target
+  # that you can depend on to trigger all these to be built.
+  add_custom_target(${custom_target_name} DEPENDS ${all_generated_files}
+                                                  ${additional_dependencies})
 
   # Register the include directory we are using.
-  if (NOT ${generated_includes_dir} STREQUAL "")
+  if(NOT ${generated_includes_dir} STREQUAL "")
     include_directories(${generated_includes_dir})
-    set_property(TARGET ${custom_target_name}
-      PROPERTY GENERATED_INCLUDES_DIR
-      ${generated_includes_dir})
+    set_property(TARGET ${custom_target_name} PROPERTY GENERATED_INCLUDES_DIR
+                                                       ${generated_includes_dir})
   endif()
 
   # Register the binary schemas dir we are using.
-  if (NOT ${binary_schemas_dir} STREQUAL "")
-    set_property(TARGET ${custom_target_name}
-      PROPERTY BINARY_SCHEMAS_DIR
-      ${binary_schemas_dir})
+  if(NOT ${binary_schemas_dir} STREQUAL "")
+    set_property(TARGET ${custom_target_name} PROPERTY BINARY_SCHEMAS_DIR
+                                                       ${binary_schemas_dir})
   endif()
 
   # Register the text schema copy dir we are using.
-  if (NOT ${copy_text_schemas_dir} STREQUAL "")
-    set_property(TARGET ${custom_target_name}
-      PROPERTY COPY_TEXT_SCHEMAS_DIR
-      ${copy_text_schemas_dir})
+  if(NOT ${copy_text_schemas_dir} STREQUAL "")
+    set_property(TARGET ${custom_target_name} PROPERTY COPY_TEXT_SCHEMAS_DIR
+                                                       ${copy_text_schemas_dir})
   endif()
 endfunction()
diff --git a/cmake/FetchMegBrainVersion.cmake b/cmake/FetchMegBrainVersion.cmake
index 0de834ce..80d3f27a 100644
--- a/cmake/FetchMegBrainVersion.cmake
+++ b/cmake/FetchMegBrainVersion.cmake
@@ -1,49 +1,45 @@
-# Parses the version set in src/core/include/megbrain/version.h
-# Exports the following variables:
-# MGB_VER_MAJOR: Major version
-# MGB_VER_MINOR: Minor version
-# MGB_VER_PATCH: Patch version
-# MGB_IS_DEV: Is development version
-# MGB_VER_STRING: Version string
+# Parses the version set in src/core/include/megbrain/version.h Exports the following
+# variables: MGB_VER_MAJOR: Major version MGB_VER_MINOR: Minor version MGB_VER_PATCH:
+# Patch version MGB_IS_DEV: Is development version MGB_VER_STRING: Version string
 option(MGB_FORCE_DEV_VERSION "Force -dev tag in version stamp" OFF)
 
-file (READ "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include/megbrain/version.h" content)
+file(READ "${CMAKE_CURRENT_SOURCE_DIR}/src/core/include/megbrain/version.h" content)
 
-string (REGEX MATCH "MGB_MAJOR +([0-9]+)" _ ${content})
-set (MGB_VER_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGB_MAJOR +([0-9]+)" _ ${content})
+set(MGB_VER_MAJOR ${CMAKE_MATCH_1})
 
-string (REGEX MATCH "MGB_MINOR +([0-9]+)" _ ${content})
-set (MGB_VER_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGB_MINOR +([0-9]+)" _ ${content})
+set(MGB_VER_MINOR ${CMAKE_MATCH_1})
 
-string (REGEX MATCH "MGB_PATCH *([0-9]+)" _ ${content})
-set (MGB_VER_PATCH ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGB_PATCH *([0-9]+)" _ ${content})
+set(MGB_VER_PATCH ${CMAKE_MATCH_1})
 
-string (REGEX MATCH "MGE_MAJOR +([0-9]+)" _ ${content})
-set (MGE_VER_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGE_MAJOR +([0-9]+)" _ ${content})
+set(MGE_VER_MAJOR ${CMAKE_MATCH_1})
 
-string (REGEX MATCH "MGE_MINOR +([0-9]+)" _ ${content})
-set (MGE_VER_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGE_MINOR +([0-9]+)" _ ${content})
+set(MGE_VER_MINOR ${CMAKE_MATCH_1})
 
-string (REGEX MATCH "MGE_PATCH *([0-9]+)" _ ${content})
-set (MGE_VER_PATCH ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGE_PATCH *([0-9]+)" _ ${content})
+set(MGE_VER_PATCH ${CMAKE_MATCH_1})
 
-string (REGEX MATCH "MGE_EXTRA_NAME *\"(.*)\"" _ ${content})
-set (MGE_EXTRA_NAME ${CMAKE_MATCH_1})
+string(REGEX MATCH "MGE_EXTRA_NAME *\"(.*)\"" _ ${content})
+set(MGE_EXTRA_NAME ${CMAKE_MATCH_1})
 
-if (MGB_FORCE_DEV_VERSION)
-    set (MGB_IS_DEV 1)
+if(MGB_FORCE_DEV_VERSION)
+  set(MGB_IS_DEV 1)
 else()
-    string (REGEX MATCH "MGB_IS_DEV +([01])" _ ${content})
-    set (MGB_IS_DEV ${CMAKE_MATCH_1})
+  string(REGEX MATCH "MGB_IS_DEV +([01])" _ ${content})
+  set(MGB_IS_DEV ${CMAKE_MATCH_1})
 endif()
 
-if (DEFINED MGB_VER_MAJOR)
-    set (MGB_VER_STRING "${MGB_VER_MAJOR}.${MGB_VER_MINOR}.${MGB_VER_PATCH}")
+if(DEFINED MGB_VER_MAJOR)
+  set(MGB_VER_STRING "${MGB_VER_MAJOR}.${MGB_VER_MINOR}.${MGB_VER_PATCH}")
 else()
-    set (MGB_VER_STRING "${MGE_VER_MAJOR}.${MGE_VER_MINOR}.${MGE_VER_PATCH}")
+  set(MGB_VER_STRING "${MGE_VER_MAJOR}.${MGE_VER_MINOR}.${MGE_VER_PATCH}")
 endif(DEFINED MGB_VER_MAJOR)
-if (MGB_IS_DEV)
-    set (MGB_VER_STRING "${MGB_VER_STRING}-dev")
+if(MGB_IS_DEV)
+  set(MGB_VER_STRING "${MGB_VER_STRING}-dev")
 endif()
 
 message(STATUS "Building MegBrain ${MGB_VER_STRING}")
diff --git a/cmake/Halide.cmake b/cmake/Halide.cmake
index 2dc8ecab..760f2384 100644
--- a/cmake/Halide.cmake
+++ b/cmake/Halide.cmake
@@ -2,31 +2,40 @@
 include(ExternalProject)
 find_package(LLVM 6.0 REQUIRED CONFIG)
 
-STRING(REPLACE "." ";" LLVM_VERSION_LIST ${LLVM_PACKAGE_VERSION})
+string(REPLACE "." ";" LLVM_VERSION_LIST ${LLVM_PACKAGE_VERSION})
 list(GET LLVM_VERSION_LIST 0 LLVM_VERSION_MAJOR)
 list(GET LLVM_VERSION_LIST 1 LLVM_VERSION_MINOR)
 
-set(HALIDE_DIR "${PROJECT_SOURCE_DIR}/third_party/Halide" CACHE STRING "halide directory")
+set(HALIDE_DIR
+    "${PROJECT_SOURCE_DIR}/third_party/Halide"
+    CACHE STRING "halide directory")
 set(HALIDE_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/Halide)
 set(HALIDE_LIB ${HALIDE_BUILD_DIR}/lib/libHalide.a)
-ExternalProject_add(
-    halide
-    SOURCE_DIR ${HALIDE_DIR}
-    PREFIX ${HALIDE_BUILD_DIR}
-    CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_INSTALL_PREFIX=${HALIDE_BUILD_DIR} -DWITH_APPS=OFF -DWITH_TESTS=OFF -DWITH_TUTORIALS=OFF -DHALIDE_SHARED_LIBRARY=OFF -DHALIDE_REQUIRE_LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DTARGET_MIPS=OFF -DTARGET_POWERPC=OFF
-    BUILD_BYPRODUCTS ${HALIDE_LIB}
-)
+ExternalProject_Add(
+  halide
+  SOURCE_DIR ${HALIDE_DIR}
+  PREFIX ${HALIDE_BUILD_DIR}
+  CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
+             -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
+             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+             -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+             -DCMAKE_INSTALL_PREFIX=${HALIDE_BUILD_DIR}
+             -DWITH_APPS=OFF
+             -DWITH_TESTS=OFF
+             -DWITH_TUTORIALS=OFF
+             -DHALIDE_SHARED_LIBRARY=OFF
+             -DHALIDE_REQUIRE_LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DTARGET_MIPS=OFF
+             -DTARGET_POWERPC=OFF
+  BUILD_BYPRODUCTS ${HALIDE_LIB})
 
 set(HALIDE_INC ${HALIDE_BUILD_DIR}/include)
 file(MAKE_DIRECTORY ${HALIDE_INC})
 add_library(libhalide STATIC IMPORTED GLOBAL)
 add_dependencies(libhalide halide)
-set_target_properties(
-    libhalide PROPERTIES
-    IMPORTED_LOCATION ${HALIDE_LIB}
-    INTERFACE_INCLUDE_DIRECTORIES ${HALIDE_INC}
-)
+set_target_properties(libhalide PROPERTIES IMPORTED_LOCATION ${HALIDE_LIB}
+                                           INTERFACE_INCLUDE_DIRECTORIES ${HALIDE_INC})
 
 set(LLVM_COMPONENTS mcjit;bitwriter;linker;passes;X86;ARM;AArch64;Hexagon;NVPTX;AMDGPU)
 llvm_map_components_to_libnames(HALIDE_LLVM_LIBS ${LLVM_COMPONENTS})
-
diff --git a/cmake/MKL_DNN.cmake b/cmake/MKL_DNN.cmake
index bb85dc3a..ea2407a2 100644
--- a/cmake/MKL_DNN.cmake
+++ b/cmake/MKL_DNN.cmake
@@ -1,25 +1,31 @@
-if (MGE_USE_SYSTEM_LIB)
-    find_package(dnnl)
-    if (dnnl_FOUND)
-        message(STATUS "Using system provided MKL-DNN.")
-        set (MGE_USE_SYSTEM_MKLDNN ON)
-        return()
-    endif()
+if(MGE_USE_SYSTEM_LIB)
+  find_package(dnnl)
+  if(dnnl_FOUND)
+    message(STATUS "Using system provided MKL-DNN.")
+    set(MGE_USE_SYSTEM_MKLDNN ON)
+    return()
+  endif()
 endif()
 option(DNNL_BUILD_TESTS "" OFF)
 option(DNNL_BUILD_EXAMPLES "" OFF)
-# we do not want to use OMP now, so config to CPU mode
-# if set to OMP, some dnnl algo will be more fast
-set(DNNL_CPU_RUNTIME "SEQ" CACHE STRING "config dnnl to DNNL_RUNTIME_SEQ")
+# we do not want to use OMP now, so config to CPU mode if set to OMP, some dnnl algo
+# will be more fast
+set(DNNL_CPU_RUNTIME
+    "SEQ"
+    CACHE STRING "config dnnl to DNNL_RUNTIME_SEQ")
 if(MGE_BLAS STREQUAL "MKL")
-    option(_DNNL_USE_MKL "" ON)
-    set(MKLROOT ${MKL_ROOT_DIR} CACHE STRING "MKL ROOT FOR DNNL")
-    set(MKLLIB libmkl)
+  option(_DNNL_USE_MKL "" ON)
+  set(MKLROOT
+      ${MKL_ROOT_DIR}
+      CACHE STRING "MKL ROOT FOR DNNL")
+  set(MKLLIB libmkl)
 else()
-    option(_DNNL_USE_MKL "" OFF)
+  option(_DNNL_USE_MKL "" OFF)
 endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-parameter -Wno-extra")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-extra")
-set(DNNL_LIBRARY_TYPE STATIC CACHE STRING "config dnnl to STATIC")
+set(DNNL_LIBRARY_TYPE
+    STATIC
+    CACHE STRING "config dnnl to STATIC")
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/intel-mkl-dnn)
diff --git a/cmake/Modules/FindNumPy.cmake b/cmake/Modules/FindNumPy.cmake
index 248f8c21..25c767f7 100644
--- a/cmake/Modules/FindNumPy.cmake
+++ b/cmake/Modules/FindNumPy.cmake
@@ -1,30 +1,28 @@
-# - Find the NumPy libraries
-# This module finds if NumPy is installed, and sets the following variables
-# indicating where it is.
+# * Find the NumPy libraries This module finds if NumPy is installed, and sets the
+#   following variables indicating where it is.
 #
 # TODO: Update to provide the libraries and paths for linking npymath lib.
 #
-#  NUMPY_FOUND               - was NumPy found
-#  NUMPY_VERSION             - the version of NumPy found as a string
-#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
-#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
-#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
-#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
-#  NUMPY_INCLUDE_DIR         - path to the NumPy include files
+# NUMPY_FOUND               - was NumPy found NUMPY_VERSION             - the version of
+# NumPy found as a string NUMPY_VERSION_MAJOR       - the major version number of NumPy
+# NUMPY_VERSION_MINOR       - the minor version number of NumPy NUMPY_VERSION_PATCH -
+# the patch version number of NumPy NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is
+# 10601 NUMPY_INCLUDE_DIR         - path to the NumPy include files
 
 unset(NUMPY_VERSION)
 unset(NUMPY_INCLUDE_DIR)
 
 if(PYTHONINTERP_FOUND)
-  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "import numpy as n; print(n.__version__); print(n.get_include());"
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" "-c"
+            "import numpy as n; print(n.__version__); print(n.get_include());"
     RESULT_VARIABLE __result
     OUTPUT_VARIABLE __output
     OUTPUT_STRIP_TRAILING_WHITESPACE)
 
   if(__result MATCHES 0)
     string(REGEX REPLACE ";" "\\\\;" __values ${__output})
-    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
+    string(REGEX REPLACE "\r?\n" ";" __values ${__values})
     list(GET __values 0 NUMPY_VERSION)
     list(GET __values 1 NUMPY_INCLUDE_DIR)
 
@@ -33,13 +31,18 @@ if(PYTHONINTERP_FOUND)
       set(NUMPY_VERSION_MAJOR ${CMAKE_MATCH_1})
       set(NUMPY_VERSION_MINOR ${CMAKE_MATCH_2})
       set(NUMPY_VERSION_PATCH ${CMAKE_MATCH_3})
-      math(EXPR NUMPY_VERSION_DECIMAL
-        "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
-      string(REGEX REPLACE "\\\\" "/"  NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
+      math(
+        EXPR
+        NUMPY_VERSION_DECIMAL
+        "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}"
+      )
+      string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIR ${NUMPY_INCLUDE_DIR})
     else()
-     unset(NUMPY_VERSION)
-     unset(NUMPY_INCLUDE_DIR)
-     message(STATUS "Requested NumPy version and include path, but got instead:\n${__output}\n")
+      unset(NUMPY_VERSION)
+      unset(NUMPY_INCLUDE_DIR)
+      message(
+        STATUS
+          "Requested NumPy version and include path, but got instead:\n${__output}\n")
     endif()
   endif()
 else()
@@ -47,8 +50,10 @@ else()
 endif()
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
-                                        VERSION_VAR   NUMPY_VERSION)
+find_package_handle_standard_args(
+  NumPy
+  REQUIRED_VARS NUMPY_INCLUDE_DIR NUMPY_VERSION
+  VERSION_VAR NUMPY_VERSION)
 
 if(NUMPY_FOUND)
   message(STATUS "NumPy ver. ${NUMPY_VERSION} found (include: ${NUMPY_INCLUDE_DIR})")
diff --git a/cmake/OpenBLAS.cmake b/cmake/OpenBLAS.cmake
index 3216522f..1562f88a 100644
--- a/cmake/OpenBLAS.cmake
+++ b/cmake/OpenBLAS.cmake
@@ -1,48 +1,50 @@
-if (MGE_USE_SYSTEM_LIB)
-    find_package(OpenBLAS)
-    set (MGE_USE_SYSTEM_OPENBLAS ON)
-
-    message(STATUS "Using system provided OpenBLAS ${OpenBLAS_VERSION}")
-    add_library(libopenblas IMPORTED GLOBAL)
-    set_target_properties(
-        libopenblas PROPERTIES
-        IMPORTED_LOCATION ${OpenBLAS_LIBRARIES}
-        INTERFACE_INCLUDE_DIRECTORIES ${OpenBLAS_INCLUDE_DIRS}
-    )
-    return()
+if(MGE_USE_SYSTEM_LIB)
+  find_package(OpenBLAS)
+  set(MGE_USE_SYSTEM_OPENBLAS ON)
+
+  message(STATUS "Using system provided OpenBLAS ${OpenBLAS_VERSION}")
+  add_library(libopenblas IMPORTED GLOBAL)
+  set_target_properties(
+    libopenblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_LIBRARIES}
+                           INTERFACE_INCLUDE_DIRECTORIES ${OpenBLAS_INCLUDE_DIRS})
+  return()
 endif()
 
 include(ExternalProject)
 include(GNUInstallDirs)
 
-set(OPENBLAS_DIR "${PROJECT_SOURCE_DIR}/third_party/OpenBLAS" CACHE STRING "OpenBLAS directory")
+set(OPENBLAS_DIR
+    "${PROJECT_SOURCE_DIR}/third_party/OpenBLAS"
+    CACHE STRING "OpenBLAS directory")
 set(OPENBLAS_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/OpenBLAS)
 
 set(OPENBLAS_INC ${OPENBLAS_BUILD_DIR}/include)
 set(OPENBLAS_LIB ${OPENBLAS_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libopenblas.a)
 
 if(${CMAKE_GENERATOR} STREQUAL "Ninja")
-    set(MAKE_COMMAND make)
+  set(MAKE_COMMAND make)
 else()
-    set(MAKE_COMMAND "$(MAKE)")
+  set(MAKE_COMMAND "$(MAKE)")
 endif()
 
-ExternalProject_add(
-    openblas
-    SOURCE_DIR ${OPENBLAS_DIR}
-    PREFIX ${OPENBLAS_BUILD_DIR}
-    CMAKE_GENERATOR "Unix Makefiles"
-    CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${OPENBLAS_BUILD_DIR} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    BUILD_COMMAND ${MAKE_COMMAND}
-    BUILD_BYPRODUCTS ${OPENBLAS_LIB} ${OPENBLAS_PROTOC_EXECUTABLE}
-)
+ExternalProject_Add(
+  openblas
+  SOURCE_DIR ${OPENBLAS_DIR}
+  PREFIX ${OPENBLAS_BUILD_DIR}
+  CMAKE_GENERATOR "Unix Makefiles"
+  CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
+             -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
+             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+             -DCMAKE_INSTALL_PREFIX=${OPENBLAS_BUILD_DIR}
+             -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+  BUILD_COMMAND ${MAKE_COMMAND}
+  BUILD_BYPRODUCTS ${OPENBLAS_LIB} ${OPENBLAS_PROTOC_EXECUTABLE})
 
 file(MAKE_DIRECTORY ${OPENBLAS_INC})
 
 add_library(libopenblas STATIC IMPORTED GLOBAL)
 add_dependencies(libopenblas openblas)
 set_target_properties(
-    libopenblas PROPERTIES
-    IMPORTED_LOCATION ${OPENBLAS_LIB}
-    INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_BUILD_DIR}/include
-)
+  libopenblas PROPERTIES IMPORTED_LOCATION ${OPENBLAS_LIB}
+                         INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_BUILD_DIR}/include)
diff --git a/cmake/aclrt.cmake b/cmake/aclrt.cmake
index d9d3d190..ab2201e5 100644
--- a/cmake/aclrt.cmake
+++ b/cmake/aclrt.cmake
@@ -1,31 +1,31 @@
-find_library(ACLRT_LIBRARY
-    NAMES libascendcl.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{ACLRT_HOME}/lib64/stub" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES stub
-    DOC "ACL library." )
+find_library(
+  ACLRT_LIBRARY
+  NAMES libascendcl.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{ACLRT_HOME}/lib64/stub" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES stub
+  DOC "ACL library.")
 
 if(ACLRT_LIBRARY STREQUAL "ACLRT_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find ACLRT Library")
+  message(FATAL_ERROR "Can not find ACLRT Library")
 endif()
 
 get_filename_component(__found_aclrt_root "${ACLRT_LIBRARY}/../../../" REALPATH)
-find_path(ACLRT_INCLUDE_DIR
-    NAMES acl/acl.h
-    HINTS "$ENV{ACLRT_HOME}/include" ${__found_aclrt_root}
-    PATH_SUFFIXES include
-    DOC "Path to ACLRT include directory." )
+find_path(
+  ACLRT_INCLUDE_DIR
+  NAMES acl/acl.h
+  HINTS "$ENV{ACLRT_HOME}/include" ${__found_aclrt_root}
+  PATH_SUFFIXES include
+  DOC "Path to ACLRT include directory.")
 
 if(ACLRT_INCLUDE_DIR STREQUAL "ACLRT_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find ACLRT Library")
+  message(FATAL_ERROR "Can not find ACLRT Library")
 endif()
 
 add_library(libascendcl SHARED IMPORTED)
 
-set_target_properties(libascendcl PROPERTIES
-    IMPORTED_LOCATION ${ACLRT_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${ACLRT_INCLUDE_DIR}
-)
+set_target_properties(
+  libascendcl PROPERTIES IMPORTED_LOCATION ${ACLRT_LIBRARY}
+                         INTERFACE_INCLUDE_DIRECTORIES ${ACLRT_INCLUDE_DIR})
 
 message(STATUS "Found ACLRT: ${__found_aclrt_root}")
-
diff --git a/cmake/cndev.cmake b/cmake/cndev.cmake
index 0b85297f..e9bde247 100644
--- a/cmake/cndev.cmake
+++ b/cmake/cndev.cmake
@@ -1,44 +1,57 @@
-find_library(CNDEV_LIBRARY 
-    NAMES libcndev.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "CNDEV library." )
+find_library(
+  CNDEV_LIBRARY
+  NAMES libcndev.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "CNDEV library.")
 
 if(CNDEV_LIBRARY STREQUAL "CNDEV_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNDEV Library")
+  message(FATAL_ERROR "Can not find CNDEV Library")
 endif()
 
 get_filename_component(__found_cndev_root ${CNDEV_LIBRARY}/../.. REALPATH)
-find_path(CNDEV_INCLUDE_DIR 
-    NAMES cndev.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cndev_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CNDEV include directory." )
+find_path(
+  CNDEV_INCLUDE_DIR
+  NAMES cndev.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cndev_root}
+  PATH_SUFFIXES include
+  DOC "Path to CNDEV include directory.")
 
 if(CNDEV_INCLUDE_DIR STREQUAL "CNDEV_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNDEV Library")
+  message(FATAL_ERROR "Can not find CNDEV Library")
 endif()
 
-file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_1 REGEX "^#define CNDEV_VERSION_1 [0-9]+.*$")
-file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_2 REGEX "^#define CNDEV_VERSION_2 [0-9]+.*$")
-file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_3 REGEX "^#define CNDEV_VERSION_3 [0-9]+.*$")
-file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_4 REGEX "^#define CNDEV_VERSION_4 [0-9]+.*$")
-file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_5 REGEX "^#define CNDEV_VERSION_5 [0-9]+.*$")
-
-string(REGEX REPLACE "^#define CNDEV_VERSION_1 ([0-9]+).*$" "\\1" CNDEV_VERSION_1 "${CNDEV_1}")
-string(REGEX REPLACE "^#define CNDEV_VERSION_2 ([0-9]+).*$" "\\1" CNDEV_VERSION_2 "${CNDEV_2}")
-string(REGEX REPLACE "^#define CNDEV_VERSION_3 ([0-9]+).*$" "\\1" CNDEV_VERSION_3 "${CNDEV_3}")
-string(REGEX REPLACE "^#define CNDEV_VERSION_4 ([0-9]+).*$" "\\1" CNDEV_VERSION_4 "${CNDEV_4}")
-string(REGEX REPLACE "^#define CNDEV_VERSION_5 ([0-9]+).*$" "\\1" CNDEV_VERSION_5 "${CNDEV_5}")
-set(CNDEV_VERSION_STRING "${CNDEV_VERSION_1}.${CNDEV_VERSION_2}.${CNDEV_VERSION_3}.${CNDEV_VERSION_4}.${CNDEV_VERSION_5}")
+file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_1
+     REGEX "^#define CNDEV_VERSION_1 [0-9]+.*$")
+file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_2
+     REGEX "^#define CNDEV_VERSION_2 [0-9]+.*$")
+file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_3
+     REGEX "^#define CNDEV_VERSION_3 [0-9]+.*$")
+file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_4
+     REGEX "^#define CNDEV_VERSION_4 [0-9]+.*$")
+file(STRINGS "${CNDEV_INCLUDE_DIR}/cndev.h" CNDEV_5
+     REGEX "^#define CNDEV_VERSION_5 [0-9]+.*$")
+
+string(REGEX REPLACE "^#define CNDEV_VERSION_1 ([0-9]+).*$" "\\1" CNDEV_VERSION_1
+                     "${CNDEV_1}")
+string(REGEX REPLACE "^#define CNDEV_VERSION_2 ([0-9]+).*$" "\\1" CNDEV_VERSION_2
+                     "${CNDEV_2}")
+string(REGEX REPLACE "^#define CNDEV_VERSION_3 ([0-9]+).*$" "\\1" CNDEV_VERSION_3
+                     "${CNDEV_3}")
+string(REGEX REPLACE "^#define CNDEV_VERSION_4 ([0-9]+).*$" "\\1" CNDEV_VERSION_4
+                     "${CNDEV_4}")
+string(REGEX REPLACE "^#define CNDEV_VERSION_5 ([0-9]+).*$" "\\1" CNDEV_VERSION_5
+                     "${CNDEV_5}")
+set(CNDEV_VERSION_STRING
+    "${CNDEV_VERSION_1}.${CNDEV_VERSION_2}.${CNDEV_VERSION_3}.${CNDEV_VERSION_4}.${CNDEV_VERSION_5}"
+)
 
 add_library(libcndev SHARED IMPORTED)
 
-set_target_properties(libcndev PROPERTIES
-    IMPORTED_LOCATION ${CNDEV_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CNDEV_INCLUDE_DIR}
-)
-
-message(STATUS "Found CNDEV: ${__found_cndev_root} (found version: ${CNDEV_VERSION_STRING})")
+set_target_properties(
+  libcndev PROPERTIES IMPORTED_LOCATION ${CNDEV_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
+                                                         ${CNDEV_INCLUDE_DIR})
 
+message(
+  STATUS "Found CNDEV: ${__found_cndev_root} (found version: ${CNDEV_VERSION_STRING})")
diff --git a/cmake/cnlight.cmake b/cmake/cnlight.cmake
index 4c18c1d8..725d913d 100644
--- a/cmake/cnlight.cmake
+++ b/cmake/cnlight.cmake
@@ -1,40 +1,49 @@
-find_library(CNLIGHT_LIBRARY 
-    NAMES libcnlight.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "CNLIGHT library." )
+find_library(
+  CNLIGHT_LIBRARY
+  NAMES libcnlight.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "CNLIGHT library.")
 
 if(CNLIGHT_LIBRARY STREQUAL "CNLIGHT_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNLIGHT Library")
+  message(FATAL_ERROR "Can not find CNLIGHT Library")
 endif()
 
 get_filename_component(__found_cnlight_root "${CNLIGHT_LIBRARY}/../.." REALPATH)
-find_path(CNLIGHT_INCLUDE_DIR 
-    NAMES cnlight.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnlight_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CNLIGHT include directory." )
+find_path(
+  CNLIGHT_INCLUDE_DIR
+  NAMES cnlight.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnlight_root}
+  PATH_SUFFIXES include
+  DOC "Path to CNLIGHT include directory.")
 
 if(CNLIGHT_INCLUDE_DIR STREQUAL "CNLIGHT_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNLIGHT Library")
+  message(FATAL_ERROR "Can not find CNLIGHT Library")
 endif()
 
-file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MAJOR REGEX "^#define CNLIGHT_MAJOR_VERSION [0-9]+.*$")
-file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MINOR REGEX "^#define CNLIGHT_MINOR_VERSION [0-9]+.*$")
-file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_PATCH REGEX "^#define CNLIGHT_PATCH_VERSION [0-9]+.*$")
-
-string(REGEX REPLACE "^#define CNLIGHT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNLIGHT_VERSION_MAJOR "${CNLIGHT_MAJOR}")
-string(REGEX REPLACE "^#define CNLIGHT_MINOR_VERSION ([0-9]+).*$" "\\1" CNLIGHT_VERSION_MINOR "${CNLIGHT_MINOR}")
-string(REGEX REPLACE "^#define CNLIGHT_PATCH_VERSION ([0-9]+).*$" "\\1" CNLIGHT_VERSION_PATCH "${CNLIGHT_PATCH}")
-set(CNLIGHT_VERSION_STRING "${CNLIGHT_VERSION_MAJOR}.${CNLIGHT_VERSION_MINOR}.${CNLIGHT_VERSION_PATCH}")
+file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MAJOR
+     REGEX "^#define CNLIGHT_MAJOR_VERSION [0-9]+.*$")
+file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_MINOR
+     REGEX "^#define CNLIGHT_MINOR_VERSION [0-9]+.*$")
+file(STRINGS "${CNLIGHT_INCLUDE_DIR}/cnlight.h" CNLIGHT_PATCH
+     REGEX "^#define CNLIGHT_PATCH_VERSION [0-9]+.*$")
+
+string(REGEX REPLACE "^#define CNLIGHT_MAJOR_VERSION ([0-9]+).*$" "\\1"
+                     CNLIGHT_VERSION_MAJOR "${CNLIGHT_MAJOR}")
+string(REGEX REPLACE "^#define CNLIGHT_MINOR_VERSION ([0-9]+).*$" "\\1"
+                     CNLIGHT_VERSION_MINOR "${CNLIGHT_MINOR}")
+string(REGEX REPLACE "^#define CNLIGHT_PATCH_VERSION ([0-9]+).*$" "\\1"
+                     CNLIGHT_VERSION_PATCH "${CNLIGHT_PATCH}")
+set(CNLIGHT_VERSION_STRING
+    "${CNLIGHT_VERSION_MAJOR}.${CNLIGHT_VERSION_MINOR}.${CNLIGHT_VERSION_PATCH}")
 
 add_library(libcnlight SHARED IMPORTED)
 
-set_target_properties(libcnlight PROPERTIES
-    IMPORTED_LOCATION ${CNLIGHT_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CNLIGHT_INCLUDE_DIR}
-)
-
-message(STATUS "Found CNLIGHT: ${__found_cnlight_root} (found version: ${CNLIGHT_VERSION_STRING})")
+set_target_properties(
+  libcnlight PROPERTIES IMPORTED_LOCATION ${CNLIGHT_LIBRARY}
+                        INTERFACE_INCLUDE_DIRECTORIES ${CNLIGHT_INCLUDE_DIR})
 
+message(
+  STATUS
+    "Found CNLIGHT: ${__found_cnlight_root} (found version: ${CNLIGHT_VERSION_STRING})")
diff --git a/cmake/cnml.cmake b/cmake/cnml.cmake
index 067572d1..7b0ed901 100644
--- a/cmake/cnml.cmake
+++ b/cmake/cnml.cmake
@@ -1,40 +1,48 @@
-find_library(CNML_LIBRARY 
-    NAMES libcnml.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "CNML library." )
+find_library(
+  CNML_LIBRARY
+  NAMES libcnml.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "CNML library.")
 
 if(CNML_LIBRARY STREQUAL "CNML_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNML Library")
+  message(FATAL_ERROR "Can not find CNML Library")
 endif()
 
 get_filename_component(__found_cnml_root "${CNML_LIBRARY}/../.." REALPATH)
-find_path(CNML_INCLUDE_DIR 
-    NAMES cnml.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnml_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CNML include directory." )
+find_path(
+  CNML_INCLUDE_DIR
+  NAMES cnml.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnml_root}
+  PATH_SUFFIXES include
+  DOC "Path to CNML include directory.")
 
 if(CNML_INCLUDE_DIR STREQUAL "CNML_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNML Library")
+  message(FATAL_ERROR "Can not find CNML Library")
 endif()
 
-file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MAJOR REGEX "^#define CNML_MAJOR_VERSION [0-9]+.*$")
-file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MINOR REGEX "^#define CNML_MINOR_VERSION [0-9]+.*$")
-file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_PATCH REGEX "^#define CNML_PATCH_VERSION [0-9]+.*$")
-
-string(REGEX REPLACE "^#define CNML_MAJOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MAJOR "${CNML_MAJOR}")
-string(REGEX REPLACE "^#define CNML_MINOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MINOR "${CNML_MINOR}")
-string(REGEX REPLACE "^#define CNML_PATCH_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_PATCH "${CNML_PATCH}")
-set(CNML_VERSION_STRING "${CNML_VERSION_MAJOR}.${CNML_VERSION_MINOR}.${CNML_VERSION_PATCH}")
+file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MAJOR
+     REGEX "^#define CNML_MAJOR_VERSION [0-9]+.*$")
+file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_MINOR
+     REGEX "^#define CNML_MINOR_VERSION [0-9]+.*$")
+file(STRINGS "${CNML_INCLUDE_DIR}/cnml.h" CNML_PATCH
+     REGEX "^#define CNML_PATCH_VERSION [0-9]+.*$")
+
+string(REGEX REPLACE "^#define CNML_MAJOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MAJOR
+                     "${CNML_MAJOR}")
+string(REGEX REPLACE "^#define CNML_MINOR_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_MINOR
+                     "${CNML_MINOR}")
+string(REGEX REPLACE "^#define CNML_PATCH_VERSION ([0-9]+).*$" "\\1" CNML_VERSION_PATCH
+                     "${CNML_PATCH}")
+set(CNML_VERSION_STRING
+    "${CNML_VERSION_MAJOR}.${CNML_VERSION_MINOR}.${CNML_VERSION_PATCH}")
 
 add_library(libcnml SHARED IMPORTED)
 
-set_target_properties(libcnml PROPERTIES
-    IMPORTED_LOCATION ${CNML_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CNML_INCLUDE_DIR}
-)
-
-message(STATUS "Found CNML: ${__found_cnml_root} (found version: ${CNML_VERSION_STRING})")
+set_target_properties(
+  libcnml PROPERTIES IMPORTED_LOCATION ${CNML_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
+                                                       ${CNML_INCLUDE_DIR})
 
+message(
+  STATUS "Found CNML: ${__found_cnml_root} (found version: ${CNML_VERSION_STRING})")
diff --git a/cmake/cnnl.cmake b/cmake/cnnl.cmake
index 4d6cf973..6822adf8 100644
--- a/cmake/cnnl.cmake
+++ b/cmake/cnnl.cmake
@@ -1,80 +1,100 @@
-find_library(CNNL_LIBRARY 
-    NAMES libcnnl.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "CNNL library." )
+find_library(
+  CNNL_LIBRARY
+  NAMES libcnnl.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "CNNL library.")
 
 if(CNNL_LIBRARY STREQUAL "CNNL_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNNL Library")
+  message(FATAL_ERROR "Can not find CNNL Library")
 endif()
 
 get_filename_component(__found_cnnl_root "${CNNL_LIBRARY}/../.." REALPATH)
-find_path(CNNL_INCLUDE_DIR 
-    NAMES cnnl.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CNNL include directory." )
+find_path(
+  CNNL_INCLUDE_DIR
+  NAMES cnnl.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_root}
+  PATH_SUFFIXES include
+  DOC "Path to CNNL include directory.")
 
 if(CNNL_INCLUDE_DIR STREQUAL "CNNL_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNNL Library")
+  message(FATAL_ERROR "Can not find CNNL Library")
 endif()
 
-file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MAJOR REGEX "^#define CNNL_MAJOR [0-9]+.*$")
-file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MINOR REGEX "^#define CNNL_MINOR [0-9]+.*$")
-file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_PATCH REGEX "^#define CNNL_PATCHLEVEL [0-9]+.*$")
-
-string(REGEX REPLACE "^#define CNNL_MAJOR ([0-9]+).*$" "\\1" CNNL_VERSION_MAJOR "${CNNL_MAJOR}")
-string(REGEX REPLACE "^#define CNNL_MINOR ([0-9]+).*$" "\\1" CNNL_VERSION_MINOR "${CNNL_MINOR}")
-string(REGEX REPLACE "^#define CNNL_PATCHLEVEL ([0-9]+).*$" "\\1" CNNL_VERSION_PATCH "${CNNL_PATCH}")
-set(CNNL_VERSION_STRING "${CNNL_VERSION_MAJOR}.${CNNL_VERSION_MINOR}.${CNNL_VERSION_PATCH}")
+file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MAJOR
+     REGEX "^#define CNNL_MAJOR [0-9]+.*$")
+file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_MINOR
+     REGEX "^#define CNNL_MINOR [0-9]+.*$")
+file(STRINGS "${CNNL_INCLUDE_DIR}/cnnl.h" CNNL_PATCH
+     REGEX "^#define CNNL_PATCHLEVEL [0-9]+.*$")
+
+string(REGEX REPLACE "^#define CNNL_MAJOR ([0-9]+).*$" "\\1" CNNL_VERSION_MAJOR
+                     "${CNNL_MAJOR}")
+string(REGEX REPLACE "^#define CNNL_MINOR ([0-9]+).*$" "\\1" CNNL_VERSION_MINOR
+                     "${CNNL_MINOR}")
+string(REGEX REPLACE "^#define CNNL_PATCHLEVEL ([0-9]+).*$" "\\1" CNNL_VERSION_PATCH
+                     "${CNNL_PATCH}")
+set(CNNL_VERSION_STRING
+    "${CNNL_VERSION_MAJOR}.${CNNL_VERSION_MINOR}.${CNNL_VERSION_PATCH}")
 
 add_library(libcnnl SHARED IMPORTED)
 
-set_target_properties(libcnnl PROPERTIES
-    IMPORTED_LOCATION ${CNNL_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CNNL_INCLUDE_DIR}
-)
+set_target_properties(
+  libcnnl PROPERTIES IMPORTED_LOCATION ${CNNL_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
+                                                       ${CNNL_INCLUDE_DIR})
 
-message(STATUS "Found CNNL: ${__found_cnnl_root} (found version: ${CNNL_VERSION_STRING})")
+message(
+  STATUS "Found CNNL: ${__found_cnnl_root} (found version: ${CNNL_VERSION_STRING})")
 
-find_library(CNNL_EXTRA_LIBRARY 
-    NAMES libcnnl_extra.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "CNNL_EXTRA library." )
+find_library(
+  CNNL_EXTRA_LIBRARY
+  NAMES libcnnl_extra.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "CNNL_EXTRA library.")
 
 if(CNNL_EXTRA_LIBRARY STREQUAL "CNNL_EXTRA_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNNL_EXTRA Library")
+  message(FATAL_ERROR "Can not find CNNL_EXTRA Library")
 endif()
 
 get_filename_component(__found_cnnl_extra_root "${CNNL_EXTRA_LIBRARY}/../.." REALPATH)
-find_path(CNNL_EXTRA_INCLUDE_DIR 
-    NAMES cnnl_extra.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_extra_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CNNL_EXTRA include directory." )
+find_path(
+  CNNL_EXTRA_INCLUDE_DIR
+  NAMES cnnl_extra.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnnl_extra_root}
+  PATH_SUFFIXES include
+  DOC "Path to CNNL_EXTRA include directory.")
 
 if(CNNL_EXTRA_INCLUDE_DIR STREQUAL "CNNL_EXTRA_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNNL_EXTRA Library")
+  message(FATAL_ERROR "Can not find CNNL_EXTRA Library")
 endif()
 
-file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MAJOR REGEX "^#define CNNL_EXTRA_MAJOR [0-9]+.*$")
-file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MINOR REGEX "^#define CNNL_EXTRA_MINOR [0-9]+.*$")
-file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_PATCH REGEX "^#define CNNL_EXTRA_PATCHLEVEL [0-9]+.*$")
-
-string(REGEX REPLACE "^#define CNNL_EXTRA_MAJOR ([0-9]+).*$" "\\1" CNNL_EXTRA_VERSION_MAJOR "${CNNL_EXTRA_MAJOR}")
-string(REGEX REPLACE "^#define CNNL_EXTRA_MINOR ([0-9]+).*$" "\\1" CNNL_EXTRA_VERSION_MINOR "${CNNL_EXTRA_MINOR}")
-string(REGEX REPLACE "^#define CNNL_EXTRA_PATCHLEVEL ([0-9]+).*$" "\\1" CNNL_EXTRA_VERSION_PATCH "${CNNL_EXTRA_PATCH}")
-set(CNNL_EXTRA_VERSION_STRING "${CNNL_EXTRA_VERSION_MAJOR}.${CNNL_EXTRA_VERSION_MINOR}.${CNNL_EXTRA_VERSION_PATCH}")
+file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MAJOR
+     REGEX "^#define CNNL_EXTRA_MAJOR [0-9]+.*$")
+file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_MINOR
+     REGEX "^#define CNNL_EXTRA_MINOR [0-9]+.*$")
+file(STRINGS "${CNNL_EXTRA_INCLUDE_DIR}/cnnl_extra.h" CNNL_EXTRA_PATCH
+     REGEX "^#define CNNL_EXTRA_PATCHLEVEL [0-9]+.*$")
+
+string(REGEX REPLACE "^#define CNNL_EXTRA_MAJOR ([0-9]+).*$" "\\1"
+                     CNNL_EXTRA_VERSION_MAJOR "${CNNL_EXTRA_MAJOR}")
+string(REGEX REPLACE "^#define CNNL_EXTRA_MINOR ([0-9]+).*$" "\\1"
+                     CNNL_EXTRA_VERSION_MINOR "${CNNL_EXTRA_MINOR}")
+string(REGEX REPLACE "^#define CNNL_EXTRA_PATCHLEVEL ([0-9]+).*$" "\\1"
+                     CNNL_EXTRA_VERSION_PATCH "${CNNL_EXTRA_PATCH}")
+set(CNNL_EXTRA_VERSION_STRING
+    "${CNNL_EXTRA_VERSION_MAJOR}.${CNNL_EXTRA_VERSION_MINOR}.${CNNL_EXTRA_VERSION_PATCH}"
+)
 
 add_library(libcnnl_extra SHARED IMPORTED)
 
-set_target_properties(libcnnl_extra PROPERTIES
-    IMPORTED_LOCATION ${CNNL_EXTRA_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CNNL_EXTRA_INCLUDE_DIR}
-)
-
-message(STATUS "Found CNNL_EXTRA: ${__found_cnnl_extra_root} (found version: ${CNNL_EXTRA_VERSION_STRING})")
+set_target_properties(
+  libcnnl_extra PROPERTIES IMPORTED_LOCATION ${CNNL_EXTRA_LIBRARY}
+                           INTERFACE_INCLUDE_DIRECTORIES ${CNNL_EXTRA_INCLUDE_DIR})
 
+message(
+  STATUS
+    "Found CNNL_EXTRA: ${__found_cnnl_extra_root} (found version: ${CNNL_EXTRA_VERSION_STRING})"
+)
diff --git a/cmake/cnrt.cmake b/cmake/cnrt.cmake
index fec07cce..c4b98756 100644
--- a/cmake/cnrt.cmake
+++ b/cmake/cnrt.cmake
@@ -1,40 +1,48 @@
-find_library(CNRT_LIBRARY 
-    NAMES libcnrt.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "CNRT library." )
+find_library(
+  CNRT_LIBRARY
+  NAMES libcnrt.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "CNRT library.")
 
 if(CNRT_LIBRARY STREQUAL "CNRT_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNRT Library")
+  message(FATAL_ERROR "Can not find CNRT Library")
 endif()
 
 get_filename_component(__found_cnrt_root ${CNRT_LIBRARY}/../../ REALPATH)
-find_path(CNRT_INCLUDE_DIR 
-    NAMES cnrt.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnrt_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CNRT include directory." )
+find_path(
+  CNRT_INCLUDE_DIR
+  NAMES cnrt.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_cnrt_root}
+  PATH_SUFFIXES include
+  DOC "Path to CNRT include directory.")
 
 if(CNRT_INCLUDE_DIR STREQUAL "CNRT_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CNRT Library")
+  message(FATAL_ERROR "Can not find CNRT Library")
 endif()
 
-file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MAJOR REGEX "^#define CNRT_MAJOR_VERSION [0-9]+.*$")
-file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MINOR REGEX "^#define CNRT_MINOR_VERSION [0-9]+.*$")
-file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_PATCH REGEX "^#define CNRT_PATCH_VERSION [0-9]+.*$")
-
-string(REGEX REPLACE "^#define CNRT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MAJOR "${CNRT_MAJOR}")
-string(REGEX REPLACE "^#define CNRT_MINOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MINOR "${CNRT_MINOR}")
-string(REGEX REPLACE "^#define CNRT_PATCH_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_PATCH "${CNRT_PATCH}")
-set(CNRT_VERSION_STRING "${CNRT_VERSION_MAJOR}.${CNRT_VERSION_MINOR}.${CNRT_VERSION_PATCH}")
+file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MAJOR
+     REGEX "^#define CNRT_MAJOR_VERSION [0-9]+.*$")
+file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_MINOR
+     REGEX "^#define CNRT_MINOR_VERSION [0-9]+.*$")
+file(STRINGS "${CNRT_INCLUDE_DIR}/cnrt.h" CNRT_PATCH
+     REGEX "^#define CNRT_PATCH_VERSION [0-9]+.*$")
+
+string(REGEX REPLACE "^#define CNRT_MAJOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MAJOR
+                     "${CNRT_MAJOR}")
+string(REGEX REPLACE "^#define CNRT_MINOR_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_MINOR
+                     "${CNRT_MINOR}")
+string(REGEX REPLACE "^#define CNRT_PATCH_VERSION ([0-9]+).*$" "\\1" CNRT_VERSION_PATCH
+                     "${CNRT_PATCH}")
+set(CNRT_VERSION_STRING
+    "${CNRT_VERSION_MAJOR}.${CNRT_VERSION_MINOR}.${CNRT_VERSION_PATCH}")
 
 add_library(libcnrt SHARED IMPORTED)
 
-set_target_properties(libcnrt PROPERTIES
-    IMPORTED_LOCATION ${CNRT_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CNRT_INCLUDE_DIR}
-)
-
-message(STATUS "Found CNRT: ${__found_cnrt_root} (found version: ${CNRT_VERSION_STRING})")
+set_target_properties(
+  libcnrt PROPERTIES IMPORTED_LOCATION ${CNRT_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
+                                                       ${CNRT_INCLUDE_DIR})
 
+message(
+  STATUS "Found CNRT: ${__found_cnrt_root} (found version: ${CNRT_VERSION_STRING})")
diff --git a/cmake/cpp_redis.cmake b/cmake/cpp_redis.cmake
index d7b642e4..f4a88758 100644
--- a/cmake/cpp_redis.cmake
+++ b/cmake/cpp_redis.cmake
@@ -1,2 +1,5 @@
-file(GLOB_RECURSE CPP_REDIS_SRCS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp)
-set(CPP_REDIS_INCLUDES ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
\ No newline at end of file
+file(GLOB_RECURSE CPP_REDIS_SRCS
+     ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp
+     ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp)
+set(CPP_REDIS_INCLUDES ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes
+                       ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
diff --git a/cmake/cpuinfo.cmake b/cmake/cpuinfo.cmake
index 97647e38..cf220e06 100644
--- a/cmake/cpuinfo.cmake
+++ b/cmake/cpuinfo.cmake
@@ -1,20 +1,20 @@
-if (MGE_USE_SYSTEM_LIB)
-    find_package(Cpuinfo)
-    message(STATUS "Using system provided cpuinfo ${cpuinfo_VERSION}")
-    add_library(libcpuinfo IMPORTED GLOBAL)
-    set_target_properties(
-        libcpuinfo PROPERTIES
-        IMPORTED_LOCATION ${cpuinfo_LIBRARIES}
-        INTERFACE_INCLUDE_DIRECTORIES ${cpuinfo_INCLUDE_DIRS}
-        )
-    return()
+if(MGE_USE_SYSTEM_LIB)
+  find_package(Cpuinfo)
+  message(STATUS "Using system provided cpuinfo ${cpuinfo_VERSION}")
+  add_library(libcpuinfo IMPORTED GLOBAL)
+  set_target_properties(
+    libcpuinfo PROPERTIES IMPORTED_LOCATION ${cpuinfo_LIBRARIES}
+                          INTERFACE_INCLUDE_DIRECTORIES ${cpuinfo_INCLUDE_DIRS})
+  return()
 endif()
 
-SET(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "Type of cpuinfo library (shared, static, or default) to build")
-OPTION(CPUINFO_BUILD_TOOLS "Build command-line tools" OFF)
-OPTION(CPUINFO_BUILD_UNIT_TESTS "Build cpuinfo unit tests" OFF)
-OPTION(CPUINFO_BUILD_MOCK_TESTS "Build cpuinfo mock tests" OFF)
-OPTION(CPUINFO_BUILD_BENCHMARKS "Build cpuinfo micro-benchmarks" OFF)
+set(CPUINFO_LIBRARY_TYPE
+    "static"
+    CACHE STRING "Type of cpuinfo library (shared, static, or default) to build")
+option(CPUINFO_BUILD_TOOLS "Build command-line tools" OFF)
+option(CPUINFO_BUILD_UNIT_TESTS "Build cpuinfo unit tests" OFF)
+option(CPUINFO_BUILD_MOCK_TESTS "Build cpuinfo mock tests" OFF)
+option(CPUINFO_BUILD_BENCHMARKS "Build cpuinfo micro-benchmarks" OFF)
 include_directories("${PROJECT_SOURCE_DIR}/third_party/cpuinfo/include")
-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/cpuinfo ${CMAKE_CURRENT_BINARY_DIR}/cpuinfo EXCLUDE_FROM_ALL)
-
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/cpuinfo
+                 ${CMAKE_CURRENT_BINARY_DIR}/cpuinfo EXCLUDE_FROM_ALL)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 9f262c50..d256b8fe 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -1,73 +1,83 @@
 find_package(PkgConfig)
 if(${PkgConfig_FOUND})
-    pkg_check_modules(PC_CUDNN QUIET CUDNN)
+  pkg_check_modules(PC_CUDNN QUIET CUDNN)
 endif()
 
-if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}"  STREQUAL "")
-    set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR})
+if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "")
+  set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR})
 endif()
 
 if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED)
-    find_library(CUDNN_LIBRARY 
-        NAMES libcudnn_static.a cudnn.lib
-        PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
-        HINTS ${ALTER_LIBRARY_PATHS}
-        PATH_SUFFIXES lib lib64
-        DOC "CUDNN library." )
+  find_library(
+    CUDNN_LIBRARY
+    NAMES libcudnn_static.a cudnn.lib
+    PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS}
+          ${CMAKE_INSTALL_PREFIX}
+    HINTS ${ALTER_LIBRARY_PATHS}
+    PATH_SUFFIXES lib lib64
+    DOC "CUDNN library.")
 else()
-    find_library(CUDNN_LIBRARY 
-        NAMES libcudnn.so libcudnn.dylib cudnn64.dll
-        PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
-        HINTS ${ALTER_LIBRARY_PATHS}
-        PATH_SUFFIXES lib lib64
-        DOC "CUDNN library." )
+  find_library(
+    CUDNN_LIBRARY
+    NAMES libcudnn.so libcudnn.dylib cudnn64.dll
+    PATHS ${ALTER_LD_LIBRARY_PATHS} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS}
+          ${CMAKE_INSTALL_PREFIX}
+    HINTS ${ALTER_LIBRARY_PATHS}
+    PATH_SUFFIXES lib lib64
+    DOC "CUDNN library.")
 endif()
 
 if(CUDNN_LIBRARY STREQUAL "CUDNN_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find CuDNN Library, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env")
+  message(
+    FATAL_ERROR
+      "Can not find CuDNN Library, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env"
+  )
 endif()
 
 get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH)
-find_path(CUDNN_INCLUDE_DIR 
-    NAMES cudnn.h
-    HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root}
-    PATH_SUFFIXES include 
-    DOC "Path to CUDNN include directory." )
+find_path(
+  CUDNN_INCLUDE_DIR
+  NAMES cudnn.h
+  HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE}
+        ${__found_cudnn_root}
+  PATH_SUFFIXES include
+  DOC "Path to CUDNN include directory.")
 
 if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find CuDNN INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env")
+  message(
+    FATAL_ERROR
+      "Can not find CuDNN INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init CUDNN env"
+  )
 endif()
 
 if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+  file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
 else()
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+  file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
 endif()
 
-string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
-    CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
-    CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
-string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
-    CUDNN_MINOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
-    CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
-string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
-    CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
-    CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}")  
+string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" CUDNN_MAJOR_VERSION
+             "${CUDNN_VERSION_FILE_CONTENTS}")
+string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" CUDNN_MAJOR_VERSION
+                     "${CUDNN_MAJOR_VERSION}")
+string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)" CUDNN_MINOR_VERSION
+             "${CUDNN_VERSION_FILE_CONTENTS}")
+string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1" CUDNN_MINOR_VERSION
+                     "${CUDNN_MINOR_VERSION}")
+string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" CUDNN_PATCH_VERSION
+             "${CUDNN_VERSION_FILE_CONTENTS}")
+string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" CUDNN_PATCH_VERSION
+                     "${CUDNN_PATCH_VERSION}")
 set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION})
 
-
-
 if(MGE_CUDA_USE_STATIC)
-    add_library(libcudnn STATIC IMPORTED)
+  add_library(libcudnn STATIC IMPORTED)
 else()
-    add_library(libcudnn SHARED IMPORTED)
+  add_library(libcudnn SHARED IMPORTED)
 endif()
 
-set_target_properties(libcudnn PROPERTIES
-    IMPORTED_LOCATION ${CUDNN_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
+set_target_properties(
+  libcudnn PROPERTIES IMPORTED_LOCATION ${CUDNN_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
+                                                         ${CUDNN_INCLUDE_DIR})
 
 message(STATUS "Found CuDNN: ${__found_cudnn_root} (found version: ${CUDNN_VERSION})")
diff --git a/cmake/flatbuffers.cmake b/cmake/flatbuffers.cmake
index 0f895930..61cb0c0f 100644
--- a/cmake/flatbuffers.cmake
+++ b/cmake/flatbuffers.cmake
@@ -1,27 +1,47 @@
-if (MGE_USE_SYSTEM_LIB)
-    find_package(Flatbuffers REQUIRED)
-    message(STATUS "Using system provided Flatbuffers ${Flatbuffers_VERSION}")
-    include(cmake/BuildFlatBuffers.cmake)
-    return()
+if(MGE_USE_SYSTEM_LIB)
+  find_package(Flatbuffers REQUIRED)
+  message(STATUS "Using system provided Flatbuffers ${Flatbuffers_VERSION}")
+  include(cmake/BuildFlatBuffers.cmake)
+  return()
 endif()
 if(MSVC OR WIN32)
-    message(DEBUG "add flags flatc for clang-cl build")
-    set(FLATC_FLAGS "")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=unknown-argument -Wno-error=c++98-compat -Wno-error=reserved-id-macro")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=sign-conversion -Wno-error=exceptions -Wno-error=argument-outside-range")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=delete-non-virtual-dtor -Wno-error=ignored-attributes -Wno-error=format")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=sign-compare -Wno-error=unused-private-field -Wno-error=braced-scalar-init")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=return-type-c-linkage -Wno-error=invalid-noreturn -Wno-error=c++98-compat-pedantic")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=extra-semi-stmt -Wno-error=missing-prototypes -Wno-error=documentation-unknown-command")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=missing-variable-declarations  -Wno-error=nonportable-system-include-path")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=exit-time-destructors -Wno-error=unused-macros -Wno-error=global-constructors")
-    set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=switch-enum -Wno-error=missing-noreturn -Wno-error=float-equal")
-    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0.0")
-        set(FLATC_FLAGS "${FLATC_FLAGS} -Wno-error=suggest-override -Wno-error=suggest-destructor-override")
-    endif()
+  message(DEBUG "add flags flatc for clang-cl build")
+  set(FLATC_FLAGS "")
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=unknown-argument -Wno-error=c++98-compat -Wno-error=reserved-id-macro"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=sign-conversion -Wno-error=exceptions -Wno-error=argument-outside-range"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=delete-non-virtual-dtor -Wno-error=ignored-attributes -Wno-error=format"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=sign-compare -Wno-error=unused-private-field -Wno-error=braced-scalar-init"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=return-type-c-linkage -Wno-error=invalid-noreturn -Wno-error=c++98-compat-pedantic"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=extra-semi-stmt -Wno-error=missing-prototypes -Wno-error=documentation-unknown-command"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=missing-variable-declarations  -Wno-error=nonportable-system-include-path"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=exit-time-destructors -Wno-error=unused-macros -Wno-error=global-constructors"
+  )
+  set(FLATC_FLAGS
+      "${FLATC_FLAGS} -Wno-error=switch-enum -Wno-error=missing-noreturn -Wno-error=float-equal"
+  )
+  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0.0")
+    set(FLATC_FLAGS
+        "${FLATC_FLAGS} -Wno-error=suggest-override -Wno-error=suggest-destructor-override"
+    )
+  endif()
 
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLATC_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATC_FLAGS}")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLATC_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATC_FLAGS}")
 endif()
 
 option(FLATBUFFERS_BUILD_TESTS "" OFF)
diff --git a/cmake/gflags.cmake b/cmake/gflags.cmake
index 9dbb8035..a645ecdd 100644
--- a/cmake/gflags.cmake
+++ b/cmake/gflags.cmake
@@ -1 +1,2 @@
-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags ${CMAKE_CURRENT_BINARY_DIR}/gflags)
\ No newline at end of file
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags
+                 ${CMAKE_CURRENT_BINARY_DIR}/gflags)
diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake
index d2be2f35..a3071f8f 100644
--- a/cmake/gtest.cmake
+++ b/cmake/gtest.cmake
@@ -1,2 +1,2 @@
-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gtest ${CMAKE_CURRENT_BINARY_DIR}/gtest EXCLUDE_FROM_ALL)
-
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gtest
+                 ${CMAKE_CURRENT_BINARY_DIR}/gtest EXCLUDE_FROM_ALL)
diff --git a/cmake/llvm-project.cmake b/cmake/llvm-project.cmake
index bbea20d7..ce2599ac 100644
--- a/cmake/llvm-project.cmake
+++ b/cmake/llvm-project.cmake
@@ -1,88 +1,136 @@
-# - Find the llvm/mlir libraries
-# This module finds if llvm/mlir is installed, or build llvm/mlir from source.
-# This module sets the following variables.
+# * Find the llvm/mlir libraries This module finds if llvm/mlir is installed, or build
+#   llvm/mlir from source. This module sets the following variables.
 #
-#  MLIR_LLVM_INCLUDE_DIR     - path to the LLVM/MLIR include files
-#  MLIR_LLVM_LIBS            - path to the LLVM/MLIR libraries
+# MLIR_LLVM_INCLUDE_DIR     - path to the LLVM/MLIR include files MLIR_LLVM_LIBS - path
+# to the LLVM/MLIR libraries
 #
 # This module define the following functions.
 #
-# external_tablegen_library  - created interface library which depends on tablegen outputs
+# external_tablegen_library  - created interface library which depends on tablegen
+# outputs
 
 include(CMakeParseArguments)
 
 function(external_tablegen_library)
-    cmake_parse_arguments(
-        _RULE
-        "TESTONLY"
-        "NAME;TBLGEN"
-        "SRCS;INCLUDES;OUTS"
-        ${ARGN}
-        )
+  cmake_parse_arguments(_RULE "TESTONLY" "NAME;TBLGEN" "SRCS;INCLUDES;OUTS" ${ARGN})
 
-    if(_RULE_TESTONLY AND NOT MGE_WITH_TEST)
-        return()
-    endif()
+  if(_RULE_TESTONLY AND NOT MGE_WITH_TEST)
+    return()
+  endif()
 
-    set(_NAME ${_RULE_NAME})
+  set(_NAME ${_RULE_NAME})
 
-    set(LLVM_TARGET_DEFINITIONS ${_RULE_SRCS})
-    set(_INCLUDE_DIRS ${_RULE_INCLUDES})
-    list(TRANSFORM _INCLUDE_DIRS PREPEND "-I")
-    set(_OUTPUTS)
-    while(_RULE_OUTS)
-        list(GET _RULE_OUTS 0 _COMMAND)
-        list(REMOVE_AT _RULE_OUTS 0)
-        list(GET _RULE_OUTS 0 _FILE)
-        list(REMOVE_AT _RULE_OUTS 0)
-        tablegen(${_RULE_TBLGEN} ${_FILE} ${_COMMAND} ${_INCLUDE_DIRS})
-        list(APPEND _OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${_FILE})
-    endwhile()
-    add_custom_target(${_NAME}_target DEPENDS ${_OUTPUTS})
+  set(LLVM_TARGET_DEFINITIONS ${_RULE_SRCS})
+  set(_INCLUDE_DIRS ${_RULE_INCLUDES})
+  list(TRANSFORM _INCLUDE_DIRS PREPEND "-I")
+  set(_OUTPUTS)
+  while(_RULE_OUTS)
+    list(GET _RULE_OUTS 0 _COMMAND)
+    list(REMOVE_AT _RULE_OUTS 0)
+    list(GET _RULE_OUTS 0 _FILE)
+    list(REMOVE_AT _RULE_OUTS 0)
+    tablegen(${_RULE_TBLGEN} ${_FILE} ${_COMMAND} ${_INCLUDE_DIRS})
+    list(APPEND _OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${_FILE})
+  endwhile()
+  add_custom_target(${_NAME}_target DEPENDS ${_OUTPUTS})
 
-    add_library(${_NAME} INTERFACE)
-    add_dependencies(${_NAME} ${_NAME}_target)
+  add_library(${_NAME} INTERFACE)
+  add_dependencies(${_NAME} ${_NAME}_target)
 
-    target_include_directories(${_NAME} INTERFACE
-        "$<BUILD_INTERFACE:${_RULE_INCLUDES}>")
+  target_include_directories(${_NAME} INTERFACE "$<BUILD_INTERFACE:${_RULE_INCLUDES}>")
 
-    install(TARGETS ${_NAME} EXPORT ${MGE_EXPORT_TARGETS})
+  install(TARGETS ${_NAME} EXPORT ${MGE_EXPORT_TARGETS})
 endfunction()
 
-set(LLVM_LIBS LLVMCore LLVMSupport LLVMX86CodeGen LLVMOrcJIT LLVMNVPTXCodeGen LLVMNVPTXDesc LLVMNVPTXInfo)
-set(MLIR_CORE_LIBS MLIRAnalysis MLIRExecutionEngine MLIRIR MLIRParser MLIRPass MLIRSideEffectInterfaces MLIRTransforms)
-set(MLIR_DIALECT_LIBS MLIRAsync MLIRAVX512 MLIRGPU MLIRLLVMAVX512 MLIRNVVMIR MLIROpenACC MLIRPDL MLIRPDLInterp MLIRQuant MLIRROCDLIR MLIRSDBM MLIRShape MLIRSPIRV MLIRStandardOpsTransforms MLIRTosa)
-set(MLIR_CONVERSION_LIBS MLIRAffineToStandard MLIRAVX512ToLLVM MLIRGPUToGPURuntimeTransforms MLIRGPUToNVVMTransforms MLIRSCFToStandard)
+set(LLVM_LIBS
+    LLVMCore
+    LLVMSupport
+    LLVMX86CodeGen
+    LLVMOrcJIT
+    LLVMNVPTXCodeGen
+    LLVMNVPTXDesc
+    LLVMNVPTXInfo)
+set(MLIR_CORE_LIBS
+    MLIRAnalysis
+    MLIRExecutionEngine
+    MLIRIR
+    MLIRParser
+    MLIRPass
+    MLIRSideEffectInterfaces
+    MLIRTransforms)
+set(MLIR_DIALECT_LIBS
+    MLIRAsync
+    MLIRAVX512
+    MLIRGPU
+    MLIRLLVMAVX512
+    MLIRNVVMIR
+    MLIROpenACC
+    MLIRPDL
+    MLIRPDLInterp
+    MLIRQuant
+    MLIRROCDLIR
+    MLIRSDBM
+    MLIRShape
+    MLIRSPIRV
+    MLIRStandardOpsTransforms
+    MLIRTosa)
+set(MLIR_CONVERSION_LIBS
+    MLIRAffineToStandard MLIRAVX512ToLLVM MLIRGPUToGPURuntimeTransforms
+    MLIRGPUToNVVMTransforms MLIRSCFToStandard)
 set(MLIR_TRANSLATION_LIBS MLIRTargetLLVMIR MLIRTargetNVVMIR)
-set(MLIR_LIBS ${MLIR_CORE_LIBS} ${MLIR_DIALECT_LIBS} ${MLIR_CONVERSION_LIBS} ${MLIR_TRANSLATION_LIBS})
+set(MLIR_LIBS ${MLIR_CORE_LIBS} ${MLIR_DIALECT_LIBS} ${MLIR_CONVERSION_LIBS}
+              ${MLIR_TRANSLATION_LIBS})
 set(MLIR_LLVM_LIBS ${LLVM_LIBS} ${MLIR_LIBS})
 
 function(add_mge_mlir_src_dep llvm_monorepo_path)
-    set(_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}")
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
-    if(NOT uppercase_CMAKE_BUILD_TYPE MATCHES "^(DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL)$")
-        set(CMAKE_BUILD_TYPE "Debug")
-    endif()
-    set(_CMAKE_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
+  set(_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}")
+  string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+  if(NOT uppercase_CMAKE_BUILD_TYPE MATCHES
+     "^(DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL)$")
+    set(CMAKE_BUILD_TYPE "Debug")
+  endif()
+  set(_CMAKE_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+  set(BUILD_SHARED_LIBS
+      OFF
+      CACHE BOOL "" FORCE)
 
-    add_subdirectory("${llvm_monorepo_path}/llvm" ${LLVM_BUILD_DIR} EXCLUDE_FROM_ALL)
+  add_subdirectory("${llvm_monorepo_path}/llvm" ${LLVM_BUILD_DIR} EXCLUDE_FROM_ALL)
 
-    # Reset CMAKE_BUILD_TYPE to its previous setting
-    set(CMAKE_BUILD_TYPE "${_CMAKE_BUILD_TYPE}" CACHE STRING "Build type" FORCE)
-    # Reset BUILD_SHARED_LIBS to its previous setting
-    set(BUILD_SHARED_LIBS ${_CMAKE_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libraries" FORCE)
+  # Reset CMAKE_BUILD_TYPE to its previous setting
+  set(CMAKE_BUILD_TYPE
+      "${_CMAKE_BUILD_TYPE}"
+      CACHE STRING "Build type" FORCE)
+  # Reset BUILD_SHARED_LIBS to its previous setting
+  set(BUILD_SHARED_LIBS
+      ${_CMAKE_BUILD_SHARED_LIBS}
+      CACHE BOOL "Build shared libraries" FORCE)
 endfunction()
 
 # llvm build options
-set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "" FORCE)
-set(LLVM_INCLUDE_TESTS OFF CACHE BOOL "" FORCE)
-set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "" FORCE)
-set(LLVM_ENABLE_BINDINGS OFF CACHE BOOL "" FORCE)
-set(LLVM_INCLUDE_BENCHMARKS OFF CACHE BOOL "" FORCE)
-set(LLVM_ENABLE_RTTI ${MGE_ENABLE_RTTI} CACHE BOOL "" FORCE)
-set(LLVM_TARGETS_TO_BUILD "X86;NVPTX;AArch64;ARM" CACHE STRING "" FORCE)
-set(LLVM_ENABLE_PROJECTS "mlir" CACHE STRING "" FORCE)
+set(LLVM_INCLUDE_EXAMPLES
+    OFF
+    CACHE BOOL "" FORCE)
+set(LLVM_INCLUDE_TESTS
+    OFF
+    CACHE BOOL "" FORCE)
+set(LLVM_INCLUDE_DOCS
+    OFF
+    CACHE BOOL "" FORCE)
+set(LLVM_ENABLE_BINDINGS
+    OFF
+    CACHE BOOL "" FORCE)
+set(LLVM_INCLUDE_BENCHMARKS
+    OFF
+    CACHE BOOL "" FORCE)
+set(LLVM_ENABLE_RTTI
+    ${MGE_ENABLE_RTTI}
+    CACHE BOOL "" FORCE)
+set(LLVM_TARGETS_TO_BUILD
+    "X86;NVPTX;AArch64;ARM"
+    CACHE STRING "" FORCE)
+set(LLVM_ENABLE_PROJECTS
+    "mlir"
+    CACHE STRING "" FORCE)
 set(LLVM_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm)
 
 add_mge_mlir_src_dep("third_party/llvm-project")
@@ -91,6 +139,5 @@ set(MLIR_LLVM_INCLUDE_DIR
     ${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/include
     ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm/include
     ${PROJECT_SOURCE_DIR}/third_party/llvm-project/mlir/include
-    ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm/tools/mlir/include
-    )
+    ${PROJECT_BINARY_DIR}/third_party/llvm-project/llvm/tools/mlir/include)
 set(MLIR_TABLEGEN_EXE mlir-tblgen)
diff --git a/cmake/magicmind.cmake b/cmake/magicmind.cmake
index 0dd3d050..37ae170e 100644
--- a/cmake/magicmind.cmake
+++ b/cmake/magicmind.cmake
@@ -1,54 +1,64 @@
-find_library(MAGICMIND_LIBRARY 
-    NAMES libmagicmind.so
-    PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
-    HINTS ${ALTER_LIBRARY_PATHS}
-    PATH_SUFFIXES lib lib64
-    DOC "MAGICMIND library." )
+find_library(
+  MAGICMIND_LIBRARY
+  NAMES libmagicmind.so
+  PATHS ${ALTER_LD_LIBRARY_PATHS} "$ENV{NEUWARE_HOME}/lib64" ${CMAKE_INSTALL_PREFIX}
+  HINTS ${ALTER_LIBRARY_PATHS}
+  PATH_SUFFIXES lib lib64
+  DOC "MAGICMIND library.")
 
 if(MAGICMIND_LIBRARY STREQUAL "MAGICMIND_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find MAGICMIND Library")
+  message(FATAL_ERROR "Can not find MAGICMIND Library")
 endif()
 
 get_filename_component(__found_magicmind_root "${MAGICMIND_LIBRARY}/../../" REALPATH)
-find_path(MAGICMIND_INCLUDE_DIR 
-    NAMES common.h
-    HINTS "$ENV{NEUWARE_HOME}/include" ${__found_magicmind_root}
-    PATH_SUFFIXES include 
-    DOC "Path to MAGICMIND include directory." )
+find_path(
+  MAGICMIND_INCLUDE_DIR
+  NAMES common.h
+  HINTS "$ENV{NEUWARE_HOME}/include" ${__found_magicmind_root}
+  PATH_SUFFIXES include
+  DOC "Path to MAGICMIND include directory.")
 
 if(MAGICMIND_INCLUDE_DIR STREQUAL "MAGICMIND_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find MAGICMIND Library")
+  message(FATAL_ERROR "Can not find MAGICMIND Library")
 endif()
 
-file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MAJOR REGEX "^#define MM_MAJOR_VERSION [0-9]+.*$")
-file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MINOR REGEX "^#define MM_MINOR_VERSION [0-9]+.*$")
-file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_PATCH REGEX "^#define MM_PATCH_VERSION [0-9]+.*$")
+file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MAJOR
+     REGEX "^#define MM_MAJOR_VERSION [0-9]+.*$")
+file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_MINOR
+     REGEX "^#define MM_MINOR_VERSION [0-9]+.*$")
+file(STRINGS "${MAGICMIND_INCLUDE_DIR}/common.h" MAGICMIND_PATCH
+     REGEX "^#define MM_PATCH_VERSION [0-9]+.*$")
 
-string(REGEX REPLACE "^#define MM_MAJOR_VERSION ([0-9]+).*$" "\\1" MAGICMIND_VERSION_MAJOR "${MAGICMIND_MAJOR}")
-string(REGEX REPLACE "^#define MM_MINOR_VERSION ([0-9]+).*$" "\\1" MAGICMIND_VERSION_MINOR "${MAGICMIND_MINOR}")
-string(REGEX REPLACE "^#define MM_PATCH_VERSION ([0-9]+).*$" "\\1" MAGICMIND_VERSION_PATCH "${MAGICMIND_PATCH}")
-set(MAGICMIND_VERSION_STRING "${MAGICMIND_VERSION_MAJOR}.${MAGICMIND_VERSION_MINOR}.${MAGICMIND_VERSION_PATCH}")
+string(REGEX REPLACE "^#define MM_MAJOR_VERSION ([0-9]+).*$" "\\1"
+                     MAGICMIND_VERSION_MAJOR "${MAGICMIND_MAJOR}")
+string(REGEX REPLACE "^#define MM_MINOR_VERSION ([0-9]+).*$" "\\1"
+                     MAGICMIND_VERSION_MINOR "${MAGICMIND_MINOR}")
+string(REGEX REPLACE "^#define MM_PATCH_VERSION ([0-9]+).*$" "\\1"
+                     MAGICMIND_VERSION_PATCH "${MAGICMIND_PATCH}")
+set(MAGICMIND_VERSION_STRING
+    "${MAGICMIND_VERSION_MAJOR}.${MAGICMIND_VERSION_MINOR}.${MAGICMIND_VERSION_PATCH}")
 
 add_library(libmagicmind SHARED IMPORTED)
 
-set_target_properties(libmagicmind PROPERTIES
-    IMPORTED_LOCATION ${MAGICMIND_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${MAGICMIND_INCLUDE_DIR}
-)
+set_target_properties(
+  libmagicmind PROPERTIES IMPORTED_LOCATION ${MAGICMIND_LIBRARY}
+                          INTERFACE_INCLUDE_DIRECTORIES ${MAGICMIND_INCLUDE_DIR})
 
-message(STATUS "Found MAGICMIND: ${__found_magicmind_root} (found version: ${MAGICMIND_VERSION_STRING})")
+message(
+  STATUS
+    "Found MAGICMIND: ${__found_magicmind_root} (found version: ${MAGICMIND_VERSION_STRING})"
+)
 
-find_library(MAGICMIND_RUNTIME_LIBRARY 
-    NAMES libmagicmind_runtime.so
-    PATHS "${__found_magicmind_root}/lib64"
-    )
+find_library(
+  MAGICMIND_RUNTIME_LIBRARY
+  NAMES libmagicmind_runtime.so
+  PATHS "${__found_magicmind_root}/lib64")
 
 if(MAGICMIND_RUNTIME_LIBRARY STREQUAL "MAGICMIND_RUNTIME_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find MAGICMIND_RUNTIME Library")
+  message(FATAL_ERROR "Can not find MAGICMIND_RUNTIME Library")
 else()
-    message(STATUS "Found MAGICMIND_RUNTIME: ${MAGICMIND_RUNTIME_LIBRARY}")
+  message(STATUS "Found MAGICMIND_RUNTIME: ${MAGICMIND_RUNTIME_LIBRARY}")
 endif()
 add_library(libmagicmind_runtime SHARED IMPORTED)
-set_target_properties(libmagicmind_runtime PROPERTIES
-    IMPORTED_LOCATION ${MAGICMIND_RUNTIME_LIBRARY}
-)
+set_target_properties(libmagicmind_runtime PROPERTIES IMPORTED_LOCATION
+                                                      ${MAGICMIND_RUNTIME_LIBRARY})
diff --git a/cmake/mkl.cmake b/cmake/mkl.cmake
index 8315f583..cbb81c5c 100644
--- a/cmake/mkl.cmake
+++ b/cmake/mkl.cmake
@@ -1,77 +1,83 @@
-find_path(MKL_ROOT_DIR
-    include/mkl_cblas.h
-    PATHS
-    ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH}
-    ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH}/Library
-    ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32/Library
-    ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32
-    $ENV{MKLDIR}
-    /opt/intel/mkl/*/
-    /opt/intel/cmkl/*/
-    /Library/Frameworks/Intel_MKL.framework/Versions/Current/lib/universal
-)
+find_path(
+  MKL_ROOT_DIR include/mkl_cblas.h
+  PATHS ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH}
+        ${PROJECT_SOURCE_DIR}/third_party/mkl/${MGE_ARCH}/Library
+        ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32/Library
+        ${PROJECT_SOURCE_DIR}/third_party/mkl/x86_32
+        $ENV{MKLDIR}
+        /opt/intel/mkl/*/
+        /opt/intel/cmkl/*/
+        /Library/Frameworks/Intel_MKL.framework/Versions/Current/lib/universal)
 
 if(${MKL_ROOT_DIR} STREQUAL "MKL_ROOT_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find MKL")
+  message(FATAL_ERROR "Can not find MKL")
 endif()
 message(STATUS "Build with MKL in ${MKL_ROOT_DIR}")
 
-find_path(MKL_INCLUDE_DIR
-    mkl_cblas.h
-    PATHS
-    ${MKL_ROOT_DIR}/include
-    ${INCLUDE_INSTALL_DIR}
-)
+find_path(MKL_INCLUDE_DIR mkl_cblas.h PATHS ${MKL_ROOT_DIR}/include
+                                            ${INCLUDE_INSTALL_DIR})
 
 option(MGE_MKL_USE_STATIC "Build MegEngine with static MKL" ON)
 if(MGE_MKL_USE_STATIC)
-    find_library(MKL_CORE_LIBRARY
-        NAMES libmkl_core.a mkl_core.lib
-        PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  find_library(
+    MKL_CORE_LIBRARY
+    NAMES libmkl_core.a mkl_core.lib
+    PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
 
-    find_library(MKL_SEQUENTIAL_LIBRARY
-        NAMES libmkl_sequential.a mkl_sequential.lib
-        PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  find_library(
+    MKL_SEQUENTIAL_LIBRARY
+    NAMES libmkl_sequential.a mkl_sequential.lib
+    PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
 
-    if(${MGE_ARCH} STREQUAL "x86_64")
-        find_library(MKL_IPL_LIBRARY
-            NAMES libmkl_intel_ilp64.a mkl_intel_ilp64.lib
-            PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
-    elseif(${MGE_ARCH} STREQUAL "i386")
-        find_library(MKL_IPL_LIBRARY
-            NAMES libmkl_intel_32.a mkl_intel_32.lib mkl_intel_c.lib
-            PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
-    endif()
+  if(${MGE_ARCH} STREQUAL "x86_64")
+    find_library(
+      MKL_IPL_LIBRARY
+      NAMES libmkl_intel_ilp64.a mkl_intel_ilp64.lib
+      PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  elseif(${MGE_ARCH} STREQUAL "i386")
+    find_library(
+      MKL_IPL_LIBRARY
+      NAMES libmkl_intel_32.a mkl_intel_32.lib mkl_intel_c.lib
+      PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  endif()
 
-    add_library(libmkl INTERFACE IMPORTED)
-    if(UNIX AND NOT APPLE)
-        target_link_libraries(libmkl INTERFACE -Wl,--start-group ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY} -Wl,--end-group)
-    else()
-        target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY})
-    endif()
-    target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR})
+  add_library(libmkl INTERFACE IMPORTED)
+  if(UNIX AND NOT APPLE)
+    target_link_libraries(
+      libmkl INTERFACE -Wl,--start-group ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY}
+                       ${MKL_IPL_LIBRARY} -Wl,--end-group)
+  else()
+    target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY}
+                                           ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY})
+  endif()
+  target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR})
 else()
-    find_library(MKL_CORE_LIBRARY
-        NAMES libmkl_core.so libmkl_core.dylib
-        PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  find_library(
+    MKL_CORE_LIBRARY
+    NAMES libmkl_core.so libmkl_core.dylib
+    PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
 
-    find_library(MKL_SEQUENTIAL_LIBRARY
-        NAMES libmkl_sequential.so libmkl_sequential.dylib
-        PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  find_library(
+    MKL_SEQUENTIAL_LIBRARY
+    NAMES libmkl_sequential.so libmkl_sequential.dylib
+    PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
 
-    if(${MGE_ARCH} STREQUAL "x86_64")
-        find_library(MKL_IPL_LIBRARY
-            NAMES libmkl_intel_ilp64.so libmkl_intel_ilp64.dylib
-            PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
-    elseif(${MGE_ARCH} STREQUAL "x86_32")
-        find_library(MKL_IPL_LIBRARY
-            NAMES libmkl_intel_32.so libmkl_intel_32.dylib
-            PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
-    endif()
-    target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY} ${MKL_IPL_LIBRARY})
-    target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR})
+  if(${MGE_ARCH} STREQUAL "x86_64")
+    find_library(
+      MKL_IPL_LIBRARY
+      NAMES libmkl_intel_ilp64.so libmkl_intel_ilp64.dylib
+      PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  elseif(${MGE_ARCH} STREQUAL "x86_32")
+    find_library(
+      MKL_IPL_LIBRARY
+      NAMES libmkl_intel_32.so libmkl_intel_32.dylib
+      PATHS ${MKL_ROOT_DIR}/lib/${MKL_ARCH_DIR} ${MKL_ROOT_DIR}/lib/)
+  endif()
+  target_link_libraries(libmkl INTERFACE ${MKL_CORE_LIBRARY} ${MKL_SEQUENTIAL_LIBRARY}
+                                         ${MKL_IPL_LIBRARY})
+  target_include_directories(libmkl INTERFACE ${MKL_INCLUDE_DIR})
 endif()
 
 if(${MGE_ARCH} STREQUAL "x86_64")
-    target_compile_definitions(libmkl INTERFACE -DMKL_ILP64)
+  target_compile_definitions(libmkl INTERFACE -DMKL_ILP64)
 endif()
diff --git a/cmake/protobuf.cmake b/cmake/protobuf.cmake
index 5802b25f..6ac0892d 100644
--- a/cmake/protobuf.cmake
+++ b/cmake/protobuf.cmake
@@ -1,70 +1,83 @@
 function(PROTOBUF_GENERATE_CPP_WITH_ROOT SRCS HDRS ROOT_DIR)
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP_WITH_ROOT() called without any proto files")
-        return()
-    endif()
+  if(NOT ARGN)
+    message(
+      SEND_ERROR
+        "Error: PROTOBUF_GENERATE_CPP_WITH_ROOT() called without any proto files")
+    return()
+  endif()
 
-    set(${SRCS})
-    set(${HDRS})
-    foreach(FIL ${ARGN})
-        set(ABS_FIL ${ROOT_DIR}/${FIL})
-        get_filename_component(FIL_WE ${FIL} NAME_WE)
-        get_filename_component(FIL_DIR ${ABS_FIL} PATH)
-        file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR})
+  set(${SRCS})
+  set(${HDRS})
+  foreach(FIL ${ARGN})
+    set(ABS_FIL ${ROOT_DIR}/${FIL})
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    get_filename_component(FIL_DIR ${ABS_FIL} PATH)
+    file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR})
 
-        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
-        list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
 
-        add_custom_command(
-            OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc"
-                   "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h"
-            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} -I ${FIL_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS}
-            DEPENDS ${ABS_FIL} libprotobuf
-            COMMENT "Running C++ protocol buffer compiler on ${FIL}"
-            VERBATIM)
-    endforeach()
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc"
+             "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
+              -I ${FIL_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS}
+      DEPENDS ${ABS_FIL} libprotobuf
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM)
+  endforeach()
 
-    set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
-    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-    set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS}
+      ${${SRCS}}
+      PARENT_SCOPE)
+  set(${HDRS}
+      ${${HDRS}}
+      PARENT_SCOPE)
 endfunction()
 
 if(MGE_USE_SYSTEM_LIB)
-    find_package(Protobuf)
-    if(Protobuf_FOUND)
-        add_library(libprotobuf INTERFACE)
-        target_link_libraries(libprotobuf INTERFACE ${Protobuf_LIBRARIES})
-        target_include_directories(libprotobuf INTERFACE ${Protobuf_INCLUDE_DIRS})
-        get_filename_component(Protobuf_ROOT ${Protobuf_INCLUDE_DIR} DIRECTORY)
-        set(PROTOBUF_ROOT ${Protobuf_ROOT})
-        set(PROTOBUF_PROTOC_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE})
-        set(PROTOBUF_INCLUDE_DIRS ${Protobuf_INCLUDE_DIRS})
-        return()
-    endif()
+  find_package(Protobuf)
+  if(Protobuf_FOUND)
+    add_library(libprotobuf INTERFACE)
+    target_link_libraries(libprotobuf INTERFACE ${Protobuf_LIBRARIES})
+    target_include_directories(libprotobuf INTERFACE ${Protobuf_INCLUDE_DIRS})
+    get_filename_component(Protobuf_ROOT ${Protobuf_INCLUDE_DIR} DIRECTORY)
+    set(PROTOBUF_ROOT ${Protobuf_ROOT})
+    set(PROTOBUF_PROTOC_EXECUTABLE ${Protobuf_PROTOC_EXECUTABLE})
+    set(PROTOBUF_INCLUDE_DIRS ${Protobuf_INCLUDE_DIRS})
+    return()
+  endif()
 endif()
 
-
 include(ExternalProject)
 include(GNUInstallDirs)
 
-set(PROTOBUF_DIR "${PROJECT_SOURCE_DIR}/third_party/protobuf" CACHE STRING "protobuf directory")
+set(PROTOBUF_DIR
+    "${PROJECT_SOURCE_DIR}/third_party/protobuf"
+    CACHE STRING "protobuf directory")
 set(PROTOBUF_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/protobuf)
 
 if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobufd.a)
+  set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobufd.a)
 else()
-    set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobuf.a)
+  set(PROTOBUF_LIB ${PROTOBUF_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libprotobuf.a)
 endif()
 set(PROTOBUF_PROTOC_EXECUTABLE ${PROTOBUF_BUILD_DIR}/bin/protoc)
 
-ExternalProject_add(
-    protobuf
-    SOURCE_DIR ${PROTOBUF_DIR}/cmake
-    PREFIX ${PROTOBUF_BUILD_DIR}
-    CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_BUILD_DIR} -Dprotobuf_BUILD_EXAMPLES=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    BUILD_BYPRODUCTS ${PROTOBUF_LIB} ${PROTOBUF_PROTOC_EXECUTABLE}
-)
+ExternalProject_Add(
+  protobuf
+  SOURCE_DIR ${PROTOBUF_DIR}/cmake
+  PREFIX ${PROTOBUF_BUILD_DIR}
+  CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
+             -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
+             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_BUILD_DIR}
+             -Dprotobuf_BUILD_EXAMPLES=OFF
+             -Dprotobuf_BUILD_TESTS=OFF
+             -DBUILD_SHARED_LIBS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+  BUILD_BYPRODUCTS ${PROTOBUF_LIB} ${PROTOBUF_PROTOC_EXECUTABLE})
 
 set(PROTOBUF_INC ${PROTOBUF_BUILD_DIR}/include)
 file(MAKE_DIRECTORY ${PROTOBUF_INC})
@@ -72,19 +85,14 @@ file(MAKE_DIRECTORY ${PROTOBUF_INC})
 add_library(libprotobuf STATIC IMPORTED GLOBAL)
 add_dependencies(libprotobuf protobuf)
 set_target_properties(
-    libprotobuf PROPERTIES
-    IMPORTED_LOCATION ${PROTOBUF_LIB}
-    INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_BUILD_DIR}/include
-)
+  libprotobuf PROPERTIES IMPORTED_LOCATION ${PROTOBUF_LIB}
+                         INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_BUILD_DIR}/include)
 
 add_executable(protoc IMPORTED GLOBAL)
 add_dependencies(protoc protobuf)
-set_target_properties(
-    protoc PROPERTIES
-    IMPORTED_LOCATION ${PROTOBUF_BUILD_DIR}/bin/protoc
-)
+set_target_properties(protoc PROPERTIES IMPORTED_LOCATION
+                                        ${PROTOBUF_BUILD_DIR}/bin/protoc)
 
 set(PROTOBUF_ROOT ${PROTOBUF_BUILD_DIR})
 set(PROTOBUF_PROTOC_EXECUTABLE protoc)
 set(PROTOBUF_INCLUDE_DIRS ${PROTOBUF_BUILD_DIR}/include)
-
diff --git a/cmake/rocm.cmake b/cmake/rocm.cmake
index 3bd5897a..b48f1d3c 100644
--- a/cmake/rocm.cmake
+++ b/cmake/rocm.cmake
@@ -1,28 +1,34 @@
 if(NOT DEFINED HIP_PATH)
-    if(NOT DEFINED ENV{HIP_PATH})
-        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-    else()
-        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-    endif()
+  if(NOT DEFINED ENV{HIP_PATH})
+    set(HIP_PATH
+        "/opt/rocm/hip"
+        CACHE PATH "Path to which HIP has been installed")
+  else()
+    set(HIP_PATH
+        $ENV{HIP_PATH}
+        CACHE PATH "Path to which HIP has been installed")
+  endif()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 find_package(HIP QUIET)
-if (HIP_FOUND)
-    message(STATUS "Found HIP: " ${HIP_VERSION})
+if(HIP_FOUND)
+  message(STATUS "Found HIP: " ${HIP_VERSION})
 else()
-    message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.")
+  message(
+    FATAL_ERROR
+      "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location."
+  )
 endif()
 
-if (${HIP_VERSION} VERSION_LESS 3.0)
-    message(FATAL_ERROR "ROCM version needed 3. Please update ROCM.")
+if(${HIP_VERSION} VERSION_LESS 3.0)
+  message(FATAL_ERROR "ROCM version needed 3. Please update ROCM.")
 endif()
 
 macro(hipconfig_get_option variable option)
-    if(NOT DEFINED ${variable})
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} ${option}
-            OUTPUT_VARIABLE ${variable})
-    endif()
+  if(NOT DEFINED ${variable})
+    execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} ${option}
+                    OUTPUT_VARIABLE ${variable})
+  endif()
 endmacro()
 
 hipconfig_get_option(HIP_COMPILER "--compiler")
@@ -31,30 +37,33 @@ hipconfig_get_option(HIP_CPP_CONFIG "--cpp_config")
 separate_arguments(HIP_CPP_CONFIG)
 
 foreach(hip_config_item ${HIP_CPP_CONFIG})
-    foreach(macro_name "__HIP_PLATFORM_HCC__" "__HIP_ROCclr__")
-        if(${hip_config_item} STREQUAL "-D${macro_name}=")
-            set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name}\n")
-            set(HIP_CPP_UNDEFINE "${HIP_CPP_UNDEFINE}\
+  foreach(macro_name "__HIP_PLATFORM_HCC__" "__HIP_ROCclr__")
+    if(${hip_config_item} STREQUAL "-D${macro_name}=")
+      set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name}\n")
+      set(HIP_CPP_UNDEFINE
+          "${HIP_CPP_UNDEFINE}\
                     #ifdef ${macro_name}\n#undef ${macro_name}\n\
                     #else\n#error\n\
                     #endif\n")
-        elseif(${hip_config_item} STREQUAL "-D${macro_name}")
-            set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name} 1\n")
-            set(HIP_CPP_UNDEFINE "${HIP_CPP_UNDEFINE}\
+    elseif(${hip_config_item} STREQUAL "-D${macro_name}")
+      set(HIP_CPP_DEFINE "${HIP_CPP_DEFINE}#define ${macro_name} 1\n")
+      set(HIP_CPP_UNDEFINE
+          "${HIP_CPP_UNDEFINE}\
                     #ifdef ${macro_name}\n#undef ${macro_name}\n\
                     #else\n#error\n\
                     #endif\n")
-        endif()
-    endforeach()
+    endif()
+  endforeach()
 endforeach()
 
 message(STATUS "Using HIP compiler ${HIP_COMPILER}")
 
 if(${HIP_COMPILER} STREQUAL "hcc")
-    set(MGE_ROCM_LIBS hip_hcc)
-    message(WARNING "hcc is not well supported, please modify link.txt to link with hipcc")
-elseif (${HIP_COMPILER} STREQUAL "clang")
-    set(MGE_ROCM_LIBS amdhip64)
+  set(MGE_ROCM_LIBS hip_hcc)
+  message(
+    WARNING "hcc is not well supported, please modify link.txt to link with hipcc")
+elseif(${HIP_COMPILER} STREQUAL "clang")
+  set(MGE_ROCM_LIBS amdhip64)
 endif()
 
 list(APPEND MGE_ROCM_LIBS amdocl64 MIOpen rocblas rocrand)
@@ -63,26 +72,28 @@ set(HIP_INCLUDE_DIR ${HIP_ROOT_DIR}/../include)
 set(HIP_LIBRARY_DIR ${HIP_ROOT_DIR}/../lib)
 
 function(find_rocm_library name dirname include library)
-    find_path(${name}_LIBRARY_DIR
-        NAMES ${library}
-        HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}"
-        PATH_SUFFIXES lib lib/x86_64
-        DOC "Path to ${name} library directory")
+  find_path(
+    ${name}_LIBRARY_DIR
+    NAMES ${library}
+    HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}"
+    PATH_SUFFIXES lib lib/x86_64
+    DOC "Path to ${name} library directory")
 
-    if(${${name}_LIBRARY_DIR} MATCHES "NOTFOUND$")
-        message(FATAL_ERROR "Can not find ${name} library")
-    endif()
+  if(${${name}_LIBRARY_DIR} MATCHES "NOTFOUND$")
+    message(FATAL_ERROR "Can not find ${name} library")
+  endif()
 
-    find_path(${name}_INCLUDE_DIR
-        NAMES ${include}
-        HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}"
-        PATH_SUFFIXES include
-        DOC "Path to ${name} include directory")
+  find_path(
+    ${name}_INCLUDE_DIR
+    NAMES ${include}
+    HINTS "${${name}_ROOT_DIR}" "${HIP_ROOT_DIR}/../${dirname}"
+    PATH_SUFFIXES include
+    DOC "Path to ${name} include directory")
 
-    if(${name}_INCLUDE_DIR MATCHES "NOTFOUND$")
-        message(FATAL_ERROR "Can not find ${name} include")
-    endif()
-    message(DEBUG "Found lib ${${name}_LIBRARY_DIR}, include ${${name}_INCLUDE_DIR}")
+  if(${name}_INCLUDE_DIR MATCHES "NOTFOUND$")
+    message(FATAL_ERROR "Can not find ${name} include")
+  endif()
+  message(DEBUG "Found lib ${${name}_LIBRARY_DIR}, include ${${name}_INCLUDE_DIR}")
 endfunction()
 
 find_rocm_library(MIOPEN miopen miopen libMIOpen.so)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 53f0f433..9bcba8d6 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -1,166 +1,189 @@
-if("${TRT_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{TRT_ROOT_DIR}"  STREQUAL "")
-    set(TRT_ROOT_DIR $ENV{TRT_ROOT_DIR})
+if("${TRT_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{TRT_ROOT_DIR}" STREQUAL "")
+  set(TRT_ROOT_DIR $ENV{TRT_ROOT_DIR})
 endif()
 
 if(MGE_CUDA_USE_STATIC)
-    find_library(TRT_LIBRARY 
-        NAMES libnvinfer_static.a nvinfer.lib
-        PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
-        HINTS ${ALTER_LIBRARY_PATHS}
-        PATH_SUFFIXES lib lib64
-        DOC "TRT library." )
-    find_library(TRT_PLUGIN_LIBRARY
-        NAMES libnvinfer_plugin_static.a nvinfer_plugin.lib
-        PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
-        HINTS ${ALTER_LIBRARY_PATHS}
-        PATH_SUFFIXES lib lib64
-        DOC "TRT plugin library." )
+  find_library(
+    TRT_LIBRARY
+    NAMES libnvinfer_static.a nvinfer.lib
+    PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
+    HINTS ${ALTER_LIBRARY_PATHS}
+    PATH_SUFFIXES lib lib64
+    DOC "TRT library.")
+  find_library(
+    TRT_PLUGIN_LIBRARY
+    NAMES libnvinfer_plugin_static.a nvinfer_plugin.lib
+    PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
+    HINTS ${ALTER_LIBRARY_PATHS}
+    PATH_SUFFIXES lib lib64
+    DOC "TRT plugin library.")
 else()
-    find_library(TRT_LIBRARY 
-        NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll
-        PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
-        HINTS ${ALTER_LIBRARY_PATHS}
-        PATH_SUFFIXES lib lib64
-        DOC "TRT library." )
-    find_library(TRT_PLUGIN_LIBRARY
-        NAMES libnvinfer_plugin.so libnvinfer_plugin.dylib nvinfer_plugin.dll
-        PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
-        HINTS ${ALTER_LIBRARY_PATHS}
-        PATH_SUFFIXES lib lib64
-        DOC "TRT plugin library." )
+  find_library(
+    TRT_LIBRARY
+    NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll
+    PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
+    HINTS ${ALTER_LIBRARY_PATHS}
+    PATH_SUFFIXES lib lib64
+    DOC "TRT library.")
+  find_library(
+    TRT_PLUGIN_LIBRARY
+    NAMES libnvinfer_plugin.so libnvinfer_plugin.dylib nvinfer_plugin.dll
+    PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
+    HINTS ${ALTER_LIBRARY_PATHS}
+    PATH_SUFFIXES lib lib64
+    DOC "TRT plugin library.")
 endif()
 
 if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find TensorRT Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
+  message(
+    FATAL_ERROR
+      "Can not find TensorRT Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env"
+  )
 endif()
 if(TRT_PLUGIN_LIBRARY STREQUAL "TRT_PLUGIN_LIBRARY-NOTFOUND")
-    message(FATAL_ERROR "Can not find TensorRT Plugin Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
+  message(
+    FATAL_ERROR
+      "Can not find TensorRT Plugin Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env"
+  )
 endif()
 
 get_filename_component(__found_trt_root ${TRT_LIBRARY}/../.. REALPATH)
-find_path(TRT_INCLUDE_DIR 
-    NAMES NvInfer.h
-    HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root}
-    PATH_SUFFIXES include 
-    DOC "Path to TRT include directory." )
-find_path(TRT_PLUGIN_INCLUDE_DIR
-    NAMES NvInferPlugin.h
-    HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root}
-    PATH_SUFFIXES include
-    DOC "Path to TRT plugin include directory." )
+find_path(
+  TRT_INCLUDE_DIR
+  NAMES NvInfer.h
+  HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root}
+  PATH_SUFFIXES include
+  DOC "Path to TRT include directory.")
+find_path(
+  TRT_PLUGIN_INCLUDE_DIR
+  NAMES NvInferPlugin.h
+  HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root}
+  PATH_SUFFIXES include
+  DOC "Path to TRT plugin include directory.")
 
 if(TRT_INCLUDE_DIR STREQUAL "TRT_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find TensorRT INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
+  message(
+    FATAL_ERROR
+      "Can not find TensorRT INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env"
+  )
 endif()
 if(TRT_PLUGIN_INCLUDE_DIR STREQUAL "TRT_PLUGIN_INCLUDE_DIR-NOTFOUND")
-    message(FATAL_ERROR "Can not find TensorRT Plugin INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
+  message(
+    FATAL_ERROR
+      "Can not find TensorRT Plugin INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env"
+  )
 endif()
 
-file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
-file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
-file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR
+     REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
+file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR
+     REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
+file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH
+     REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
 
-if (TensorRT_MAJOR STREQUAL "")
-    file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
-    file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
-    file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+if(TensorRT_MAJOR STREQUAL "")
+  file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR
+       REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
+  file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR
+       REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
+  file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH
+       REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
 endif()
 
-string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
-string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
-string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
-set(TRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
+string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1"
+                     TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
+string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1"
+                     TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
+string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1"
+                     TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
+set(TRT_VERSION_STRING
+    "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
 
 if(MGE_CUDA_USE_STATIC)
-    add_library(libnvinfer STATIC IMPORTED)
-    add_library(libnvinfer_plugin STATIC IMPORTED)
+  add_library(libnvinfer STATIC IMPORTED)
+  add_library(libnvinfer_plugin STATIC IMPORTED)
 else()
-    add_library(libnvinfer SHARED IMPORTED)
-    add_library(libnvinfer_plugin SHARED IMPORTED)
+  add_library(libnvinfer SHARED IMPORTED)
+  add_library(libnvinfer_plugin SHARED IMPORTED)
 endif()
 
-set_target_properties(libnvinfer PROPERTIES
-    IMPORTED_LOCATION ${TRT_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${TRT_INCLUDE_DIR}
-)
-set_target_properties(libnvinfer_plugin PROPERTIES
-    IMPORTED_LOCATION ${TRT_PLUGIN_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${TRT_PLUGIN_INCLUDE_DIR}
-)
+set_target_properties(
+  libnvinfer PROPERTIES IMPORTED_LOCATION ${TRT_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
+                                                         ${TRT_INCLUDE_DIR})
+set_target_properties(
+  libnvinfer_plugin PROPERTIES IMPORTED_LOCATION ${TRT_PLUGIN_LIBRARY}
+                               INTERFACE_INCLUDE_DIRECTORIES ${TRT_PLUGIN_INCLUDE_DIR})
 
-message(STATUS "Found TensorRT: ${__found_trt_root} (found version: ${TRT_VERSION_STRING})")
+message(
+  STATUS "Found TensorRT: ${__found_trt_root} (found version: ${TRT_VERSION_STRING})")
 
 if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
-    if(MGE_CUDA_USE_STATIC)
-        find_library(LIBMYELIN_COMPILER
-            NAMES libmyelin_compiler_static.a myelin_compiler_static.lib
-            PATHS ${__found_trt_root}/lib
-            )
-        if(LIBMYELIN_COMPILER STREQUAL "LIBMYELIN_COMPILER-NOTFOUND")
-            message(FATAL_ERROR "Can not find LIBMYELIN_COMPILER Library")
-        else()
-            message(STATUS "Found TensorRT myelin_compiler: ${LIBMYELIN_COMPILER}")
-        endif()
-        add_library(libmyelin_compiler STATIC IMPORTED)
-        set_target_properties(libmyelin_compiler PROPERTIES
-            IMPORTED_LOCATION ${LIBMYELIN_COMPILER}
-            )
+  if(MGE_CUDA_USE_STATIC)
+    find_library(
+      LIBMYELIN_COMPILER
+      NAMES libmyelin_compiler_static.a myelin_compiler_static.lib
+      PATHS ${__found_trt_root}/lib)
+    if(LIBMYELIN_COMPILER STREQUAL "LIBMYELIN_COMPILER-NOTFOUND")
+      message(FATAL_ERROR "Can not find LIBMYELIN_COMPILER Library")
+    else()
+      message(STATUS "Found TensorRT myelin_compiler: ${LIBMYELIN_COMPILER}")
+    endif()
+    add_library(libmyelin_compiler STATIC IMPORTED)
+    set_target_properties(libmyelin_compiler PROPERTIES IMPORTED_LOCATION
+                                                        ${LIBMYELIN_COMPILER})
 
-        find_library(LIBMYELIN_EXECUTOR
-            NAMES libmyelin_executor_static.a myelin_executor_static.lib
-            PATHS ${__found_trt_root}/lib
-            )
-        if(LIBMYELIN_EXECUTOR STREQUAL "LIBMYELIN_EXECUTOR-NOTFOUND")
-            message(FATAL_ERROR "Can not find LIBMYELIN_EXECUTOR Library")
-        else()
-            message(STATUS "Found TensorRT libmyelin_executor: ${LIBMYELIN_EXECUTOR}")
-        endif()
-        add_library(libmyelin_executor STATIC IMPORTED)
-        set_target_properties(libmyelin_executor PROPERTIES
-            IMPORTED_LOCATION ${LIBMYELIN_EXECUTOR}
-            )
+    find_library(
+      LIBMYELIN_EXECUTOR
+      NAMES libmyelin_executor_static.a myelin_executor_static.lib
+      PATHS ${__found_trt_root}/lib)
+    if(LIBMYELIN_EXECUTOR STREQUAL "LIBMYELIN_EXECUTOR-NOTFOUND")
+      message(FATAL_ERROR "Can not find LIBMYELIN_EXECUTOR Library")
+    else()
+      message(STATUS "Found TensorRT libmyelin_executor: ${LIBMYELIN_EXECUTOR}")
+    endif()
+    add_library(libmyelin_executor STATIC IMPORTED)
+    set_target_properties(libmyelin_executor PROPERTIES IMPORTED_LOCATION
+                                                        ${LIBMYELIN_EXECUTOR})
 
-        find_library(LIBMYELIN_PATTERN_RUNTIME
-            NAMES libmyelin_pattern_runtime_static.a myelin_pattern_runtime_static.lib
-            PATHS ${__found_trt_root}/lib
-            )
-        if(LIBMYELIN_PATTERN_RUNTIME STREQUAL "LIBMYELIN_PATTERN_RUNTIME-NOTFOUND")
-            message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_RUNTIME Library")
-        else()
-            message(STATUS "Found TensorRT libmyelin_pattern_runtime: ${LIBMYELIN_PATTERN_RUNTIME}")
-        endif()
-        add_library(libmyelin_pattern_runtime STATIC IMPORTED)
-        set_target_properties(libmyelin_pattern_runtime PROPERTIES
-            IMPORTED_LOCATION ${LIBMYELIN_PATTERN_RUNTIME}
-            )
+    find_library(
+      LIBMYELIN_PATTERN_RUNTIME
+      NAMES libmyelin_pattern_runtime_static.a myelin_pattern_runtime_static.lib
+      PATHS ${__found_trt_root}/lib)
+    if(LIBMYELIN_PATTERN_RUNTIME STREQUAL "LIBMYELIN_PATTERN_RUNTIME-NOTFOUND")
+      message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_RUNTIME Library")
+    else()
+      message(
+        STATUS "Found TensorRT libmyelin_pattern_runtime: ${LIBMYELIN_PATTERN_RUNTIME}")
+    endif()
+    add_library(libmyelin_pattern_runtime STATIC IMPORTED)
+    set_target_properties(libmyelin_pattern_runtime
+                          PROPERTIES IMPORTED_LOCATION ${LIBMYELIN_PATTERN_RUNTIME})
 
-        find_library(LIBMYELIN_PATTERN_LIBRARY
-            NAMES libmyelin_pattern_library_static.a myelin_pattern_library_static.lib
-            PATHS ${__found_trt_root}/lib
-            )
-        if(LIBMYELIN_PATTERN_LIBRARY STREQUAL "LIBMYELIN_PATTERN_LIBRARY-NOTFOUND")
-            message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_LIBRARY Library")
-        else()
-            message(STATUS "Found TensorRT libmyelin_pattern_library: ${LIBMYELIN_PATTERN_LIBRARY}")
-        endif()
-        add_library(libmyelin_pattern_library STATIC IMPORTED)
-        set_target_properties(libmyelin_pattern_library PROPERTIES
-            IMPORTED_LOCATION ${LIBMYELIN_PATTERN_LIBRARY}
-            )
+    find_library(
+      LIBMYELIN_PATTERN_LIBRARY
+      NAMES libmyelin_pattern_library_static.a myelin_pattern_library_static.lib
+      PATHS ${__found_trt_root}/lib)
+    if(LIBMYELIN_PATTERN_LIBRARY STREQUAL "LIBMYELIN_PATTERN_LIBRARY-NOTFOUND")
+      message(FATAL_ERROR "Can not find LIBMYELIN_PATTERN_LIBRARY Library")
     else()
-        find_library(LIBMYELIN_SHARED
-            NAMES libmyelin.so myelin.dll
-            PATHS ${__found_trt_root}/lib
-            )
+      message(
+        STATUS "Found TensorRT libmyelin_pattern_library: ${LIBMYELIN_PATTERN_LIBRARY}")
+    endif()
+    add_library(libmyelin_pattern_library STATIC IMPORTED)
+    set_target_properties(libmyelin_pattern_library
+                          PROPERTIES IMPORTED_LOCATION ${LIBMYELIN_PATTERN_LIBRARY})
+  else()
+    find_library(
+      LIBMYELIN_SHARED
+      NAMES libmyelin.so myelin.dll
+      PATHS ${__found_trt_root}/lib)
 
-        if(LIBMYELIN_SHARED STREQUAL "LIBMYELIN_SHARED-NOTFOUND")
-            message(FATAL_ERROR "Can not find LIBMYELIN_SHARED Library")
-        else()
-            message(STATUS "Found TensorRT libmyelin_shared: ${LIBMYELIN_SHARED}")
-        endif()
-        add_library(libmyelin SHARED IMPORTED)
-        set_target_properties(libmyelin PROPERTIES
-            IMPORTED_LOCATION ${LIBMYELIN_SHARED}
-            )
+    if(LIBMYELIN_SHARED STREQUAL "LIBMYELIN_SHARED-NOTFOUND")
+      message(FATAL_ERROR "Can not find LIBMYELIN_SHARED Library")
+    else()
+      message(STATUS "Found TensorRT libmyelin_shared: ${LIBMYELIN_SHARED}")
     endif()
+    add_library(libmyelin SHARED IMPORTED)
+    set_target_properties(libmyelin PROPERTIES IMPORTED_LOCATION ${LIBMYELIN_SHARED})
+  endif()
 endif()
diff --git a/cmake/zmq.cmake b/cmake/zmq.cmake
index d4677553..71f40cd2 100644
--- a/cmake/zmq.cmake
+++ b/cmake/zmq.cmake
@@ -1,17 +1,26 @@
 include(ExternalProject)
 include(GNUInstallDirs)
 
-set(ZMQ_DIR ${PROJECT_SOURCE_DIR}/third_party/libzmq CACHE STRING "ZMQ directory")
+set(ZMQ_DIR
+    ${PROJECT_SOURCE_DIR}/third_party/libzmq
+    CACHE STRING "ZMQ directory")
 set(ZMQ_BUILD_DIR ${PROJECT_BINARY_DIR}/third_party/libzmq)
 set(ZMQ_LIB ${ZMQ_BUILD_DIR}/${CMAKE_INSTALL_LIBDIR}/libzmq.a)
 
-ExternalProject_add(
-    zmq
-    SOURCE_DIR ${ZMQ_DIR}
-    PREFIX ${ZMQ_BUILD_DIR}
-    CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} -DCMAKE_INSTALL_PREFIX=${ZMQ_BUILD_DIR} -DWITH_PERF_TOOL=OFF -DZMQ_BUILD_TESTS=OFF -DENABLE_CPACK=OFF -DENABLE_CURVE=OFF
-    BUILD_BYPRODUCTS ${ZMQ_LIB}
-)
+ExternalProject_Add(
+  zmq
+  SOURCE_DIR ${ZMQ_DIR}
+  PREFIX ${ZMQ_BUILD_DIR}
+  CMAKE_ARGS -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
+             -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
+             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+             -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+             -DCMAKE_INSTALL_PREFIX=${ZMQ_BUILD_DIR}
+             -DWITH_PERF_TOOL=OFF
+             -DZMQ_BUILD_TESTS=OFF
+             -DENABLE_CPACK=OFF
+             -DENABLE_CURVE=OFF
+  BUILD_BYPRODUCTS ${ZMQ_LIB})
 
 set(ZMQ_INC ${ZMQ_BUILD_DIR}/include)
 include_directories(${ZMQ_INC})
@@ -19,8 +28,5 @@ file(MAKE_DIRECTORY ${ZMQ_INC})
 
 add_library(libzmq STATIC IMPORTED GLOBAL)
 add_dependencies(libzmq zmq)
-set_target_properties(
-    libzmq PROPERTIES
-    IMPORTED_LOCATION ${ZMQ_LIB}
-    INTERFACE_INCLUDE_DIRECTORIES ${ZMQ_INC}
-)
+set_target_properties(libzmq PROPERTIES IMPORTED_LOCATION ${ZMQ_LIB}
+                                        INTERFACE_INCLUDE_DIRECTORIES ${ZMQ_INC})
diff --git a/dnn/CMakeLists.txt b/dnn/CMakeLists.txt
index 6270da98..dfa4a97a 100644
--- a/dnn/CMakeLists.txt
+++ b/dnn/CMakeLists.txt
@@ -4,66 +4,61 @@ set(OPR_PARAM_DEFS_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/scripts/gen_param_defs.py)
 set(OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/include/)
 file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/megdnn)
 add_custom_command(
-    OUTPUT
-        ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
-        ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h
-        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS}
-        ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
-        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} ${OPR_PARAM_DEFS_SRCS}
-       tmp_unuse.log --write-cppjson ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h
-    DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
-    VERBATIM
-)
-
-list(APPEND OPR_PARAM_DEFS_OUTS
-    ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
+  OUTPUT ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
+         ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h
+  COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT}
+          ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
+  COMMAND
+    ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT}
+    ${OPR_PARAM_DEFS_SRCS} tmp_unuse.log --write-cppjson
     ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h
-)
+  DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
+  VERBATIM)
+
+list(APPEND OPR_PARAM_DEFS_OUTS ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_defs.h
+     ${OPR_PARAM_DEFS_OUT_DIR}/megdnn/opr_param_json.h)
 list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR})
 
 set(OPR_PARAM_DEFS_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
 file(MAKE_DIRECTORY ${OPR_PARAM_DEFS_OUT_DIR}/src/common)
 add_custom_command(
-    OUTPUT
-        ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
-        COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT}
-        --enumv ${OPR_PARAM_DEFS_SRCS}
-        ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
-    DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
-    VERBATIM
-)
+  OUTPUT ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
+  COMMAND
+    ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${OPR_PARAM_DEFS_SCRIPT} --enumv
+    ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
+  DEPENDS ${OPR_PARAM_DEFS_SRCS} ${OPR_PARAM_DEFS_SCRIPT}
+  VERBATIM)
 
 list(APPEND OPR_PARAM_DEFS_OUTS
-    ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh
-)
+     ${OPR_PARAM_DEFS_OUT_DIR}/src/common/opr_param_defs_enumv.cuh)
 list(APPEND OPR_PARAM_DEFS_INC ${OPR_PARAM_DEFS_OUT_DIR})
 
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/megdnn
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES_MATCHING
+  PATTERN "*.h")
 
 add_custom_target(_opr_param_defs DEPENDS ${OPR_PARAM_DEFS_OUTS})
 add_library(opr_param_defs INTERFACE)
 target_include_directories(opr_param_defs
-    INTERFACE
-        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-)
-foreach (INCPATH IN LISTS OPR_PARAM_DEFS_INC)
-    target_include_directories(opr_param_defs
-        INTERFACE $<BUILD_INTERFACE:${INCPATH}>
-    )
+                           INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+foreach(INCPATH IN LISTS OPR_PARAM_DEFS_INC)
+  target_include_directories(opr_param_defs INTERFACE $<BUILD_INTERFACE:${INCPATH}>)
 endforeach()
 
 add_dependencies(opr_param_defs _opr_param_defs)
 install(TARGETS opr_param_defs EXPORT ${MGE_EXPORT_TARGETS})
 
 if(MGE_WITH_CUDA)
-    add_library(cutlass INTERFACE)
-    target_include_directories(cutlass 
-        INTERFACE 
-        $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>)
+  add_library(cutlass INTERFACE)
+  target_include_directories(
+    cutlass
+    INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>)
 endif()
 
 if(MGE_WITH_TEST)
-    add_subdirectory(test)
+  add_subdirectory(test)
 endif()
 
 add_subdirectory(src)
diff --git a/dnn/atlas-stub/CMakeLists.txt b/dnn/atlas-stub/CMakeLists.txt
index f6bffb1a..7be656a4 100644
--- a/dnn/atlas-stub/CMakeLists.txt
+++ b/dnn/atlas-stub/CMakeLists.txt
@@ -1,6 +1,8 @@
 add_library(atlas-stub STATIC src/libatlas-wrap.cpp)
-target_include_directories(atlas-stub PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+target_include_directories(
+  atlas-stub PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 install(TARGETS atlas-stub EXPORT ${MGE_EXPORT_TARGETS})
 
 add_library(acl-cblas STATIC src/libacl_cblas-wrap.cpp)
-target_include_directories(acl-cblas PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+target_include_directories(
+  acl-cblas PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
diff --git a/dnn/cuda-stub/CMakeLists.txt b/dnn/cuda-stub/CMakeLists.txt
index e89dddbd..8d71e835 100644
--- a/dnn/cuda-stub/CMakeLists.txt
+++ b/dnn/cuda-stub/CMakeLists.txt
@@ -1,26 +1,27 @@
-file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp)
-file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp)
+file(GLOB_RECURSE CUDA_STUB src/libcuda.cpp)
+file(GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp)
 
 if(MGE_WITH_CUDA_STUB)
-    list(APPEND STUB_SRC ${CUDA_STUB})
+  list(APPEND STUB_SRC ${CUDA_STUB})
 endif()
 
 if(MGE_WITH_NVRTC_STUB)
-    list(APPEND STUB_SRC ${NVRTC_STUB})
+  list(APPEND STUB_SRC ${NVRTC_STUB})
 endif()
 
 if(MSVC OR WIN32)
-    add_library (cuda-stub STATIC ${STUB_SRC})
+  add_library(cuda-stub STATIC ${STUB_SRC})
 else()
-    add_library (cuda-stub SHARED ${STUB_SRC})
+  add_library(cuda-stub SHARED ${STUB_SRC})
 endif()
 
 set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub)
 target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL)
-if (MSVC OR WIN32)
-    target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined)
+if(MSVC OR WIN32)
+  target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined)
 else()
-    target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined)
+  target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined)
 endif()
-target_include_directories(cuda-stub PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/genfiles>)
-install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS})
+target_include_directories(cuda-stub
+                           PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/genfiles>)
+install(TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS})
diff --git a/dnn/include/megdnn/common.h b/dnn/include/megdnn/common.h
index f0073c59..09cb70a0 100644
--- a/dnn/include/megdnn/common.h
+++ b/dnn/include/megdnn/common.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "megbrain_build_config.h"
+#include "megdnn/oprs/base.h"
 
 #if MGB_ENABLE_GETENV
 #define MGB_GETENV ::std::getenv
@@ -36,6 +37,11 @@ bool has_available_algo(Opr* opr, Args&&... args) {
     return !all_algos.empty();
 }
 
+template <class Opr, typename... Args>
+bool has_no_naive_heuristic_algo(Opr* opr, Args&&... args) {
+    auto&& algo = opr->get_algorithm_info_heuristic(std::forward<Args>(args)...);
+    return !static_cast<bool>(algo.attribute & detail::Algorithm::Attribute::NAIVE);
+}
 }  // namespace megdnn
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h
index e5ea399c..7188d941 100644
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -1936,6 +1936,119 @@ protected:
             const TensorLayout& grad_s, size_t workspace_in_bytes);
 };
 
+class LayerNormBase : public OperatorBase {
+    DEF_OPR_IMPL_CTOR(LayerNormBase, OperatorBase);
+    DEF_OPR_PARAM(LayerNorm);
+
+protected:
+    void deduce_layout_fwd(
+            const TensorLayout& data, const TensorLayout& weight,
+            const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean,
+            TensorLayout& rstd);
+    void check_layout_fwd(
+            const TensorLayout& data, const TensorLayout& weight,
+            const TensorLayout& bias, const TensorLayout& dst, const TensorLayout& mean,
+            const TensorLayout& rstd);
+};
+
+class LayerNormForward : public LayerNormBase {
+    DEF_OPR_IMPL(LayerNormForward, LayerNormBase, 3, 3);
+
+public:
+    virtual void exec(
+            _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
+            _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
+            _megdnn_workspace workspace) = 0;
+    void deduce_layout(
+            const TensorLayout& data, const TensorLayout& weight,
+            const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean,
+            TensorLayout& rstd);
+    virtual size_t get_workspace_in_bytes(
+            const TensorLayout& data, const TensorLayout& weight,
+            const TensorLayout& bias, const TensorLayout& dst, const TensorLayout& mean,
+            const TensorLayout& rstd) = 0;
+
+protected:
+    void check_exec(
+            const TensorLayout& data, const TensorLayout& weight,
+            const TensorLayout& bias, const TensorLayout& dst, const TensorLayout& mean,
+            const TensorLayout& rstd, size_t workspace_in_bytes);
+};
+using LayerNorm = LayerNormForward;
+
+class LayerNormBackward : public LayerNormBase {
+    DEF_OPR_IMPL(LayerNormBackward, LayerNormBase, 5, 3);
+
+public:
+    virtual void exec(
+            _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight,
+            _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata,
+            _megdnn_tensor_out dweight, _megdnn_tensor_out dbias,
+            _megdnn_workspace workspace) = 0;
+    void deduce_layout(
+            const TensorLayout& diff, const TensorLayout& data,
+            const TensorLayout& weight, const TensorLayout& mean,
+            const TensorLayout& rstd, TensorLayout& ddata, TensorLayout& dweight,
+            TensorLayout& dbias);
+    virtual size_t get_workspace_in_bytes(
+            const TensorLayout& diff, const TensorLayout& data,
+            const TensorLayout& weight, const TensorLayout& mean,
+            const TensorLayout& rstd, const TensorLayout& ddata,
+            const TensorLayout& dweight, const TensorLayout& dbias) = 0;
+
+protected:
+    void check_exec(
+            const TensorLayout& diff, const TensorLayout& data,
+            const TensorLayout& weight, const TensorLayout& mean,
+            const TensorLayout& rstd, const TensorLayout& ddata,
+            const TensorLayout& dweight, const TensorLayout& dbias,
+            size_t workspace_in_bytes);
+};
+
+class DropoutBase : public OperatorBase {
+    DEF_OPR_IMPL_CTOR(DropoutBase, OperatorBase);
+    DEF_OPR_PARAM(Dropout);
+};
+
+class DropoutForward : public DropoutBase {
+    DEF_OPR_IMPL(DropoutForward, DropoutBase, 1, 2);
+
+public:
+    void deduce_layout(const TensorLayout& inp, TensorLayout& oup, TensorLayout& mask);
+    virtual void exec(
+            _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask,
+            _megdnn_workspace workspace) = 0;
+    virtual size_t get_workspace_in_bytes(
+            const TensorLayout& inp, const TensorLayout& oup,
+            const TensorLayout& mask) = 0;
+    virtual size_t get_mask_size_in_bytes(const TensorLayout& inp) = 0;
+
+protected:
+    void check_exec(
+            const TensorLayout& inp, const TensorLayout& oup, const TensorLayout& mask,
+            size_t workspace_in_bytes);
+};
+using Dropout = DropoutForward;
+
+class DropoutBackward : public DropoutBase {
+    DEF_OPR_IMPL(DropoutBackward, DropoutBase, 2, 1);
+
+public:
+    void deduce_layout(
+            const TensorLayout& doup, const TensorLayout& mask, TensorLayout& dinp);
+    virtual void exec(
+            _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp,
+            _megdnn_workspace workspace) = 0;
+    virtual size_t get_workspace_in_bytes(
+            const TensorLayout& doup, const TensorLayout& mask,
+            const TensorLayout& dinp) = 0;
+
+protected:
+    void check_exec(
+            const TensorLayout& doup, const TensorLayout& mask,
+            const TensorLayout& dinp, size_t workspace_in_bytes);
+};
+
 }  // namespace megdnn
 #include "megdnn/internal/opr_header_epilogue.h"
 
diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py
index 76220c99..9da6bbe9 100755
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -1212,3 +1212,15 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
           member_alias=[(i, 'PADDING_{}'.format(i)) for i in PADDING_MODES]
           )
 )
+
+(pdef('LayerNorm')
+ .add_fields('bool', 'affine', 'true')
+ .add_fields('float32', 'eps', '1e-5f')
+ .add_fields('uint64', 'normalized_dim', '1')
+ .add_fields('uint64', 'normalized_size', '1')
+)
+
+(pdef('Dropout')
+ .add_fields('float32', 'drop_prob', '0')
+ .add_fields('uint64', 'seed', '0')
+ )
diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt
index 4e20c9a5..d0566165 100644
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -5,168 +5,190 @@ file(GLOB_RECURSE SOURCES common/*.cpp naive/*.cpp)
 list(APPEND SOURCES ${PROJECT_BINARY_DIR}/genfiles/megbrain_build_config.h)
 
 if(NOT ${MGE_ARCH} STREQUAL "naive")
-    file(GLOB_RECURSE SOURCES_ fallback/*.cpp)
+  file(GLOB_RECURSE SOURCES_ fallback/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
+  if(${MGE_ARCH} STREQUAL "fallback")
+    message(WARNING "build only with fallback")
+  elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386")
+    file(GLOB_RECURSE SOURCES_ x86/*.cpp)
     list(APPEND SOURCES ${SOURCES_})
-    if(${MGE_ARCH} STREQUAL "fallback")
-        message(WARNING "build only with fallback")
-    elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386")
-        file(GLOB_RECURSE SOURCES_ x86/*.cpp)
-        list(APPEND SOURCES ${SOURCES_})
-        if(NOT MSVC)
-            file(GLOB_RECURSE SOURCES_ x86/*.S)
-            set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C)
-            list(APPEND SOURCES ${SOURCES_})
-        endif()
-    elseif(${MGE_ARCH} STREQUAL "armv7")
-        file(GLOB_RECURSE SOURCES_ armv7/*.cpp)
-        list(APPEND SOURCES ${SOURCES_})
-        file(GLOB_RECURSE SOURCES_ arm_common/*.cpp)
-        list(APPEND SOURCES ${SOURCES_})
-        file(GLOB_RECURSE SOURCES_ armv7/*.S)
-        set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C)
-        list(APPEND SOURCES ${SOURCES_})
-    elseif(${MGE_ARCH} STREQUAL "aarch64")
-        file(GLOB_RECURSE SOURCES_ aarch64/*.cpp)
-        list(APPEND SOURCES ${SOURCES_})
-        file(GLOB_RECURSE SOURCES_ arm_common/*.cpp)
-        list(APPEND SOURCES ${SOURCES_})
-        file(GLOB_RECURSE SOURCES_ aarch64/*.S)
-        set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C)
-        list(APPEND SOURCES ${SOURCES_})
+    if(NOT MSVC)
+      file(GLOB_RECURSE SOURCES_ x86/*.S)
+      set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C)
+      list(APPEND SOURCES ${SOURCES_})
     endif()
+  elseif(${MGE_ARCH} STREQUAL "armv7")
+    file(GLOB_RECURSE SOURCES_ armv7/*.cpp)
+    list(APPEND SOURCES ${SOURCES_})
+    file(GLOB_RECURSE SOURCES_ arm_common/*.cpp)
+    list(APPEND SOURCES ${SOURCES_})
+    file(GLOB_RECURSE SOURCES_ armv7/*.S)
+    set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C)
+    list(APPEND SOURCES ${SOURCES_})
+  elseif(${MGE_ARCH} STREQUAL "aarch64")
+    file(GLOB_RECURSE SOURCES_ aarch64/*.cpp)
+    list(APPEND SOURCES ${SOURCES_})
+    file(GLOB_RECURSE SOURCES_ arm_common/*.cpp)
+    list(APPEND SOURCES ${SOURCES_})
+    file(GLOB_RECURSE SOURCES_ aarch64/*.S)
+    set_source_files_properties(${SOURCES_} PROPERTIES LANGUAGE C)
+    list(APPEND SOURCES ${SOURCES_})
+  endif()
 endif()
 
 if(MGE_WITH_MIDOUT_PROFILE)
-    list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp)
+  list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp)
 endif()
 
-###############################################################################
+# ######################################################################################
 # HIP_COMPILE
-###############################################################################
-macro (HIP_COMPILE _hip_target _hip_objs)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources
-        _cmake_options
-        _hipcc_options
-        _hcc_options
-        _nvcc_options
-        ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${_hip_target}
-        OBJ _generated_files _source_files ${_sources} ${_cmake_options}
-        HIPCC_OPTIONS ${_hipcc_options}
-        HCC_OPTIONS ${_hcc_options}
-        NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
+# ######################################################################################
+macro(HIP_COMPILE _hip_target _hip_objs)
+  # Separate the sources from the options
+  hip_get_sources_and_options(_sources _cmake_options _hipcc_options _hcc_options
+                              _nvcc_options ${ARGN})
+  hip_prepare_target_commands(
+    ${_hip_target}
+    OBJ
+    _generated_files
+    _source_files
+    ${_sources}
+    ${_cmake_options}
+    HIPCC_OPTIONS
+    ${_hipcc_options}
+    HCC_OPTIONS
+    ${_hcc_options}
+    NVCC_OPTIONS
+    ${_nvcc_options})
+  if(_source_files)
+    list(REMOVE_ITEM _sources ${_source_files})
+  endif()
 
-    add_custom_target(${_hip_target})
+  add_custom_target(${_hip_target})
 
-    # set return value
-    set(${_hip_objs} ${_generated_files})
+  # set return value
+  set(${_hip_objs} ${_generated_files})
 endmacro()
 
-if (MGE_WITH_ROCM)
-    file (GLOB_RECURSE SOURCES_ rocm/*.cpp)
-    list (APPEND SOURCES ${SOURCES_})
-
-    # FIXME rocm may lost the first hip file, so currently we just create an
-    # empty file to bypass this error.
-    file(GLOB start.cpp.hip "" )
-    list(APPEND HIP_SOURCES start.cpp.hip)
-    configure_file(
-        ${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h.in
-        ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h)
-
-    configure_file(
-        ${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h.in
-        ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h)
-
-    file(GLOB_RECURSE HIP_SOURCES_ rocm/*.cpp.hip)
-    set(HIP_TARGET_NAME megdnn_hip_kernel)
-    set(_HIPCC_OPTIONS "-fPIC")
-    set(_HCC_OPTIONS "-fPIC")
-    set(_NVCC_OPTIONS "-fPIC")
-
-    list(APPEND HIP_SOURCES ${HIP_SOURCES_})
-    set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-    HIP_INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/dnn
-        ${PROJECT_SOURCE_DIR}/dnn/include
-        ${PROJECT_BINARY_DIR}/dnn
-        ${PROJECT_BINARY_DIR}/genfiles
-        ${PROJECT_BINARY_DIR}/dnn/include
-        ${HIP_INCLUDE_DIR}
-        ${MIOPEN_INCLUDE_DIR}
-        ${ROCBLAS_INCLUDE_DIR}
-        ${ROCRAND_INCLUDE_DIR}
-        ${AMDOCL_INCLUDE_DIR})
-    hip_compile(
-        ${HIP_TARGET_NAME} HIPOBJS ${HIP_SOURCES}
-        HIPCC_OPTIONS ${_HIPCC_OPTIONS}
-        HCC_OPTIONS ${_HCC_OPTIONS}
-        NVCC_OPTIONS ${_NVCC_OPTIONS})
-    list(APPEND SOURCES ${HIPOBJS})
-endif ()
+if(MGE_WITH_ROCM)
+  file(GLOB_RECURSE SOURCES_ rocm/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
+
+  # FIXME rocm may lost the first hip file, so currently we just create an empty file to
+  # bypass this error.
+  file(GLOB start.cpp.hip "")
+  list(APPEND HIP_SOURCES start.cpp.hip)
+  configure_file(${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h.in
+                 ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_prologue.h)
+
+  configure_file(${PROJECT_SOURCE_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h.in
+                 ${PROJECT_BINARY_DIR}/dnn/include/hcc_detail/hcc_defs_epilogue.h)
+
+  file(GLOB_RECURSE HIP_SOURCES_ rocm/*.cpp.hip)
+  set(HIP_TARGET_NAME megdnn_hip_kernel)
+  set(_HIPCC_OPTIONS "-fPIC")
+  set(_HCC_OPTIONS "-fPIC")
+  set(_NVCC_OPTIONS "-fPIC")
+
+  list(APPEND HIP_SOURCES ${HIP_SOURCES_})
+  set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  hip_include_directories(
+    ${PROJECT_SOURCE_DIR}/dnn
+    ${PROJECT_SOURCE_DIR}/dnn/include
+    ${PROJECT_BINARY_DIR}/dnn
+    ${PROJECT_BINARY_DIR}/genfiles
+    ${PROJECT_BINARY_DIR}/dnn/include
+    ${HIP_INCLUDE_DIR}
+    ${MIOPEN_INCLUDE_DIR}
+    ${ROCBLAS_INCLUDE_DIR}
+    ${ROCRAND_INCLUDE_DIR}
+    ${AMDOCL_INCLUDE_DIR})
+  hip_compile(
+    ${HIP_TARGET_NAME}
+    HIPOBJS
+    ${HIP_SOURCES}
+    HIPCC_OPTIONS
+    ${_HIPCC_OPTIONS}
+    HCC_OPTIONS
+    ${_HCC_OPTIONS}
+    NVCC_OPTIONS
+    ${_NVCC_OPTIONS})
+  list(APPEND SOURCES ${HIPOBJS})
+endif()
 
 if(MGE_WITH_CUDA)
-    file(GLOB_RECURSE SOURCES_ cuda/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ cuda/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 
-    file(GLOB_RECURSE CUSOURCES cuda/*.cu)
-
-    set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py)
-    set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated)
-    set(CUTLASS_SOURCES "")
-    function(gen_cutlass_kimpl op type gen_files)
-        set(CURRENT_CUTLASS_STAGE_DIR ${CUTLASS_GEN_DIR}/${op}_${type}.stage)
-        set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type})
-        
-        set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS ${CUTLASS_GEN_SCRIPT})
-        
-        file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR})
-        file(MAKE_DIRECTORY ${CURRENT_CUTLASS_STAGE_DIR})
-        file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR})
-        execute_process(
-            COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_STAGE_DIR}
-            RESULT_VARIABLE gen_cutlass_result
-            OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
-            ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
-        )
-        if (NOT gen_cutlass_result EQUAL 0)
-            message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log")
-        endif()
-        file(GLOB CUTLASS_GEN_FILES RELATIVE "${CURRENT_CUTLASS_GEN_DIR}/" "${CURRENT_CUTLASS_GEN_DIR}/*.cu")
-        foreach(FILE ${CUTLASS_GEN_FILES}) 
-            if (NOT EXISTS "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}")
-                file(REMOVE "${CURRENT_CUTLASS_GEN_DIR}/${FILE}")
-            endif()
-        endforeach()
-        file(GLOB CUTLASS_GEN_FILES RELATIVE "${CURRENT_CUTLASS_STAGE_DIR}" "${CURRENT_CUTLASS_STAGE_DIR}/*.cu")
-        foreach(FILE ${CUTLASS_GEN_FILES})
-            execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}" "${CURRENT_CUTLASS_GEN_DIR}")
-        endforeach()
-        file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR})
-        file(GLOB_RECURSE CUTLASS_GEN_FILES "${CURRENT_CUTLASS_GEN_DIR}/*.cu")
-        list(APPEND ${gen_files} ${CUTLASS_GEN_FILES})
-        set(${gen_files} "${${gen_files}}" PARENT_SCOPE)
-    endfunction()
-    gen_cutlass_kimpl(gemm simt CUTLASS_SOURCES)
-    gen_cutlass_kimpl(gemm tensorop884 CUTLASS_SOURCES)
-    gen_cutlass_kimpl(gemm tensorop1688 CUTLASS_SOURCES)
-    gen_cutlass_kimpl(gemv simt CUTLASS_SOURCES)
-    gen_cutlass_kimpl(deconv simt CUTLASS_SOURCES)
-    gen_cutlass_kimpl(deconv tensorop8816 CUTLASS_SOURCES)
-    gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES)
-    gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES)
-    gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES)
-    list(APPEND SOURCES ${CUTLASS_SOURCES})
-    list(APPEND SOURCES ${CUSOURCES})
+  file(GLOB_RECURSE CUSOURCES cuda/*.cu)
+
+  set(CUTLASS_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py)
+  set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated)
+  set(CUTLASS_SOURCES "")
+  function(gen_cutlass_kimpl op type gen_files)
+    set(CURRENT_CUTLASS_STAGE_DIR ${CUTLASS_GEN_DIR}/${op}_${type}.stage)
+    set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type})
+
+    set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS ${CUTLASS_GEN_SCRIPT})
+
+    file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR})
+    file(MAKE_DIRECTORY ${CURRENT_CUTLASS_STAGE_DIR})
+    file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR})
+    execute_process(
+      COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations
+              ${op} --type ${type} ${CURRENT_CUTLASS_STAGE_DIR}
+      RESULT_VARIABLE gen_cutlass_result
+      OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
+      ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log)
+    if(NOT gen_cutlass_result EQUAL 0)
+      message(
+        FATAL_ERROR
+          "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log"
+      )
+    endif()
+    file(
+      GLOB CUTLASS_GEN_FILES
+      RELATIVE "${CURRENT_CUTLASS_GEN_DIR}/"
+      "${CURRENT_CUTLASS_GEN_DIR}/*.cu")
+    foreach(FILE ${CUTLASS_GEN_FILES})
+      if(NOT EXISTS "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}")
+        file(REMOVE "${CURRENT_CUTLASS_GEN_DIR}/${FILE}")
+      endif()
+    endforeach()
+    file(
+      GLOB CUTLASS_GEN_FILES
+      RELATIVE "${CURRENT_CUTLASS_STAGE_DIR}"
+      "${CURRENT_CUTLASS_STAGE_DIR}/*.cu")
+    foreach(FILE ${CUTLASS_GEN_FILES})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${CURRENT_CUTLASS_STAGE_DIR}/${FILE}" "${CURRENT_CUTLASS_GEN_DIR}")
+    endforeach()
+    file(REMOVE_RECURSE ${CURRENT_CUTLASS_STAGE_DIR})
+    file(GLOB_RECURSE CUTLASS_GEN_FILES "${CURRENT_CUTLASS_GEN_DIR}/*.cu")
+    list(APPEND ${gen_files} ${CUTLASS_GEN_FILES})
+    set(${gen_files}
+        "${${gen_files}}"
+        PARENT_SCOPE)
+  endfunction()
+  gen_cutlass_kimpl(gemm simt CUTLASS_SOURCES)
+  gen_cutlass_kimpl(gemm tensorop884 CUTLASS_SOURCES)
+  gen_cutlass_kimpl(gemm tensorop1688 CUTLASS_SOURCES)
+  gen_cutlass_kimpl(gemv simt CUTLASS_SOURCES)
+  gen_cutlass_kimpl(deconv simt CUTLASS_SOURCES)
+  gen_cutlass_kimpl(deconv tensorop8816 CUTLASS_SOURCES)
+  gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES)
+  gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES)
+  gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES)
+  list(APPEND SOURCES ${CUTLASS_SOURCES})
+  list(APPEND SOURCES ${CUSOURCES})
 endif()
 
 if(MGE_WITH_ATLAS)
-    file(GLOB_RECURSE SOURCES_ atlas/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
-    list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1)
+  file(GLOB_RECURSE SOURCES_ atlas/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
+  list(APPEND LIBMEGDNN_DEF -DMEGDNN_WITH_ATLAS=1)
 endif()
 
 add_definitions(${LIBMEGDNN_DEF})
@@ -174,81 +196,85 @@ add_library(megdnn EXCLUDE_FROM_ALL OBJECT ${SOURCES})
 
 target_link_libraries(megdnn PUBLIC opr_param_defs)
 if(MGE_WITH_CUDA)
-    target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cutlass>)
-    target_include_directories(megdnn PRIVATE ${CUDNN_INCLUDE_DIR})
+  target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cutlass>)
+  target_include_directories(megdnn PRIVATE ${CUDNN_INCLUDE_DIR})
 endif()
 
 if(MGE_WITH_ROCM)
-    target_include_directories(megdnn PUBLIC
-        ${HIP_INCLUDE_DIR}
-        ${MIOPEN_INCLUDE_DIR}
-        ${ROCBLAS_INCLUDE_DIR}
-        ${ROCRAND_INCLUDE_DIR}
-        ${AMDOCL_INCLUDE_DIR})
-    target_link_directories(megdnn PUBLIC
-        ${HIP_LIBRARY_DIR}
-        ${MIOPEN_LIBRARY_DIR}
-        ${ROCBLAS_LIBRARY_DIR}
-        ${ROCRAND_LIBRARY_DIR}
-        ${AMDOCL_LIBRARY_DIR})
+  target_include_directories(
+    megdnn PUBLIC ${HIP_INCLUDE_DIR} ${MIOPEN_INCLUDE_DIR} ${ROCBLAS_INCLUDE_DIR}
+                  ${ROCRAND_INCLUDE_DIR} ${AMDOCL_INCLUDE_DIR})
+  target_link_directories(
+    megdnn
+    PUBLIC
+    ${HIP_LIBRARY_DIR}
+    ${MIOPEN_LIBRARY_DIR}
+    ${ROCBLAS_LIBRARY_DIR}
+    ${ROCRAND_LIBRARY_DIR}
+    ${AMDOCL_LIBRARY_DIR})
 endif()
 
-if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} STREQUAL "armv7" OR ${MGE_ARCH} STREQUAL "aarch64")
-    if(MGE_ENABLE_CPUINFO)
-        target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cpuinfo>)
-    endif()
+if(${MGE_ARCH} STREQUAL "x86_64"
+   OR ${MGE_ARCH} STREQUAL "i386"
+   OR ${MGE_ARCH} STREQUAL "armv7"
+   OR ${MGE_ARCH} STREQUAL "aarch64")
+  if(MGE_ENABLE_CPUINFO)
+    target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cpuinfo>)
+  endif()
 endif()
 
-target_include_directories(megdnn
-    PUBLIC
-        $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/genfiles>
-        $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/dnn/include>
-        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}/dnn
-        ${PROJECT_SOURCE_DIR}/third_party/midout/src
-)
+target_include_directories(
+  megdnn
+  PUBLIC $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/genfiles>
+         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/dnn/include>
+         $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PRIVATE ${PROJECT_SOURCE_DIR}/dnn ${PROJECT_SOURCE_DIR}/third_party/midout/src)
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/dnn/include DESTINATION . FILES_MATCHING PATTERN "*.h*")
+install(
+  DIRECTORY ${PROJECT_SOURCE_DIR}/dnn/include
+  DESTINATION .
+  FILES_MATCHING
+  PATTERN "*.h*")
 
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
-    if(MGE_WITH_CUDA)
-        target_compile_options(megdnn PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
-            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
-    else()
-        target_compile_options(megdnn PRIVATE "-Wno-class-memaccess")
-    endif()
+  if(MGE_WITH_CUDA)
+    target_compile_options(
+      megdnn PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
+                     "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
+  else()
+    target_compile_options(megdnn PRIVATE "-Wno-class-memaccess")
+  endif()
 endif()
 target_compile_definitions(megdnn INTERFACE ${LIBMEGDNN_DEF})
 
 if(MGE_WITH_MKLDNN AND ${MGE_ARCH} STREQUAL "x86_64")
-    if (BUILD_SHARED_LIBS)
-        target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:dnnl>)
-    else()
-        target_link_libraries(megdnn PRIVATE dnnl)
-    endif()
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:dnnl>)
+  else()
+    target_link_libraries(megdnn PRIVATE dnnl)
+  endif()
 endif()
-if (BUILD_SHARED_LIBS)
-    target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:${MGE_BLAS_LIBS}>)
+if(BUILD_SHARED_LIBS)
+  target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:${MGE_BLAS_LIBS}>)
 else()
-    target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS})
+  target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS})
 endif()
 
-if (MGE_WITH_ROCM)
-    target_link_libraries(megdnn PRIVATE ${HIPOBJS} ${MGE_ROCM_LIBS})
-endif ()
+if(MGE_WITH_ROCM)
+  target_link_libraries(megdnn PRIVATE ${HIPOBJS} ${MGE_ROCM_LIBS})
+endif()
 
 if(MGE_WITH_ATLAS)
-    if (BUILD_SHARED_LIBS)
-        target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:${MGE_ATLAS_LIBS}>)
-    else()
-        target_link_libraries(megdnn PRIVATE ${MGE_ATLAS_LIBS})
-    endif()
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:${MGE_ATLAS_LIBS}>)
+  else()
+    target_link_libraries(megdnn PRIVATE ${MGE_ATLAS_LIBS})
+  endif()
 
 endif()
 
 if(CMAKE_THREAD_LIBS_INIT)
-    target_link_libraries(megdnn PRIVATE Threads::Threads)
+  target_link_libraries(megdnn PRIVATE Threads::Threads)
 endif()
 
 install(TARGETS megdnn EXPORT ${MGE_EXPORT_TARGETS})
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp
index 30874d33..200769d6 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
-INSTANTIATION_CONV_S1(2);
\ No newline at end of file
+INSTANTIATION_CONV_S1_BIAS(2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..c6c974a5
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp
new file mode 100644
index 00000000..6f075a54
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_NO_BIAS(2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp
index be37fc1c..a9728847 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
-INSTANTIATION_CONV_S2(5);
\ No newline at end of file
+INSTANTIATION_CONV_S2_BIAS(2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..ae899e2c
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp
new file mode 100644
index 00000000..94c09aea
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_NO_BIAS(2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp
index 689df883..0047c51e 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
-INSTANTIATION_CONV_S1(5);
\ No newline at end of file
+INSTANTIATION_CONV_S1_BIAS(3);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..c273dede
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(3);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp
new file mode 100644
index 00000000..719dbd1d
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_NO_BIAS(3);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp
index 355c9ed6..01209f9c 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_2x2s2.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
-INSTANTIATION_CONV_S2(2);
\ No newline at end of file
+INSTANTIATION_CONV_S2_BIAS(3);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..7bed53e2
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(3);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp
new file mode 100644
index 00000000..9aa190df
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_NO_BIAS(3);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp
index 6d99b01b..5cbcb78a 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s1.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
-INSTANTIATION_CONV_S1(3);
\ No newline at end of file
+INSTANTIATION_CONV_S1_BIAS(5);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..bcf92bab
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(5);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp
new file mode 100644
index 00000000..d944b02b
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_NO_BIAS(5);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp
index a7571939..a75f159a 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
-INSTANTIATION_CONV_S2(7);
+INSTANTIATION_CONV_S2_BIAS(5);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..ff9653ea
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(5);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp
new file mode 100644
index 00000000..a2705bde
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_5x5s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_NO_BIAS(5);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp
index db2be268..47cbf3d7 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
-INSTANTIATION_CONV_S1(7);
\ No newline at end of file
+INSTANTIATION_CONV_S1_BIAS(7);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..f8fa2c29
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(7);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp
new file mode 100644
index 00000000..c7824aad
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h"
+INSTANTIATION_CONV_S1_NO_BIAS(7);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp
index 6075a8ca..fd603f3b 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_3x3s2.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
-INSTANTIATION_CONV_S2(3);
\ No newline at end of file
+INSTANTIATION_CONV_S2_BIAS(7);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..bd1e5c29
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(7);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp
new file mode 100644
index 00000000..0caee33c
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_7x7s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h"
+INSTANTIATION_CONV_S2_NO_BIAS(7);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
index bf9418b4..3915caea 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
@@ -469,9 +469,12 @@ void conv_bias::conv_direct_fp32_nchw44(
     INSTANTIATION(filter_size, bias, HSwishOp<dt_float32>) \
     INSTANTIATION(filter_size, bias, SigmoidOp<dt_float32>)
 
-#define INSTANTIATION_CONV_S1(filter_size)                \
-    FOR_OP(filter_size, BiasMode::NO_BIAS)                \
-    FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS) \
-    FOR_OP(filter_size, BiasMode::BIAS)
+#define INSTANTIATION_CONV_S1_NO_BIAS(filter_size) \
+    FOR_OP(filter_size, BiasMode::NO_BIAS)
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+#define INSTANTIATION_CONV_S1_BROADCAST_CHANNEL_BIAS(filter_size) \
+    FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS)
+
+#define INSTANTIATION_CONV_S1_BIAS(filter_size) FOR_OP(filter_size, BiasMode::BIAS)
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
index b31cd438..cbbf047a 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
@@ -550,9 +550,12 @@ void conv_bias::conv_direct_fp32_nchw44(
     INSTANTIATION(filter_size, bias, HSwishOp<dt_float32>) \
     INSTANTIATION(filter_size, bias, SigmoidOp<dt_float32>)
 
-#define INSTANTIATION_CONV_S2(filter_size)                \
-    FOR_OP(filter_size, BiasMode::NO_BIAS)                \
-    FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS) \
-    FOR_OP(filter_size, BiasMode::BIAS)
+#define INSTANTIATION_CONV_S2_NO_BIAS(filter_size) \
+    FOR_OP(filter_size, BiasMode::NO_BIAS)
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+#define INSTANTIATION_CONV_S2_BROADCAST_CHANNEL_BIAS(filter_size) \
+    FOR_OP(filter_size, BiasMode::BROADCAST_CHANNEL_BIAS)
+
+#define INSTANTIATION_CONV_S2_BIAS(filter_size) FOR_OP(filter_size, BiasMode::BIAS)
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp
index 291741b9..e82c464e 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
-INSTANCE_CONV(2, 1);
\ No newline at end of file
+INSTANCE_CONV_BIAS(2, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..fbdb5ec7
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(2, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp
new file mode 100644
index 00000000..97f2595c
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(2, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp
index d3f360c5..3acba768 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
-INSTANCE_CONV(2, 2);
+INSTANCE_CONV_BIAS(2, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..1e4c7197
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(2, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp
new file mode 100644
index 00000000..03bc548f
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_2x2s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(2, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp
index 432708e7..89bc21d5 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
-INSTANCE_CONV(3, 1);
+INSTANCE_CONV_BIAS(3, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..fe811030
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(3, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias.cpp
new file mode 100644
index 00000000..88cbe5f8
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s1_no_bias
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(3, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp
similarity index 86%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp
rename to dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp
index 38ffe8ef..f2f02815 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2.cpp
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_bias.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -11,4 +11,5 @@
  * implied.
  */
 #include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
-INSTANCE_CONV(3, 2);
+INSTANCE_CONV_BIAS(3, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..416e9839
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(3, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp
new file mode 100644
index 00000000..bf3f792d
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_3x3s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(3, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp
new file mode 100644
index 00000000..f4d38c4e
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BIAS(5, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..1dcb60b0
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(5, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp
new file mode 100644
index 00000000..e32fbccb
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(5, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp
new file mode 100644
index 00000000..a818401f
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BIAS(5, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..be387827
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(5, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp
new file mode 100644
index 00000000..64c9db59
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(5, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp
new file mode 100644
index 00000000..6fb2e117
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BIAS(7, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..74ad5102
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(7, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp
new file mode 100644
index 00000000..94af0cbd
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(7, 1);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp
new file mode 100644
index 00000000..576213bc
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BIAS(7, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
new file mode 100644
index 00000000..58890e90
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_broadcast_channel_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(7, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp
new file mode 100644
index 00000000..4a4e0f35
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp
@@ -0,0 +1,15 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2_no_bias.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+INSTANCE_CONV_NO_BIAS(7, 2);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
index 36daabc8..1869f2ff 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
@@ -928,9 +928,11 @@ void fp32_direct_nchw_nchw44::conv_direct_fp32_nchw_nchw44(
     INSTANTIATION(stride, filter, bias, ReluOp<dt_float32>) \
     INSTANTIATION(stride, filter, bias, HSwishOp<dt_float32>)
 
-#define INSTANCE_CONV(filter, stride)                        \
-    FOR_OP(stride, filter, BiasMode::NO_BIAS)                \
-    FOR_OP(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS) \
-    FOR_OP(stride, filter, BiasMode::BIAS)
+#define INSTANCE_CONV_NO_BIAS(filter, stride) FOR_OP(stride, filter, BiasMode::NO_BIAS)
+
+#define INSTANCE_CONV_BROADCAST_CHANNEL_BIAS(filter, stride) \
+    FOR_OP(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS)
+
+#define INSTANCE_CONV_BIAS(filter, stride) FOR_OP(stride, filter, BiasMode::BIAS)
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h
similarity index 97%
rename from dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp
rename to dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h
index cc9624f3..48d7bd1d 100644
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.cpp
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -265,7 +265,8 @@ void conv_direct_sdot_int8_nchw44(
 
 #define INSTANTIATION(dst_type, stride, filter_size, bias_mode, Op)                \
     template void                                                                  \
-    conv_direct_sdot_int8_nchw44<dst_type, stride, bias_mode, Op, filter_size>(    \
+    megdnn::arm_common::direct_dotprod_nchw44::conv_direct_sdot_int8_nchw44<       \
+            dst_type, stride, bias_mode, Op, filter_size>(                         \
             dst_type * dst, const int oh, const int ow, const int8_t* src,         \
             const int ih, const int iw, const int8_t* weight, const int32_t* bias, \
             const int oh_size, const int oc, const int ic, const Op& op);
@@ -284,22 +285,6 @@ void conv_direct_sdot_int8_nchw44(
     FOR_OP(stride, i, BiasMode::NO_BIAS) \
     FOR_OP(stride, i, BiasMode::BROADCAST_CHANNEL_BIAS)
 
-#define FOR_FILTER(stride) \
-    FOR_BIAS(stride, 2)    \
-    FOR_BIAS(stride, 3)    \
-    FOR_BIAS(stride, 5)    \
-    FOR_BIAS(stride, 7)
-
-FOR_FILTER(1)
-
-#undef FOR_STRIDE
-#undef FOR_FILTER
-#undef FOR_IC
-#undef FOR_BIAS
-#undef FOR_NONLINEAR
-#undef FOR_REMAIN
-#undef INSTANTIATION
-
 }  // namespace direct_dotprod_nchw44
 }  // namespace arm_common
 }  // namespace megdnn
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp
new file mode 100644
index 00000000..66ae2846
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_2x2.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(1, 2);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp
new file mode 100644
index 00000000..faf8f46c
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_3x3.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(1, 3);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp
new file mode 100644
index 00000000..94fe0811
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_5x5.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(1, 5);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp
new file mode 100644
index 00000000..001c58a9
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1_7x7.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s1.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(1, 7);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h
similarity index 97%
rename from dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp
rename to dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h
index 841e868f..21ce451b 100644
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.cpp
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -266,7 +266,8 @@ void conv_direct_sdot_int8_nchw44(
 
 #define INSTANTIATION(dst_type, stride, filter_size, bias_mode, Op)                \
     template void                                                                  \
-    conv_direct_sdot_int8_nchw44<dst_type, stride, bias_mode, Op, filter_size>(    \
+    megdnn::arm_common::direct_dotprod_nchw44::conv_direct_sdot_int8_nchw44<       \
+            dst_type, stride, bias_mode, Op, filter_size>(                         \
             dst_type * dst, const int oh, const int ow, const int8_t* src,         \
             const int ih, const int iw, const int8_t* weight, const int32_t* bias, \
             const int oh_size, const int oc, const int ic, const Op& op);
@@ -285,22 +286,6 @@ void conv_direct_sdot_int8_nchw44(
     FOR_OP(stride, i, BiasMode::NO_BIAS) \
     FOR_OP(stride, i, BiasMode::BROADCAST_CHANNEL_BIAS)
 
-#define FOR_FILTER(stride) \
-    FOR_BIAS(stride, 2)    \
-    FOR_BIAS(stride, 3)    \
-    FOR_BIAS(stride, 5)    \
-    FOR_BIAS(stride, 7)
-
-FOR_FILTER(2)
-
-#undef FOR_STRIDE
-#undef FOR_FILTER
-#undef FOR_IC
-#undef FOR_BIAS
-#undef FOR_NONLINEAR
-#undef FOR_REMAIN
-#undef INSTANTIATION
-
 }  // namespace direct_dotprod_nchw44
 }  // namespace arm_common
 }  // namespace megdnn
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp
new file mode 100644
index 00000000..521ce682
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_2x2.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(2, 2);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp
new file mode 100644
index 00000000..d2af6eca
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_3x3.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(2, 3);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp
new file mode 100644
index 00000000..949c105e
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_5x5.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(2, 5);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp
new file mode 100644
index 00000000..4337a3e5
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp
@@ -0,0 +1,21 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2_7x7.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw44_s2.h"
+#if MGB_ENABLE_DOT
+using namespace megdnn;
+using namespace arm_common;
+
+FOR_BIAS(2, 7);
+
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h
index 74a11e4f..174c48c3 100644
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -45,4 +45,4 @@ public:
 }  // namespace arm_common
 }  // namespace megdnn
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
index f238a1fc..3373cf9a 100644
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
@@ -13,336 +13,9 @@
 
 #include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h"
 #include "src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h"
+
 namespace megdnn {
 namespace arm_common {
-namespace {
-/**
- * @brief core code for calculation patten
- *
- * @tparam src_idx is offset of src reg
- * @tparam weight_idx is offset of weight reg
- * @tparam c_dim is output channel
- * @tparam Func mla operation funcion
- * @tparam stride
- * @tparam T outpur regs type
- * @tparam T2 src regs type
- * @tparam T3 weight regs type
- * @tparam T4 temp regs type
- */
-
-template <
-        int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2,
-        typename T3, typename T4>
-struct ShiftCalHelper {
-    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp);
-    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
-};
-template <
-        int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2,
-        typename T3, typename T4>
-MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight, T4& temp) {
-    ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3, T4>::impl(
-            c, src, weight, temp);
-}
-template <
-        int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2,
-        typename T3>
-MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
-    ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3, int>::impl(
-            c, src, weight);
-};
-template <
-        int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4>
-struct ShiftCalHelper<src_idx, weight_idx, 2, 1, T, T2, T3, T4> {
-    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
-        c[0][0] = vdotq_s32_h(
-                src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]);
-        c[1][0] = vdotq_s32_h(
-                src[(0 + src_idx) % 8], weight[1][weight_idx], c[1][0], temp[1]);
-        c[0][1] = vdotq_s32_h(
-                src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[2]);
-        c[1][1] = vdotq_s32_h(
-                src[(1 + src_idx) % 8], weight[1][weight_idx], c[1][1], temp[3]);
-        c[0][2] = vdotq_s32_h(
-                src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[0]);
-        c[1][2] = vdotq_s32_h(
-                src[(2 + src_idx) % 8], weight[1][weight_idx], c[1][2], temp[1]);
-        c[0][3] = vdotq_s32_h(
-                src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[2]);
-        c[1][3] = vdotq_s32_h(
-                src[(3 + src_idx) % 8], weight[1][weight_idx], c[1][3], temp[3]);
-
-        c[0][4] = vdotq_s32_h(
-                src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]);
-        c[1][4] = vdotq_s32_h(
-                src[(4 + src_idx) % 8], weight[1][weight_idx], c[1][4], temp[1]);
-        c[0][5] = vdotq_s32_h(
-                src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[2]);
-        c[1][5] = vdotq_s32_h(
-                src[(5 + src_idx) % 8], weight[1][weight_idx], c[1][5], temp[3]);
-        c[0][6] = vdotq_s32_h(
-                src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[0]);
-        c[1][6] = vdotq_s32_h(
-                src[(6 + src_idx) % 8], weight[1][weight_idx], c[1][6], temp[1]);
-        c[0][7] = vdotq_s32_h(
-                src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[2]);
-        c[1][7] = vdotq_s32_h(
-                src[(7 + src_idx) % 8], weight[1][weight_idx], c[1][7], temp[3]);
-    }
-    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&);
-};
-template <
-        int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4>
-struct ShiftCalHelper<src_idx, weight_idx, 1, 1, T, T2, T3, T4> {
-    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
-        c[0][0] = vdotq_s32_h(
-                src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]);
-        c[0][1] = vdotq_s32_h(
-                src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[1]);
-        c[0][2] = vdotq_s32_h(
-                src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[2]);
-        c[0][3] = vdotq_s32_h(
-                src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[3]);
-        c[0][4] = vdotq_s32_h(
-                src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]);
-        c[0][5] = vdotq_s32_h(
-                src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[1]);
-        c[0][6] = vdotq_s32_h(
-                src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[2]);
-        c[0][7] = vdotq_s32_h(
-                src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[3]);
-    }
-    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&);
-};
-
-template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
-struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 1, oc_block, 1> {
-    static void impl(
-            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
-            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
-        constexpr int stride = 1;
-        constexpr int filter_height = 1;
-        constexpr int filter_width = 4;
-        constexpr int oc_step = 4;
-        constexpr int loop_ic_step = 1;
-        constexpr int simd_len = 16;
-        constexpr int pack_iw_len = 16;
-        constexpr int src_reg = 8;
-        constexpr int weight_reg = 1;
-
-        const int ic_stride = ih * iw * pack_iw_len;
-        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
-        constexpr int c_dim = OCHelper<oc_block>::val;
-        int32x4_t c[c_dim][8];
-        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
-
-        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
-            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
-            int8x16_t src[src_reg];
-            int8x16_t dot4_weight[c_dim][weight_reg];
-            int16x8_t temp_c[4];
-            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
-                    dot4_weight, weight_ptr, ld_weight_oc);
-            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
-                    src, nchw_src_ptr + 0 * iw * pack_iw_len, 0);
-            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
-
-            weight_ptr += oc_step * filter_height * filter_width;
-        }
-
-        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
-                c, op, dst_ptr, ld_dst_oc);
-    }
-};
-
-template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
-struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 2, oc_block, 1> {
-    static void impl(
-            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
-            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
-        constexpr int stride = 1;
-        constexpr int filter_height = 2;
-        constexpr int filter_width = 4;
-        constexpr int oc_step = 4;
-        constexpr int loop_ic_step = 1;
-        constexpr int simd_len = 16;
-        constexpr int pack_iw_len = 16;
-        constexpr int src_reg = 8;
-        constexpr int weight_reg = 1;
-
-        const int ic_stride = ih * iw * pack_iw_len;
-        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
-        constexpr int c_dim = OCHelper<oc_block>::val;
-        int32x4_t c[c_dim][8];
-        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
-
-        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
-            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
-            int8x16_t src[src_reg];
-            int8x16_t dot4_weight[c_dim][weight_reg];
-            int16x8_t temp_c[4];
-            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
-                    dot4_weight, weight_ptr, ld_weight_oc);
-            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
-                    src, nchw_src_ptr + 0 * iw * pack_iw_len, 0);
-            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
-
-            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
-                    dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc);
-            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
-                    src, nchw_src_ptr + 1 * iw * pack_iw_len, 0);
-            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
-
-            weight_ptr += oc_step * filter_height * filter_width;
-        }
-
-        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
-                c, op, dst_ptr, ld_dst_oc);
-    }
-};
-
-template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
-struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 3, oc_block, 1> {
-    static void impl(
-            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
-            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
-        constexpr int stride = 1;
-        constexpr int filter_height = 3;
-        constexpr int filter_width = 4;
-        constexpr int oc_step = 4;
-        constexpr int loop_ic_step = 1;
-        constexpr int simd_len = 16;
-        constexpr int pack_iw_len = 16;
-        constexpr int src_reg = 8;
-        constexpr int weight_reg = 1;
-
-        const int ic_stride = ih * iw * pack_iw_len;
-        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
-        constexpr int c_dim = OCHelper<oc_block>::val;
-        int32x4_t c[c_dim][8];
-        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
-
-        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
-            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
-            int8x16_t src[src_reg];
-            int8x16_t dot4_weight[c_dim][weight_reg];
-            int16x8_t temp_c[4];
-            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
-                    dot4_weight, weight_ptr, ld_weight_oc);
-
-            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
-                    src, nchw_src_ptr + 0 * iw * pack_iw_len, 0);
-            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
-            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
-                    dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc);
-
-            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
-                    src, nchw_src_ptr + 1 * iw * pack_iw_len, 0);
-            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
-
-            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
-                    dot4_weight, weight_ptr + 2 * filter_width * oc_step, ld_weight_oc);
-            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
-                    src, nchw_src_ptr + 2 * iw * pack_iw_len, 0);
-            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
-
-            weight_ptr += oc_step * filter_height * filter_width;
-        }
-        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
-                c, op, dst_ptr, ld_dst_oc);
-    }
-};
-
-template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
-struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 5, oc_block, 1> {
-    static void impl(
-            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
-            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
-        constexpr int stride = 1;
-        constexpr int filter_height = 5;
-        constexpr int filter_width = 8;
-        constexpr int oc_step = 4;
-        constexpr int loop_ic_step = 1;
-        constexpr int simd_len = 16;
-        constexpr int pack_iw_len = 16;
-        constexpr int src_reg = 8;
-        constexpr int weight_reg = 2;
-
-        const int ic_stride = ih * iw * pack_iw_len;
-        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
-        constexpr int c_dim = OCHelper<oc_block>::val;
-        int32x4_t c[c_dim][8];
-        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
-
-        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
-            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
-            int8x16_t src[src_reg];
-            int8x16_t dot4_weight[c_dim][weight_reg];
-            int16x8_t temp_c[4];
-#define cb(step)                                                                     \
-    load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(                           \
-            dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc);  \
-    load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(                                  \
-            src, nchw_src_ptr + step * iw * pack_iw_len, 0);                         \
-    cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);                    \
-    load_helper<4, 0, simd_len, 0, Vld1q_s8>(                                        \
-            src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \
-    cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c);
-            UNROLL_CALL_RAW(5, cb);
-#undef cb
-            weight_ptr += oc_step * filter_height * filter_width;
-        }
-        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
-                c, op, dst_ptr, ld_dst_oc);
-    }
-};
-
-template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
-struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 7, oc_block, 1> {
-    static void impl(
-            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
-            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
-        constexpr int stride = 1;
-        constexpr int filter_height = 7;
-        constexpr int filter_width = 8;
-        constexpr int oc_step = 4;
-        constexpr int loop_ic_step = 1;
-        constexpr int simd_len = 16;
-        constexpr int pack_iw_len = 16;
-        constexpr int src_reg = 8;
-        constexpr int weight_reg = 2;
-
-        const int ic_stride = ih * iw * pack_iw_len;
-        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
-        constexpr int c_dim = OCHelper<oc_block>::val;
-        int32x4_t c[c_dim][8];
-        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
-
-        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
-            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
-            int8x16_t src[src_reg];
-            int8x16_t dot4_weight[c_dim][weight_reg];
-            int16x8_t temp_c[4];
-#define cb(step)                                                                     \
-    load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(                           \
-            dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc);  \
-    load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(                                  \
-            src, nchw_src_ptr + step * iw * pack_iw_len, 0);                         \
-    cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);                    \
-    load_helper<4, 0, simd_len, 0, Vld1q_s8>(                                        \
-            src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \
-    cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c);
-
-            UNROLL_CALL_RAW(7, cb);
-#undef cb
-            weight_ptr += oc_step * filter_height * filter_width;
-        }
-        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
-                c, op, dst_ptr, ld_dst_oc);
-    }
-};
-}  // namespace
-
 namespace int8_direct_nchw_nchw44 {
 /**
  * pack {oc / 4, fh, fw, ic, 4(oc)} to {oc / 4, ic, fh ,fw/4, 4(oc)*4(fw)}
@@ -444,115 +117,9 @@ void pack_nchw_src_for_nchw44_conv<1>(
     }
 }
 
-template <BiasMode bias_mode, typename Op, size_t filter_size>
-struct ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, 1> {
-    static void impl(
-            const int8_t* src, const int8_t* filter, const int32_t* bias, int32_t* temp,
-            int8_t* dst, const size_t oc, const size_t ic, const size_t ih,
-            const size_t iw, const size_t oh, const size_t ow, const Op& op) {
-        MEGDNN_MARK_USED_VAR(temp);
-        constexpr int stride = 1;
-        constexpr size_t fh = filter_size;
-        constexpr size_t fw = (filter_size + 3) / 4 * 4;
-        constexpr size_t ic_step = 1;
-        constexpr size_t big_oc_step = 8;
-        constexpr size_t oc_step = 4;
-        constexpr size_t ih_step = 1;
-        constexpr size_t oh_step = 1;
-        constexpr size_t ow_step = 8;
-        constexpr size_t stride_h = stride;
-        constexpr size_t stride_w = stride;
-        constexpr int pack_iw_len = 16;
-
-        const size_t img_stride = oh * ow;
-        const size_t ow_end = ow / ow_step * ow_step;
-        const size_t ow_remain = ow - ow_end;
-        const size_t oc_end = oc / big_oc_step * big_oc_step;
-        const size_t oc_remain = oc - oc_end;
-        const int ld_dst_oc = oc_step * img_stride;
-
-        using remain_fun = std::function<void(
-                const int8_t* src_ptr, const int8_t* weight_ptr,
-                const int32_t* bias_ptr, int8_t* dst_ptr, int ic, int ih, int iw,
-                int ld_dst_oc, const Op& op)>;
-        remain_fun kern_big_oc_remain = nullptr;
-        remain_fun kern_small_oc_remain = nullptr;
-        switch (ow_remain) {
-#define cb(step)                                                              \
-    case step:                                                                \
-        kern_big_oc_remain = KerNeonXXs2NchwNchw44<                           \
-                bias_mode, Op, step, filter_size, big_oc_step, stride>::impl; \
-        kern_small_oc_remain = KerNeonXXs2NchwNchw44<                         \
-                bias_mode, Op, step, filter_size, oc_step, stride>::impl;     \
-        break;
-
-            UNROLL_CALL_RAW(8, cb);
-            default:
-                megdnn_assert(0, "no remain %zu for kern", ow_remain);
-        }
-
-        for (size_t oc_idx = 0; oc_idx < oc_end; oc_idx += big_oc_step) {
-            const size_t weight_offset = oc_idx * ic * fh * fw;
-            for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) {
-                for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
-                    const size_t src_offset =
-                            (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) *
-                            ic_step * pack_iw_len;
-                    const size_t dst_offset =
-                            oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
-
-                    KerNeonXXs2NchwNchw44<
-                            bias_mode, Op, ow_step, filter_size, big_oc_step, stride>::
-                            impl(src + src_offset, filter + weight_offset,
-                                 bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc,
-                                 op);
-                }
-                if (ow_remain > 0) {
-                    const size_t src_offset =
-                            (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) *
-                            ic_step * pack_iw_len;
-                    const size_t dst_offset =
-                            oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step;
-                    kern_big_oc_remain(
-                            src + src_offset, filter + weight_offset, bias + oc_idx,
-                            dst + dst_offset, ic, ih, iw, ld_dst_oc, op);
-                }
-            }
-        }
-
-        if (oc_remain > 0) {
-            size_t oc_idx = oc_end;
-            const size_t weight_offset = oc_idx * ic * fh * fw;
-            for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) {
-                for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
-                    const size_t src_offset =
-                            (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) *
-                            ic_step * pack_iw_len;
-                    const size_t dst_offset =
-                            oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
-                    KerNeonXXs2NchwNchw44<
-                            bias_mode, Op, ow_step, filter_size, oc_step, stride>::
-                            impl(src + src_offset, filter + weight_offset,
-                                 bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc,
-                                 op);
-                }
-                if (ow_remain > 0) {
-                    const size_t src_offset =
-                            (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) *
-                            ic_step * pack_iw_len;
-                    const size_t dst_offset =
-                            oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step;
-                    kern_small_oc_remain(
-                            src + src_offset, filter + weight_offset, bias + oc_idx,
-                            dst + dst_offset, ic, ih, iw, ld_dst_oc, op);
-                }
-            }
-        }
-    }
-};
-
 #define INSTANCE_CONV_KERN_FUN(stride, filter_size, bias_mode, Op) \
-    template struct ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, stride>;
+    template struct megdnn::arm_common::int8_direct_nchw_nchw44::  \
+            ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, stride>;
 
 #define INSTANCE_OP_PARAM(stride, filter, bias_mode)                               \
     INSTANCE_CONV_KERN_FUN(                                                        \
@@ -566,17 +133,10 @@ struct ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, 1> {
     INSTANCE_OP_PARAM(stride, filter, BiasMode::NO_BIAS) \
     INSTANCE_OP_PARAM(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS)
 
-#define INSTANCE_CONV_KERN(stride)      \
-    INSTANCE_BIAS_MODE_PARAM(stride, 1) \
-    INSTANCE_BIAS_MODE_PARAM(stride, 2) \
-    INSTANCE_BIAS_MODE_PARAM(stride, 3) \
-    INSTANCE_BIAS_MODE_PARAM(stride, 5) \
-    INSTANCE_BIAS_MODE_PARAM(stride, 7)
-
-INSTANCE_CONV_KERN(1);
+#define INSTANCE_CONV_KERN(stride, filter) INSTANCE_BIAS_MODE_PARAM(stride, filter)
 
 }  // namespace int8_direct_nchw_nchw44
 }  // namespace arm_common
 }  // namespace megdnn
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h
new file mode 100644
index 00000000..4ae7f4f7
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h
@@ -0,0 +1,481 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_common.h"
+#include "src/arm_common/conv_bias/int8/direct_nchw_nchw44_kern.h"
+
+namespace megdnn {
+namespace arm_common {
+namespace {
+/**
+ * @brief core code for calculation patten
+ *
+ * @tparam src_idx is offset of src reg
+ * @tparam weight_idx is offset of weight reg
+ * @tparam c_dim is output channel
+ * @tparam Func mla operation funcion
+ * @tparam stride
+ * @tparam T outpur regs type
+ * @tparam T2 src regs type
+ * @tparam T3 weight regs type
+ * @tparam T4 temp regs type
+ */
+
+template <
+        int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2,
+        typename T3, typename T4>
+struct ShiftCalHelper {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp);
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
+};
+template <
+        int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2,
+        typename T3, typename T4>
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight, T4& temp) {
+    ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3, T4>::impl(
+            c, src, weight, temp);
+}
+template <
+        int src_idx, int weight_idx, int c_dim, int stride, typename T, typename T2,
+        typename T3>
+MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
+    ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3, int>::impl(
+            c, src, weight);
+};
+template <
+        int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4>
+struct ShiftCalHelper<src_idx, weight_idx, 2, 1, T, T2, T3, T4> {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
+        c[0][0] = vdotq_s32_h(
+                src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]);
+        c[1][0] = vdotq_s32_h(
+                src[(0 + src_idx) % 8], weight[1][weight_idx], c[1][0], temp[1]);
+        c[0][1] = vdotq_s32_h(
+                src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[2]);
+        c[1][1] = vdotq_s32_h(
+                src[(1 + src_idx) % 8], weight[1][weight_idx], c[1][1], temp[3]);
+        c[0][2] = vdotq_s32_h(
+                src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[0]);
+        c[1][2] = vdotq_s32_h(
+                src[(2 + src_idx) % 8], weight[1][weight_idx], c[1][2], temp[1]);
+        c[0][3] = vdotq_s32_h(
+                src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[2]);
+        c[1][3] = vdotq_s32_h(
+                src[(3 + src_idx) % 8], weight[1][weight_idx], c[1][3], temp[3]);
+
+        c[0][4] = vdotq_s32_h(
+                src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]);
+        c[1][4] = vdotq_s32_h(
+                src[(4 + src_idx) % 8], weight[1][weight_idx], c[1][4], temp[1]);
+        c[0][5] = vdotq_s32_h(
+                src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[2]);
+        c[1][5] = vdotq_s32_h(
+                src[(5 + src_idx) % 8], weight[1][weight_idx], c[1][5], temp[3]);
+        c[0][6] = vdotq_s32_h(
+                src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[0]);
+        c[1][6] = vdotq_s32_h(
+                src[(6 + src_idx) % 8], weight[1][weight_idx], c[1][6], temp[1]);
+        c[0][7] = vdotq_s32_h(
+                src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[2]);
+        c[1][7] = vdotq_s32_h(
+                src[(7 + src_idx) % 8], weight[1][weight_idx], c[1][7], temp[3]);
+    }
+    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&);
+};
+template <
+        int src_idx, int weight_idx, typename T, typename T2, typename T3, typename T4>
+struct ShiftCalHelper<src_idx, weight_idx, 1, 1, T, T2, T3, T4> {
+    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight, T4& temp) {
+        c[0][0] = vdotq_s32_h(
+                src[(0 + src_idx) % 8], weight[0][weight_idx], c[0][0], temp[0]);
+        c[0][1] = vdotq_s32_h(
+                src[(1 + src_idx) % 8], weight[0][weight_idx], c[0][1], temp[1]);
+        c[0][2] = vdotq_s32_h(
+                src[(2 + src_idx) % 8], weight[0][weight_idx], c[0][2], temp[2]);
+        c[0][3] = vdotq_s32_h(
+                src[(3 + src_idx) % 8], weight[0][weight_idx], c[0][3], temp[3]);
+        c[0][4] = vdotq_s32_h(
+                src[(4 + src_idx) % 8], weight[0][weight_idx], c[0][4], temp[0]);
+        c[0][5] = vdotq_s32_h(
+                src[(5 + src_idx) % 8], weight[0][weight_idx], c[0][5], temp[1]);
+        c[0][6] = vdotq_s32_h(
+                src[(6 + src_idx) % 8], weight[0][weight_idx], c[0][6], temp[2]);
+        c[0][7] = vdotq_s32_h(
+                src[(7 + src_idx) % 8], weight[0][weight_idx], c[0][7], temp[3]);
+    }
+    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&);
+};
+
+template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
+struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 1, oc_block, 1> {
+    static void impl(
+            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
+            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
+        constexpr int stride = 1;
+        constexpr int filter_height = 1;
+        constexpr int filter_width = 4;
+        constexpr int oc_step = 4;
+        constexpr int loop_ic_step = 1;
+        constexpr int simd_len = 16;
+        constexpr int pack_iw_len = 16;
+        constexpr int src_reg = 8;
+        constexpr int weight_reg = 1;
+
+        const int ic_stride = ih * iw * pack_iw_len;
+        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
+        constexpr int c_dim = OCHelper<oc_block>::val;
+        int32x4_t c[c_dim][8];
+        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
+
+        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
+            int8x16_t src[src_reg];
+            int8x16_t dot4_weight[c_dim][weight_reg];
+            int16x8_t temp_c[4];
+            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
+                    dot4_weight, weight_ptr, ld_weight_oc);
+            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
+                    src, nchw_src_ptr + 0 * iw * pack_iw_len, 0);
+            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
+
+            weight_ptr += oc_step * filter_height * filter_width;
+        }
+
+        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
+                c, op, dst_ptr, ld_dst_oc);
+    }
+};
+
+template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
+struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 2, oc_block, 1> {
+    static void impl(
+            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
+            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
+        constexpr int stride = 1;
+        constexpr int filter_height = 2;
+        constexpr int filter_width = 4;
+        constexpr int oc_step = 4;
+        constexpr int loop_ic_step = 1;
+        constexpr int simd_len = 16;
+        constexpr int pack_iw_len = 16;
+        constexpr int src_reg = 8;
+        constexpr int weight_reg = 1;
+
+        const int ic_stride = ih * iw * pack_iw_len;
+        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
+        constexpr int c_dim = OCHelper<oc_block>::val;
+        int32x4_t c[c_dim][8];
+        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
+
+        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
+            int8x16_t src[src_reg];
+            int8x16_t dot4_weight[c_dim][weight_reg];
+            int16x8_t temp_c[4];
+            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
+                    dot4_weight, weight_ptr, ld_weight_oc);
+            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
+                    src, nchw_src_ptr + 0 * iw * pack_iw_len, 0);
+            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
+
+            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
+                    dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc);
+            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
+                    src, nchw_src_ptr + 1 * iw * pack_iw_len, 0);
+            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
+
+            weight_ptr += oc_step * filter_height * filter_width;
+        }
+
+        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
+                c, op, dst_ptr, ld_dst_oc);
+    }
+};
+
+template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
+struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 3, oc_block, 1> {
+    static void impl(
+            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
+            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
+        constexpr int stride = 1;
+        constexpr int filter_height = 3;
+        constexpr int filter_width = 4;
+        constexpr int oc_step = 4;
+        constexpr int loop_ic_step = 1;
+        constexpr int simd_len = 16;
+        constexpr int pack_iw_len = 16;
+        constexpr int src_reg = 8;
+        constexpr int weight_reg = 1;
+
+        const int ic_stride = ih * iw * pack_iw_len;
+        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
+        constexpr int c_dim = OCHelper<oc_block>::val;
+        int32x4_t c[c_dim][8];
+        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
+
+        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
+            int8x16_t src[src_reg];
+            int8x16_t dot4_weight[c_dim][weight_reg];
+            int16x8_t temp_c[4];
+            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
+                    dot4_weight, weight_ptr, ld_weight_oc);
+
+            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
+                    src, nchw_src_ptr + 0 * iw * pack_iw_len, 0);
+            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
+            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
+                    dot4_weight, weight_ptr + 1 * filter_width * oc_step, ld_weight_oc);
+
+            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
+                    src, nchw_src_ptr + 1 * iw * pack_iw_len, 0);
+            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
+
+            load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(
+                    dot4_weight, weight_ptr + 2 * filter_width * oc_step, ld_weight_oc);
+            load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(
+                    src, nchw_src_ptr + 2 * iw * pack_iw_len, 0);
+            cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);
+
+            weight_ptr += oc_step * filter_height * filter_width;
+        }
+        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
+                c, op, dst_ptr, ld_dst_oc);
+    }
+};
+
+template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
+struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 5, oc_block, 1> {
+    static void impl(
+            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
+            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
+        constexpr int stride = 1;
+        constexpr int filter_height = 5;
+        constexpr int filter_width = 8;
+        constexpr int oc_step = 4;
+        constexpr int loop_ic_step = 1;
+        constexpr int simd_len = 16;
+        constexpr int pack_iw_len = 16;
+        constexpr int src_reg = 8;
+        constexpr int weight_reg = 2;
+
+        const int ic_stride = ih * iw * pack_iw_len;
+        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
+        constexpr int c_dim = OCHelper<oc_block>::val;
+        int32x4_t c[c_dim][8];
+        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
+
+        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
+            int8x16_t src[src_reg];
+            int8x16_t dot4_weight[c_dim][weight_reg];
+            int16x8_t temp_c[4];
+#define cb(step)                                                                     \
+    load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(                           \
+            dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc);  \
+    load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(                                  \
+            src, nchw_src_ptr + step * iw * pack_iw_len, 0);                         \
+    cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);                    \
+    load_helper<4, 0, simd_len, 0, Vld1q_s8>(                                        \
+            src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \
+    cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c);
+            UNROLL_CALL_RAW(5, cb);
+#undef cb
+            weight_ptr += oc_step * filter_height * filter_width;
+        }
+        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
+                c, op, dst_ptr, ld_dst_oc);
+    }
+};
+
+template <BiasMode bias_mode, typename Op, int remain_w, int oc_block>
+struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 7, oc_block, 1> {
+    static void impl(
+            const int8_t* src_ptr, const int8_t* weight_ptr, const int32_t* bias_ptr,
+            int8_t* dst_ptr, int ic, int ih, int iw, int ld_dst_oc, const Op& op) {
+        constexpr int stride = 1;
+        constexpr int filter_height = 7;
+        constexpr int filter_width = 8;
+        constexpr int oc_step = 4;
+        constexpr int loop_ic_step = 1;
+        constexpr int simd_len = 16;
+        constexpr int pack_iw_len = 16;
+        constexpr int src_reg = 8;
+        constexpr int weight_reg = 2;
+
+        const int ic_stride = ih * iw * pack_iw_len;
+        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
+        constexpr int c_dim = OCHelper<oc_block>::val;
+        int32x4_t c[c_dim][8];
+        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
+
+        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
+            int8x16_t src[src_reg];
+            int8x16_t dot4_weight[c_dim][weight_reg];
+            int16x8_t temp_c[4];
+#define cb(step)                                                                     \
+    load_helper<weight_reg, 0, simd_len, c_dim, Vld1q_s8>(                           \
+            dot4_weight, weight_ptr + step * filter_width * oc_step, ld_weight_oc);  \
+    load_helper<src_reg, 0, simd_len, 0, Vld1q_s8>(                                  \
+            src, nchw_src_ptr + step * iw * pack_iw_len, 0);                         \
+    cal_helper<0, 0, c_dim, stride>(c, src, dot4_weight, temp_c);                    \
+    load_helper<4, 0, simd_len, 0, Vld1q_s8>(                                        \
+            src, nchw_src_ptr + step * iw * pack_iw_len + src_reg * pack_iw_len, 0); \
+    cal_helper<4, 1, c_dim, stride>(c, src, dot4_weight, temp_c);
+
+            UNROLL_CALL_RAW(7, cb);
+#undef cb
+            weight_ptr += oc_step * filter_height * filter_width;
+        }
+        store_ocx_ow8_remain_static_dt<c_dim, remain_w, Op, dt_qint8*>(
+                c, op, dst_ptr, ld_dst_oc);
+    }
+};
+}  // namespace
+
+namespace int8_direct_nchw_nchw44 {
+/**
+ * pack {oc / 4, fh, fw, ic, 4(oc)} to {oc / 4, ic, fh ,fw/4, 4(oc)*4(fw)}
+ * pack interleave two adjacent row in filter to one row
+ * */
+template <BiasMode bias_mode, typename Op, size_t filter_size>
+struct ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, 1> {
+    static void impl(
+            const int8_t* src, const int8_t* filter, const int32_t* bias, int32_t* temp,
+            int8_t* dst, const size_t oc, const size_t ic, const size_t ih,
+            const size_t iw, const size_t oh, const size_t ow, const Op& op) {
+        MEGDNN_MARK_USED_VAR(temp);
+        constexpr int stride = 1;
+        constexpr size_t fh = filter_size;
+        constexpr size_t fw = (filter_size + 3) / 4 * 4;
+        constexpr size_t ic_step = 1;
+        constexpr size_t big_oc_step = 8;
+        constexpr size_t oc_step = 4;
+        constexpr size_t ih_step = 1;
+        constexpr size_t oh_step = 1;
+        constexpr size_t ow_step = 8;
+        constexpr size_t stride_h = stride;
+        constexpr size_t stride_w = stride;
+        constexpr int pack_iw_len = 16;
+
+        const size_t img_stride = oh * ow;
+        const size_t ow_end = ow / ow_step * ow_step;
+        const size_t ow_remain = ow - ow_end;
+        const size_t oc_end = oc / big_oc_step * big_oc_step;
+        const size_t oc_remain = oc - oc_end;
+        const int ld_dst_oc = oc_step * img_stride;
+
+        using remain_fun = std::function<void(
+                const int8_t* src_ptr, const int8_t* weight_ptr,
+                const int32_t* bias_ptr, int8_t* dst_ptr, int ic, int ih, int iw,
+                int ld_dst_oc, const Op& op)>;
+        remain_fun kern_big_oc_remain = nullptr;
+        remain_fun kern_small_oc_remain = nullptr;
+        switch (ow_remain) {
+#define cb(step)                                                              \
+    case step:                                                                \
+        kern_big_oc_remain = KerNeonXXs2NchwNchw44<                           \
+                bias_mode, Op, step, filter_size, big_oc_step, stride>::impl; \
+        kern_small_oc_remain = KerNeonXXs2NchwNchw44<                         \
+                bias_mode, Op, step, filter_size, oc_step, stride>::impl;     \
+        break;
+
+            UNROLL_CALL_RAW(8, cb);
+            default:
+                megdnn_assert(0, "no remain %zu for kern", ow_remain);
+        }
+
+        for (size_t oc_idx = 0; oc_idx < oc_end; oc_idx += big_oc_step) {
+            const size_t weight_offset = oc_idx * ic * fh * fw;
+            for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) {
+                for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
+                    const size_t src_offset =
+                            (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) *
+                            ic_step * pack_iw_len;
+                    const size_t dst_offset =
+                            oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
+
+                    KerNeonXXs2NchwNchw44<
+                            bias_mode, Op, ow_step, filter_size, big_oc_step, stride>::
+                            impl(src + src_offset, filter + weight_offset,
+                                 bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc,
+                                 op);
+                }
+                if (ow_remain > 0) {
+                    const size_t src_offset =
+                            (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) *
+                            ic_step * pack_iw_len;
+                    const size_t dst_offset =
+                            oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step;
+                    kern_big_oc_remain(
+                            src + src_offset, filter + weight_offset, bias + oc_idx,
+                            dst + dst_offset, ic, ih, iw, ld_dst_oc, op);
+                }
+            }
+        }
+
+        if (oc_remain > 0) {
+            size_t oc_idx = oc_end;
+            const size_t weight_offset = oc_idx * ic * fh * fw;
+            for (size_t oh_idx = 0; oh_idx < oh; oh_idx += oh_step) {
+                for (size_t ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
+                    const size_t src_offset =
+                            (oh_idx * stride_h * iw + ow_idx * stride_w * ih_step) *
+                            ic_step * pack_iw_len;
+                    const size_t dst_offset =
+                            oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
+                    KerNeonXXs2NchwNchw44<
+                            bias_mode, Op, ow_step, filter_size, oc_step, stride>::
+                            impl(src + src_offset, filter + weight_offset,
+                                 bias + oc_idx, dst + dst_offset, ic, ih, iw, ld_dst_oc,
+                                 op);
+                }
+                if (ow_remain > 0) {
+                    const size_t src_offset =
+                            (oh_idx * stride_h * iw + ow_end * stride_w * ih_step) *
+                            ic_step * pack_iw_len;
+                    const size_t dst_offset =
+                            oc_idx * img_stride + (oh_idx * ow + ow_end) * oc_step;
+                    kern_small_oc_remain(
+                            src + src_offset, filter + weight_offset, bias + oc_idx,
+                            dst + dst_offset, ic, ih, iw, ld_dst_oc, op);
+                }
+            }
+        }
+    }
+};
+
+#define INSTANCE_CONV_KERN_FUN(stride, filter_size, bias_mode, Op) \
+    template struct megdnn::arm_common::int8_direct_nchw_nchw44::  \
+            ConvDiectStrideInt8NchwNchw44<bias_mode, Op, filter_size, stride>;
+
+#define INSTANCE_OP_PARAM(stride, filter, bias_mode)                               \
+    INSTANCE_CONV_KERN_FUN(                                                        \
+            stride, filter, bias_mode, TypeCvtOp<dt_qint32 MEGDNN_COMMA dt_qint8>) \
+    INSTANCE_CONV_KERN_FUN(                                                        \
+            stride, filter, bias_mode, ReluOp<dt_qint32 MEGDNN_COMMA dt_qint8>)    \
+    INSTANCE_CONV_KERN_FUN(                                                        \
+            stride, filter, bias_mode, HSwishOp<dt_qint32 MEGDNN_COMMA dt_qint8>)
+
+#define INSTANCE_BIAS_MODE_PARAM(stride, filter)         \
+    INSTANCE_OP_PARAM(stride, filter, BiasMode::NO_BIAS) \
+    INSTANCE_OP_PARAM(stride, filter, BiasMode::BROADCAST_CHANNEL_BIAS)
+
+#define INSTANCE_CONV_KERN(stride, filter) INSTANCE_BIAS_MODE_PARAM(stride, filter)
+
+}  // namespace int8_direct_nchw_nchw44
+}  // namespace arm_common
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp
new file mode 100644
index 00000000..14763c96
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp
@@ -0,0 +1,19 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_1x1.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h"
+using namespace megdnn;
+using namespace arm_common;
+
+INSTANCE_CONV_KERN(1, 1);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp
new file mode 100644
index 00000000..10d46268
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp
@@ -0,0 +1,19 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_2x2.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h"
+using namespace megdnn;
+using namespace arm_common;
+
+INSTANCE_CONV_KERN(1, 2);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp
new file mode 100644
index 00000000..87553278
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp
@@ -0,0 +1,19 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_3x3.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h"
+using namespace megdnn;
+using namespace arm_common;
+
+INSTANCE_CONV_KERN(1, 3);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp
new file mode 100644
index 00000000..d7deb345
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp
@@ -0,0 +1,19 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_5x5.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h"
+using namespace megdnn;
+using namespace arm_common;
+
+INSTANCE_CONV_KERN(1, 5);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp
new file mode 100644
index 00000000..37cb7679
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp
@@ -0,0 +1,19 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1_7x7.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.h"
+using namespace megdnn;
+using namespace arm_common;
+
+INSTANCE_CONV_KERN(1, 7);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp
similarity index 83%
rename from dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp
rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp
index 8b959f54..e6b8576d 100644
--- a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s1.cpp
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s1.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -12,8 +12,5 @@
  */
 #include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
 INSTANCE_CONV(2, 1);
-INSTANCE_CONV(3, 1);
-INSTANCE_CONV(5, 1);
-INSTANCE_CONV(7, 1);
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp
similarity index 83%
rename from dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp
rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp
index 050a7df8..7f78d864 100644
--- a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_s2.cpp
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_2x2s2.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -12,8 +12,5 @@
  */
 #include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
 INSTANCE_CONV(2, 2);
-INSTANCE_CONV(3, 2);
-INSTANCE_CONV(5, 2);
-INSTANCE_CONV(7, 2);
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp
new file mode 100644
index 00000000..a970478d
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp
@@ -0,0 +1,16 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s1.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
+INSTANCE_CONV(3, 1);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp
new file mode 100644
index 00000000..532351d8
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp
@@ -0,0 +1,16 @@
+/**
+ * \file
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_3x3s2.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
+INSTANCE_CONV(3, 2);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp
similarity index 66%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp
rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp
index 2bd616d6..ffe5decc 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s1.cpp
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s1.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,5 +10,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  * implied.
  */
-#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
 INSTANCE_CONV(5, 1);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp
similarity index 66%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp
rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp
index 8433d0de..7a64dbe4 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_5x5s2.cpp
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_5x5s2.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,5 +10,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  * implied.
  */
-#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
 INSTANCE_CONV(5, 2);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp
similarity index 66%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp
rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp
index deb839a8..154903f9 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s1.cpp
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s1.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,5 +10,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  * implied.
  */
-#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
 INSTANCE_CONV(7, 1);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp
similarity index 66%
rename from dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp
rename to dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp
index c0a18167..83b9e21b 100644
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp
@@ -1,6 +1,6 @@
 /**
  * \file
- * dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_7x7s2.cpp
+ * dnn/src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl_7x7s2.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,5 +10,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  * implied.
  */
-#include "src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h"
+#include "src/arm_common/conv_bias/int8x8x16/kernel/direct_nchw_nchw44_kern_impl.h"
 INSTANCE_CONV(7, 2);
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/dropout.cpp b/dnn/src/common/dropout.cpp
new file mode 100644
index 00000000..7327ca99
--- /dev/null
+++ b/dnn/src/common/dropout.cpp
@@ -0,0 +1,74 @@
+/**
+ * \file dnn/src/common/dropout.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include <time.h>
+#include "megdnn/oprs.h"
+#include "src/common/utils.h"
+
+namespace megdnn {
+
+void DropoutForward::deduce_layout(
+        const TensorLayout& inp, TensorLayout& oup, TensorLayout& mask) {
+    oup = inp;
+    size_t mask_size = get_mask_size_in_bytes(inp);
+    mask = TensorLayout(TensorShape({mask_size}), dtype::Byte());
+}
+
+void DropoutForward::check_exec(
+        const TensorLayout& inp, const TensorLayout& oup, const TensorLayout& mask,
+        size_t workspace_in_bytes) {
+    auto errmsg = [&]() {
+        return megdnn_layout_msg(inp) + ", " + megdnn_layout_msg(oup) + ", " +
+               megdnn_layout_msg(mask);
+    };
+    MEGDNN_MARK_USED_VAR(errmsg);
+
+    megdnn_assert_contiguous(inp);
+    megdnn_assert_contiguous(oup);
+    megdnn_assert_contiguous(mask);
+    megdnn_assert(inp.eq_layout(oup), "%s", errmsg().c_str());
+    megdnn_assert(inp.dtype.category() == DTypeCategory::FLOAT);
+
+    auto required_workspace_in_bytes = get_workspace_in_bytes(inp, oup, mask);
+    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
+    auto required_mask_size_in_bytes = get_mask_size_in_bytes(inp);
+    megdnn_assert(mask.total_nr_elems() >= required_mask_size_in_bytes);
+    megdnn_assert(mask.dtype == dtype::Byte());
+}
+
+void DropoutBackward::deduce_layout(
+        const TensorLayout& doup, const TensorLayout&, TensorLayout& dinp) {
+    dinp = doup;
+}
+
+void DropoutBackward::check_exec(
+        const TensorLayout& doup, const TensorLayout& mask, const TensorLayout& dinp,
+        size_t workspace_in_bytes) {
+    auto errmsg = [&]() {
+        return megdnn_layout_msg(doup) + ", " + megdnn_layout_msg(mask) + ", " +
+               megdnn_layout_msg(dinp);
+    };
+    MEGDNN_MARK_USED_VAR(errmsg);
+
+    megdnn_assert_contiguous(doup);
+    megdnn_assert_contiguous(mask);
+    megdnn_assert_contiguous(dinp);
+    megdnn_assert(doup.eq_layout(dinp), "%s", errmsg().c_str());
+
+    auto required_workspace_in_bytes = get_workspace_in_bytes(doup, mask, dinp);
+    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
+    megdnn_assert(doup.dtype.category() == DTypeCategory::FLOAT);
+    megdnn_assert(mask.dtype == dtype::Byte());
+    megdnn_assert(mask.ndim == 1);
+}
+
+}  // namespace megdnn
diff --git a/dnn/src/common/handle_impl.h b/dnn/src/common/handle_impl.h
index 7c3e01a1..ff030f25 100644
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -209,7 +209,11 @@ private:
     cb(LSQBackward) \
     cb(Fill) \
     cb(PaddingForward) \
-    cb(PaddingBackward)
+    cb(PaddingBackward) \
+    cb(LayerNormForward) \
+    cb(LayerNormBackward) \
+    cb(DropoutForward) \
+    cb(DropoutBackward)
 // clang-format on
 
 /*!
diff --git a/dnn/src/common/layer_norm.cpp b/dnn/src/common/layer_norm.cpp
new file mode 100644
index 00000000..44bb16e1
--- /dev/null
+++ b/dnn/src/common/layer_norm.cpp
@@ -0,0 +1,180 @@
+/**
+ * \file dnn/src/common/layer_norm.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "megdnn/oprs.h"
+
+#include "src/common/utils.h"
+
+namespace megdnn {
+
+void LayerNormBase::deduce_layout_fwd(
+        const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias,
+        TensorLayout& dst, TensorLayout& mean, TensorLayout& rstd) {
+    MEGDNN_MARK_USED_VAR(weight);
+    MEGDNN_MARK_USED_VAR(bias);
+    auto p = param();
+    TensorShape unnormalized_shape;
+    unnormalized_shape.ndim = data.ndim - p.normalized_dim;
+    for (size_t i = 0; i < unnormalized_shape.ndim; ++i) {
+        unnormalized_shape.shape[i] = data.shape[i];
+    }
+    TensorLayout unnormalized_layout =
+            TensorLayout(unnormalized_shape, dtype::Float32());
+    dst = data;
+    mean = unnormalized_layout;
+    rstd = unnormalized_layout;
+}
+
+void LayerNormBase::check_layout_fwd(
+        const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias,
+        const TensorLayout& dst, const TensorLayout& mean, const TensorLayout& rstd) {
+    megdnn_assert_contiguous(data);
+    megdnn_assert_contiguous(weight);
+    megdnn_assert_contiguous(bias);
+    megdnn_assert_contiguous(dst);
+    megdnn_assert_contiguous(mean);
+    megdnn_assert_contiguous(rstd);
+    auto errmsg = [&]() {
+        return megdnn_layout_msg(data) + ", " + megdnn_layout_msg(weight) + ", " +
+               megdnn_layout_msg(bias) + ", " + megdnn_layout_msg(dst) + ", " +
+               megdnn_layout_msg(mean) + ", " + megdnn_layout_msg(rstd);
+    };
+    MEGDNN_MARK_USED_VAR(errmsg);
+
+    auto equal_layout = [](const TensorLayout& lhs, const TensorLayout& rhs) -> bool {
+        if (!(lhs.ndim == rhs.ndim && lhs.dtype == rhs.dtype &&
+              lhs.format == rhs.format))
+            return false;
+        for (size_t i = 0; i < lhs.ndim; ++i) {
+            if (lhs.shape[i] != rhs.shape[i] || lhs.stride[i] != rhs.stride[i]) {
+                return false;
+            }
+        }
+        return true;
+    };
+
+    megdnn_assert(equal_layout(data, dst), "%s", errmsg().c_str());
+    megdnn_assert(equal_layout(weight, bias), "%s", errmsg().c_str());
+    megdnn_assert(equal_layout(mean, rstd), "%s", errmsg().c_str());
+
+    auto p = param();
+    uint64_t normalized_dim = p.normalized_dim;
+    size_t unnormalized_dim = data.ndim - normalized_dim;
+    megdnn_assert(
+            normalized_dim < data.ndim,
+            "the dims of normalized shape should smaller than input dims");
+
+    for (size_t i = 0; i < unnormalized_dim; ++i) {
+        megdnn_assert(data.shape[i] == mean.shape[i], "%s", errmsg().c_str());
+    }
+    if (p.affine) {
+        for (size_t i = 0; i < normalized_dim; ++i) {
+            megdnn_assert(
+                    data.shape[unnormalized_dim + i] == weight.shape[i], "%s",
+                    errmsg().c_str());
+        }
+    }
+}
+
+void LayerNormForward::deduce_layout(
+        const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias,
+        TensorLayout& dst, TensorLayout& mean, TensorLayout& rstd) {
+    deduce_layout_fwd(data, weight, bias, dst, mean, rstd);
+}
+
+void LayerNormForward::check_exec(
+        const TensorLayout& data, const TensorLayout& weight, const TensorLayout& bias,
+        const TensorLayout& dst, const TensorLayout& mean, const TensorLayout& rstd,
+        size_t workspace_in_bytes) {
+    check_layout_fwd(data, weight, bias, dst, mean, rstd);
+    auto required_workspace_in_bytes =
+            get_workspace_in_bytes(data, weight, bias, dst, mean, rstd);
+    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
+}
+
+void LayerNormBackward::deduce_layout(
+        const TensorLayout& diff, const TensorLayout& data, const TensorLayout& weight,
+        const TensorLayout& mean, const TensorLayout& rstd, TensorLayout& ddata,
+        TensorLayout& dweight, TensorLayout& dbias) {
+    MEGDNN_MARK_USED_VAR(diff);
+    MEGDNN_MARK_USED_VAR(mean);
+    MEGDNN_MARK_USED_VAR(rstd);
+    ddata = data;
+    dweight = weight;
+    dbias = weight;
+}
+
+void LayerNormBackward::check_exec(
+        const TensorLayout& diff, const TensorLayout& data, const TensorLayout& weight,
+        const TensorLayout& mean, const TensorLayout& rstd, const TensorLayout& ddata,
+        const TensorLayout& dweight, const TensorLayout& dbias,
+        size_t workspace_in_bytes) {
+    auto p = param();
+    auto required_workspace_in_bytes = get_workspace_in_bytes(
+            diff, data, weight, mean, rstd, ddata, dweight, dbias);
+    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
+
+    megdnn_assert_contiguous(diff);
+    megdnn_assert_contiguous(data);
+    megdnn_assert_contiguous(mean);
+    megdnn_assert_contiguous(rstd);
+    megdnn_assert_contiguous(ddata);
+    if (p.affine) {
+        megdnn_assert_contiguous(weight);
+        megdnn_assert_contiguous(dweight);
+        megdnn_assert_contiguous(dbias);
+    }
+
+    auto errmsg = [&]() {
+        return megdnn_layout_msg(diff) + ", " + megdnn_layout_msg(data) + ", " +
+               megdnn_layout_msg(weight) + ", " + megdnn_layout_msg(mean) + ", " +
+               megdnn_layout_msg(rstd) + ", " + megdnn_layout_msg(ddata) + ", " +
+               megdnn_layout_msg(dweight) + ", " + megdnn_layout_msg(dbias);
+    };
+    MEGDNN_MARK_USED_VAR(errmsg);
+
+    auto equal_layout = [](const TensorLayout& lhs, const TensorLayout& rhs) -> bool {
+        if (!(lhs.ndim == rhs.ndim && lhs.dtype == rhs.dtype &&
+              lhs.format == rhs.format))
+            return false;
+        for (size_t i = 0; i < lhs.ndim; ++i) {
+            if (lhs.shape[i] != rhs.shape[i] || lhs.stride[i] != rhs.stride[i]) {
+                return false;
+            }
+        }
+        return true;
+    };
+
+    megdnn_assert(equal_layout(data, ddata), "%s", errmsg().c_str());
+    megdnn_assert(equal_layout(mean, rstd), "%s", errmsg().c_str());
+    if (p.affine) {
+        megdnn_assert(equal_layout(weight, dweight), "%s", errmsg().c_str());
+        megdnn_assert(equal_layout(weight, dbias), "%s", errmsg().c_str());
+    }
+
+    size_t normalized_dim = p.normalized_dim;
+    size_t unnormalized_dim = data.ndim - normalized_dim;
+
+    for (size_t i = 0; i < unnormalized_dim; ++i) {
+        megdnn_assert(data.shape[i] == mean.shape[i], "%s", errmsg().c_str());
+    }
+    if (p.affine) {
+        for (size_t i = 0; i < normalized_dim; ++i) {
+            megdnn_assert(
+                    data.shape[unnormalized_dim + i] == weight.shape[i], "%s",
+                    errmsg().c_str());
+        }
+    }
+}
+
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/opr_trait.h b/dnn/src/common/opr_trait.h
index 8999b736..8b6145a4 100644
--- a/dnn/src/common/opr_trait.h
+++ b/dnn/src/common/opr_trait.h
@@ -135,6 +135,10 @@ DEF(CheckNonFinite, 2, true, true);
 DEF(LSQForward, 5, true, true);
 DEF(LSQBackward, 7, true, false);
 DEF(Fill, 1, true, false);
+DEF(LayerNormForward, 6, true, true);
+DEF(LayerNormBackward, 8, true, true);
+DEF(DropoutForward, 3, true, true);
+DEF(DropoutBackward, 3, true, true);
 }  // namespace megdnn
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/common/pooling.cpp b/dnn/src/common/pooling.cpp
index 5d99b4d8..b2fed318 100644
--- a/dnn/src/common/pooling.cpp
+++ b/dnn/src/common/pooling.cpp
@@ -93,7 +93,7 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst)
     size_t ph = this->param().pad_h;
     size_t pw = this->param().pad_w;
     if (ph >= fh || pw >= fw) {
-        megdnn_log_error(
+        megdnn_log_warn(
                 "pooling padding size (%zu %zu) should not be bigger than "
                 "window size (%zu %zu), it only can be used in CaffePooling",
                 pw, ph, fw, fh);
diff --git a/dnn/src/cuda/dropout/opr_impl.cpp b/dnn/src/cuda/dropout/opr_impl.cpp
new file mode 100644
index 00000000..d7349114
--- /dev/null
+++ b/dnn/src/cuda/dropout/opr_impl.cpp
@@ -0,0 +1,118 @@
+/**
+ * \file dnn/src/cuda/dropout/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/cuda/dropout/opr_impl.h"
+
+namespace megdnn {
+namespace cuda {
+
+using Param = megdnn::Dropout::Param;
+
+struct DropoutTensorDesc : public TensorDesc {
+public:
+    DropoutTensorDesc(const TensorLayout& layout) : TensorDesc() {
+        set_dropout_desc(layout);
+    }
+    void set_dropout_desc(const TensorLayout& layout) {
+        cudnnDataType_t cudnn_dtype;
+        switch (layout.dtype.enumv()) {
+            case DTypeEnum::Float32:
+                cudnn_dtype = CUDNN_DATA_FLOAT;
+                break;
+            case DTypeEnum::Float16:
+                cudnn_dtype = CUDNN_DATA_HALF;
+                break;
+            default:
+                megdnn_throw("dtype must be float16/float32");
+        }
+        cudnn_check(cudnnSetTensor4dDescriptor(
+                desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, 1, 1,
+                layout.total_nr_elems()));
+    }
+};
+
+size_t DropoutForwardImpl::get_mask_size_in_bytes(const TensorLayout& inp) {
+    size_t reserve_space_size_in_bytes = 0;
+    DropoutTensorDesc ddesc(inp);
+    cudnn_check(
+            cudnnDropoutGetReserveSpaceSize(ddesc.desc, &reserve_space_size_in_bytes));
+    return reserve_space_size_in_bytes;
+}
+
+void DropoutForwardImpl::exec(
+        _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask,
+        _megdnn_workspace workspace) {
+    check_exec(inp.layout, oup.layout, mask.layout, workspace.size);
+    uint64_t seed = param().seed;
+    float drop_prob = param().drop_prob;
+
+    if (!dropout_status.initialized()) {
+        dropout_status.set(cudnn_handle(this->handle()), seed, drop_prob);
+    }
+    if (dropout_status.drop_prob != drop_prob) {
+        dropout_status.drop_prob = drop_prob;
+        dropout_status.restore_desc(cudnn_handle(this->handle()));
+    }
+    megdnn_assert(dropout_status.seed == seed);
+
+    DropoutTensorDesc inp_desc(inp.layout), oup_desc(oup.layout);
+    auto&& op_desc = dropout_status.desc;
+
+    cudnn_check(cudnnDropoutForward(
+            cudnn_handle(this->handle()), op_desc.desc, inp_desc.desc, inp.raw_ptr(),
+            oup_desc.desc, oup.raw_ptr(), mask.raw_ptr(),
+            mask.layout.total_nr_elems()));
+}
+
+void DropoutBackwardImpl::exec(
+        _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp,
+        _megdnn_workspace workspace) {
+    check_exec(doup.layout, mask.layout, dinp.layout, workspace.size);
+
+#if CUDNN_VERSION >= 7000
+    size_t status_size_in_bytes = 0;
+    cudnn_check(cudnnDropoutGetStatesSize(
+            cudnn_handle(this->handle()), &status_size_in_bytes));
+
+    DropoutTensorDesc doup_desc(doup.layout), dinp_desc(dinp.layout);
+    op_desc.restore(
+            cudnn_handle(this->handle()), param().drop_prob, nullptr,
+            status_size_in_bytes, 0);
+    cudnn_check(cudnnDropoutBackward(
+            cudnn_handle(this->handle()), op_desc.desc, doup_desc.desc, doup.raw_ptr(),
+            dinp_desc.desc, dinp.raw_ptr(), mask.raw_ptr(),
+            mask.layout.total_nr_elems()));
+#else
+    uint64_t seed = param().seed;
+    float drop_prob = param().drop_prob;
+
+    if (!dropout_status.initialized()) {
+        dropout_status.set(cudnn_handle(this->handle()), seed, drop_prob);
+    }
+    if (dropout_status.drop_prob != drop_prob) {
+        dropout_status.drop_prob = drop_prob;
+        dropout_status.restore_desc(cudnn_handle(this->handle()));
+    }
+
+    auto&& op_desc = dropout_status.desc;
+    DropoutTensorDesc doup_desc(doup.layout), dinp_desc(dinp.layout);
+
+    cudnn_check(cudnnDropoutBackward(
+            cudnn_handle(this->handle()), op_desc.desc, doup_desc.desc, doup.raw_ptr(),
+            dinp_desc.desc, dinp.raw_ptr(), mask.raw_ptr(),
+            mask.layout.total_nr_elems()));
+#endif
+}
+
+}  // namespace cuda
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/dropout/opr_impl.h b/dnn/src/cuda/dropout/opr_impl.h
new file mode 100644
index 00000000..4db5df47
--- /dev/null
+++ b/dnn/src/cuda/dropout/opr_impl.h
@@ -0,0 +1,116 @@
+/**
+ * \file dnn/src/cuda/dropout/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+#include "src/cuda/cudnn_wrapper.h"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+class DropoutDesc {
+public:
+    DropoutDesc() { cudnn_check(cudnnCreateDropoutDescriptor(&desc)); }
+    ~DropoutDesc() { cudnn_check(cudnnDestroyDropoutDescriptor(desc)); }
+    void set(
+            cudnnHandle_t handle, void* status, size_t states_size_in_bytes,
+            uint64_t seed, float drop_prob) {
+        cudnn_check(cudnnSetDropoutDescriptor(
+                desc, handle, drop_prob, status, states_size_in_bytes, seed));
+    }
+    void restore(
+            cudnnHandle_t handle, float drop_prob, void* status,
+            size_t states_size_in_bytes, uint64_t seed) {
+#if CUDNN_VERSION >= 7000
+        cudnn_check(cudnnRestoreDropoutDescriptor(
+                desc, handle, drop_prob, status, states_size_in_bytes, 0));
+#else
+        // cudnnDropoutRestore is not support when cudnn version < 7000
+        // so we set the dropoutDesc rather than restore
+        cudnn_check(cudnnSetDropoutDescriptor(
+                desc, handle, drop_prob, status, states_size_in_bytes, seed));
+#endif
+    }
+    cudnnDropoutDescriptor_t desc;
+};
+
+class DropoutStatus {
+    void* status;
+    uint64_t status_size;
+    uint64_t seed;
+    float drop_prob;
+    DropoutDesc desc;
+
+public:
+    DropoutStatus() {
+        status = nullptr;
+        status_size = 0;
+    }
+    ~DropoutStatus() {
+        if (status != nullptr)
+            cuda_check(cudaFree(status));
+    }
+    void set(cudnnHandle_t handle, uint64_t seed, float drop_prob) {
+        this->seed = seed;
+        this->drop_prob = drop_prob;
+        cudnn_check(cudnnDropoutGetStatesSize(handle, &status_size));
+        cuda_check(cudaMalloc(&status, status_size));
+        desc.set(handle, status, status_size, seed, drop_prob);
+    }
+    void restore_desc(cudnnHandle_t handle) {
+        desc.restore(handle, drop_prob, status, status_size, seed);
+    }
+    bool initialized() { return status != nullptr; }
+    friend class DropoutForwardImpl;
+    friend class DropoutBackwardImpl;
+};
+
+// similar to RNG operator, dropout operator also have status
+class DropoutForwardImpl final : public DropoutForward {
+    DropoutStatus dropout_status;
+
+public:
+    using DropoutForward::DropoutForward;
+    void exec(
+            _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask,
+            _megdnn_workspace workspace) override;
+    size_t get_mask_size_in_bytes(const TensorLayout& inp) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+class DropoutBackwardImpl final : public DropoutBackward {
+#if CUDNN_VERSION >= 7000
+    DropoutDesc op_desc;
+#else
+    // cudnnDropoutRestore is not support when cudnn version < 7000
+    // so we need save the dropout status and set the dropoutDesc
+    // rather than restore
+    DropoutStatus dropout_status;
+#endif
+
+public:
+    using DropoutBackward::DropoutBackward;
+    void exec(
+            _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/handle.cpp b/dnn/src/cuda/handle.cpp
index 39bd56b3..f00006ea 100644
--- a/dnn/src/cuda/handle.cpp
+++ b/dnn/src/cuda/handle.cpp
@@ -52,7 +52,9 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle)
     // Get stream from MegCore computing handle.
     megdnn_assert(
             CUDNN_VERSION == cudnnGetVersion(),
-            "cudnn version mismatch: compiled with %d; detected %zu at runtime",
+            "cudnn version mismatch: compiled with %d; detected %zu at runtime, may "
+            "caused by customized environment, for example LD_LIBRARY_PATH on LINUX "
+            "and PATH on Windows!!",
             CUDNN_VERSION, cudnnGetVersion());
 #if CUDA_VERSION >= 10010
     megdnn_assert(
diff --git a/dnn/src/cuda/handle_create.cpp b/dnn/src/cuda/handle_create.cpp
index 03858f5f..df3c5ccb 100644
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -34,6 +34,7 @@
 #include "src/cuda/deformable_conv/opr_impl.h"
 #include "src/cuda/deformable_ps_roi_pooling/opr_impl.h"
 #include "src/cuda/dot/opr_impl.h"
+#include "src/cuda/dropout/opr_impl.h"
 #include "src/cuda/elemwise/opr_impl.h"
 #include "src/cuda/elemwise_multi_type/opr_impl.h"
 #include "src/cuda/eye/opr_impl.h"
@@ -45,6 +46,7 @@
 #include "src/cuda/images2neibs/opr_impl.h"
 #include "src/cuda/indexing_multi_axis_vec/opr_impl.h"
 #include "src/cuda/indexing_one_hot/opr_impl.h"
+#include "src/cuda/layer_norm/opr_impl.h"
 #include "src/cuda/linspace/opr_impl.h"
 #include "src/cuda/local/opr_impl.h"
 #include "src/cuda/local_share/opr_impl.h"
diff --git a/dnn/src/cuda/layer_norm/layer_norm_cuda.cu b/dnn/src/cuda/layer_norm/layer_norm_cuda.cu
new file mode 100644
index 00000000..2cca694a
--- /dev/null
+++ b/dnn/src/cuda/layer_norm/layer_norm_cuda.cu
@@ -0,0 +1,664 @@
+/**
+ * \file dnn/src/cuda/layer_norm/layer_norm_cuda.cu
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include <thrust/pair.h>
+#include <thrust/tuple.h>
+#include <cfloat>
+#include "megdnn/arch.h"
+#include "megdnn/dtype.h"
+#include "src/cuda/cuda_shfl_compat.cuh"
+#include "src/cuda/layer_norm/layer_norm_cuda.cuh"
+#include "src/cuda/utils.cuh"
+
+namespace megdnn {
+namespace cuda {
+namespace layer_norm {
+
+constexpr int kCUDANumThreads = 256;
+constexpr int vec_size = 4;
+
+// warp size may be used as array length, or used in host function,
+// so we define WARP_SIZE rather than using warpSize
+#define WARP_SIZE 32
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ \
+    __attribute__((no_sanitize("float-divide-by-zero")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#endif
+
+struct WelfordStat {
+    float mean;
+    float sigma2;
+    float count;
+    MEGDNN_HOST MEGDNN_DEVICE WelfordStat() : mean(0.f), sigma2(0.f), count(0.f) {}
+    MEGDNN_HOST MEGDNN_DEVICE WelfordStat(float mean, float sigma2, float count)
+            : mean(mean), sigma2(sigma2), count(count) {}
+};
+
+template <typename T, typename combine_t>
+struct WelfordData {
+    T mean;
+    T sigma2;
+    combine_t count;
+
+    MEGDNN_HOST MEGDNN_DEVICE WelfordData() : mean(0), sigma2(0), count(0) {}
+
+    MEGDNN_HOST MEGDNN_DEVICE WelfordData(T mean, T sigma2, combine_t count)
+            : mean(mean), sigma2(sigma2), count(count) {}
+};
+
+template <typename T, typename combine_t, typename res_t>
+struct WelfordOps {
+public:
+    using WelfordData_T = WelfordData<T, combine_t>;
+    inline MEGDNN_DEVICE WelfordData_T reduce(WelfordData_T acc, T data) const {
+        T delta = data - acc.mean;
+        T new_mean = static_cast<T>(acc.mean + delta / (acc.count + 1));
+        T new_delta = static_cast<T>(data - new_mean);
+        return {
+                new_mean,
+                acc.sigma2 + delta * new_delta,
+                combine_t(acc.count + 1),
+        };
+    }
+    inline MEGDNN_DEVICE WelfordData_T
+    combine(WelfordData_T lhs, WelfordData_T rhs) const {
+        if (lhs.count != 0 && rhs.count != 0) {
+            T delta = rhs.mean - lhs.mean;
+            combine_t new_count = lhs.count + rhs.count;
+            T nb_over_n = rhs.count / new_count;
+            return {lhs.mean + delta * nb_over_n,
+                    lhs.sigma2 + rhs.sigma2 + delta * delta * lhs.count * nb_over_n,
+                    new_count};
+        } else {
+            return (lhs.count != 0) ? lhs : rhs;
+        }
+    }
+    inline MEGDNN_DEVICE res_t
+    project(WelfordData_T acc) const __ubsan_ignore_float_divide_by_zero__ {
+        const auto mean = static_cast<T>(acc.mean);
+        const combine_t divisor = static_cast<combine_t>(acc.count);
+        const auto var = acc.sigma2 / divisor;
+        res_t results(var, mean);
+        return results;
+    }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    inline MEGDNN_DEVICE WelfordData_T
+    warp_shfl_down(WelfordData_T acc, int offset) const {
+        return {__shfl_down(acc.mean, offset, warpSize),
+                __shfl_down(acc.sigma2, offset, warpSize),
+                __shfl_down(acc.count, offset, warpSize)};
+    }
+#endif
+    MEGDNN_HOST MEGDNN_DEVICE WelfordOps() {}
+};
+
+template <typename T, int vec_size>
+struct alignas(sizeof(T) * vec_size) aligned_vector {
+    T val[vec_size];
+};
+
+template <typename T, bool is_cuda>
+using acc_type = T;
+
+template <typename U>
+MEGDNN_DEVICE WelfordStat
+update_welford_stat_online(const U val, const WelfordStat& curr_sum) {
+    U delta = static_cast<U>(val - curr_sum.mean);
+    U new_count = static_cast<U>(curr_sum.count + 1.f);
+    U new_mean = static_cast<U>(curr_sum.mean + delta * (1.f / new_count));
+    return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
+}
+
+MEGDNN_DEVICE WelfordStat
+combine_welford_stat(const WelfordStat lhs, const WelfordStat rhs) {
+    using U = decltype(lhs.count);
+    U delta = lhs.mean - rhs.mean;
+    U count = rhs.count + lhs.count;
+    U mean, sigma2;
+    if (count > decltype(lhs.count){0}) {
+        auto coef = 1.f / count;
+        auto nA = rhs.count * coef;
+        auto nB = lhs.count * coef;
+        mean = nA * rhs.mean + nB * lhs.mean;
+        sigma2 = rhs.sigma2 + lhs.sigma2 + delta * delta * rhs.count * nB;
+    } else {
+        mean = U(0);
+        sigma2 = U(0);
+    }
+    return {mean, sigma2, count};
+}
+
+template <typename T>
+MEGDNN_DEVICE WelfordStat
+compute_stats(const T* __restrict__ X, const int slice_len, float* buf) {
+    using vec_t = aligned_vector<T, vec_size>;
+    using acc_t = acc_type<T, true>;
+    const vec_t* X_vec = reinterpret_cast<const vec_t*>(X);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const int n_vec_to_read = slice_len / vec_size;
+    WelfordStat w_stat(0.f, 0.f, 0.f);
+    for (int i = thrx; i < n_vec_to_read; i += numx) {
+        vec_t data = X_vec[i];
+#pragma unroll
+        for (int ii = 0; ii < vec_size; ii++) {
+            w_stat = update_welford_stat_online(
+                    static_cast<acc_t>(data.val[ii]), w_stat);
+        }
+    }
+    // intra-warp reduction
+#pragma unroll
+    for (int offset = (warpSize >> 1); offset > 0; offset >>= 1) {
+        WelfordStat w_tmp{
+                __shfl_down(w_stat.mean, offset, warpSize),
+                __shfl_down(w_stat.sigma2, offset, warpSize),
+                __shfl_down(w_stat.count, offset, warpSize)};
+        w_stat = combine_welford_stat(w_stat, w_tmp);
+    }
+
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+        float* mean_sigma_buf = buf;
+        float* count_buf = buf + blockDim.y;
+        for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
+            // upper half of warps write to shared
+            if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
+                const int wrt_y = threadIdx.y - offset;
+                mean_sigma_buf[2 * wrt_y] = w_stat.mean;
+                mean_sigma_buf[2 * wrt_y + 1] = w_stat.sigma2;
+                count_buf[wrt_y] = w_stat.count;
+            }
+            __syncthreads();
+
+            // lower half merges
+            if (threadIdx.x == 0 && threadIdx.y < offset) {
+                WelfordStat w_tmp{
+                        mean_sigma_buf[2 * threadIdx.y],
+                        mean_sigma_buf[2 * threadIdx.y + 1], count_buf[threadIdx.y]};
+                w_stat = combine_welford_stat(w_stat, w_tmp);
+            }
+            __syncthreads();
+        }
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            mean_sigma_buf[0] = w_stat.mean;
+            mean_sigma_buf[1] = w_stat.sigma2 / float(slice_len);
+        }
+        __syncthreads();
+        return WelfordStat{mean_sigma_buf[0], mean_sigma_buf[1], 0.f};
+
+    } else {
+        return WelfordStat{
+                __shfl(w_stat.mean, 0, warpSize),
+                __shfl(w_stat.sigma2, 0, warpSize) / float(slice_len), 0.f};
+    }
+}
+
+template <typename T, typename T_ACC>
+__global__ void vectorized_layer_norm_forward_affine_kernel(
+        const int slice_len, T_ACC eps, const T* __restrict__ X, const T* weight,
+        const T* bias, T_ACC* mean, T_ACC* rstd, T* Y) {
+    // if we made smem WelfordStat type, there would be bank conflicts,
+    // as one thread would have to write 3 consecutive floats
+    extern __shared__ float s_data[];
+
+    auto slice_id = blockIdx.x;
+    const T* slice = X + slice_id * slice_len;
+    WelfordStat slice_w_stat = compute_stats(slice, slice_len, s_data);
+    using vec_t = aligned_vector<T, vec_size>;
+    const vec_t* X_vec = reinterpret_cast<const vec_t*>(slice);
+    vec_t* Y_vec = reinterpret_cast<vec_t*>(Y + slice_id * slice_len);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const int n_vec_to_read = slice_len / vec_size;
+    T_ACC rstd_val = static_cast<T_ACC>(rsqrt(slice_w_stat.sigma2 + eps));
+
+    for (int i = thrx; i < n_vec_to_read; i += numx) {
+        vec_t data = X_vec[i];
+        vec_t out;
+        // computation is performed in T_ACC, X is cast to T_ACC and result is
+        // implicitly cast to T
+
+#pragma unroll
+        for (int ii = 0; ii < vec_size; ii++) {
+            out.val[ii] = static_cast<T_ACC>(weight[i * vec_size + ii]) *
+                                  (rstd_val * (static_cast<T_ACC>(data.val[ii]) -
+                                               slice_w_stat.mean)) +
+                          static_cast<T_ACC>(bias[i * vec_size + ii]);
+        }
+        Y_vec[i] = out;
+    }
+    if (thrx == 0) {
+        mean[slice_id] = slice_w_stat.mean;
+        rstd[slice_id] = rstd_val;
+    }
+}
+
+template <typename T, typename T_ACC>
+__global__ void vectorized_layer_norm_forward_kernel(
+        const int slice_len, T_ACC eps, const T* __restrict__ X, const T* weight,
+        const T* bias, T_ACC* mean, T_ACC* rstd, T* Y) {
+    extern __shared__ float s_data[];
+
+    auto slice_id = blockIdx.x;
+    const T* slice = X + slice_id * slice_len;
+    WelfordStat slice_w_stat = compute_stats(slice, slice_len, s_data);
+    using vec_t = aligned_vector<T, vec_size>;
+    const vec_t* X_vec = reinterpret_cast<const vec_t*>(slice);
+    vec_t* Y_vec = reinterpret_cast<vec_t*>(Y + slice_id * slice_len);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const int n_vec_to_read = slice_len / vec_size;
+    T_ACC rstd_val = static_cast<T_ACC>(rsqrt(slice_w_stat.sigma2 + eps));
+
+    for (int i = thrx; i < n_vec_to_read; i += numx) {
+        vec_t data = X_vec[i];
+        vec_t out;
+
+#pragma unroll
+        for (int ii = 0; ii < vec_size; ii++) {
+            out.val[ii] =
+                    rstd_val * (static_cast<T_ACC>(data.val[ii]) - slice_w_stat.mean);
+        }
+        Y_vec[i] = out;
+    }
+    if (thrx == 0) {
+        mean[slice_id] = slice_w_stat.mean;
+        rstd[slice_id] = rstd_val;
+    }
+}
+
+template <typename T, typename T_ACC>
+void launch_vectorized_layer_norm_forward_kernel(
+        int64_t slice_len, int64_t slice_num, T_ACC eps, const T* X_data,
+        const T* weight_data, const T* bias_data, T* Y_data, T_ACC* mean_data,
+        T_ACC* rstd_data, cudaStream_t stream) {
+    const int num_threads = 128;
+    const dim3 threads(WARP_SIZE, num_threads / WARP_SIZE, 1);
+    const dim3 blocks(slice_num);
+    int nshared = threads.y > 1 ? threads.y * 3 / 2 * sizeof(T_ACC) : 0;
+
+    if (weight_data == nullptr && bias_data == nullptr) {
+        vectorized_layer_norm_forward_kernel<<<blocks, threads, nshared, stream>>>(
+                slice_len, eps, X_data, weight_data, bias_data, mean_data, rstd_data,
+                Y_data);
+    } else {
+        vectorized_layer_norm_forward_affine_kernel<<<
+                blocks, threads, nshared, stream>>>(
+                slice_len, eps, X_data, weight_data, bias_data, mean_data, rstd_data,
+                Y_data);
+    }
+    after_kernel_launch();
+}
+
+template <typename T, class ReduceOp>
+__inline__ MEGDNN_DEVICE T welford_warp_reduce(T val, const ReduceOp& op) {
+#pragma unroll
+    for (int offset = (warpSize >> 1); offset > 0; offset >>= 1) {
+        val = op.combine(val, op.warp_shfl_down(val, offset));
+    }
+    return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ MEGDNN_DEVICE T
+welford_block_reduce(T val, const ReduceOp& op, const T& identity_element, T* shared) {
+    const int lid = threadIdx.x % warpSize;
+    const int wid = threadIdx.x / warpSize;
+    val = welford_warp_reduce(val, op);
+    __syncthreads();
+    if (lid == 0) {
+        shared[wid] = val;
+    }
+    __syncthreads();
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lid] : identity_element;
+    if (wid == 0) {
+        val = welford_warp_reduce(val, op);
+    }
+    return val;
+}
+
+template <typename T, typename T_ACC>
+__global__ void get_input_mean_and_rstd_kernel(
+        int64_t slice_len, T_ACC eps, const T* X, T_ACC* mean, T_ACC* rstd) {
+    using WelfordType = WelfordData<T_ACC, T_ACC>;
+    using WelfordOp = WelfordOps<T_ACC, T_ACC, thrust::pair<T_ACC, T_ACC>>;
+
+    __shared__ typename std::aligned_storage<
+            sizeof(WelfordType), alignof(WelfordType)>::type val_shared[WARP_SIZE];
+    WelfordType* val_shared_ptr = reinterpret_cast<WelfordType*>(val_shared);
+
+    const int64_t i = blockIdx.x;
+    WelfordOp welford_op;
+    WelfordType val(
+            static_cast<T_ACC>(0), static_cast<T_ACC>(0), static_cast<T_ACC>(0));
+
+    for (int64_t j = threadIdx.x; j < slice_len; j += blockDim.x) {
+        const int64_t index = i * slice_len + j;
+        val = welford_op.reduce(val, static_cast<T_ACC>(X[index]));
+    }
+    val = welford_block_reduce(
+            val, welford_op,
+            WelfordType(
+                    static_cast<T_ACC>(0), static_cast<T_ACC>(0),
+                    static_cast<T_ACC>(0)),
+            val_shared_ptr);
+
+    if (threadIdx.x == 0) {
+        T_ACC slice_mean;
+        T_ACC slice_sigma2;
+        thrust::tie(slice_sigma2, slice_mean) = welford_op.project(val);
+        mean[i] = slice_mean;
+        rstd[i] = rsqrt(slice_sigma2 + eps);
+    }
+}
+
+template <typename T, typename T_ACC>
+__global__ void layer_norm_forward_kernel(
+        int64_t slice_len, const T* X, const T_ACC* mean, const T_ACC* rstd,
+        const T* weight, const T* bias, T* Y) {
+    const int64_t i = blockIdx.x;
+    for (int64_t j = threadIdx.x; j < slice_len; j += blockDim.x) {
+        const int64_t index = i * slice_len + j;
+        const T_ACC weight_v =
+                weight == nullptr ? T_ACC(1) : static_cast<T_ACC>(weight[j]);
+        const T_ACC bias_v = bias == nullptr ? T_ACC(0) : static_cast<T_ACC>(bias[j]);
+        Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+                           static_cast<T_ACC>(rstd[i]) * weight_v +
+                   bias_v;
+    }
+}
+
+template <typename T, typename T_ACC>
+void forward(
+        T* X, T* weight, T* bias, int64_t slice_num, int64_t slice_len, T_ACC eps, T* Y,
+        T_ACC* mean, T_ACC* rstd, cudaStream_t stream) {
+    auto can_vectorize = [&](const T* ptr, int alignment) {
+        uint64_t addr = reinterpret_cast<uint64_t>(ptr);
+        return addr % alignment == 0;
+    };
+    constexpr int num_vec_elems = vec_size;
+    constexpr int alignment = num_vec_elems * sizeof(T);
+    if ((std::is_same<T, dt_float32>::value || std::is_same<T, dt_float16>::value ||
+         std::is_same<T, dt_bfloat16>::value) &&
+        slice_len <= static_cast<int64_t>(1ULL << std::numeric_limits<float>::digits) &&
+        slice_len % num_vec_elems == 0 && can_vectorize(X, alignment) &&
+        can_vectorize(Y, alignment)) {
+        launch_vectorized_layer_norm_forward_kernel<T, T_ACC>(
+                slice_len, slice_num, static_cast<T_ACC>(eps), X, weight, bias, Y, mean,
+                rstd, stream);
+        after_kernel_launch();
+    } else {
+        get_input_mean_and_rstd_kernel<T, T_ACC>
+                <<<slice_num, 512, 0, stream>>>(slice_len, eps, X, mean, rstd);
+        after_kernel_launch();
+        layer_norm_forward_kernel<T, T_ACC><<<slice_num, kCUDANumThreads, 0, stream>>>(
+                slice_len, X, mean, rstd, weight, bias, Y);
+        after_kernel_launch();
+    }
+}
+
+template <typename T>
+__inline__ MEGDNN_DEVICE T warp_reduce_sum(T val) {
+#pragma unroll
+    for (int offset = (warpSize >> 1); offset > 0; offset >>= 1) {
+        val += __shfl_down(val, offset, warpSize);
+    }
+    return val;
+}
+
+template <typename T>
+__inline__ MEGDNN_DEVICE T block_reduce_sum(T val, T* shared) {
+    const int lid = threadIdx.x % warpSize;
+    const int wid = threadIdx.x / warpSize;
+    val = warp_reduce_sum(val);
+    __syncthreads();
+    if (lid == 0) {
+        shared[wid] = val;
+    }
+    __syncthreads();
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lid] : T(0);
+    if (wid == 0) {
+        val = warp_reduce_sum(val);
+    }
+    return val;
+}
+
+template <typename T, typename T_ACC>
+__inline__ MEGDNN_DEVICE void layer_norm_grad_input_kernel_impl(
+        const T* __restrict__ dY, const T* __restrict__ X,
+        const T_ACC* __restrict__ mean, const T_ACC* __restrict__ rstd,
+        const T* __restrict__ weight, T* dX, const int slice_len, T_ACC* buf) {
+    const auto slice_id = blockIdx.x;
+    const T_ACC mean_val = mean[slice_id];
+    const T_ACC rstd_val = rstd[slice_id];
+    T_ACC stats_x1{0}, stats_x2{0};
+    constexpr int unroll = 4;
+    auto l = unroll * threadIdx.x;
+    const T* X_i = X + slice_id * slice_len;
+    const T* dY_i = dY + slice_id * slice_len;
+    T* dX_i = dX + slice_id * slice_len;
+    // vectorized reads don't improve perf, so use regular unrolling
+
+    for (; l + unroll - 1 < slice_len; l += blockDim.x * unroll) {
+#pragma unroll
+        for (int k = 0; k < unroll; k++) {
+            T_ACC weight_val =
+                    (weight != nullptr) ? static_cast<T_ACC>(weight[l + k]) : T_ACC(1);
+            const T_ACC c_h = static_cast<T_ACC>(X_i[l + k]);
+            const T_ACC c_loss = static_cast<T_ACC>(dY_i[l + k]);
+            stats_x1 += c_loss * weight_val;
+            stats_x2 += c_loss * weight_val * (c_h - mean_val) * rstd_val;
+        }
+    }
+    for (; l < slice_len; l++) {
+        T_ACC weight_val =
+                (weight != nullptr) ? static_cast<T_ACC>(weight[l]) : T_ACC(1);
+        const T_ACC c_h = static_cast<T_ACC>(X_i[l]);
+        const T_ACC c_loss = static_cast<T_ACC>(dY_i[l]);
+        stats_x1 += c_loss * weight_val;
+        stats_x2 += c_loss * weight_val * (c_h - mean_val) * rstd_val;
+    }
+
+    stats_x1 = block_reduce_sum(stats_x1, buf);
+    stats_x2 = block_reduce_sum(stats_x2, buf);
+    if (threadIdx.x == 0) {
+        buf[0] = stats_x1;
+        buf[1] = stats_x2;
+    }
+    __syncthreads();
+    stats_x1 = buf[0];
+    stats_x2 = buf[1];
+    T_ACC fH = slice_len;
+    T_ACC term1 = (T_ACC(1) / fH) * rstd_val;
+
+    for (int l = threadIdx.x; l < slice_len; l += blockDim.x) {
+        const T_ACC x = X_i[l];
+        const T_ACC dy = dY_i[l];
+        T_ACC weight_val =
+                (weight != nullptr) ? static_cast<T_ACC>(weight[l]) : T_ACC(1);
+        T_ACC f_grad_input = fH * weight_val * dy;
+        f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+        f_grad_input -= stats_x1;
+        f_grad_input *= term1;
+        dX_i[l] = f_grad_input;
+    }
+}
+
+template <typename T, typename T_ACC>
+__global__ void layer_norm_grad_input_kernel(
+        const T* __restrict__ dY, const T* __restrict__ X,
+        const T_ACC* __restrict__ mean, const T_ACC* __restrict__ rstd,
+        const T* __restrict__ weight, T* dX, const int slice_len) {
+    alignas(sizeof(double)) extern __shared__ char s_data1[];
+    T_ACC* buf = reinterpret_cast<T_ACC*>(&s_data1);
+
+    layer_norm_grad_input_kernel_impl(dY, X, mean, rstd, weight, dX, slice_len, buf);
+}
+
+template <typename T, typename T_ACC>
+__global__ void layer_norm_grad_weight_bias_simple_kernel(
+        int64_t slice_num, int64_t slice_len, const T* dY, const T* X,
+        const T_ACC* mean, const T_ACC* rstd, T* dweight, T* dbias) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j < slice_len) {
+        T_ACC sum1 = 0;
+        T_ACC sum2 = 0;
+        for (int64_t i = 0; i < slice_num; ++i) {
+            const int64_t index = i * slice_len + j;
+            sum1 += dweight == nullptr ? T_ACC(0)
+                                       : static_cast<T_ACC>(dY[index]) *
+                                                 (static_cast<T_ACC>(X[index]) -
+                                                  static_cast<T_ACC>(mean[i])) *
+                                                 static_cast<T_ACC>(rstd[i]);
+            sum2 += dbias == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+        }
+        if (dweight != nullptr) {
+            dweight[j] = sum1;
+        }
+        if (dbias != nullptr) {
+            dbias[j] = sum2;
+        }
+    }
+}
+
+template <typename T, typename T_ACC>
+__global__ void layer_norm_grad_weight_bias_kernel(
+        int64_t slice_num, int64_t slice_len, const T* dY, const T* X,
+        const T_ACC* mean, const T_ACC* rstd, T* dweight, T* dbias) {
+    alignas(sizeof(double)) extern __shared__ char s_data1[];
+    T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+    constexpr int unroll = 8;
+    T dYs[unroll];
+    T Xs[unroll];
+    T_ACC* means = s_data_typed;
+    T_ACC* rstds = s_data_typed + unroll * blockDim.y;
+    T_ACC dg_sum = 0;
+    T_ACC db_sum = 0;
+    if (j < slice_len) {
+        int bcounter;
+        for (bcounter = 0; bcounter < slice_num / (blockDim.y * unroll); bcounter++) {
+            int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
+#pragma unroll
+            for (int ii = 0; ii < unroll; ii++) {
+                if (threadIdx.x == 0) {
+                    means[ii * blockDim.y + threadIdx.y] = mean[offset + ii];
+                    rstds[ii * blockDim.y + threadIdx.y] = rstd[offset + ii];
+                }
+                dYs[ii] = dY[(offset + ii) * slice_len + j];
+                Xs[ii] = X[(offset + ii) * slice_len + j];
+            }
+            __syncthreads();
+#pragma unroll
+            for (int ii = 0; ii < unroll; ii++) {
+                dg_sum += dYs[ii] * (Xs[ii] - means[ii * blockDim.y + threadIdx.y]) *
+                          rstds[ii * blockDim.y + threadIdx.y];
+                db_sum += dYs[ii];
+            }
+            __syncthreads();
+        }
+        int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
+        for (int ii = 0; ii < 8; ii++) {
+            T_ACC mean_val, rstd_val;  // we don't use smem in the tail to avoid awkward
+                                       // synchronizations, perf penalty is negligible
+            if ((offset + ii) < slice_num) {
+                mean_val = mean[offset + ii];
+                rstd_val = rstd[offset + ii];
+                dYs[0] = dY[(offset + ii) * slice_len + j];
+                Xs[0] = X[(offset + ii) * slice_len + j];
+                dg_sum += dYs[0] * (Xs[0] - mean_val) * rstd_val;
+                db_sum += dYs[0];
+            }
+        }
+        s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
+        s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] =
+                db_sum;
+        __syncthreads();
+        for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
+            if (threadIdx.y < offset) {
+                s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] +=
+                        s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+                s_data_typed
+                        [blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+                         threadIdx.x] += s_data_typed
+                                [blockDim.x * blockDim.y +
+                                 (threadIdx.y + offset) * blockDim.x + threadIdx.x];
+            }
+            __syncthreads();
+        }
+        if (threadIdx.y == 0) {
+            if (dweight) {
+                dweight[j] = s_data_typed[threadIdx.x];
+            }
+            if (dbias) {
+                dbias[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y];
+            }
+        }
+    }
+}
+
+template <typename T, typename T_ACC>
+void backward(
+        const T* dY_data, const T* X_data, const T_ACC* mean_data,
+        const T_ACC* rstd_data, const T* weight_data, int64_t slice_num,
+        int64_t slice_len, T* dX_data, T* dweight_data, T* dbias_data,
+        cudaStream_t stream) {
+    if (dX_data != nullptr) {
+        const int num_threads = 128;
+        const dim3 blocks(slice_num);
+        int nshared = (num_threads / WARP_SIZE) * sizeof(T_ACC);
+        layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+                dY_data, X_data, mean_data, rstd_data, weight_data, dX_data, slice_len);
+        after_kernel_launch();
+    }
+    if (dweight_data || dbias_data) {
+        if (slice_num < 512) {
+            const int64_t B = (slice_len + kCUDANumThreads - 1) / kCUDANumThreads;
+            layer_norm_grad_weight_bias_simple_kernel<T, T_ACC>
+                    <<<B, kCUDANumThreads, 0, stream>>>(
+                            slice_num, slice_len, dY_data, X_data, mean_data, rstd_data,
+                            dweight_data, dbias_data);
+            after_kernel_launch();
+        } else {
+            dim3 threads{16, 32};
+            int blocks = (slice_len + threads.x - 1) / threads.x;
+            layer_norm_grad_weight_bias_kernel<T, T_ACC>
+                    <<<blocks, threads, 2 * sizeof(T_ACC) * threads.x * threads.y,
+                       stream>>>(
+                            slice_num, slice_len, dY_data, X_data, mean_data, rstd_data,
+                            dweight_data, dbias_data);
+            after_kernel_launch();
+        }
+    }
+}
+
+#define INST(T, T_ACC)                                                              \
+    template void forward<T, T_ACC>(                                                \
+            T*, T*, T*, int64_t, int64_t, T_ACC, T*, T_ACC*, T_ACC*, cudaStream_t); \
+    template void backward<T, T_ACC>(                                               \
+            const T*, const T*, const T_ACC*, const T_ACC*, const T*, int64_t,      \
+            int64_t, T*, T*, T*, cudaStream_t);
+
+INST(dt_float32, dt_float32)
+INST(dt_float16, dt_float32)
+INST(dt_bfloat16, dt_float32)
+#undef INST
+
+}  // namespace layer_norm
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/layer_norm/layer_norm_cuda.cuh b/dnn/src/cuda/layer_norm/layer_norm_cuda.cuh
new file mode 100644
index 00000000..8e14de34
--- /dev/null
+++ b/dnn/src/cuda/layer_norm/layer_norm_cuda.cuh
@@ -0,0 +1,34 @@
+/**
+ * \file dnn/src/cuda/layer_norm/layer_norm.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include <cuda_runtime_api.h>
+
+namespace megdnn {
+namespace cuda {
+namespace layer_norm {
+
+template <typename T, typename T_ACC>
+void forward(
+        T* X, T* gamma, T* beta, int64_t M, int64_t N, T_ACC eps, T* Y, T_ACC* mean,
+        T_ACC* rstd, cudaStream_t stream);
+
+template <typename T, typename T_ACC>
+void backward(
+        const T* dY_data, const T* X_data, const T_ACC* mean_data,
+        const T_ACC* rstd_data, const T* gamma_data, int64_t M, int64_t N, T* dX_data,
+        T* dgamma_data, T* dbeta_data, cudaStream_t stream);
+
+}  // namespace layer_norm
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/layer_norm/opr_impl.cpp b/dnn/src/cuda/layer_norm/opr_impl.cpp
new file mode 100644
index 00000000..426de527
--- /dev/null
+++ b/dnn/src/cuda/layer_norm/opr_impl.cpp
@@ -0,0 +1,94 @@
+/**
+ * \file dnn/src/cuda/layer_norm/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/cuda/layer_norm/opr_impl.h"
+#include "src/cuda/layer_norm/layer_norm_cuda.cuh"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+void LayerNormForwardImpl::exec(
+        _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
+        _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
+        _megdnn_workspace workspace) {
+    check_exec(
+            data.layout, weight.layout, bias.layout, dst.layout, mean.layout,
+            rstd.layout, workspace.size);
+
+    auto p = param();
+    float eps = p.eps;
+    bool affine = p.affine;
+    uint64_t slice_length = p.normalized_size;
+    uint64_t slice_dim = p.normalized_dim;
+    uint64_t n_slices = 1;
+    for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) {
+        n_slices = n_slices * data.layout.shape[i];
+    }
+
+    auto stream = cuda_stream(handle());
+    using namespace ::megdnn::cuda::layer_norm;
+
+#define cb(DType)                                                                 \
+    if (data.layout.dtype == DType()) {                                           \
+        using T = typename DTypeTrait<DType>::ctype;                              \
+        using T_ACC = float;                                                      \
+        forward<T, T_ACC>(                                                        \
+                data.ptr<T>(), affine ? weight.ptr<T>() : nullptr,                \
+                affine ? bias.ptr<T>() : nullptr, static_cast<int64_t>(n_slices), \
+                static_cast<int64_t>(slice_length), static_cast<T_ACC>(eps),      \
+                dst.ptr<T>(), mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), stream);      \
+        return;                                                                   \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw("bad dtype");
+}
+
+void LayerNormBackwardImpl::exec(
+        _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight,
+        _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata,
+        _megdnn_tensor_out dweight, _megdnn_tensor_out dbias,
+        _megdnn_workspace workspace) {
+    check_exec(
+            diff.layout, data.layout, weight.layout, mean.layout, rstd.layout,
+            ddata.layout, dweight.layout, dbias.layout, workspace.size);
+    auto p = param();
+    bool affine = p.affine;
+    uint64_t slice_length = p.normalized_size;
+    uint64_t slice_dim = p.normalized_dim;
+    uint64_t n_slices = 1;
+    for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) {
+        n_slices = n_slices * data.layout.shape[i];
+    }
+
+    auto stream = cuda_stream(handle());
+    using namespace ::megdnn::cuda::layer_norm;
+#define cb(DType)                                                                   \
+    if (data.layout.dtype == DType()) {                                             \
+        using T = typename DTypeTrait<DType>::ctype;                                \
+        using T_ACC = float;                                                        \
+        backward<T, T_ACC>(                                                         \
+                diff.ptr<T>(), data.ptr<T>(), mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), \
+                affine ? weight.ptr<T>() : nullptr, n_slices, slice_length,         \
+                ddata.ptr<T>(), affine ? dweight.ptr<T>() : nullptr,                \
+                affine ? dbias.ptr<T>() : nullptr, stream);                         \
+        return;                                                                     \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw("bad dtype");
+}
+
+}  // namespace cuda
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/layer_norm/opr_impl.h b/dnn/src/cuda/layer_norm/opr_impl.h
new file mode 100644
index 00000000..8bca6a75
--- /dev/null
+++ b/dnn/src/cuda/layer_norm/opr_impl.h
@@ -0,0 +1,53 @@
+/**
+ * \file dnn/src/cuda/layer_norm/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+#include "src/cuda/cudnn_wrapper.h"
+
+namespace megdnn {
+namespace cuda {
+
+class LayerNormForwardImpl final : public LayerNormForward {
+public:
+    using LayerNormForward::LayerNormForward;
+    void exec(
+            _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
+            _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&,
+            const TensorLayout&, const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+class LayerNormBackwardImpl final : public LayerNormBackward {
+public:
+    using LayerNormBackward::LayerNormBackward;
+    void exec(
+            _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight,
+            _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata,
+            _megdnn_tensor_out dweight, _megdnn_tensor_out dbias,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&,
+            const TensorLayout&, const TensorLayout&, const TensorLayout&,
+            const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/elemwise/opr_binary_impl.cpp b/dnn/src/fallback/elemwise/opr_binary_impl.cpp
new file mode 100644
index 00000000..9acda94e
--- /dev/null
+++ b/dnn/src/fallback/elemwise/opr_binary_impl.cpp
@@ -0,0 +1,275 @@
+/**
+ * \file dnn/src/fallback/elemwise/opr_binary_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "./opr_impl.h"
+
+#include "src/common/elemwise/kern_defs.cuh"
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_fallback_elemwise_binary)
+
+namespace megdnn {
+namespace fallback {
+
+template <typename dtype, uint32_t mode>
+void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) {
+    using ctype = typename DTypeTrait<dtype>::ctype;
+    using Kern = ElemwiseKern<megcorePlatformCPU, mode, ctype>;
+
+    MIDOUT_BEGIN(megdnn_fallback_elemwise_binary, ctype, midout_iv(mode)) {
+        if (param.max_ndim == 1) {
+            MIDOUT_BEGIN(
+                    megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
+                    midout_iv(1)) {
+                auto tot = param.size;
+                auto as = param[0].layout.stride[0], bs = param[1].layout.stride[0];
+                auto src0 = param[0];
+                auto src1 = param[1];
+                auto dst_tensor = *m_dst;
+
+                MEGDNN_DISPATCH_CPU_KERN_OPR({
+                    ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
+                    ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
+                    ctype* __restrict dst = static_cast<ctype*>(dst_tensor.raw_ptr());
+                    for (size_t i = 0; i < tot; ++i) {
+                        dst[i] = Kern::apply(a[i * as], b[i * bs]);
+                    }
+                });
+                return;
+            }
+            MIDOUT_END();
+        }
+
+        if (std::min(param[0].layout.ndim, param[1].layout.ndim) > 1) {
+            return naive::ElemwiseForwardImpl::exec(*m_src, *m_dst);
+        }
+
+        if (param.max_ndim == 2) {
+            if (param[0].layout.ndim == 1) {
+                MIDOUT_BEGIN(
+                        megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
+                        midout_iv(21)) {
+                    auto as = param[0].layout.stride[0],
+                         bs0 = param[1].layout.stride[0],
+                         bs1 = param[1].layout.stride[1];
+                    auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1];
+                    auto src0 = param[0];
+                    auto src1 = param[1];
+                    auto dst_tensor = *m_dst;
+
+                    MEGDNN_DISPATCH_CPU_KERN_OPR({
+                        ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
+                        ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
+                        ctype* __restrict dst =
+                                static_cast<ctype*>(dst_tensor.raw_ptr());
+                        ptrdiff_t toff = 0;
+                        for (size_t i = 0; i < n0; ++i) {
+                            for (size_t j = 0; j < n1; ++j) {
+                                dst[toff] =
+                                        Kern::apply(a[as * toff], b[bs0 * i + bs1 * j]);
+                                ++toff;
+                            }
+                        }
+                    });
+                    return;
+                }
+                MIDOUT_END();
+            }
+
+            MIDOUT_BEGIN(
+                    megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
+                    midout_iv(22)) {
+                megdnn_assert(param[1].layout.ndim == 1);
+                auto bs = param[1].layout.stride[0], as0 = param[0].layout.stride[0],
+                     as1 = param[0].layout.stride[1];
+                auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1];
+                auto src0 = param[0];
+                auto src1 = param[1];
+                auto dst_tensor = *m_dst;
+
+                MEGDNN_DISPATCH_CPU_KERN_OPR({
+                    ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
+                    ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
+                    ctype* __restrict dst = static_cast<ctype*>(dst_tensor.raw_ptr());
+                    ptrdiff_t toff = 0;
+                    for (size_t i = 0; i < n0; ++i) {
+                        for (size_t j = 0; j < n1; ++j) {
+                            dst[toff] = Kern::apply(a[as0 * i + as1 * j], b[toff * bs]);
+                            ++toff;
+                        }
+                    }
+                });
+                return;
+            }
+            MIDOUT_END();
+        }
+
+        if (param.max_ndim == 3) {
+            auto brd_101 = [](const TensorND& t) {
+                auto&& l = t.layout;
+                return l.ndim == 3 && l.stride[0] == 0 && l.stride[2] == 0;
+            };
+            if (param[0].layout.ndim == 1 && brd_101(param[1])) {
+                MIDOUT_BEGIN(
+                        megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
+                        midout_iv(31)) {
+                    auto as = param[0].layout.stride[0], bs = param[1].layout.stride[1];
+                    auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1],
+                         n2 = param[1].layout.shape[2];
+                    auto src0 = param[0];
+                    auto src1 = param[1];
+                    auto dst_tensor = *m_dst;
+
+                    MEGDNN_DISPATCH_CPU_KERN_OPR({
+                        ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
+                        ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
+                        ctype* __restrict dst =
+                                static_cast<ctype*>(dst_tensor.raw_ptr());
+                        size_t toff = 0;
+                        for (size_t i = 0; i < n0; ++i) {
+                            for (size_t j = 0; j < n1; ++j) {
+                                for (size_t k = 0; k < n2; ++k) {
+                                    dst[toff] = Kern::apply(a[as * toff], b[bs * j]);
+                                    ++toff;
+                                }
+                            }
+                        }
+                    });
+                    return;
+                }
+                MIDOUT_END();
+            }
+            if (param[1].layout.ndim == 1 && brd_101(param[0])) {
+                MIDOUT_BEGIN(
+                        megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
+                        midout_iv(32)) {
+                    auto as = param[0].layout.stride[1], bs = param[1].layout.stride[0];
+                    auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1],
+                         n2 = param[0].layout.shape[2];
+                    auto src0 = param[0];
+                    auto src1 = param[1];
+                    auto dst_tensor = *m_dst;
+                    MEGDNN_DISPATCH_CPU_KERN_OPR({
+                        ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
+                        ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
+                        ctype* __restrict dst =
+                                static_cast<ctype*>(dst_tensor.raw_ptr());
+                        size_t toff = 0;
+                        for (size_t i = 0; i < n0; ++i) {
+                            for (size_t j = 0; j < n1; ++j) {
+                                for (size_t k = 0; k < n2; ++k) {
+                                    dst[toff] = Kern::apply(a[as * j], b[bs * toff]);
+                                    ++toff;
+                                }
+                            }
+                        }
+                    });
+                    return;
+                }
+                MIDOUT_END();
+            }
+        }
+
+        naive::ElemwiseForwardImpl::exec(*m_src, *m_dst);
+    }
+    MIDOUT_END();
+}
+
+#define SWITCH_DTYPE(_cat, _cb)                            \
+    switch (m_dst->layout.dtype.enumv()) {                 \
+        MEGDNN_FOREACH_COMPUTING_DTYPE_##_cat(_cb) default \
+                : megdnn_throw("bad dtype");               \
+    }
+
+template <uint32_t mode>
+void ElemwiseImpl::exec_BINARY_INT() {
+    auto param = make_elemwise_op_param<2>();
+#define cb(_dt)                  \
+    case DTypeTrait<_dt>::enumv: \
+        return binary_kern<_dt, mode>(param);
+
+    SWITCH_DTYPE(INT, cb)
+
+#undef cb
+}
+
+template <uint32_t mode>
+void ElemwiseImpl::exec_BINARY_FLOAT() {
+    auto param = make_elemwise_op_param<2>();
+#define cb(_dt)                  \
+    case DTypeTrait<_dt>::enumv: \
+        return binary_kern<_dt, mode>(param);
+
+    SWITCH_DTYPE(FLOAT, cb)
+
+#undef cb
+}
+
+#undef SWITCH_DTYPE
+
+#undef SWITCH_DTYPE
+using Mode = param_enumv::Elemwise::Mode;
+#define INST(mode) template void megdnn::fallback::ElemwiseImpl::exec_BINARY_INT<mode>()
+INST(Mode::ABS_GRAD);
+INST(Mode::ADD);
+INST(Mode::FLOOR_DIV);
+INST(Mode::MAX);
+INST(Mode::MIN);
+INST(Mode::MOD);
+INST(Mode::MUL);
+INST(Mode::SIGMOID_GRAD);
+INST(Mode::SUB);
+INST(Mode::SWITCH_GT0);
+INST(Mode::TANH_GRAD);
+INST(Mode::LT);
+INST(Mode::LEQ);
+INST(Mode::EQ);
+INST(Mode::SHL);
+INST(Mode::SHR);
+INST(Mode::FUSE_ADD_RELU);
+INST(Mode::RMULH);
+#undef INST
+
+#define INST(mode) \
+    template void megdnn::fallback::ElemwiseImpl::exec_BINARY_FLOAT<mode>()
+INST(Mode::ABS_GRAD);
+INST(Mode::ADD);
+INST(Mode::FLOOR_DIV);
+INST(Mode::MAX);
+INST(Mode::MIN);
+INST(Mode::MOD);
+INST(Mode::MUL);
+INST(Mode::POW);
+INST(Mode::SIGMOID_GRAD);
+INST(Mode::SUB);
+INST(Mode::SWITCH_GT0);
+INST(Mode::TANH_GRAD);
+INST(Mode::TRUE_DIV);
+INST(Mode::LOG_SUM_EXP);
+INST(Mode::LT);
+INST(Mode::LEQ);
+INST(Mode::EQ);
+INST(Mode::FUSE_ADD_RELU);
+INST(Mode::FUSE_ADD_SIGMOID);
+INST(Mode::FUSE_ADD_TANH);
+INST(Mode::FAST_TANH_GRAD);
+INST(Mode::ATAN2);
+INST(Mode::H_SWISH_GRAD);
+INST(Mode::FUSE_ADD_H_SWISH);
+INST(Mode::SILU_GRAD);
+INST(Mode::GELU_GRAD);
+#undef INST
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/elemwise/opr_impl.cpp b/dnn/src/fallback/elemwise/opr_impl.cpp
index a9fd7815..eb4b2d9c 100644
--- a/dnn/src/fallback/elemwise/opr_impl.cpp
+++ b/dnn/src/fallback/elemwise/opr_impl.cpp
@@ -16,8 +16,6 @@
 
 #include "midout.h"
 
-MIDOUT_DECL(megdnn_fallback_elemwise_unary)
-MIDOUT_DECL(megdnn_fallback_elemwise_binary)
 MIDOUT_DECL(megdnn_fallback_elemwise_exec_UNARY_INT)
 MIDOUT_DECL(megdnn_fallback_elemwise_exec_UNARY_FLOAT)
 MIDOUT_DECL(megdnn_fallback_elemwise_exec_BINARY_INT)
@@ -26,200 +24,6 @@ MIDOUT_DECL(megdnn_fallback_elemwise_exec_BINARY_FLOAT)
 namespace megdnn {
 namespace fallback {
 
-template <typename dtype, uint32_t mode>
-void ElemwiseImpl::unary_kern(const ElemwiseOpParamN<1>& param) {
-    using ctype = typename DTypeTrait<dtype>::ctype;
-    using Kern = ElemwiseKern<megcorePlatformCPU, mode, ctype>;
-    MIDOUT_BEGIN(megdnn_fallback_elemwise_unary, ctype, midout_iv(mode)) {
-        // only specialize for the most common 1-dim case
-        auto tot = param.size;
-        auto stride = param[0].layout.stride[0];
-        auto src0 = param[0];
-        auto dst_tensor = *m_dst;
-        if (param.max_ndim == 1) {
-            MIDOUT_BEGIN(
-                    megdnn_fallback_elemwise_unary, ctype, midout_iv(mode),
-                    midout_iv(1)) {
-                MEGDNN_DISPATCH_CPU_KERN_OPR({
-                    ctype* __restrict src = static_cast<ctype*>(src0.raw_ptr());
-                    ctype* __restrict dst = static_cast<ctype*>(dst_tensor.raw_ptr());
-                    for (size_t i = 0; i < tot; ++i) {
-                        dst[i] = Kern::apply(src[i * stride]);
-                    }
-                });
-                return;
-            }
-            MIDOUT_END();
-        }
-        naive::ElemwiseForwardImpl::exec(*m_src, *m_dst);
-    }
-    MIDOUT_END();
-}
-
-template <typename dtype, uint32_t mode>
-void ElemwiseImpl::binary_kern(const ElemwiseOpParamN<2>& param) {
-    using ctype = typename DTypeTrait<dtype>::ctype;
-    using Kern = ElemwiseKern<megcorePlatformCPU, mode, ctype>;
-
-    MIDOUT_BEGIN(megdnn_fallback_elemwise_binary, ctype, midout_iv(mode)) {
-        if (param.max_ndim == 1) {
-            MIDOUT_BEGIN(
-                    megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
-                    midout_iv(1)) {
-                auto tot = param.size;
-                auto as = param[0].layout.stride[0], bs = param[1].layout.stride[0];
-                auto src0 = param[0];
-                auto src1 = param[1];
-                auto dst_tensor = *m_dst;
-
-                MEGDNN_DISPATCH_CPU_KERN_OPR({
-                    ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
-                    ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
-                    ctype* __restrict dst = static_cast<ctype*>(dst_tensor.raw_ptr());
-                    for (size_t i = 0; i < tot; ++i) {
-                        dst[i] = Kern::apply(a[i * as], b[i * bs]);
-                    }
-                });
-                return;
-            }
-            MIDOUT_END();
-        }
-
-        if (std::min(param[0].layout.ndim, param[1].layout.ndim) > 1) {
-            return naive::ElemwiseForwardImpl::exec(*m_src, *m_dst);
-        }
-
-        if (param.max_ndim == 2) {
-            if (param[0].layout.ndim == 1) {
-                MIDOUT_BEGIN(
-                        megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
-                        midout_iv(21)) {
-                    auto as = param[0].layout.stride[0],
-                         bs0 = param[1].layout.stride[0],
-                         bs1 = param[1].layout.stride[1];
-                    auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1];
-                    auto src0 = param[0];
-                    auto src1 = param[1];
-                    auto dst_tensor = *m_dst;
-
-                    MEGDNN_DISPATCH_CPU_KERN_OPR({
-                        ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
-                        ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
-                        ctype* __restrict dst =
-                                static_cast<ctype*>(dst_tensor.raw_ptr());
-                        ptrdiff_t toff = 0;
-                        for (size_t i = 0; i < n0; ++i) {
-                            for (size_t j = 0; j < n1; ++j) {
-                                dst[toff] =
-                                        Kern::apply(a[as * toff], b[bs0 * i + bs1 * j]);
-                                ++toff;
-                            }
-                        }
-                    });
-                    return;
-                }
-                MIDOUT_END();
-            }
-
-            MIDOUT_BEGIN(
-                    megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
-                    midout_iv(22)) {
-                megdnn_assert(param[1].layout.ndim == 1);
-                auto bs = param[1].layout.stride[0], as0 = param[0].layout.stride[0],
-                     as1 = param[0].layout.stride[1];
-                auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1];
-                auto src0 = param[0];
-                auto src1 = param[1];
-                auto dst_tensor = *m_dst;
-
-                MEGDNN_DISPATCH_CPU_KERN_OPR({
-                    ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
-                    ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
-                    ctype* __restrict dst = static_cast<ctype*>(dst_tensor.raw_ptr());
-                    ptrdiff_t toff = 0;
-                    for (size_t i = 0; i < n0; ++i) {
-                        for (size_t j = 0; j < n1; ++j) {
-                            dst[toff] = Kern::apply(a[as0 * i + as1 * j], b[toff * bs]);
-                            ++toff;
-                        }
-                    }
-                });
-                return;
-            }
-            MIDOUT_END();
-        }
-
-        if (param.max_ndim == 3) {
-            auto brd_101 = [](const TensorND& t) {
-                auto&& l = t.layout;
-                return l.ndim == 3 && l.stride[0] == 0 && l.stride[2] == 0;
-            };
-            if (param[0].layout.ndim == 1 && brd_101(param[1])) {
-                MIDOUT_BEGIN(
-                        megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
-                        midout_iv(31)) {
-                    auto as = param[0].layout.stride[0], bs = param[1].layout.stride[1];
-                    auto n0 = param[1].layout.shape[0], n1 = param[1].layout.shape[1],
-                         n2 = param[1].layout.shape[2];
-                    auto src0 = param[0];
-                    auto src1 = param[1];
-                    auto dst_tensor = *m_dst;
-
-                    MEGDNN_DISPATCH_CPU_KERN_OPR({
-                        ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
-                        ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
-                        ctype* __restrict dst =
-                                static_cast<ctype*>(dst_tensor.raw_ptr());
-                        size_t toff = 0;
-                        for (size_t i = 0; i < n0; ++i) {
-                            for (size_t j = 0; j < n1; ++j) {
-                                for (size_t k = 0; k < n2; ++k) {
-                                    dst[toff] = Kern::apply(a[as * toff], b[bs * j]);
-                                    ++toff;
-                                }
-                            }
-                        }
-                    });
-                    return;
-                }
-                MIDOUT_END();
-            }
-            if (param[1].layout.ndim == 1 && brd_101(param[0])) {
-                MIDOUT_BEGIN(
-                        megdnn_fallback_elemwise_binary, ctype, midout_iv(mode),
-                        midout_iv(32)) {
-                    auto as = param[0].layout.stride[1], bs = param[1].layout.stride[0];
-                    auto n0 = param[0].layout.shape[0], n1 = param[0].layout.shape[1],
-                         n2 = param[0].layout.shape[2];
-                    auto src0 = param[0];
-                    auto src1 = param[1];
-                    auto dst_tensor = *m_dst;
-                    MEGDNN_DISPATCH_CPU_KERN_OPR({
-                        ctype* __restrict a = static_cast<ctype*>(src0.raw_ptr());
-                        ctype* __restrict b = static_cast<ctype*>(src1.raw_ptr());
-                        ctype* __restrict dst =
-                                static_cast<ctype*>(dst_tensor.raw_ptr());
-                        size_t toff = 0;
-                        for (size_t i = 0; i < n0; ++i) {
-                            for (size_t j = 0; j < n1; ++j) {
-                                for (size_t k = 0; k < n2; ++k) {
-                                    dst[toff] = Kern::apply(a[as * j], b[bs * toff]);
-                                    ++toff;
-                                }
-                            }
-                        }
-                    });
-                    return;
-                }
-                MIDOUT_END();
-            }
-        }
-
-        naive::ElemwiseForwardImpl::exec(*m_src, *m_dst);
-    }
-    MIDOUT_END();
-}
-
 void ElemwiseImpl::exec(const TensorNDArray& srcs, _megdnn_tensor_out dst) {
     if (!dst.layout.is_contiguous()) {
         return naive::ElemwiseForwardImpl::exec(srcs, dst);
@@ -278,62 +82,6 @@ void ElemwiseImpl::exec(const TensorNDArray& srcs, _megdnn_tensor_out dst) {
     naive::ElemwiseForwardImpl::exec(srcs, dst);
 }
 
-#define SWITCH_DTYPE(_cat, _cb)                            \
-    switch (m_dst->layout.dtype.enumv()) {                 \
-        MEGDNN_FOREACH_COMPUTING_DTYPE_##_cat(_cb) default \
-                : megdnn_throw("bad dtype");               \
-    }
-
-template <uint32_t mode>
-void ElemwiseImpl::exec_UNARY_INT() {
-    auto param = make_elemwise_op_param<1>();
-#define cb(_dt)                  \
-    case DTypeTrait<_dt>::enumv: \
-        return unary_kern<_dt, mode>(param);
-
-    SWITCH_DTYPE(INT, cb)
-
-#undef cb
-}
-
-template <uint32_t mode>
-void ElemwiseImpl::exec_UNARY_FLOAT() {
-    auto param = make_elemwise_op_param<1>();
-#define cb(_dt)                  \
-    case DTypeTrait<_dt>::enumv: \
-        return unary_kern<_dt, mode>(param);
-
-    SWITCH_DTYPE(FLOAT, cb)
-
-#undef cb
-}
-
-template <uint32_t mode>
-void ElemwiseImpl::exec_BINARY_INT() {
-    auto param = make_elemwise_op_param<2>();
-#define cb(_dt)                  \
-    case DTypeTrait<_dt>::enumv: \
-        return binary_kern<_dt, mode>(param);
-
-    SWITCH_DTYPE(INT, cb)
-
-#undef cb
-}
-
-template <uint32_t mode>
-void ElemwiseImpl::exec_BINARY_FLOAT() {
-    auto param = make_elemwise_op_param<2>();
-#define cb(_dt)                  \
-    case DTypeTrait<_dt>::enumv: \
-        return binary_kern<_dt, mode>(param);
-
-    SWITCH_DTYPE(FLOAT, cb)
-
-#undef cb
-}
-
-#undef SWITCH_DTYPE
-
 }  // namespace fallback
 }  // namespace megdnn
 
diff --git a/dnn/src/fallback/elemwise/opr_unary_impl.cpp b/dnn/src/fallback/elemwise/opr_unary_impl.cpp
new file mode 100644
index 00000000..af829358
--- /dev/null
+++ b/dnn/src/fallback/elemwise/opr_unary_impl.cpp
@@ -0,0 +1,122 @@
+/**
+ * \file dnn/src/fallback/elemwise/opr_unary_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "./opr_impl.h"
+
+#include "src/common/elemwise/kern_defs.cuh"
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_fallback_elemwise_unary)
+
+namespace megdnn {
+namespace fallback {
+
+template <typename dtype, uint32_t mode>
+void ElemwiseImpl::unary_kern(const ElemwiseOpParamN<1>& param) {
+    using ctype = typename DTypeTrait<dtype>::ctype;
+    using Kern = ElemwiseKern<megcorePlatformCPU, mode, ctype>;
+    MIDOUT_BEGIN(megdnn_fallback_elemwise_unary, ctype, midout_iv(mode)) {
+        // only specialize for the most common 1-dim case
+        auto tot = param.size;
+        auto stride = param[0].layout.stride[0];
+        auto src0 = param[0];
+        auto dst_tensor = *m_dst;
+        if (param.max_ndim == 1) {
+            MIDOUT_BEGIN(
+                    megdnn_fallback_elemwise_unary, ctype, midout_iv(mode),
+                    midout_iv(1)) {
+                MEGDNN_DISPATCH_CPU_KERN_OPR({
+                    ctype* __restrict src = static_cast<ctype*>(src0.raw_ptr());
+                    ctype* __restrict dst = static_cast<ctype*>(dst_tensor.raw_ptr());
+                    for (size_t i = 0; i < tot; ++i) {
+                        dst[i] = Kern::apply(src[i * stride]);
+                    }
+                });
+                return;
+            }
+            MIDOUT_END();
+        }
+        naive::ElemwiseForwardImpl::exec(*m_src, *m_dst);
+    }
+    MIDOUT_END();
+}
+
+#define SWITCH_DTYPE(_cat, _cb)                            \
+    switch (m_dst->layout.dtype.enumv()) {                 \
+        MEGDNN_FOREACH_COMPUTING_DTYPE_##_cat(_cb) default \
+                : megdnn_throw("bad dtype");               \
+    }
+
+template <uint32_t mode>
+void ElemwiseImpl::exec_UNARY_INT() {
+    auto param = make_elemwise_op_param<1>();
+#define cb(_dt)                  \
+    case DTypeTrait<_dt>::enumv: \
+        return unary_kern<_dt, mode>(param);
+
+    SWITCH_DTYPE(INT, cb)
+
+#undef cb
+}
+
+template <uint32_t mode>
+void ElemwiseImpl::exec_UNARY_FLOAT() {
+    auto param = make_elemwise_op_param<1>();
+#define cb(_dt)                  \
+    case DTypeTrait<_dt>::enumv: \
+        return unary_kern<_dt, mode>(param);
+
+    SWITCH_DTYPE(FLOAT, cb)
+
+#undef cb
+}
+
+#undef SWITCH_DTYPE
+using Mode = param_enumv::Elemwise::Mode;
+#define INST(mode) template void megdnn::fallback::ElemwiseImpl::exec_UNARY_INT<mode>();
+INST(Mode::RELU);
+INST(Mode::ABS);
+INST(Mode::NEGATE);
+#undef INST
+
+#define INST(mode) \
+    template void megdnn::fallback::ElemwiseImpl::exec_UNARY_FLOAT<mode>();
+INST(Mode::RELU);
+INST(Mode::ABS);
+INST(Mode::ACOS);
+INST(Mode::ASIN);
+INST(Mode::CEIL);
+INST(Mode::COS);
+INST(Mode::EXP);
+INST(Mode::EXPM1);
+INST(Mode::FLOOR);
+INST(Mode::LOG);
+INST(Mode::LOG1P);
+INST(Mode::NEGATE);
+INST(Mode::SIGMOID);
+INST(Mode::SIN);
+INST(Mode::TANH);
+INST(Mode::FAST_TANH);
+INST(Mode::ROUND);
+INST(Mode::ERF);
+INST(Mode::ERFINV);
+INST(Mode::ERFC);
+INST(Mode::ERFCINV);
+INST(Mode::H_SWISH);
+INST(Mode::SILU);
+INST(Mode::GELU);
+#undef INST
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/dropout/opr_impl.cpp b/dnn/src/naive/dropout/opr_impl.cpp
new file mode 100644
index 00000000..64b359d4
--- /dev/null
+++ b/dnn/src/naive/dropout/opr_impl.cpp
@@ -0,0 +1,110 @@
+/**
+ * \file dnn/src/naive/dropout/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/naive/dropout/opr_impl.h"
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+using namespace megdnn;
+using namespace naive;
+using namespace std;
+namespace {
+
+using Param = megdnn::Dropout::Param;
+
+dt_float32 get_random_number(uint64_t x) {
+    union {
+        uint32_t i;
+        dt_float32 f;
+    } u;
+    u.i = (0x7F << 23) | (x >> 41);
+    return 2 - u.f;
+}
+
+template <typename T>
+void forward(
+        T* inp, T* oup, void* raw_reserved, size_t len, Xoroshiro128plus& rng,
+        float drop_prob) {
+    uint8_t* reserved = reinterpret_cast<uint8_t*>(raw_reserved);
+    float scale = 1.0f / (1.0f - drop_prob);
+    for (size_t i = 0; i < len; ++i) {
+        float rn = get_random_number(rng());
+        reserved[i] = rn < drop_prob ? 0 : 1;
+        oup[i] = static_cast<T>(reserved[i] ? static_cast<float>(inp[i]) * scale : 0.f);
+    }
+}
+
+template <typename T>
+void backward(T* doup, T* dinp, void* raw_reserved, size_t len, float drop_prob) {
+    uint8_t* reserved = reinterpret_cast<uint8_t*>(raw_reserved);
+    float scale = 1.0f / (1.0f - drop_prob);
+    for (size_t i = 0; i < len; ++i) {
+        dinp[i] =
+                static_cast<T>(reserved[i] ? static_cast<float>(doup[i]) * scale : 0.f);
+    }
+}
+
+}  // namespace
+
+namespace megdnn {
+namespace naive {
+
+size_t DropoutForwardImpl::get_mask_size_in_bytes(const TensorLayout& inp) {
+    return inp.total_nr_elems();
+}
+
+void DropoutForwardImpl::exec(
+        _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask,
+        _megdnn_workspace workspace) {
+    check_exec(inp.layout, oup.layout, mask.layout, workspace.size);
+    size_t length = inp.layout.total_nr_elems();
+    uint64_t seed = param().seed;
+
+    m_rng.ensure_seed(seed);
+
+#define cb(DType)                                                          \
+    if (inp.layout.dtype == DType()) {                                     \
+        using T = typename DTypeTrait<DType>::ctype;                       \
+        MEGDNN_DISPATCH_CPU_KERN_OPR(forward<T>(                           \
+                inp.ptr<T>(), oup.ptr<T>(), mask.raw_ptr(), length, m_rng, \
+                param().drop_prob));                                       \
+        return;                                                            \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw("bad dtype");
+}
+
+void DropoutBackwardImpl::exec(
+        _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp,
+        _megdnn_workspace workspace) {
+    check_exec(doup.layout, mask.layout, dinp.layout, workspace.size);
+    size_t length = doup.layout.total_nr_elems();
+
+#define cb(DType)                                                     \
+    if (doup.layout.dtype == DType()) {                               \
+        using T = typename DTypeTrait<DType>::ctype;                  \
+        MEGDNN_DISPATCH_CPU_KERN_OPR(backward<T>(                     \
+                doup.ptr<T>(), dinp.ptr<T>(), mask.raw_ptr(), length, \
+                param().drop_prob));                                  \
+        return;                                                       \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw("bad dtype");
+}
+
+}  // namespace naive
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/dropout/opr_impl.h b/dnn/src/naive/dropout/opr_impl.h
new file mode 100644
index 00000000..f40de4e2
--- /dev/null
+++ b/dnn/src/naive/dropout/opr_impl.h
@@ -0,0 +1,49 @@
+/**
+ * \file dnn/src/naive/dropout/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+#include "src/naive/rng/opr_impl.h"
+
+namespace megdnn {
+namespace naive {
+
+class DropoutForwardImpl final : public DropoutForward {
+    Xoroshiro128plus m_rng;
+
+public:
+    using DropoutForward::DropoutForward;
+    void exec(
+            _megdnn_tensor_in inp, _megdnn_tensor_out oup, _megdnn_tensor_out mask,
+            _megdnn_workspace workspace) override;
+    size_t get_mask_size_in_bytes(const TensorLayout& inp) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+class DropoutBackwardImpl final : public DropoutBackward {
+public:
+    using DropoutBackward::DropoutBackward;
+    void exec(
+            _megdnn_tensor_in doup, _megdnn_tensor_in mask, _megdnn_tensor_out dinp,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+}  // namespace naive
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp
new file mode 100644
index 00000000..1c135917
--- /dev/null
+++ b/dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp
@@ -0,0 +1,138 @@
+/**
+ * \file dnn/src/naive/elemwise_multi_type/opr_impl_1.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./opr_impl.h"
+#include "megdnn/tensor_iter.h"
+#include "src/common/elemwise/kern_defs.cuh"
+#include "src/common/elemwise_multi_type/kern_defs.cuh"
+#include "src/naive/handle.h"
+
+using namespace megdnn;
+using namespace naive;
+
+void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32(
+        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
+    auto size = param.size;
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto src2 = param[2];
+    auto work = [src0, src1, src2, size, dst]() {
+        auto i0 = tensor_iter_valonly<dt_int16>(src0).begin();
+        auto i1 = tensor_iter_valonly<dt_int32>(src1).begin();
+        auto i2 = tensor_iter_valonly<dt_int32>(src2).begin();
+        auto dst_ptr = dst.ptr<dt_int32>();
+        for (size_t i = 0; i < size; ++i) {
+            dst_ptr[i] = (*i0) * (*i1) + (*i2);
+            ++i0;
+            ++i1;
+            ++i2;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+
+void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16xf32xf32xf32(
+        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
+    auto size = param.size;
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto src2 = param[2];
+    auto work = [src0, src1, src2, size, dst]() {
+        auto i0 = tensor_iter_valonly<dt_int16>(src0).begin();
+        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
+        auto i2 = tensor_iter_valonly<dt_float32>(src2).begin();
+        auto dst_ptr = dst.ptr<dt_float32>();
+        for (size_t i = 0; i < size; ++i) {
+            dst_ptr[i] = (*i0) * (*i1) + (*i2);
+            ++i0;
+            ++i1;
+            ++i2;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+
+void ElemwiseMultiTypeImpl::on_fuse_mul_add3_uint8xf32xf32xf32(
+        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
+    auto size = param.size;
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto src2 = param[2];
+    auto work = [src0, src1, src2, size, dst]() {
+        auto i0 = tensor_iter_valonly<dt_uint8>(src0).begin();
+        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
+        auto i2 = tensor_iter_valonly<dt_float32>(src2).begin();
+        auto dst_ptr = dst.ptr<dt_float32>();
+        for (size_t i = 0; i < size; ++i) {
+            dst_ptr[i] = (*i0) * (*i1) + (*i2);
+            ++i0;
+            ++i1;
+            ++i2;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+
+void ElemwiseMultiTypeImpl::on_mul_int16xf32xf32(
+        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
+    auto size = param.size;
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto work = [src0, src1, size, dst]() {
+        auto i0 = tensor_iter_valonly<dt_int16>(src0).begin();
+        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
+        auto dst_ptr = dst.ptr<dt_float32>();
+        for (size_t i = 0; i < size; ++i) {
+            dst_ptr[i] = (*i0) * (*i1);
+            ++i0;
+            ++i1;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+
+void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8(
+        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
+    switch (param[0].layout.dtype.enumv()) {
+#define cb(t)                  \
+    case DTypeTrait<t>::enumv: \
+        return dispatch_fma3_iXxf32xf32xi8<DTypeTrait<t>::ctype>(param, dst);
+        MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
+#undef cb
+        default:
+            megdnn_throw("unsupported src dtype");
+    }
+}
+
+template <typename ctype>
+void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8(
+        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
+    auto size = param.size;
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto src2 = param[2];
+    auto work = [src0, src1, src2, size, dst]() {
+        elemwise_multi_type::Fma3iXxf32xf32xiYOp<ctype, dt_int8> op;
+        auto i0 = tensor_iter_valonly<ctype>(src0).begin();
+        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
+        auto i2 = tensor_iter_valonly<dt_float32>(src2).begin();
+        auto dst_ptr = dst.ptr<dt_int8>();
+        for (size_t i = 0; i < size; ++i) {
+            dst_ptr[i] = op(*i0, *i1, *i2);
+            ++i0;
+            ++i1;
+            ++i2;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp
new file mode 100644
index 00000000..45861c78
--- /dev/null
+++ b/dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp
@@ -0,0 +1,115 @@
+/**
+ * \file dnn/src/naive/elemwise_multi_type/opr_impl_2.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./opr_impl.h"
+#include "megdnn/tensor_iter.h"
+#include "src/common/elemwise/kern_defs.cuh"
+#include "src/common/elemwise_multi_type/kern_defs.cuh"
+#include "src/naive/handle.h"
+
+using namespace megdnn;
+using namespace naive;
+
+void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8(
+        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
+    switch (param[0].layout.dtype.enumv()) {
+#define cb(t)                                                                       \
+    case DTypeTrait<t>::enumv:                                                      \
+        return dispatch_round_shr_saturate_iXxi8xiX<DTypeTrait<t>::ctype, dt_int8>( \
+                param, dst);
+        MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
+#undef cb
+        default:
+            megdnn_throw("unsupported src dtype");
+    }
+}
+
+template <typename ctype, typename dst_ctype>
+void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX(
+        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto size = param.size;
+    auto work = [src0, src1, size, dst]() {
+        // This is needed as these iterators are captured as const value.
+        auto iA = tensor_iter_valonly<ctype>(src0).begin();
+        auto iB = tensor_iter_valonly<dt_int8>(src1).begin();
+        auto pD = dst.ptr<dst_ctype>();
+        for (size_t i = 0; i < size; i++) {
+            *pD = elemwise_multi_type::round_shr_saturate<ctype, dst_ctype>(*iA, *iB);
+            ++iA;
+            ++iB;
+            ++pD;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+template <typename ctype>
+void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate(
+        const ElemwiseOpParamN<6>& param, const TensorND& dst) {
+    auto size = param.size;
+    auto src0 = param[0];
+    auto src1 = param[1];
+    auto src2 = param[2];
+    auto src3 = param[3];
+    auto src4 = param[4];
+    auto src5 = param[5];
+    auto work = [size, src0, src1, src2, src3, src4, src5, dst]() {
+        auto i0 = tensor_iter_valonly<ctype>(src0).begin();
+        auto i1 = tensor_iter_valonly<ctype>(src1).begin();
+        auto i2 = tensor_iter_valonly<ctype>(src2).begin();
+        auto ioff = tensor_iter_valonly<dt_int8>(src3).begin();
+        auto imin = tensor_iter_valonly<dt_int8>(src4).begin();
+        auto imax = tensor_iter_valonly<dt_int8>(src5).begin();
+        auto dst_ptr = dst.ptr<dt_int8>();
+        for (size_t i = 0; i < size; ++i) {
+            auto res = elemwise_multi_type::round_shr_saturate<ctype, dt_int8>(
+                    round_mulh_saturate<ctype>(*i0 + *i1, *i2), *ioff);
+            res = std::min(res, *imax);
+            res = std::max(res, *imin);
+            dst_ptr[i] = res;
+            ++i0;
+            ++i1;
+            ++i2;
+            ++ioff;
+            ++imin;
+            ++imax;
+        }
+    };
+    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
+}
+
+void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
+        const ElemwiseOpParamN<6>& param, const TensorND& dst) {
+    dispatch_fuse_add_rmulh_round_shr_saturate<dt_int16>(param, dst);
+}
+
+void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
+        const ElemwiseOpParamN<6>& param, const TensorND& dst) {
+    dispatch_fuse_add_rmulh_round_shr_saturate<dt_int32>(param, dst);
+}
+
+void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16(
+        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
+    switch (param[0].layout.dtype.enumv()) {
+#define cb(t)                                                                        \
+    case DTypeTrait<t>::enumv:                                                       \
+        return dispatch_round_shr_saturate_iXxi8xiX<DTypeTrait<t>::ctype, dt_int16>( \
+                param, dst);
+        cb(::megdnn::dtype::Int32);
+        cb(::megdnn::dtype::Int16);
+#undef cb
+        default:
+            megdnn_throw("unsupported src dtype");
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp
similarity index 56%
rename from dnn/src/naive/elemwise_multi_type/opr_impl.cpp
rename to dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp
index 502cef74..67c21b12 100644
--- a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp
+++ b/dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp
@@ -1,5 +1,5 @@
 /**
- * \file dnn/src/naive/elemwise_multi_type/opr_impl.cpp
+ * \file dnn/src/naive/elemwise_multi_type/opr_impl_3.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -18,218 +18,6 @@
 using namespace megdnn;
 using namespace naive;
 
-void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16x32x32x32(
-        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
-    auto size = param.size;
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto src2 = param[2];
-    auto work = [src0, src1, src2, size, dst]() {
-        auto i0 = tensor_iter_valonly<dt_int16>(src0).begin();
-        auto i1 = tensor_iter_valonly<dt_int32>(src1).begin();
-        auto i2 = tensor_iter_valonly<dt_int32>(src2).begin();
-        auto dst_ptr = dst.ptr<dt_int32>();
-        for (size_t i = 0; i < size; ++i) {
-            dst_ptr[i] = (*i0) * (*i1) + (*i2);
-            ++i0;
-            ++i1;
-            ++i2;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-void ElemwiseMultiTypeImpl::on_fuse_mul_add3_int16xf32xf32xf32(
-        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
-    auto size = param.size;
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto src2 = param[2];
-    auto work = [src0, src1, src2, size, dst]() {
-        auto i0 = tensor_iter_valonly<dt_int16>(src0).begin();
-        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
-        auto i2 = tensor_iter_valonly<dt_float32>(src2).begin();
-        auto dst_ptr = dst.ptr<dt_float32>();
-        for (size_t i = 0; i < size; ++i) {
-            dst_ptr[i] = (*i0) * (*i1) + (*i2);
-            ++i0;
-            ++i1;
-            ++i2;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-void ElemwiseMultiTypeImpl::on_fuse_mul_add3_uint8xf32xf32xf32(
-        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
-    auto size = param.size;
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto src2 = param[2];
-    auto work = [src0, src1, src2, size, dst]() {
-        auto i0 = tensor_iter_valonly<dt_uint8>(src0).begin();
-        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
-        auto i2 = tensor_iter_valonly<dt_float32>(src2).begin();
-        auto dst_ptr = dst.ptr<dt_float32>();
-        for (size_t i = 0; i < size; ++i) {
-            dst_ptr[i] = (*i0) * (*i1) + (*i2);
-            ++i0;
-            ++i1;
-            ++i2;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-void ElemwiseMultiTypeImpl::on_mul_int16xf32xf32(
-        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
-    auto size = param.size;
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto work = [src0, src1, size, dst]() {
-        auto i0 = tensor_iter_valonly<dt_int16>(src0).begin();
-        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
-        auto dst_ptr = dst.ptr<dt_float32>();
-        for (size_t i = 0; i < size; ++i) {
-            dst_ptr[i] = (*i0) * (*i1);
-            ++i0;
-            ++i1;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-void ElemwiseMultiTypeImpl::on_fuse_mul_add3_iXxf32xf32xi8(
-        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
-    switch (param[0].layout.dtype.enumv()) {
-#define cb(t)                  \
-    case DTypeTrait<t>::enumv: \
-        return dispatch_fma3_iXxf32xf32xi8<DTypeTrait<t>::ctype>(param, dst);
-        MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
-#undef cb
-        default:
-            megdnn_throw("unsupported src dtype");
-    }
-}
-
-template <typename ctype>
-void ElemwiseMultiTypeImpl::dispatch_fma3_iXxf32xf32xi8(
-        const ElemwiseOpParamN<3>& param, const TensorND& dst) {
-    auto size = param.size;
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto src2 = param[2];
-    auto work = [src0, src1, src2, size, dst]() {
-        elemwise_multi_type::Fma3iXxf32xf32xiYOp<ctype, dt_int8> op;
-        auto i0 = tensor_iter_valonly<ctype>(src0).begin();
-        auto i1 = tensor_iter_valonly<dt_float32>(src1).begin();
-        auto i2 = tensor_iter_valonly<dt_float32>(src2).begin();
-        auto dst_ptr = dst.ptr<dt_int8>();
-        for (size_t i = 0; i < size; ++i) {
-            dst_ptr[i] = op(*i0, *i1, *i2);
-            ++i0;
-            ++i1;
-            ++i2;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi8(
-        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
-    switch (param[0].layout.dtype.enumv()) {
-#define cb(t)                                                                       \
-    case DTypeTrait<t>::enumv:                                                      \
-        return dispatch_round_shr_saturate_iXxi8xiX<DTypeTrait<t>::ctype, dt_int8>( \
-                param, dst);
-        MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
-#undef cb
-        default:
-            megdnn_throw("unsupported src dtype");
-    }
-}
-
-template <typename ctype, typename dst_ctype>
-void ElemwiseMultiTypeImpl::dispatch_round_shr_saturate_iXxi8xiX(
-        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto size = param.size;
-    auto work = [src0, src1, size, dst]() {
-        // This is needed as these iterators are captured as const value.
-        auto iA = tensor_iter_valonly<ctype>(src0).begin();
-        auto iB = tensor_iter_valonly<dt_int8>(src1).begin();
-        auto pD = dst.ptr<dst_ctype>();
-        for (size_t i = 0; i < size; i++) {
-            *pD = elemwise_multi_type::round_shr_saturate<ctype, dst_ctype>(*iA, *iB);
-            ++iA;
-            ++iB;
-            ++pD;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-template <typename ctype>
-void ElemwiseMultiTypeImpl::dispatch_fuse_add_rmulh_round_shr_saturate(
-        const ElemwiseOpParamN<6>& param, const TensorND& dst) {
-    auto size = param.size;
-    auto src0 = param[0];
-    auto src1 = param[1];
-    auto src2 = param[2];
-    auto src3 = param[3];
-    auto src4 = param[4];
-    auto src5 = param[5];
-    auto work = [size, src0, src1, src2, src3, src4, src5, dst]() {
-        auto i0 = tensor_iter_valonly<ctype>(src0).begin();
-        auto i1 = tensor_iter_valonly<ctype>(src1).begin();
-        auto i2 = tensor_iter_valonly<ctype>(src2).begin();
-        auto ioff = tensor_iter_valonly<dt_int8>(src3).begin();
-        auto imin = tensor_iter_valonly<dt_int8>(src4).begin();
-        auto imax = tensor_iter_valonly<dt_int8>(src5).begin();
-        auto dst_ptr = dst.ptr<dt_int8>();
-        for (size_t i = 0; i < size; ++i) {
-            auto res = elemwise_multi_type::round_shr_saturate<ctype, dt_int8>(
-                    round_mulh_saturate<ctype>(*i0 + *i1, *i2), *ioff);
-            res = std::min(res, *imax);
-            res = std::max(res, *imin);
-            dst_ptr[i] = res;
-            ++i0;
-            ++i1;
-            ++i2;
-            ++ioff;
-            ++imin;
-            ++imax;
-        }
-    };
-    MEGDNN_DISPATCH_CPU_KERN_OPR(work());
-}
-
-void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int16x16x16x8(
-        const ElemwiseOpParamN<6>& param, const TensorND& dst) {
-    dispatch_fuse_add_rmulh_round_shr_saturate<dt_int16>(param, dst);
-}
-
-void ElemwiseMultiTypeImpl::on_fuse_add_rmulh_round_shr_saturate_int32x32x32x8(
-        const ElemwiseOpParamN<6>& param, const TensorND& dst) {
-    dispatch_fuse_add_rmulh_round_shr_saturate<dt_int32>(param, dst);
-}
-
-void ElemwiseMultiTypeImpl::on_round_shr_saturate_iXxi8xi16(
-        const ElemwiseOpParamN<2>& param, const TensorND& dst) {
-    switch (param[0].layout.dtype.enumv()) {
-#define cb(t)                                                                        \
-    case DTypeTrait<t>::enumv:                                                       \
-        return dispatch_round_shr_saturate_iXxi8xiX<DTypeTrait<t>::ctype, dt_int16>( \
-                param, dst);
-        cb(::megdnn::dtype::Int32);
-        cb(::megdnn::dtype::Int16);
-#undef cb
-        default:
-            megdnn_throw("unsupported src dtype");
-    }
-}
-
 template <typename KernImpl, typename src_ctype, typename dst_ctype>
 void ElemwiseMultiTypeImpl::dispatch_add_qint_op(
         const ElemwiseOpParamN<1>& param, const TensorND& dst_tensor) {
diff --git a/dnn/src/naive/handle.cpp b/dnn/src/naive/handle.cpp
index e38bfead..2a705335 100644
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -36,6 +36,7 @@
 #include "src/naive/deformable_conv/opr_impl.h"
 #include "src/naive/deformable_ps_roi_pooling/opr_impl.h"
 #include "src/naive/dot/opr_impl.h"
+#include "src/naive/dropout/opr_impl.h"
 #include "src/naive/elemwise/opr_impl.h"
 #include "src/naive/elemwise_multi_type/opr_impl.h"
 #include "src/naive/eye/opr_impl.h"
@@ -47,6 +48,7 @@
 #include "src/naive/images2neibs/opr_impl.h"
 #include "src/naive/indexing_multi_axis_vec/opr_impl.h"
 #include "src/naive/indexing_one_hot/opr_impl.h"
+#include "src/naive/layer_norm/opr_impl.h"
 #include "src/naive/linspace/opr_impl.h"
 #include "src/naive/local/opr_impl.h"
 #include "src/naive/local_share/opr_impl.h"
diff --git a/dnn/src/naive/layer_norm/opr_impl.cpp b/dnn/src/naive/layer_norm/opr_impl.cpp
new file mode 100644
index 00000000..cc967035
--- /dev/null
+++ b/dnn/src/naive/layer_norm/opr_impl.cpp
@@ -0,0 +1,170 @@
+/**
+ * \file dnn/src/naive/layer_norm/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/naive/layer_norm/opr_impl.h"
+#include <algorithm>
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+using namespace megdnn;
+using namespace naive;
+
+namespace {
+
+using Param = megdnn::LayerNorm::Param;
+
+template <typename T, typename T_ACC = float>
+void forward(
+        _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
+        _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
+        const Param& param) {
+    float eps = param.eps;
+    bool affine = param.affine;
+    uint64_t slice_length = param.normalized_size;
+    uint64_t slice_dim = param.normalized_dim;
+    uint64_t n_slices = 1;
+    for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) {
+        n_slices = n_slices * data.layout.shape[i];
+    }
+
+    for (size_t i = 0; i < n_slices; i++) {
+        T_ACC slice_sum = static_cast<T>(0.0f);
+        for (size_t j = 0; j < slice_length; j++) {
+            auto value = data.ptr<T>()[i * slice_length + j];
+            slice_sum += value;
+        }
+        T_ACC slice_mean = static_cast<T>(slice_sum / slice_length);
+
+        T_ACC slice_var = static_cast<T>(0.0f);
+        for (size_t j = 0; j < slice_length; j++) {
+            slice_var += (data.ptr<T>()[i * slice_length + j] - slice_mean) *
+                         (data.ptr<T>()[i * slice_length + j] - slice_mean);
+        }
+        slice_var = slice_var / slice_length;
+
+        T_ACC slice_std = static_cast<T>(sqrt(slice_var + eps));
+        for (size_t j = 0; j < slice_length; j++) {
+            dst.ptr<T>()[i * slice_length + j] =
+                    (data.ptr<T>()[i * slice_length + j] - slice_mean) / slice_std;
+            if (affine) {
+                dst.ptr<T>()[i * slice_length + j] =
+                        dst.ptr<T>()[i * slice_length + j] * weight.ptr<T>()[j] +
+                        bias.ptr<T>()[j];
+            }
+        }
+        mean.ptr<T_ACC>()[i] = static_cast<T_ACC>(slice_mean);
+        rstd.ptr<T_ACC>()[i] = static_cast<T_ACC>(1.0 / slice_std);
+    }
+}
+
+template <typename T, typename T_ACC = float>
+void backward(
+        _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight,
+        _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata,
+        _megdnn_tensor_out dweight, _megdnn_tensor_out dbias, const Param& param) {
+    bool affine = param.affine;
+    uint64_t slice_length = param.normalized_size;
+    uint64_t slice_dim = param.normalized_dim;
+    uint64_t n_slices = 1;
+    for (size_t i = 0; i < data.layout.ndim - slice_dim; ++i) {
+        n_slices = n_slices * data.layout.shape[i];
+    }
+
+    if (affine) {
+        for (size_t i = 0; i < slice_length; ++i) {
+            dweight.ptr<T>()[i] = 0;
+            dbias.ptr<T>()[i] = 0;
+        }
+
+        for (size_t i = 0; i < n_slices; ++i) {
+            for (size_t j = 0; j < slice_length; ++j) {
+                dweight.ptr<T>()[j] +=
+                        (data.ptr<T>()[i * slice_length + j] - mean.ptr<T_ACC>()[i]) *
+                        rstd.ptr<T_ACC>()[i] * diff.ptr<T>()[i * slice_length + j];
+
+                dbias.ptr<T>()[j] += diff.ptr<T>()[i * slice_length + j];
+            }
+        }
+    }
+
+    for (size_t i = 0; i < n_slices; ++i) {
+        T_ACC ds = static_cast<T_ACC>(0.0f);
+        T_ACC db = static_cast<T_ACC>(0.0f);
+        T_ACC a = static_cast<T_ACC>(0.0f);
+        T_ACC b = static_cast<T_ACC>(0.0f);
+        T_ACC c = static_cast<T_ACC>(0.0f);
+
+        for (size_t j = 0; j < slice_length; ++j) {
+            auto value = data.ptr<T>()[i * slice_length + j];
+            auto diff_v = diff.ptr<T>()[i * slice_length + j];
+            auto weight_v = affine ? weight.ptr<T>()[j] : static_cast<T>(1.0f);
+            db += diff_v * weight_v;
+            ds += diff_v * value * weight_v;
+        }
+
+        a = rstd.ptr<T_ACC>()[i];
+        b = (db * mean.ptr<T_ACC>()[i] - ds) * a * a * a / slice_length;
+        c = -b * mean.ptr<T_ACC>()[i] - db * a / slice_length;
+
+        for (uint64_t j = 0; j < slice_length; j++) {
+            auto weight_v = affine ? weight.ptr<T>()[j] : static_cast<T>(1.0f);
+            ddata.ptr<T>()[i * slice_length + j] =
+                    diff.ptr<T>()[i * slice_length + j] * a * weight_v +
+                    data.ptr<T>()[i * slice_length + j] * b + c;
+        }
+    }
+}
+
+}  // namespace
+
+namespace megdnn {
+namespace naive {
+
+void LayerNormForwardImpl::exec(
+        _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
+        _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
+        _megdnn_workspace workspace) {
+    check_exec(
+            data.layout, weight.layout, bias.layout, dst.layout, mean.layout,
+            rstd.layout, workspace.size);
+#define cb(DType)                                                                \
+    if (data.layout.dtype == DType()) {                                          \
+        MEGDNN_DISPATCH_CPU_KERN_OPR(forward<typename DTypeTrait<DType>::ctype>( \
+                data, weight, bias, dst, mean, rstd, param()));                  \
+        return;                                                                  \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw("bad dtype");
+}
+
+void LayerNormBackwardImpl::exec(
+        _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight,
+        _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata,
+        _megdnn_tensor_out dweight, _megdnn_tensor_out dbias,
+        _megdnn_workspace workspace) {
+    check_exec(
+            diff.layout, data.layout, weight.layout, mean.layout, rstd.layout,
+            ddata.layout, dweight.layout, dbias.layout, workspace.size);
+#define cb(DType)                                                                 \
+    if (data.layout.dtype == DType()) {                                           \
+        MEGDNN_DISPATCH_CPU_KERN_OPR(backward<typename DTypeTrait<DType>::ctype>( \
+                diff, data, weight, mean, rstd, ddata, dweight, dbias, param())); \
+        return;                                                                   \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw("bad dtype");
+}
+
+}  // namespace naive
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/layer_norm/opr_impl.h b/dnn/src/naive/layer_norm/opr_impl.h
new file mode 100644
index 00000000..99d93e79
--- /dev/null
+++ b/dnn/src/naive/layer_norm/opr_impl.h
@@ -0,0 +1,51 @@
+/**
+ * \file dnn/src/naive/layer_norm/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace naive {
+
+class LayerNormForwardImpl final : public LayerNormForward {
+public:
+    using LayerNormForward::LayerNormForward;
+    void exec(
+            _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias,
+            _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&,
+            const TensorLayout&, const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+class LayerNormBackwardImpl final : public LayerNormBackward {
+public:
+    using LayerNormBackward::LayerNormBackward;
+    void exec(
+            _megdnn_tensor_in diff, _megdnn_tensor_in data, _megdnn_tensor_in weight,
+            _megdnn_tensor_in mean, _megdnn_tensor_in rstd, _megdnn_tensor_out ddata,
+            _megdnn_tensor_out dweight, _megdnn_tensor_out dbias,
+            _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(
+            const TensorLayout&, const TensorLayout&, const TensorLayout&,
+            const TensorLayout&, const TensorLayout&, const TensorLayout&,
+            const TensorLayout&, const TensorLayout&) override {
+        return 0;
+    }
+};
+
+}  // namespace naive
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/x86/matrix_mul/opr_impl.cpp b/dnn/src/x86/matrix_mul/opr_impl.cpp
index 176a428c..605142a0 100644
--- a/dnn/src/x86/matrix_mul/opr_impl.cpp
+++ b/dnn/src/x86/matrix_mul/opr_impl.cpp
@@ -52,7 +52,6 @@ public:
         m_all_algos.emplace_back(&algoint8x8x32sse_m4n8k2);
         m_all_algos.emplace_back(&algoint8x8x16sse_m4n8k2);
         m_all_algos.emplace_back(&algof32mk8_8x8);
-        m_all_algos.emplace_back(&algof32_6x16);
 #if MEGDNN_X86_WITH_MKL_DNN
         m_all_algos.emplace_back(&algoint8x8x32mkldnn);
 #endif
@@ -60,6 +59,7 @@ public:
 #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM
         m_all_algos.emplace_back(&f32mkl_packa);
 #endif
+        m_all_algos.emplace_back(&algof32_6x16);
 
         for (auto&& algo : m_all_algos) {
             m_all_algos_map.emplace(algo->info().desc, algo);
diff --git a/dnn/test/CMakeLists.txt b/dnn/test/CMakeLists.txt
index 1527dcab..858d1dcc 100644
--- a/dnn/test/CMakeLists.txt
+++ b/dnn/test/CMakeLists.txt
@@ -5,38 +5,38 @@ file(GLOB SOURCES_ *.cpp)
 list(APPEND SOURCES ${SOURCES_})
 
 if(NOT ${MGE_ARCH} STREQUAL "naive")
-    file(GLOB_RECURSE SOURCES_ fallback/*.cpp)
+  file(GLOB_RECURSE SOURCES_ fallback/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ cpu/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
+  if(${MGE_ARCH} STREQUAL "fallback")
+    message(WARNING "build only with fallback")
+  elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386")
+    file(GLOB_RECURSE SOURCES_ x86/*.cpp)
     list(APPEND SOURCES ${SOURCES_})
-    file(GLOB_RECURSE SOURCES_ cpu/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
-    if(${MGE_ARCH} STREQUAL "fallback")
-        message(WARNING "build only with fallback")
-    elseif(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386")
-        file(GLOB_RECURSE SOURCES_ x86/*.cpp)
-        list(APPEND SOURCES ${SOURCES_})
-    endif()
+  endif()
 endif()
 
 if(MGE_WITH_CUDA)
-    file(GLOB_RECURSE SOURCES_ cuda/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ cuda/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 
-    file(GLOB_RECURSE CUSOURCES cuda/*.cu)
-    list(APPEND SOURCES ${CUSOURCES})
+  file(GLOB_RECURSE CUSOURCES cuda/*.cu)
+  list(APPEND SOURCES ${CUSOURCES})
 endif()
 
 if(MGE_WITH_MIDOUT_PROFILE)
-    list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp)
+  list(APPEND SOURCES ${PROJECT_SOURCE_DIR}/third_party/midout/src/midout.cpp)
 endif()
 
 if(MGE_WITH_ATLAS)
-    file(GLOB_RECURSE SOURCES_ atlas/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ atlas/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
-if (MGE_WITH_ROCM)
-    file (GLOB_RECURSE SOURCES_ rocm/*.cpp)
-    list (APPEND SOURCES ${SOURCES_})
+if(MGE_WITH_ROCM)
+  file(GLOB_RECURSE SOURCES_ rocm/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
 add_executable(megdnn_test ${SOURCES})
@@ -44,37 +44,36 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
 target_link_libraries(megdnn_test gtest)
 target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS} ${MGE_CUDA_LIBS})
 
-if (MGE_WITH_CUDA)
-    target_link_libraries(megdnn_test cutlass)
-    target_include_directories(megdnn_test PRIVATE ${CUDNN_INCLUDE_DIR})
+if(MGE_WITH_CUDA)
+  target_link_libraries(megdnn_test cutlass)
+  target_include_directories(megdnn_test PRIVATE ${CUDNN_INCLUDE_DIR})
 endif()
 if(MGE_WITH_ATLAS)
-    target_link_libraries(megdnn_test atlas-stub)
+  target_link_libraries(megdnn_test atlas-stub)
 endif()
 
 target_include_directories(megdnn_test
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}/third_party/midout/src
-)
+                           PRIVATE ${PROJECT_SOURCE_DIR}/third_party/midout/src)
 
 if(APPLE OR ANDROID)
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 else()
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
 endif()
 
 if(MGE_ENABLE_COVERAGE)
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} --coverage")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage")
 endif()
 
-if (MEG_WITH_ROCM)
-    target_link_libraries (megdnn_test ${MGE_ROCM_LIBS})
-endif ()
+if(MEG_WITH_ROCM)
+  target_link_libraries(megdnn_test ${MGE_ROCM_LIBS})
+endif()
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(megdnn_test dl)
-    else()
-        target_link_libraries(megdnn_test dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(megdnn_test dl)
+  else()
+    target_link_libraries(megdnn_test dl rt)
+  endif()
 endif()
diff --git a/dnn/test/common/deduce_layout_proxy.h b/dnn/test/common/deduce_layout_proxy.h
index 17afc1dd..f1067aec 100644
--- a/dnn/test/common/deduce_layout_proxy.h
+++ b/dnn/test/common/deduce_layout_proxy.h
@@ -58,6 +58,15 @@ struct DeduceLayoutProxy<Opr, 5, true> {
 };
 
 template <typename Opr>
+struct DeduceLayoutProxy<Opr, 6, true> {
+    static void deduce_layout(Opr* opr, TensorLayoutArray& layouts) {
+        megdnn_assert(layouts.size() == 6);
+        opr->deduce_layout(
+                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5]);
+    }
+};
+
+template <typename Opr>
 struct DeduceLayoutProxy<Opr, 5, false> {
     static void deduce_layout(Opr*, TensorLayoutArray&) {}
 };
diff --git a/dnn/test/cuda/accuracy_shake.cpp b/dnn/test/cuda/accuracy_shake.cpp
index 68463633..41b116b2 100644
--- a/dnn/test/cuda/accuracy_shake.cpp
+++ b/dnn/test/cuda/accuracy_shake.cpp
@@ -97,7 +97,7 @@ TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NHWC) {
 
 TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NCHWX) {
     using Format = ConvBias::Param::Format;
-    require_compute_capability(6, 1);
+    require_compute_capability(7, 5);
     AccuracyShakeChecker<ConvBiasForward> checker(handle_cuda());
     UniformIntRNG int_rng{-5, 5};
     UniformFloatRNG float_rng{-50, 50};
diff --git a/dnn/test/cuda/layer_norm.cpp b/dnn/test/cuda/layer_norm.cpp
new file mode 100644
index 00000000..b4d04204
--- /dev/null
+++ b/dnn/test/cuda/layer_norm.cpp
@@ -0,0 +1,94 @@
+/**
+ * \file dnn/test/cuda/layer_norm.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "test/cuda/fixture.h"
+
+#include "test/common/checker.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(CUDA, LAYERNORM_FORWARD) {
+    using Param = LayerNormForward::Param;
+    Param param;
+    param.affine = true;
+    param.eps = 1e-6;
+    param.normalized_dim = 1;
+    Checker<LayerNormForward> checker(handle_cuda());
+    checker.set_epsilon(1e-2);
+
+    auto run = [&](DType d) {
+        for (size_t n_slices : {10, 30})
+            for (size_t slice_len : {10, 30}) {
+                param.normalized_size = slice_len;
+                checker.set_param(param)
+                        .set_dtype(0, d)
+                        .set_dtype(1, d)
+                        .set_dtype(2, d)
+                        .set_dtype(3, d)
+                        .set_dtype(4, dtype::Float32())
+                        .set_dtype(5, dtype::Float32())
+                        .execs({{n_slices, slice_len},
+                                {slice_len},
+                                {slice_len},
+                                {n_slices, slice_len},
+                                {n_slices},
+                                {n_slices}});
+            }
+    };
+
+    run(dtype::Float32());
+    run(dtype::Float16());
+    run(dtype::BFloat16());
+}
+
+TEST_F(CUDA, LAYERNORM_BACKWARD) {
+    using Param = LayerNormBackward::Param;
+    Param param;
+    param.affine = true;
+    param.eps = 1e-6;
+    param.normalized_dim = 1;
+    Checker<LayerNormBackward> checker(handle_cuda());
+    checker.set_epsilon(1e-1);
+
+    auto run = [&](DType d) {
+        for (size_t n_slices : {10, 30})
+            for (size_t slice_len : {10, 30}) {
+                param.normalized_size = slice_len;
+                checker.set_param(param)
+                        .set_dtype(0, d)
+                        .set_dtype(1, d)
+                        .set_dtype(2, d)
+                        .set_dtype(3, dtype::Float32())
+                        .set_dtype(4, dtype::Float32())
+                        .set_dtype(5, d)
+                        .set_dtype(6, d)
+                        .set_dtype(7, d)
+                        .execs({{n_slices, slice_len},
+                                {n_slices, slice_len},
+                                {slice_len},
+                                {n_slices},
+                                {n_slices},
+                                {n_slices, slice_len},
+                                {slice_len},
+                                {slice_len}});
+            }
+    };
+
+    run(dtype::Float32());
+    run(dtype::Float16());
+    run(dtype::BFloat16());
+}
+
+}  // namespace test
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/cuda/rng.cpp b/dnn/test/cuda/rng.cpp
index 0a5a549d..3b0b705d 100644
--- a/dnn/test/cuda/rng.cpp
+++ b/dnn/test/cuda/rng.cpp
@@ -193,6 +193,70 @@ void run_shuffle(Handle* handle, bool bwd_flag) {
     run({6, 3});
 }
 
+template <typename T>
+void run_dropout(Handle* handle) {
+    using ctype = typename DTypeTrait<T>::ctype;
+    auto run = [&](TensorShape shape, float drop_prob) {
+        auto fwd = handle->create_operator<DropoutForward>();
+        auto bwd = handle->create_operator<DropoutBackward>();
+        fwd->param().drop_prob = drop_prob;
+        bwd->param().drop_prob = drop_prob;
+        double scale = 1.0 / (1.0 - drop_prob);
+
+        TensorLayout inp_lay{shape, T()};
+        TensorLayout oup_lay{shape, T()};
+        TensorLayout mask_lay{{fwd->get_mask_size_in_bytes(inp_lay)}, dtype::Byte()};
+        TensorLayout doup_lay{shape, T()};
+        TensorLayout dinp_lay{shape, T()};
+        TensorLayout fwd_ws_lay{
+                {fwd->get_workspace_in_bytes(inp_lay, oup_lay, mask_lay)},
+                dtype::Byte()};
+        TensorLayout bwd_ws_lay{
+                {bwd->get_workspace_in_bytes(doup_lay, mask_lay, dinp_lay)},
+                dtype::Byte()};
+
+        SyncedTensor<ctype> inp(handle, inp_lay);
+        SyncedTensor<ctype> oup(handle, oup_lay);
+        SyncedTensor<DTypeTrait<dt_byte>::ctype> mask(handle, mask_lay);
+        SyncedTensor<ctype> doup(handle, doup_lay);
+        SyncedTensor<ctype> dinp(handle, dinp_lay);
+        SyncedTensor<DTypeTrait<dt_byte>::ctype> fwd_ws(handle, fwd_ws_lay);
+        SyncedTensor<DTypeTrait<dt_byte>::ctype> bwd_ws(handle, bwd_ws_lay);
+
+        for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) {
+            inp.ptr_mutable_host()[i] = 1;
+            doup.ptr_mutable_host()[i] = 1;
+        }
+
+        fwd->exec(
+                inp.tensornd_dev(), oup.tensornd_dev(), mask.tensornd_dev(),
+                {fwd_ws.ptr_mutable_dev(), fwd_ws.layout().total_nr_elems()});
+        size_t droped_cnt = 0;
+        for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) {
+            ASSERT_TRUE(
+                    oup.ptr_host()[i] == 0 ||
+                    oup.ptr_host()[i] == static_cast<ctype>(scale));
+            if (oup.ptr_host()[i] == 0) {
+                droped_cnt++;
+            }
+        }
+        float real_drop = droped_cnt * 1.0 / inp.layout().total_nr_elems();
+        ASSERT_LT(abs(drop_prob - real_drop), 1e-2);
+
+#if CUDNN_VERSION >= 7000
+        bwd->exec(
+                doup.tensornd_dev(), mask.tensornd_dev(), dinp.tensornd_dev(),
+                {bwd_ws.ptr_mutable_dev(), bwd_ws.layout().total_nr_elems()});
+        for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) {
+            ASSERT_TRUE(oup.ptr_host()[i] == dinp.ptr_host()[i]);
+        }
+#endif
+    };
+
+    run({32, 32, 32, 32}, 0.2);
+    run({100000}, 0.3);
+}
+
 }  // anonymous namespace
 
 TEST_F(CUDA, UNIFORM_RNG_F32) {
@@ -290,6 +354,14 @@ TEST_F(CUDA, SHUFFLE_RNG_BWD_F16) {
     run_shuffle<dtype::Float16>(handle_cuda(), true);
 }
 
+TEST_F(CUDA, DROPOUT_F32) {
+    run_dropout<dtype::Float32>(handle_cuda());
+}
+
+TEST_F(CUDA, DROPOUT_F16) {
+    run_dropout<dtype::Float16>(handle_cuda());
+}
+
 }  // namespace test
 }  // namespace megdnn
 
diff --git a/dnn/test/naive/rng.cpp b/dnn/test/naive/rng.cpp
index b5a827ad..40f1c520 100644
--- a/dnn/test/naive/rng.cpp
+++ b/dnn/test/naive/rng.cpp
@@ -231,6 +231,67 @@ void run_shuffle(Handle* handle, bool bwd_flag) {
     run({10});
     run({6, 3});
 }
+
+template <typename T>
+void run_dropout(Handle* handle) {
+    using ctype = typename DTypeTrait<T>::ctype;
+    auto run = [&](TensorShape shape, float drop_prob) {
+        auto fwd = handle->create_operator<DropoutForward>();
+        auto bwd = handle->create_operator<DropoutBackward>();
+        fwd->param().drop_prob = drop_prob;
+        bwd->param().drop_prob = drop_prob;
+        double scale = 1.0 / (1.0 - drop_prob);
+
+        TensorLayout inp_lay{shape, T()};
+        TensorLayout oup_lay{shape, T()};
+        TensorLayout mask_lay{{fwd->get_mask_size_in_bytes(inp_lay)}, dtype::Byte()};
+        TensorLayout doup_lay{shape, T()};
+        TensorLayout dinp_lay{shape, T()};
+        TensorLayout fwd_ws_lay{
+                {fwd->get_workspace_in_bytes(inp_lay, oup_lay, mask_lay)},
+                dtype::Byte()};
+        TensorLayout bwd_ws_lay{
+                {bwd->get_workspace_in_bytes(doup_lay, mask_lay, dinp_lay)},
+                dtype::Byte()};
+
+        Tensor<ctype> inp(handle, inp_lay);
+        Tensor<ctype> oup(handle, oup_lay);
+        Tensor<DTypeTrait<dt_byte>::ctype> mask(handle, mask_lay);
+        Tensor<ctype> doup(handle, doup_lay);
+        Tensor<ctype> dinp(handle, dinp_lay);
+        Tensor<DTypeTrait<dt_byte>::ctype> fwd_ws(handle, fwd_ws_lay);
+        Tensor<DTypeTrait<dt_byte>::ctype> bwd_ws(handle, bwd_ws_lay);
+
+        for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) {
+            inp.ptr()[i] = 1;
+            doup.ptr()[i] = 1;
+        }
+
+        fwd->exec(
+                inp.tensornd(), oup.tensornd(), mask.tensornd(),
+                {fwd_ws.ptr(), fwd_ws.layout().total_nr_elems()});
+        size_t droped_cnt = 0;
+        for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) {
+            ASSERT_TRUE(oup.ptr()[i] == 0 || oup.ptr()[i] == static_cast<ctype>(scale));
+            if (oup.ptr()[i] == 0) {
+                droped_cnt++;
+            }
+        }
+        float real_drop = droped_cnt * 1.0 / inp.layout().total_nr_elems();
+        ASSERT_LT(abs(drop_prob - real_drop), 1e-2);
+
+        bwd->exec(
+                doup.tensornd(), mask.tensornd(), dinp.tensornd(),
+                {bwd_ws.ptr(), bwd_ws.layout().total_nr_elems()});
+        for (size_t i = 0; i < inp.layout().total_nr_elems(); ++i) {
+            ASSERT_TRUE(oup.ptr()[i] == dinp.ptr()[i]);
+        }
+    };
+
+    run({32, 32, 32, 32}, 0.2);
+    run({100000}, 0.3);
+}
+
 }  // namespace
 
 TEST_F(NAIVE, UNIFORM_RNG_F32) {
@@ -309,6 +370,14 @@ TEST_F(NAIVE, SHUFFLE_RNG_BWD_F16) {
     run_shuffle<dtype::Float16>(handle(), true);
 }
 
+TEST_F(NAIVE, DROPOUT_F32) {
+    run_dropout<dtype::Float32>(handle());
+}
+
+TEST_F(NAIVE, DROPOUT_F16) {
+    run_dropout<dtype::Float16>(handle());
+}
+
 }  // namespace test
 }  // namespace megdnn
 
diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt
index cf1a8c35..3d122663 100644
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -1,21 +1,28 @@
 find_package(NumPy REQUIRED)
 
 set(PACKAGE_NAME megengine)
-set(PACKAGE_NAME ${PACKAGE_NAME} PARENT_SCOPE)
+set(PACKAGE_NAME
+    ${PACKAGE_NAME}
+    PARENT_SCOPE)
 set(MODULE_NAME _imperative_rt)
-set(MODULE_NAME ${MODULE_NAME} PARENT_SCOPE)
+set(MODULE_NAME
+    ${MODULE_NAME}
+    PARENT_SCOPE)
 file(GLOB_RECURSE SRCS src/impl/*.cpp src/include/*.h python/src/*.cpp python/src/*.h)
 set(SRCS ${SRCS} ${CPP_REDIS_SRCS})
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_WITH_IMPERATIVE=1")
 
 file(GLOB_RECURSE PYTHON_SRCS python/${PACKAGE_NAME}/*.py)
-file(GLOB_RECURSE ALL_HEADERS src/cpp/megbrain_pubapi.h
-    ${PROJECT_SOURCE_DIR}/src/core/include/*
-    ${PROJECT_SOURCE_DIR}/src/opr/include/*
-    ${PROJECT_SOURCE_DIR}/src/serialization/include/*
-    ${PROJECT_SOURCE_DIR}/src/plugin/include/*
-    ${PROJECT_SOURCE_DIR}/dnn/include/*)
+file(
+  GLOB_RECURSE
+  ALL_HEADERS
+  src/cpp/megbrain_pubapi.h
+  ${PROJECT_SOURCE_DIR}/src/core/include/*
+  ${PROJECT_SOURCE_DIR}/src/opr/include/*
+  ${PROJECT_SOURCE_DIR}/src/serialization/include/*
+  ${PROJECT_SOURCE_DIR}/src/plugin/include/*
+  ${PROJECT_SOURCE_DIR}/dnn/include/*)
 
 set(MEGENGINE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/)
 
@@ -23,71 +30,106 @@ add_subdirectory(tablegen)
 
 add_custom_target(_version_ld SOURCES ${MGE_VERSION_SCRIPT})
 
-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11
+                 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
-if (APPLE)
-    target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared)
-elseif (MSVC OR WIN32)
-    target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared)
-    message(STATUS "CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}")
-    set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "${CMAKE_MSVC_RUNTIME_LIBRARY}")
+if(APPLE)
+  target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared)
+elseif(MSVC OR WIN32)
+  target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared)
+  message(STATUS "CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}")
+  set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY
+                                                  "${CMAKE_MSVC_RUNTIME_LIBRARY}")
 else()
-    # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and imperative(MGE_BUILD_IMPERATIVE_RT)
-    target_link_libraries(${MODULE_NAME} PRIVATE megengine_shared -Wl,--version-script=${MGE_VERSION_SCRIPT})
+  # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and
+  # imperative(MGE_BUILD_IMPERATIVE_RT)
+  target_link_libraries(
+    ${MODULE_NAME} PRIVATE megengine_shared -Wl,--version-script=${MGE_VERSION_SCRIPT})
 endif()
 
-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3)
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3
+                 ${PROJECT_BINARY_DIR}/third_party/range-v3)
 target_link_libraries(${MODULE_NAME} PRIVATE range-v3)
 
-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json)
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json
+                 ${PROJECT_BINARY_DIR}/third_party/Json)
 target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json)
 
-target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR} ${CPP_REDIS_INCLUDES})
+target_include_directories(
+  ${MODULE_NAME}
+  PUBLIC src/include
+  PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}
+          ${CPP_REDIS_INCLUDES})
 target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
 target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
-    target_compile_options(${MODULE_NAME} PRIVATE "-Wno-class-memaccess")
+  target_compile_options(${MODULE_NAME} PRIVATE "-Wno-class-memaccess")
 endif()
-set_target_properties(${MODULE_NAME} PROPERTIES
-    SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
-    LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core
-)
-if (APPLE OR MSVC OR WIN32)
-    message(VERBOSE "overwriting SUFFIX at macos and windows before config by set_target_properties")
-    pybind11_extension(${MODULE_NAME})
+set_target_properties(
+  ${MODULE_NAME}
+  PROPERTIES SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
+             LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core)
+if(APPLE
+   OR MSVC
+   OR WIN32)
+  message(
+    VERBOSE
+    "overwriting SUFFIX at macos and windows before config by set_target_properties")
+  pybind11_extension(${MODULE_NAME})
 endif()
 add_dependencies(${MODULE_NAME} mgb_opdef _version_ld)
 
 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
-    add_subdirectory(test)
+  add_subdirectory(test)
 endif()
 
 add_custom_command(
-    TARGET ${MODULE_NAME} POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
-    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/$<TARGET_FILE_NAME:${MODULE_NAME}> # clean develop
-    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/version.py # clean develop
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine ${CMAKE_CURRENT_BINARY_DIR}/python/megengine
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${CMAKE_CURRENT_BINARY_DIR}/python/test
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt
-)
+  TARGET ${MODULE_NAME}
+  POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE
+          ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
+  COMMAND
+    ${CMAKE_COMMAND} -E remove -f
+    ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/$<TARGET_FILE_NAME:${MODULE_NAME}> # clean
+                                                                                         # develop
+  COMMAND ${CMAKE_COMMAND} -E remove -f
+          ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/version.py # clean develop
+  COMMAND ${CMAKE_COMMAND} -E remove -f
+          ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/include # clean develop
+  COMMAND ${CMAKE_COMMAND} -E remove -f
+          ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/lib # clean develop
+  COMMAND
+    ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine
+    ${CMAKE_CURRENT_BINARY_DIR}/python/megengine
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test
+          ${CMAKE_CURRENT_BINARY_DIR}/python/test
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/custom/include
+          ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/core/include
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py
+          ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt
+          ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt
+          ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt
+          ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt)
 
 if(DEFINED MGB_VER_MAJOR)
-    set(IS_INTERNAL "--internal")
+  set(IS_INTERNAL "--internal")
 else()
-    set(IS_INTERNAL "")
+  set(IS_INTERNAL "")
 endif(DEFINED MGB_VER_MAJOR)
 
 if(DEFINED MGE_EXTRA_NAME)
-    set(RC_NAME "--rc=${MGE_EXTRA_NAME}")
+  set(RC_NAME "--rc=${MGE_EXTRA_NAME}")
 else()
-    set(RC_NAME "")
+  set(RC_NAME "")
 endif(DEFINED MGE_EXTRA_NAME)
 
 add_custom_command(
-    TARGET ${MODULE_NAME} POST_BUILD
-    COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py --major ${MGE_VER_MAJOR} --minor ${MGE_VER_MINOR} --patch ${MGE_VER_PATCH} ${RC_NAME} ${IS_INTERNAL}
-)
+  TARGET ${MODULE_NAME}
+  POST_BUILD
+  COMMAND
+    "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output
+    ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py --major ${MGE_VER_MAJOR}
+    --minor ${MGE_VER_MINOR} --patch ${MGE_VER_PATCH} ${RC_NAME} ${IS_INTERNAL})
diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py
index 38291bfd..e248be57 100644
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -84,7 +84,7 @@ from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
 from .serialization import load, save
 from .tensor import Parameter, Tensor, tensor
 from .utils import comp_graph_tools as cgtools
-from .utils import persistent_cache
+from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer
 from .version import __version__
 
 _set_fork_exec_path_for_timed_func(
@@ -92,15 +92,13 @@ _set_fork_exec_path_for_timed_func(
     os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
 )
 
-atexit.register(_close)
-
 del _set_fork_exec_path_for_timed_func
 
 _exit_handlers = []
 
 
 def _run_exit_handlers():
-    for handler in _exit_handlers:
+    for handler in reversed(_exit_handlers):
         handler()
     _exit_handlers.clear()
 
@@ -117,6 +115,13 @@ def _atexit(handler):
     _exit_handlers.append(handler)
 
 
+_atexit(_close)
+
+_persistent_cache = _PersistentCacheOnServer()
+_persistent_cache.reg()
+
+_atexit(_persistent_cache.flush)
+
 # subpackages
 import megengine.amp
 import megengine.autodiff
@@ -132,5 +137,3 @@ import megengine.quantization
 import megengine.random
 import megengine.utils
 import megengine.traced_module
-
-persistent_cache.get_manager()
diff --git a/imperative/python/megengine/core/ops/custom.py b/imperative/python/megengine/core/ops/custom.py
index b1a055fd..b60527c3 100644
--- a/imperative/python/megengine/core/ops/custom.py
+++ b/imperative/python/megengine/core/ops/custom.py
@@ -7,11 +7,14 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
+import os
+
 from .._imperative_rt.ops._custom import (
     _get_custom_op_list,
     _install,
     _make_custom_op,
     _uninstall,
+    get_custom_op_abi_tag,
 )
 
 __all__ = ["load"]
@@ -25,8 +28,16 @@ def _gen_custom_op_maker(custom_op_name):
 
 
 def load(lib_path):
-    op_in_this_lib = _install(lib_path[0:-3], lib_path)
+    lib_path = os.path.abspath(lib_path)
+    lib_name = os.path.splitext(lib_path)[0]
+    op_in_this_lib = _install(lib_name, lib_path)
     for op in op_in_this_lib:
         op_maker = _gen_custom_op_maker(op)
         globals()[op] = op_maker
         __all__.append(op)
+
+
+def unload(lib_path):
+    lib_path = os.path.abspath(lib_path)
+    lib_name = os.path.splitext(lib_path)[0]
+    _uninstall(lib_name)
diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index 9ababceb..025b30ea 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -13,10 +13,12 @@ from typing import NamedTuple, Optional, Sequence, Tuple, Union
 from ..core import _config
 from ..core._imperative_rt.core2 import apply, dtype_promotion
 from ..core._imperative_rt.ops import SubgraphBuilder as _SubgraphBuilder
+from ..core._imperative_rt.ops import get_global_rng_seed as _get_global_rng_seed
 from ..core.ops import builtin
 from ..core.ops.builtin import (
     BatchNorm,
     Dimshuffle,
+    Dropout,
     Elemwise,
     GetVarShape,
     Identity,
@@ -39,7 +41,6 @@ from ..core.tensor.utils import (
 from ..device import get_default_device
 from ..distributed import WORLD, is_distributed
 from ..jit import exclude_from_trace
-from ..random import uniform
 from ..tensor import Tensor
 from ..utils.deprecation import deprecated_func
 from ..utils.tuple_function import _pair, _pair_nonzero, _triple, _triple_nonzero
@@ -77,6 +78,7 @@ __all__ = [
     "max_pool2d",
     "one_hot",
     "prelu",
+    "pad",
     "relu",
     "relu6",
     "remap",
@@ -1066,57 +1068,6 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
     return cached / down
 
 
-@lru_cache(maxsize=None)
-def _get_layerNorm(device, dtype, dim, gopt_level=2):
-    @subgraph("LayerNormAffine", dtype, device, 5, gopt_level=gopt_level)
-    def layerNormAffine(inputs, f, c):
-        inp, eps, _flatten_shape, weight, bias = inputs
-        inp_shape = f(GetVarShape(), inp)
-
-        inp = f(Reshape(axis=dim), inp, _flatten_shape)
-        mean = f(Reduce(mode="mean", axis=-1), inp)
-        x2s = f(Reduce(mode="sum_sqr", axis=-1), inp)
-        reduce_shape = f(GetVarShape(), x2s)
-        reduce_size = f(
-            "//",
-            f(Reduce(mode="product", axis=0), inp_shape),
-            f(Reduce(mode="product", axis=0), reduce_shape),
-        )
-        reduce_size_f = f(TypeCvt(dtype=dtype), reduce_size)
-        var = f("-", f("/", x2s, reduce_size_f), f("**", mean, c(2)))
-        inv_sqrt_var = f("**", f("+", var, eps), c(-0.5))
-        oup = f("fma3", inp, inv_sqrt_var, f("*", f("-", mean), inv_sqrt_var))
-        affine_oup = f(Reshape(), oup, inp_shape)
-        affine_oup = f("fma3", affine_oup, weight, bias)
-
-        # NOTE: return oup make backward faster but take more memory
-        return (affine_oup, oup, mean, x2s), (True, False, False, False)
-
-    @subgraph("LayerNorm", dtype, device, 3, gopt_level=gopt_level)
-    def layerNorm(inputs, f, c):
-        inp, eps, _flatten_shape = inputs
-        inp_shape = f(GetVarShape(), inp)
-
-        inp = f(Reshape(axis=dim), inp, _flatten_shape)
-        mean = f(Reduce(mode="mean", axis=-1), inp)
-        x2s = f(Reduce(mode="sum_sqr", axis=-1), inp)
-        reduce_shape = f(GetVarShape(), x2s)
-        reduce_size = f(
-            "//",
-            f(Reduce(mode="product", axis=0), inp_shape),
-            f(Reduce(mode="product", axis=0), reduce_shape),
-        )
-        reduce_size_f = f(TypeCvt(dtype=dtype), reduce_size)
-        var = f("-", f("/", x2s, reduce_size_f), f("**", mean, c(2)))
-        inv_sqrt_var = f("**", f("+", var, eps), c(-0.5))
-        oup = f("fma3", inp, inv_sqrt_var, f("*", f("-", mean), inv_sqrt_var))
-        oup = f(Reshape(), oup, inp_shape)
-
-        return (oup,), (True,)
-
-    return (layerNorm, layerNormAffine)
-
-
 def layer_norm(
     inp: Tensor,
     normalized_shape: tuple,
@@ -1133,32 +1084,34 @@ def layer_norm(
         normalized_shape: the shape that you want to be normalizated 
         affine: whether to use weight and bias
         weight: must not be None when the affine is true
-        bias: must not be None when the bias is true
+        bias: must not be None when the affine is true
         eps: a value added to the denominator for numerical stability. Default: 1e-5
     """
-
     if amp._enabled:
         inp, weight, bias = cast_tensors(inp, weight, bias, promote=True)
 
-    _device = inp.device
-    _dtype = inp.dtype
-    _dim = len(inp.shape) - len(normalized_shape)
+    if isinstance(normalized_shape, int):
+        normalized_shape = [normalized_shape]
 
-    _flatten_shape = concat(
-        (
-            convert_single_value(inp.shape[:_dim], dtype="int32", device=inp.device),
-            convert_single_value(-1, dtype="int32", device=inp.device),
-        )
-    )
-    (layerNorm, layerNormAffine) = _get_layerNorm(_device, _dtype, _dim)
+    normalized_dim = len(normalized_shape)
+    assert normalized_dim > 0
 
-    eps = convert_single_value(eps, dtype=inp.dtype, device=inp.device)
+    normalized_size = 1
+    for i in range(normalized_dim):
+        normalized_size = normalized_size * normalized_shape[i]
+
+    op = builtin.LayerNorm(
+        affine=affine,
+        eps=eps,
+        normalized_dim=normalized_dim,
+        normalized_size=normalized_size,
+    )
     if affine:
-        outvar, *_ = apply(layerNormAffine(), inp, eps, _flatten_shape, weight, bias)
+        assert weight is not None and bias is not None
+        return apply(op, inp, weight, bias)[0]
     else:
-        outvar, *_ = apply(layerNorm(), inp, eps, _flatten_shape)
-
-    return outvar
+        # assert weight is None and bias is None
+        return apply(op, inp)[0]
 
 
 def batch_norm(
@@ -1552,12 +1505,9 @@ def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor:
         return inp
 
     # model in training mode, e.g. model.train()
-    rv = uniform(size=inp.shape)
-    mask = rv > drop_prob
-    ret = inp * mask.astype(inp.dtype)
-    ret *= 1 / (1 - drop_prob)
-
-    return ret
+    op = Dropout(drop_prob=drop_prob, seed=_get_global_rng_seed(), handle=0)
+    outputs = apply(op, inp)
+    return outputs[0]
 
 
 def one_hot(inp: Tensor, num_classes: int) -> Tensor:
diff --git a/imperative/python/megengine/functional/tensor.py b/imperative/python/megengine/functional/tensor.py
index e8ff2e6a..ac29ae16 100755
--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
@@ -113,7 +113,7 @@ def full(
             data type must be inferred from ``value``. If the value is an ``int``,
             the output tensor data type must be the default integer data type. If the
             value is a ``float``, the output tensor data type must be the default
-            floating-point data type. If the value is a ``bool``, the output tensor 
+            floating-point data type. If the value is a ``bool``, the output tensor
             must have boolean data type. Default: ``None``.
         device: device on which to place the created tensor. Default: ``None``.
 
@@ -195,77 +195,65 @@ def ones(
     return full(shape, 1.0, dtype=dtype, device=device)
 
 
-def zeros(shape, dtype="float32", device=None) -> Tensor:
-    r"""Returns a zero tensor with given shape.
+def zeros(
+    shape: Union[int, Tuple[int, ...]],
+    *,
+    dtype="float32",
+    device: Optional[CompNode] = None
+) -> Tensor:
+    r"""Returns a new tensor having a specified shape and filled with zeros.
 
     Args:
-        shape: a list, tuple or integer defining the shape of the output tensor.
-        dtype: the desired data type of the output tensor. Default: ``float32``.
-        device: the desired device of the output tensor. Default: if ``None``,
-            use the default device (see :func:`~.megengine.get_default_device`).
+        shape (int or sequence of ints): the shape of the output tensor.
+
+    Keyword args:
+        dtype (:attr:`.Tensor.dtype`): output tensor data type. Default: ``float32``.
+        device (:attr:`.Tensor.device`): device on which to place the created tensor. Default: ``None``.
+
+    Returns:
+        a tensor containing zeros.
+
+    Examples:
+        >>> F.zeros((2, 1))
+        Tensor([[0.]
+         [0.]], device=xpux:0)
     """
     return full(shape, 0.0, dtype=dtype, device=device)
 
 
 def zeros_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]:
-    r"""Returns a zero tensor with the same shape as input tensor.
+    r"""Returns a tensor filled with zeros with the same shape and data type as input tensor.
 
     Args:
-        inp: input tensor.
+        inp (Tensor): input tensor.
 
     Return:
-        output tensor.
+        a tensor containing zeros.
 
     Examples:
-
-        .. testcode::
-
-            import numpy as np
-            from megengine import tensor
-            import megengine.functional as F
-
-            inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-            out = F.zeros_like(inp)
-            print(out.numpy())
-
-        Outputs:
-
-        .. testoutput::
-
-            [[0 0 0]
-             [0 0 0]]
-
+        >>> input = F.arange(9, dtype='int32').reshape(3,3)
+        >>> F.zeros_like(input)
+        Tensor([[0 0 0]
+         [0 0 0]
+         [0 0 0]], dtype=int32, device=xpux:0)
     """
     return full_like(inp, 0.0)
 
 
 def ones_like(inp: Union[Tensor, SymbolVar]) -> Union[Tensor, SymbolVar]:
-    r"""Returns a ones tensor with the same shape as input tensor.
+    r"""Returns a tensor filled with ones with the same shape and data type as input tensor.
 
     Args:
-        inp: input tensor.
+        inp (Tensor): input tensor.
 
     Return:
-        output tensor.
+        a tensor containing ones.
 
     Examples:
-
-        .. testcode::
-
-            import numpy as np
-            from megengine import tensor
-            import megengine.functional as F
-
-            inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
-            out = F.ones_like(inp)
-            print(out.numpy())
-
-        Outputs:
-
-        .. testoutput::
-
-            [[1 1 1]
-             [1 1 1]]
+        >>> input = F.arange(6, dtype='int32').reshape(2,3)
+        >>> F.ones_like(input)
+        Tensor([[1 1 1]
+         [1 1 1]], dtype=int32, device=xpux:0)
     """
     return full_like(inp, 1.0)
 
@@ -1094,18 +1082,18 @@ def arange(
     dtype="float32",
     device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""Returns evenly spaced values within the half-open interval ``[start, stop)`` as a one-dimensional tensor. 
+    r"""Returns evenly spaced values within the half-open interval ``[start, stop)`` as a one-dimensional tensor.
 
     Note:
-        This function cannot guarantee that the interval does not include the stop value in those cases 
+        This function cannot guarantee that the interval does not include the stop value in those cases
         where step is not an integer and floating-point rounding errors affect the length of the output tensor.
 
     Args:
-        start: if ``stop`` is specified, the start of interval (inclusive); otherwise, 
-            the end of the interval (exclusive). If ``stop`` is not specified, the default starting value is ``0``. 
+        start: if ``stop`` is specified, the start of interval (inclusive); otherwise,
+            the end of the interval (exclusive). If ``stop`` is not specified, the default starting value is ``0``.
         stop: the end of the interval. Default: ``None``.
-        step: the distance between two adjacent elements ( ``out[i+1] - out[i]`` ). Must not be 0 ; 
-            may be negative, this results i an empty tensor if stop >= start . Default: 1 . 
+        step: the distance between two adjacent elements ( ``out[i+1] - out[i]`` ). Must not be 0 ;
+            may be negative, this results i an empty tensor if stop >= start . Default: 1 .
 
     Keyword args:
         dtype( :attr:`.Tensor.dtype` ): output tensor data type. Default: ``float32``.
@@ -1114,7 +1102,7 @@ def arange(
     Returns:
         A one-dimensional tensor containing evenly spaced values.
 
-        The length of the output tensor must be ``ceil((stop-start)/step)`` 
+        The length of the output tensor must be ``ceil((stop-start)/step)``
         if ``stop - start`` and ``step`` have the same sign, and length 0 otherwise.
 
     Examples:
diff --git a/imperative/python/megengine/functional/vision.py b/imperative/python/megengine/functional/vision.py
index 7ab6bf24..0fd905ee 100644
--- a/imperative/python/megengine/functional/vision.py
+++ b/imperative/python/megengine/functional/vision.py
@@ -420,6 +420,7 @@ def warp_affine(
         Here all available options for params are listed,
         however it does not mean that you can use all the combinations.
         On different platforms, different combinations are supported.
+        ``warp_affine`` only support forward inference, Please refer to ``warp_perspective`` if backward is needed.
     """
     conv_format = _config._get_actual_op_param(format, _config.__conv_format)
 
diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py
index b93fa027..19f51544 100644
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -104,6 +104,7 @@ class TensorInfo:
         "shape",
         "is_const",
         "bound_data",
+        "bound_data_numpy",
         # resources for execution
         "varnode",
         "data_setter",
@@ -119,12 +120,18 @@ class TensorInfo:
         self.shape_read = None
         self.value_read = None
         self.bound_data = None
+        self.bound_data_numpy = None
 
         self.data_setter = None
         self.shape_reader = None
         self.value_reader = None
         self.data_reader = None
 
+    def get_numpy(self):
+        if self.bound_data_numpy is None:
+            self.bound_data_numpy = self.bound_data.numpy()
+        return self.bound_data_numpy
+
 
 _io_op_types = {AssertEqual, CollectiveComm, RemoteSend, RemoteRecv}
 
@@ -292,7 +299,7 @@ class trace:
         # Const op is represented by a str
         assert isinstance(op_, str) and op_ == "Const"
 
-        expected = self._tinfo[ohandles[0]].bound_data.numpy()
+        expected = self._tinfo[ohandles[0]].get_numpy()
         shape = value.shape
         if shape != expected.shape or dtype != expected.dtype:
             eq = False
@@ -369,6 +376,7 @@ class trace:
         info.dtype = x.dtype
         info.shape = x.shape
         info.bound_data = x
+        info.bound_data_numpy = None
         info.is_const = True
         x._mixin_handle = h
         x._recording = True
@@ -612,9 +620,7 @@ class trace:
                     assert info.external
                     assert info.bound_data
                     info.varnode = graph.make_const(
-                        info.bound_data.numpy(),
-                        info.bound_data.dtype,
-                        info.bound_data.device,
+                        info.get_numpy(), info.bound_data.dtype, info.bound_data.device,
                     )
                 continue
 
@@ -627,7 +633,7 @@ class trace:
                     if info.bound_data:
                         if getattr(info, "is_const", False):
                             info.varnode = graph.make_const(
-                                info.bound_data.numpy(),
+                                info.get_numpy(),
                                 info.bound_data.dtype,
                                 info.bound_data.device,
                             )
@@ -1060,7 +1066,8 @@ class trace:
             resize_input: whether resize input image to fit input var shape.
             input_transform: a python expression to transform the input data.
                 Example: data / np.std(data)
-            dump_format: using different dump formats.
+            dump_format: using different dump formats. the open source MegEngine defaults to the FBS 
+                format. internal MegEngine have a choice of FBS and internal proprietary formats
 
         Keyword Arguments:
 
@@ -1173,7 +1180,7 @@ class trace:
                     assert info.external
                     assert info.bound_data
                     h2v[h] = graph.make_const(
-                        info.bound_data.numpy(),
+                        info.get_numpy(),
                         dtype=info.dtype,
                         device=dumped_device(info),
                         name=info.name,
@@ -1186,7 +1193,7 @@ class trace:
                     assert info.external
                     assert info.bound_data
                     h2v[h] = graph.make_const(
-                        info.bound_data.numpy(),
+                        info.get_numpy(),
                         dtype=info.dtype,
                         device=dumped_device(info),
                         name=info.name,
diff --git a/imperative/python/megengine/traced_module/__init__.py b/imperative/python/megengine/traced_module/__init__.py
index c906b879..6bbdc668 100644
--- a/imperative/python/megengine/traced_module/__init__.py
+++ b/imperative/python/megengine/traced_module/__init__.py
@@ -9,6 +9,8 @@
 from ..core._imperative_rt.core2 import set_cpp_apply_module_trace
 from . import compat
 from ._passes import optimize
+from .pytree import register_supported_type
+from .tm_config import disable_default_checker, enable_expr_checker
 from .traced_module import (
     TracedModule,
     _register_all_builtin_module,
@@ -23,8 +25,11 @@ set_cpp_apply_module_trace(cpp_apply_module_trace)
 
 __all__ = [
     "register_as_builtin",
+    "register_supported_type",
     "trace_module",
     "wrap",
     "TracedModule",
     "optimize",
+    "enable_expr_checker",
+    "disable_default_checker",
 ]
diff --git a/imperative/python/megengine/traced_module/_passes/const_pass.py b/imperative/python/megengine/traced_module/_passes/const_pass.py
index 143a704c..0ff3571b 100644
--- a/imperative/python/megengine/traced_module/_passes/const_pass.py
+++ b/imperative/python/megengine/traced_module/_passes/const_pass.py
@@ -12,7 +12,7 @@ from ...core.ops.builtin import GetVarShape
 from ...logger import get_logger
 from ...tensor import Tensor
 from ..expr import Constant, Expr, is_apply_def, is_constant, is_getattr
-from ..node import Node, TensorNode
+from ..node import Node, NodeMixin, TensorNode
 from .matcher import PatternMatcher
 from .pass_base import BackwardPass, ForwardPass, register_pass
 from .pattern import is_op
@@ -21,6 +21,12 @@ from .utils import get_const_value
 logger = get_logger(__name__)
 
 
+def _as_const_node(x):
+    node = Constant.make(x)
+    NodeMixin.wrap(x, node)
+    return node
+
+
 @register_pass("AttrToConstant")
 class AttrToConstant(BackwardPass):
     r"""Convert :class:`~.GetAttr` to :class:`~.Constant` expr."""
@@ -35,10 +41,10 @@ class AttrToConstant(BackwardPass):
         orig_node = expr.outputs[0]
         name = orig_node.name
         with graph.insert_exprs(expr):
-            const_node = Constant.make(value, name=name)
+            const_node = _as_const_node(value)
         graph.replace_node({orig_node: const_node})
         graph.compile()
-        name = orig_node.name
+        const_node.name = name
         return const_node.expr
 
 
@@ -53,7 +59,7 @@ class FixInputShape(BackwardPass):
         shape = Tensor(expr.inputs[0].shape, dtype="int32")
         graph = expr.top_graph
         with graph.insert_exprs(expr):
-            const_shape = Constant.make(shape)
+            const_shape = _as_const_node(shape)
         graph.replace_node({expr.outputs[0]: const_shape})
         graph.compile()
         const_shape.name = expr.outputs[0].name
@@ -73,7 +79,7 @@ class FlodConstant(ForwardPass):
         const_var = expr.interpret(*[get_const_value(n.expr) for n in expr.inputs])[0]
         graph = expr.top_graph
         with graph.insert_exprs(expr):
-            const_node = Constant.make(const_var)
+            const_node = _as_const_node(const_var)
         graph.replace_node({expr.outputs[0]: const_node})
         graph.compile()
         const_node.name = expr.outputs[0].name
diff --git a/imperative/python/megengine/traced_module/checker.py b/imperative/python/megengine/traced_module/checker.py
new file mode 100644
index 00000000..31fa0470
--- /dev/null
+++ b/imperative/python/megengine/traced_module/checker.py
@@ -0,0 +1,142 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import traceback
+from typing import Sequence
+
+import numpy as np
+
+from ..core._imperative_rt.core2 import apply
+from ..core._imperative_rt.ops import ROIAlign, ROIPooling
+from ..core.ops.builtin import Copy
+from ..core.tensor.utils import isscalar, setscalar
+from ..tensor import Tensor
+from .tm_config import _exclude_from_trace
+
+
+class TracedModuleChecker:
+    def __init__(self, tracer):
+        self._active_node2values = []
+        self.tracer = tracer
+
+        self.node_without_tensor_info = {}
+
+    def push_scope(self):
+        self._active_node2values.append({})
+
+    def pop_scope(self):
+        self._active_node2values.pop()
+
+    def current_node2values(self):
+        return self._active_node2values[-1]
+
+    def reset_checker(self):
+        self._active_node2values = []
+
+    def check_node_not_in_scope(self):
+        if self.node_without_tensor_info:
+            for node, info in self.node_without_tensor_info.items():
+                for expr in info[0]._exprs:
+                    if node in expr.inputs or node in expr.outputs:
+                        traceback.print_list(info[1])
+                        raise ValueError(
+                            "node({}) not in the graph:\n{}".format(node, info[0])
+                        )
+            return True
+        else:
+            return False
+
+    def check_net_outputs(self, tm_res, gt_res):
+        if isinstance(tm_res, Tensor):
+            np.testing.assert_allclose(tm_res.numpy(), gt_res.numpy())
+        elif isinstance(tm_res, Sequence):
+            for i, j in zip(tm_res, gt_res):
+                np.testing.assert_allclose(i.numpy(), j.numpy())
+        else:
+            for k in tm_res.__dict__.keys():
+                np.testing.assert_allclose(
+                    getattr(tm_res, k).numpy(), getattr(gt_res, k).numpy()
+                )
+
+    def record_nodemixin(self, node, value):
+        self.current_node2values()[node] = value
+
+    def record_node2value(self, node, value):
+        with _exclude_from_trace():
+            self.current_node2values()[node] = apply(
+                Copy(comp_node=value.device), value
+            )[0]
+            if isscalar(value):
+                setscalar(self.current_node2values()[node])
+
+    def check_apply_special_cases(self, opdef, num_outputs):
+        indexs = list(range(num_outputs))
+        if isinstance(opdef, ROIAlign) and opdef.mode == ROIAlign.Mode.AVERAGE:
+            indexs.pop(-1)
+        if isinstance(opdef, ROIPooling) and opdef.mode == ROIPooling.Mode.AVERAGE:
+            indexs.pop(-1)
+        return indexs
+
+    def check_expr_results(self, expr_outputs, gt_outputs, indexs=None):
+        expr_outputs = (
+            (expr_outputs,) if not isinstance(expr_outputs, Sequence) else expr_outputs
+        )
+        gt_outputs = (
+            (gt_outputs,) if not isinstance(gt_outputs, Sequence) else gt_outputs
+        )
+        if indexs is not None:
+            for i in indexs:
+                np.testing.assert_allclose(
+                    expr_outputs[i].numpy(), gt_outputs[i].numpy()
+                )
+        else:
+            np.testing.assert_allclose(expr_outputs, gt_outputs)
+
+    def get_node2value(self, inputs, start_idx=0):
+        inp_values = []
+        has_node_not_in_scope = False
+        for i in range(start_idx, len(inputs)):
+            try:
+                inp_values.append(self.current_node2values()[inputs[i]])
+            except:
+                has_node_not_in_scope = True
+                self.node_without_tensor_info[inputs[i]] = [
+                    self.tracer.current_scope(),
+                    traceback.extract_stack(),
+                ]
+        return inp_values, has_node_not_in_scope
+
+    def check_expr_interpret(self, expr, gt_outputs):
+        ori_in, has_node_not_in_scope = self.get_node2value(expr.inputs)
+        if not has_node_not_in_scope:
+            expr_res = expr.interpret(*ori_in)
+            try:
+                self.check_expr_results(expr_res, gt_outputs)
+            except:
+                raise ValueError("Error occurred when checking expr: {}".format(expr))
+
+    def check_apply(self, expr, gt_outputs, opdef):
+        ori_in, has_node_not_in_scope = self.get_node2value(expr.inputs)
+        if not has_node_not_in_scope:
+            expr_res = expr.interpret(*ori_in)
+            indexs = self.check_apply_special_cases(opdef, len(gt_outputs))
+            try:
+                self.check_expr_results(expr_res, gt_outputs, indexs=indexs)
+            except:
+                raise ValueError("Error occurred when checking expr: {}".format(expr))
+
+    def check_builtin_module(self, module, expr, gt_outputs):
+        ori_in, has_node_not_in_scope = self.get_node2value(expr.inputs, start_idx=1)
+        if not has_node_not_in_scope:
+            ori_in.insert(0, module)
+            expr_res = expr.interpret(*ori_in)
+            try:
+                self.check_expr_results(expr_res, gt_outputs)
+            except:
+                raise ValueError(
+                    "{}, Error occurred when checking expr: {}".format(expr)
+                )
diff --git a/imperative/python/megengine/traced_module/expr.py b/imperative/python/megengine/traced_module/expr.py
index b7e7c077..c22249fc 100644
--- a/imperative/python/megengine/traced_module/expr.py
+++ b/imperative/python/megengine/traced_module/expr.py
@@ -32,6 +32,7 @@ from .module_tracer import active_module_tracer, module_tracer
 from .node import ModuleNode, Node, NodeMixin, TensorNode
 from .pytree import ArgsIndex, TreeDef, _is_const_leaf, _is_leaf, tree_flatten
 from .serialization import _ModuleState
+from .tm_config import _exclude_from_trace, _get_expr_checker
 from .utils import _check_builtin_module_attr, _check_obj_attr, _convert_kwargs_to_args
 
 
@@ -611,6 +612,8 @@ class Apply(Expr):
             inp_nodes = [NodeMixin.get(inputs[0])]
             for i in inputs[1:]:
                 node = Constant.make(i)
+                if _get_expr_checker():
+                    active_module_tracer().checker.record_node2value(node, Tensor(i))
                 inp_nodes.append(node)
             apply_node = cls.make(opdef)
             for n in inp_nodes:
@@ -624,11 +627,17 @@ class Apply(Expr):
 
         unset_module_tracing()
         outputs = apply(opdef, *inputs)
+        outputs = list(map(Tensor, outputs))
         set_module_tracing()
 
         apply_node.add_outputs(outputs)
         for n, v in zip(apply_node.outputs, outputs):
             NodeMixin.wrap_safe(v, n)
+
+        if _get_expr_checker():
+            with _exclude_from_trace():
+                active_module_tracer().checker.check_apply(apply_node, outputs, opdef)
+
         return list(outputs)
 
 
@@ -754,6 +763,7 @@ class Constant(Expr):
         current_graph = active_module_tracer().current_scope()
         current_graph._namespace.auto_naming_for_outputs(expr)
         current_graph._insert(expr)
+        active_module_tracer().current_constant_cache().append(expr.value)
         return expr.outputs[0]
 
     def interpret(self, *inputs):
diff --git a/imperative/python/megengine/traced_module/module_tracer.py b/imperative/python/megengine/traced_module/module_tracer.py
index db2bf055..70a020f4 100644
--- a/imperative/python/megengine/traced_module/module_tracer.py
+++ b/imperative/python/megengine/traced_module/module_tracer.py
@@ -12,6 +12,7 @@ from .. import functional as F
 from ..core.tensor.array_method import ArrayMethodMixin
 from ..module import Module
 from ..module.qat import QATModule
+from .checker import TracedModuleChecker
 
 _active_module_tracer = None
 
@@ -92,7 +93,6 @@ BUILTIN_TENSOR_WRAP_METHOD = [
     "dtype",
     "grad",
     "item",
-    "name",
     "ndim",
     "numpy",
     "qparams",
@@ -129,7 +129,9 @@ class module_tracer:
 
     def __init__(self, wrap_fn):
         self._active_scopes = []
+        self.checker = TracedModuleChecker(self)
         self.patcher = Patcher(wrap_fn)
+        self._activate_constant_cache = []
 
     @classmethod
     def register_as_builtin(cls, mod):
@@ -143,15 +145,32 @@ class module_tracer:
 
     def push_scope(self, scope):
         self._active_scopes.append(scope)
+        self.checker.push_scope()
+        self._activate_constant_cache.append([])
 
     def pop_scope(self):
         self._active_scopes.pop()
+        self.checker.pop_scope()
+        cache = self._activate_constant_cache.pop()
+        for obj in cache:
+            if hasattr(obj, "_NodeMixin__node"):
+                delattr(obj, "_NodeMixin__node")
 
     def current_scope(self):
         if self._active_scopes:
             return self._active_scopes[-1]
         return None
 
+    def current_constant_cache(self):
+        if self._activate_constant_cache:
+            return self._activate_constant_cache[-1]
+        return None
+
+    def top_scope(self):
+        if self._active_scopes:
+            return self._active_scopes[0]
+        return None
+
 
 class NotExist:
     pass
diff --git a/imperative/python/megengine/traced_module/node.py b/imperative/python/megengine/traced_module/node.py
index 4bfff4af..079ee46e 100644
--- a/imperative/python/megengine/traced_module/node.py
+++ b/imperative/python/megengine/traced_module/node.py
@@ -18,6 +18,8 @@ from ..core._imperative_rt.core2 import Tensor as RawTensor
 from ..module import Module
 from ..quantization.utils import QParams
 from ..tensor import Tensor
+from .module_tracer import active_module_tracer
+from .tm_config import _get_expr_checker
 from .utils import _check_obj_attr
 
 logger = get_logger(__name__)
@@ -343,6 +345,11 @@ class NodeMixin(abc.ABC):
                 if isinstance(value, NodeMixin):
                     value._record_wrapped_nodes(node)
                 setattr(value, "_NodeMixin__node", node)
+                if _get_expr_checker():
+                    if isinstance(value, RawTensor):
+                        active_module_tracer().checker.record_node2value(node, value)
+                    if isinstance(value, NodeMixin):
+                        active_module_tracer().checker.record_nodemixin(node, value)
             else:
                 assert callable(node)
                 n = node()
@@ -352,6 +359,11 @@ class NodeMixin(abc.ABC):
                 if isinstance(value, NodeMixin):
                     value._record_wrapped_nodes(n)
                 setattr(value, "_NodeMixin__node", n)
+                if _get_expr_checker():
+                    if isinstance(value, RawTensor):
+                        active_module_tracer().checker.record_node2value(n, value)
+                    if isinstance(value, NodeMixin):
+                        active_module_tracer().checker.record_nodemixin(n, value)
 
     @classmethod
     def wrap_safe(cls, value, node):
@@ -359,10 +371,20 @@ class NodeMixin(abc.ABC):
         if isinstance(value, RawTensor):
             cls._record_tensornode_property(node, value)
         setattr(value, "_NodeMixin__node", node)
+        if _get_expr_checker():
+            if isinstance(value, RawTensor):
+                active_module_tracer().checker.record_node2value(node, value)
+            if isinstance(value, NodeMixin):
+                active_module_tracer().checker.record_nodemixin(node, value)
         if isinstance(value, NodeMixin):
             value._record_wrapped_nodes(node)
 
     @classmethod
+    def clear_node(cls, value):
+        if hasattr(value, "_NodeMixin__node"):
+            delattr(value, "_NodeMixin__node")
+
+    @classmethod
     def get(cls, value, *default):
         return getattr(value, "_NodeMixin__node", *default)
 
diff --git a/imperative/python/megengine/traced_module/pytree.py b/imperative/python/megengine/traced_module/pytree.py
index 98d19f1e..c4b132fa 100644
--- a/imperative/python/megengine/traced_module/pytree.py
+++ b/imperative/python/megengine/traced_module/pytree.py
@@ -10,7 +10,7 @@ import collections
 from collections import OrderedDict, defaultdict
 from functools import partial
 from inspect import FullArgSpec
-from typing import Callable, NamedTuple
+from typing import Any, Callable, Dict, List, NamedTuple, Tuple
 
 import numpy as np
 
@@ -46,6 +46,8 @@ SUPPORTED_LEAF_TYPE = {
     int,
     float,
     bool,
+    bytes,
+    bytearray,
     QuantDtypeMeta,
     CompNode,
     Device,
@@ -74,18 +76,51 @@ SUPPORTED_LEAF_CLS = [
 NodeType = NamedTuple("NodeType", [("flatten", Callable), ("unflatten", Callable)])
 
 
-def register_supported_type(type, flatten=None, unflatten=None):
+def register_supported_type(
+    type,
+    flatten_fn: Callable[[Any], Tuple[List, Any]] = None,
+    unflatten_fn: Callable[[List, Any], Any] = None,
+):
+    r"""Call this function to register the ``type`` as a built-in type. The registered ``type`` 
+    can be used and serialized correctly in :py:class:`TracedModule`.
+
+    Examples:
+        .. code-block::
+
+            def dict_flatten(obj: Dict):
+                context, values = [], []
+                # obj.keys() needs to be sortable
+                keys = sorted(obj.keys())
+                for key in keys:
+                    values.append(obj[key])
+                    context.append(key)
+                return values, tuple(context)
+            
+            def dict_unflatten(values: List, context: Any):
+                return dict(zip(context, values))
+            
+            register_supported_type(dict, dict_flatten, dict_unflatten)
+
+    Args:
+        type: the type that needs to be registered.
+        flatten_fn: a function that should take an object created from ``type`` and return a
+            flat list of values. It can also return some context that is used in reconstructing
+            the object. Default: None
+        unflatten_fn: a function that should take a flat list of values and some context
+            (returned by flatten_fn). It returns the object by reconstructing
+            it from the list and the context. Default: None
+    """
     tp_info = (type.__module__, type.__qualname__)
-    if flatten and unflatten:
+    if flatten_fn and unflatten_fn:
         USER_REGISTERED_CONTAINER_TYPE.append(tp_info)
     else:
         USER_REGISTERED_LEAF_TYPE.append(tp_info)
-    _register_supported_type(type, flatten, unflatten)
+    _register_supported_type(type, flatten_fn, unflatten_fn)
 
 
-def _register_supported_type(type, flatten=None, unflatten=None):
-    if flatten and unflatten:
-        SUPPORTED_TYPE[type] = NodeType(flatten, unflatten)
+def _register_supported_type(type, flatten_fn=None, unflatten_fn=None):
+    if flatten_fn and unflatten_fn:
+        SUPPORTED_TYPE[type] = NodeType(flatten_fn, unflatten_fn)
     else:
         SUPPORTED_LEAF_CLS.append(type)
 
@@ -131,6 +166,7 @@ _register_supported_type(
 _register_supported_type(
     OrderedDict, partial(_dict_flatten, True), partial(_dict_unflatten, OrderedDict)
 )
+
 _register_supported_type(
     slice,
     lambda x: ([x.start, x.stop, x.step], None),
@@ -176,7 +212,11 @@ def tree_flatten(
     to reconstruct the pytree.
     """
     if type(values) not in SUPPORTED_TYPE:
-        assert is_leaf(values), values
+        assert is_leaf(
+            values
+        ), 'doesn\'t support {} type, MUST use "register_supported_type" method to register self-defined type'.format(
+            values
+        )
         node = LeafDef(leaf_type(values))
         if is_const_leaf(values):
             node.const_val = values
@@ -244,8 +284,43 @@ class TreeDef:
             and self.children_defs == other.children_defs
         )
 
+    def _args_kwargs_repr(self):
+        if (
+            len(self.children_defs) == 2
+            and issubclass(self.children_defs[0].type, (List, Tuple))
+            and issubclass(self.children_defs[1].type, Dict)
+        ):
+            args_def = self.children_defs[0]
+            content = ", ".join(repr(i) for i in args_def.children_defs)
+            kwargs_def = self.children_defs[1]
+            if kwargs_def.aux_data:
+                content += ", "
+                content += ", ".join(
+                    str(i) + "=" + repr(j)
+                    for i, j in zip(kwargs_def.aux_data, kwargs_def.children_defs)
+                )
+            return content
+        else:
+            return repr(self)
+
     def __repr__(self):
-        return "{}[{}]".format(self.type.__name__, self.children_defs)
+        format_str = self.type.__name__ + "({})"
+        aux_data_delimiter = "="
+        if issubclass(self.type, List):
+            format_str = "[{}]"
+        if issubclass(self.type, Tuple):
+            format_str = "({})"
+        if issubclass(self.type, Dict):
+            format_str = "{{{}}}"
+            aux_data_delimiter = ":"
+        if self.aux_data:
+            content = ", ".join(
+                repr(i) + aux_data_delimiter + repr(j)
+                for i, j in zip(self.aux_data, self.children_defs)
+            )
+        else:
+            content = ", ".join(repr(i) for i in self.children_defs)
+        return format_str.format(content)
 
 
 class LeafDef(TreeDef):
@@ -275,6 +350,9 @@ class LeafDef(TreeDef):
         return hash(tuple([self.type, self.const_val]))
 
     def __repr__(self):
-        return "Leaf({}[{}])".format(
-            ", ".join(t.__name__ for t in self.type), self.const_val
+
+        return "{}".format(
+            self.const_val
+            if self.const_val is not None or type(None) in self.type
+            else self.type[0].__name__
         )
diff --git a/imperative/python/megengine/traced_module/tm_config.py b/imperative/python/megengine/traced_module/tm_config.py
new file mode 100644
index 00000000..6453a05e
--- /dev/null
+++ b/imperative/python/megengine/traced_module/tm_config.py
@@ -0,0 +1,55 @@
+import contextlib
+
+from ..core._imperative_rt.core2 import (
+    is_tracing_module,
+    set_module_tracing,
+    unset_module_tracing,
+)
+
+_enable_expr_checker = False
+_enable_default_checker = True
+
+
+def _get_expr_checker():
+    return _enable_expr_checker
+
+
+def _get_default_checker():
+    return _enable_default_checker
+
+
+def enable_expr_checker():
+    r"""Call this function to check the result of each expr during tracing."""
+    global _enable_expr_checker
+    _enable_expr_checker = True
+    _enable_default_checker = False
+
+
+def disable_default_checker():
+    r"""Call this function to disable checking the final output of the model after tracing."""
+    global _enable_default_checker
+    _enable_default_checker = False
+
+
+_enable_graph_surgery_mode = False
+
+
+def _graph_surgery_mode():
+    return _enable_graph_surgery_mode
+
+
+def _set_graph_surgery_mode(mode: bool):
+    global _enable_graph_surgery_mode
+    pre_mode = _enable_graph_surgery_mode
+    _enable_graph_surgery_mode = mode
+    return pre_mode
+
+
+@contextlib.contextmanager
+def _exclude_from_trace():
+    is_tracing = is_tracing_module()
+    if is_tracing:
+        unset_module_tracing()
+    yield
+    if is_tracing:
+        set_module_tracing()
diff --git a/imperative/python/megengine/traced_module/traced_module.py b/imperative/python/megengine/traced_module/traced_module.py
index 8092f6c9..670ab7e9 100644
--- a/imperative/python/megengine/traced_module/traced_module.py
+++ b/imperative/python/megengine/traced_module/traced_module.py
@@ -36,12 +36,16 @@ from .. import get_logger
 from .. import module as M
 from ..core._imperative_rt.core2 import Tensor as RawTensor
 from ..core._imperative_rt.core2 import (
+    apply,
     is_tracing_module,
     set_module_tracing,
     unset_module_tracing,
 )
 from ..core._trace_option import set_symbolic_shape
+from ..core.ops.builtin import Copy
+from ..core.tensor.utils import isscalar, setscalar
 from ..module import Module
+from ..module import external as MExternal
 from ..module.qat import QATModule
 from ..quantization.fake_quant import LSQ, TQT, FakeQuantize, _FakeQuantize
 from ..quantization.observer import (
@@ -54,6 +58,7 @@ from ..quantization.observer import (
     SyncMinMaxObserver,
 )
 from ..tensor import Tensor
+from ..utils.max_recursion_limit import max_recursion_limit
 from ..version import __version__
 from .expr import (
     Apply,
@@ -97,6 +102,13 @@ from .serialization import (
     load_call_tensor_method_expr,
     load_functional,
 )
+from .tm_config import (
+    _exclude_from_trace,
+    _get_default_checker,
+    _get_expr_checker,
+    _graph_surgery_mode,
+    _set_graph_surgery_mode,
+)
 from .utils import (
     _check_builtin_module_attr,
     _check_obj_attr,
@@ -116,26 +128,14 @@ def _is_builtin_name(name: str) -> bool:
 
 
 def _is_leaf(node):
-    assert isinstance(node, RawTensor), "doesn't support {} in return values".format(
+    assert isinstance(
+        node, RawTensor
+    ), 'doesn\'t support {} in return values, MUST use Tensor or use "register_supported_type" method to register self-defined type'.format(
         type(node)
     )
     return isinstance(node, RawTensor)
 
 
-_enable_graph_surgery_mode = False
-
-
-def _graph_surgery_mode():
-    return _enable_graph_surgery_mode
-
-
-def _set_graph_surgery_mode(mode: bool):
-    global _enable_graph_surgery_mode
-    pre_mode = _enable_graph_surgery_mode
-    _enable_graph_surgery_mode = mode
-    return pre_mode
-
-
 def _node_to_tensor(*args, **kwargs):
     tensors = []
     nodes, tree_def = tree_flatten((args, kwargs))
@@ -179,6 +179,25 @@ def _tensor_to_node(tensors):
     return nodes
 
 
+def _name_setter(node: Node, new_name: str):
+    surgery_mode = _set_graph_surgery_mode(False)
+    graph = active_module_tracer().current_scope()
+
+    if node.top_graph is not None:
+        top_graph = active_module_tracer().top_scope()
+        if node is top_graph._namespace.used_names.get(node._name, None):
+            graph = top_graph
+        else:
+            graph = node.top_graph
+
+    assert (
+        graph._namespace.used_names.get(new_name, None) is None
+    ), "The name(%s) is already in use. Please try a different one again." % (new_name)
+    graph._namespace.unassociate_name_with_obj(node)
+    node._name = graph._namespace.create_unique_name(new_name, node)
+    _set_graph_surgery_mode(surgery_mode)
+
+
 def _wrap_method_to_tensor_node():
     def _any_method(name, func):
         def _any(*args, **kwargs):
@@ -207,10 +226,15 @@ def _wrap_method_to_tensor_node():
     for method in get_tensor_wrapable_method():
         patch = PatchedFn(TensorNode, method)
         if type(getattr(Tensor, method)) == property:
+            # Only support property.getter
             patch.set_func(property(_any_method(method, patch.origin_fn)))
         else:
             patch.set_func(_any_method(method, patch.origin_fn))
         tensor_method_patch.append(patch)
+
+    patch = PatchedFn(Node, "name")
+    patch.set_func(property(patch.origin_fn.fget, _name_setter))
+    tensor_method_patch.append(patch)
     return tensor_method_patch
 
 
@@ -351,14 +375,14 @@ class _InsertExprs:
             assert (
                 node.top_graph == self.graph
             ), "The input node ({}) is not in the graph ({})".format(node, self.graph)
-            if isinstance(node, TensorNode) and node.expr in self.graph._exprs:
+            if node.expr in self.graph._exprs:
                 max_inp_expr_idx = max(
                     max_inp_expr_idx, self.graph._exprs.index(node.expr)
                 )
         max_inp_expr_idx += 1
 
         insert_index = -1
-        if self.expr is not None:
+        if self.expr in self.graph._exprs:
             insert_index = self.graph._exprs.index(self.expr)
         insert_index += 1
 
@@ -1224,17 +1248,18 @@ class InternalGraph:
         return result
 
     def __deepcopy__(self, memo):
-        if id(self) in memo:
-            return memo[id(self)]
-        cls = self.__class__
-        result = cls.__new__(cls)
-        state = {}
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            if not isinstance(v, weakref.ReferenceType):
-                state[k] = copy.deepcopy(v, memo)
-        result.__dict__.update(state)
-        return result
+        with max_recursion_limit():
+            if id(self) in memo:
+                return memo[id(self)]
+            cls = self.__class__
+            result = cls.__new__(cls)
+            state = {}
+            memo[id(self)] = result
+            for k, v in self.__dict__.items():
+                if not isinstance(v, weakref.ReferenceType):
+                    state[k] = copy.deepcopy(v, memo)
+            result.__dict__.update(state)
+            return result
 
 
 def _get_meth_name(obj, func):
@@ -1270,7 +1295,12 @@ def _wrapped_function(orig_func):
                         return orig_func(*args, **kwargs)
                     if isinstance(args[1], RawTensor):
                         node = NodeMixin.get(inputs[1])
-                        inputs[1] = copy.copy(inputs[1])
+                        is_scalar = isscalar(inputs[1])
+                        inputs[1] = apply(
+                            Copy(comp_node=inputs[1].device), Tensor(inputs[1])
+                        )[0]
+                        if is_scalar:
+                            setscalar(inputs[1])
                         # copy inputs[1] to avoid tensor and Tensor(tensor) share same m_tensor,
                         # which will cause they have same _NodeMixin__node in tracing.
                         NodeMixin.wrap_safe(inputs[1], node)
@@ -1294,6 +1324,13 @@ def _wrapped_function(orig_func):
             else:
                 outputs = None
             call_node.add_outputs(outputs)
+
+            if _get_expr_checker():
+                with _exclude_from_trace():
+                    active_module_tracer().checker.check_expr_interpret(
+                        call_node, outputs
+                    )
+
             set_module_tracing()
             return rst
         return orig_func(*args, **kwargs)
@@ -1475,6 +1512,12 @@ class TracedModuleBuilder(NodeMixin):
             unset_module_tracing()
             rst = self._mod(*args, **kwargs)
             outputs, out_def = tree_flatten(rst, is_leaf=_is_leaf)
+            if _get_expr_checker():
+                with _exclude_from_trace():
+                    tmp = self.build()
+                    active_module_tracer().checker.check_builtin_module(
+                        tmp, callnode, outputs
+                    )
             set_module_tracing()
             if self._is_builtin:
                 self._body = None
@@ -1649,7 +1692,9 @@ class TracedModuleBuilder(NodeMixin):
                     if not isinstance(mod_attr, (List, Dict, QATModule)):
                         assert mod_attr is wrapped._mod
                 else:
-                    assert mod_attr is wrapped
+                    assert (
+                        mod_attr is wrapped
+                    ), "TracedModule do not support modify attributes, please check your code."
 
                 if isinstance(wrapped, (NodeMixin, RawTensor)):
                     NodeMixin.wrap(
@@ -1934,7 +1979,15 @@ class TracedModule(Module):
         if hasattr(self, "argspec") and self.argspec is not None:
             args, kwargs = _convert_kwargs_to_args(self.argspec, args, kwargs, True)
         inputs, treedef = tree_flatten(((self, *args), kwargs))
-        assert treedef in self.argdef_graph_map
+        assert (
+            treedef in self.argdef_graph_map
+        ), "support input args kwargs format: \n{}, but get: \n{}".format(
+            "\n ".join(
+                "forward({})".format(i._args_kwargs_repr())
+                for i in self.argdef_graph_map.keys()
+            ),
+            treedef._args_kwargs_repr(),
+        )
         inputs = filter(
             lambda i: isinstance(i, (Module, TracedModuleBuilder, RawTensor)), inputs
         )  # allow TracedModuleBuilder for retrace.
@@ -2070,7 +2123,8 @@ class TracedModule(Module):
         for inp_def, graph in self.argdef_graph_map.items():
             if top_graph is not None:
                 graph._top_graph = weakref.ref(top_graph)
-            for n in graph._inputs + graph.outputs:
+            for n in graph._inputs + graph._outputs:
+                n.expr._top_graph = weakref.ref(graph)
                 n._top_graph = weakref.ref(graph)
             graph._inputs[0]._owner = weakref.ref(self)
             for i, n in enumerate(graph._inputs):
@@ -2307,16 +2361,17 @@ class TracedModule(Module):
         return result
 
     def __deepcopy__(self, memo):
-        cls = self.__class__
-        result = cls.__new__(cls)
-        state = {}
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            if not isinstance(v, weakref.ReferenceType):
-                state[k] = copy.deepcopy(v, memo)
-        result.__dict__.update(state)
-        result._update_ref()
-        return result
+        with max_recursion_limit():
+            cls = self.__class__
+            result = cls.__new__(cls)
+            state = {}
+            memo[id(self)] = result
+            for k, v in self.__dict__.items():
+                if not isinstance(v, weakref.ReferenceType):
+                    state[k] = copy.deepcopy(v, memo)
+            result.__dict__.update(state)
+            result._update_ref()
+            return result
 
 
 def cpp_apply_module_trace(opdef, *args):
@@ -2375,7 +2430,7 @@ def wrap(func: Callable):
 
 def _register_all_builtin_module():
 
-    for sub_mod in [M, M.qat, M.quantized]:
+    for sub_mod in [M, M.qat, M.quantized, MExternal]:
         for m in getmembers(sub_mod):
             if (
                 isclass(m[1])
@@ -2443,13 +2498,29 @@ def trace_module(
                             qualname="{}.[{}]".format(net_name, "arg_{}".format(_)),
                         ),
                     )
-            builder(*args, **kwargs)
+            rst = builder(*copy.deepcopy(args), **copy.deepcopy(kwargs))
             active_module_tracer().pop_scope()
             traced_mod = builder.build()
             traced_mod.argspec = forward_argspec
             traced_mod.graph._reset_ids()
+
+            has_expr_not_check = False
+            if _get_expr_checker():
+                has_expr_not_check = (
+                    active_module_tracer().checker.check_node_not_in_scope()
+                )
+            if _get_default_checker() or has_expr_not_check:
+                with _exclude_from_trace():
+                    tm_res = traced_mod(*args, **kwargs)
+                    tm_res, _ = tree_flatten(tm_res, is_leaf=_is_leaf)
+                    rst, _ = tree_flatten(rst, is_leaf=_is_leaf)
+                    active_module_tracer().checker.check_net_outputs(tm_res, rst)
             return traced_mod
     finally:
         set_symbolic_shape(use_sym_shape)
         set_active_module_tracer(None)
         unset_module_tracing()
+        for t in mod.tensors(recursive=True):
+            NodeMixin.clear_node(t)
+        for t in inputs:
+            NodeMixin.clear_node(t)
diff --git a/imperative/python/megengine/traced_module/utils.py b/imperative/python/megengine/traced_module/utils.py
index 21ccb35c..d93b658f 100644
--- a/imperative/python/megengine/traced_module/utils.py
+++ b/imperative/python/megengine/traced_module/utils.py
@@ -5,16 +5,15 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
 import copy
 import inspect
 from collections.abc import MutableMapping, MutableSequence
 from inspect import FullArgSpec
-from typing import Callable, Dict, Iterable, List, Optional, Sequence, Type, Union
+from typing import Callable, Dict, Iterable, List, Optional, Sequence, Union
 
 from .. import get_logger
 from ..module import Module
-from ..tensor import Parameter, Tensor
+from ..tensor import Tensor
 
 logger = get_logger(__name__)
 
@@ -126,10 +125,12 @@ def _check_obj_attr(obj):
     for _, v in obj.items():
         leafs, _ = tree_flatten(v, is_leaf=lambda _: True)
         for leaf in leafs:
-            assert _check_leaf_type(
-                leaf
-            ), "Type {} is not supported by traced module".format(
-                leaf if isinstance(leaf, type) else type(leaf)
+            assert _check_leaf_type(leaf), (
+                "Type {} is not supported in TracedModule serialization by default. "
+                "If you want to save this object to file, please call tm.register_supported_type({}) "
+                "before saving.".format(
+                    leaf if isinstance(leaf, type) else type(leaf), type(leaf).__name__
+                )
             )
 
 
diff --git a/imperative/python/megengine/utils/custom_op_tools.py b/imperative/python/megengine/utils/custom_op_tools.py
new file mode 100644
index 00000000..d9150fef
--- /dev/null
+++ b/imperative/python/megengine/utils/custom_op_tools.py
@@ -0,0 +1,909 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import collections
+import ctypes
+import glob
+import os
+import re
+import subprocess
+import sys
+import time
+from typing import List, Optional, Union
+
+from ..core.ops.custom import load
+from ..logger import get_logger
+
+
+def _get_win_folder_with_ctypes(csidl_name):
+    csidl_const = {
+        "CSIDL_APPDATA": 26,
+        "CSIDL_COMMON_APPDATA": 35,
+        "CSIDL_LOCAL_APPDATA": 28,
+    }[csidl_name]
+
+    buf = ctypes.create_unicode_buffer(1024)
+    ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)
+
+    # Downgrade to short path name if have highbit chars. See
+    # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+    has_high_char = False
+    for c in buf:
+        if ord(c) > 255:
+            has_high_char = True
+            break
+    if has_high_char:
+        buf2 = ctypes.create_unicode_buffer(1024)
+        if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
+            buf = buf2
+
+    return buf.value
+
+
+system = sys.platform
+if system == "win32":
+    _get_win_folder = _get_win_folder_with_ctypes
+
+PLAT_TO_VCVARS = {
+    "win-amd64": "x86_amd64",
+}
+
+logger = get_logger()
+
+# environment varible
+ev_custom_op_root_dir = "MGE_CUSTOM_OP_DIR"
+ev_cuda_root_dir = "CUDA_ROOT_DIR"
+ev_cudnn_root_dir = "CUDNN_ROOT_DIR"
+
+# operating system
+IS_WINDOWS = system == "win32"
+IS_LINUX = system == "linux"
+IS_MACOS = system == "darwin"
+
+MGE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MGE_INC_PATH = os.path.join(MGE_PATH, "core", "include")
+MGE_LIB_PATH = os.path.join(MGE_PATH, "core", "lib")
+MGE_ABI_VER = 0
+
+
+# compile version
+MINIMUM_GCC_VERSION = (5, 0, 0)
+MINIMUM_CLANG_CL_VERSION = (12, 0, 1)
+
+# compile flags
+COMMON_MSVC_FLAGS = [
+    "/MD",
+    "/wd4002",
+    "/wd4819",
+    "/EHsc",
+]
+
+MSVC_IGNORE_CUDAFE_WARNINGS = [
+    "field_without_dll_interface",
+]
+
+COMMON_NVCC_FLAGS = []
+
+# Finds the CUDA install path
+def _find_cuda_root_dir() -> Optional[str]:
+    cuda_root_dir = os.environ.get(ev_cuda_root_dir)
+    if cuda_root_dir is None:
+        try:
+            which = "where" if IS_WINDOWS else "which"
+            with open(os.devnull, "w") as devnull:
+                nvcc = (
+                    subprocess.check_output([which, "nvcc"], stderr=devnull)
+                    .decode()
+                    .rstrip("\r\n")
+                )
+                cuda_root_dir = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            if IS_WINDOWS:
+                cuda_root_dir = os.environ.get("CUDA_PATH", None)
+                if cuda_root_dir == None:
+                    cuda_root_dirs = glob.glob(
+                        "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*"
+                    )
+                    if len(cuda_root_dirs) == 0:
+                        cuda_root_dir = ""
+                    else:
+                        cuda_root_dir = cuda_root_dirs[0]
+            else:
+                cuda_root_dir = "/usr/local/cuda"
+            if not os.path.exists(cuda_root_dir):
+                cuda_root_dir = None
+    return cuda_root_dir
+
+
+def _find_cudnn_root_dir() -> Optional[str]:
+    cudnn_root_dir = os.environ.get(ev_cudnn_root_dir)
+    return cudnn_root_dir
+
+
+CUDA_ROOT_DIR = _find_cuda_root_dir()
+CUDNN_ROOT_DIR = _find_cudnn_root_dir()
+
+#####################################################################
+# Phase 1
+#####################################################################
+
+
+def _is_cuda_file(path: str) -> bool:
+    valid_ext = [".cu", ".cuh"]
+    return os.path.splitext(path)[1] in valid_ext
+
+
+# Return full path to the user-specific cache dir for this application.
+# Typical user cache directories are:
+#     Mac OS X:   ~/Library/Caches/<AppName>
+#     Unix:       ~/.cache/<AppName> (XDG default)
+#     Windows:    C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
+def _get_user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
+    if system == "win32":
+        appauthor = appname if appauthor is None else appauthor
+        path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor)
+            else:
+                path = os.path.join(path, appname)
+            if opinion:
+                path = os.path.join(path, "Cache")
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Caches")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+# Returns the path to the root folder under which custom op will built.
+def _get_default_build_root() -> str:
+    return os.path.realpath(_get_user_cache_dir(appname="mge_custom_op"))
+
+
+def _get_build_dir(name: str) -> str:
+    custom_op_root_dir = os.environ.get(ev_custom_op_root_dir)
+    if custom_op_root_dir is None:
+        custom_op_root_dir = _get_default_build_root()
+
+    build_dir = os.path.join(custom_op_root_dir, name)
+    return build_dir
+
+
+#####################################################################
+# Phase 2
+#####################################################################
+
+
+def update_hash(seed, value):
+    # using boost::hash_combine
+    # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+    return seed ^ (hash(value) + 0x9E3779B9 + (seed << 6) + (seed >> 2))
+
+
+def hash_source_files(hash_value, source_files):
+    for filename in source_files:
+        with open(filename) as file:
+            hash_value = update_hash(hash_value, file.read())
+    return hash_value
+
+
+def hash_build_args(hash_value, build_args):
+    for group in build_args:
+        for arg in group:
+            hash_value = update_hash(hash_value, arg)
+    return hash_value
+
+
+Entry = collections.namedtuple("Entry", "version, hash")
+
+
+class Versioner(object):
+    def __init__(self):
+        self.entries = {}
+
+    def get_version(self, name):
+        entry = self.entries.get(name)
+        return None if entry is None else entry.version
+
+    def bump_version_if_changed(
+        self, name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag
+    ):
+        hash_value = 0
+        hash_value = hash_source_files(hash_value, sources)
+        hash_value = hash_build_args(hash_value, build_args)
+        hash_value = update_hash(hash_value, build_dir)
+        hash_value = update_hash(hash_value, with_cuda)
+        hash_value = update_hash(hash_value, with_cudnn)
+        hash_value = update_hash(hash_value, abi_tag)
+
+        entry = self.entries.get(name)
+        if entry is None:
+            self.entries[name] = entry = Entry(0, hash_value)
+        elif hash_value != entry.hash:
+            self.entries[name] = entry = Entry(entry.version + 1, hash_value)
+
+        return entry.version
+
+
+custom_op_versioner = Versioner()
+
+
+def version_check(
+    name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag,
+):
+    old_version = custom_op_versioner.get_version(name)
+    version = custom_op_versioner.bump_version_if_changed(
+        name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag,
+    )
+    return version, old_version
+
+
+#####################################################################
+# Phase 3
+#####################################################################
+
+
+def _check_ninja_availability():
+    try:
+        subprocess.check_output("ninja --version".split())
+    except Exception:
+        raise RuntimeError(
+            "Ninja is required to build custom op, please install ninja and update your PATH"
+        )
+
+
+def _mge_is_built_from_src():
+    file_path = os.path.abspath(__file__)
+    if "site-packages" in file_path:
+        return False
+    else:
+        return True
+
+
+def _accepted_compilers_for_platform():
+    if IS_WINDOWS:
+        return ["clang-cl"]
+    if IS_MACOS:
+        return ["clang++", "clang"]
+    if IS_LINUX:
+        return ["g++", "gcc", "gnu-c++", "gnu-cc"]
+
+
+# Verifies that the compiler is the expected one for the current platform.
+def _check_compiler_existed_for_platform(compiler: str) -> bool:
+    # there is no suitable cmd like `which` on windows, so we assume the compiler is always true on windows
+    if IS_WINDOWS:
+        try:
+            version_string = subprocess.check_output(
+                ["clang-cl", "--version"], stderr=subprocess.STDOUT
+            ).decode()
+            return True
+        except Exception:
+            return False
+
+    # use os.path.realpath to resolve any symlinks, in particular from "c++" to e.g. "g++".
+    which = subprocess.check_output(["which", compiler], stderr=subprocess.STDOUT)
+    compiler_path = os.path.realpath(which.decode().strip())
+    if any(name in compiler_path for name in _accepted_compilers_for_platform()):
+        return True
+
+    version_string = subprocess.check_output(
+        [compiler, "-v"], stderr=subprocess.STDOUT
+    ).decode()
+    if sys.platform.startswith("linux"):
+        pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
+        results = re.findall(pattern, version_string)
+        if len(results) != 1:
+            return False
+        compiler_path = os.path.realpath(results[0].strip())
+        return any(name in compiler_path for name in _accepted_compilers_for_platform())
+
+    if sys.platform.startswith("darwin"):
+        return version_string.startswith("Apple clang")
+
+    return False
+
+
+# Verifies that the given compiler is ABI-compatible with MegEngine.
+def _check_compiler_abi_compatibility(compiler: str):
+    # we think if the megengine is built from source, the user will use the same compiler to compile the custom op
+    if _mge_is_built_from_src() or os.environ.get("MGE_CHECK_ABI", "1") == "0":
+        return True
+
+    # [TODO] There is no particular minimum version we need for clang, so we"re good here.
+    if sys.platform.startswith("darwin"):
+        return True
+
+    try:
+        if sys.platform.startswith("linux"):
+            minimum_required_version = MINIMUM_GCC_VERSION
+            versionstr = subprocess.check_output(
+                [compiler, "-dumpfullversion", "-dumpversion"]
+            )
+            version = versionstr.decode().strip().split(".")
+        else:
+            minimum_required_version = MINIMUM_CLANG_CL_VERSION
+            compiler_info = subprocess.check_output(
+                [compiler, "--version"], stderr=subprocess.STDOUT
+            )
+            match = re.search(r"(\d+)\.(\d+)\.(\d+)", compiler_info.decode().strip())
+            version = (0, 0, 0) if match is None else match.groups()
+    except Exception:
+        _, error, _ = sys.exc_info()
+        logger.warning(
+            "Error checking compiler version for {}: {}".format(compiler, error)
+        )
+        return False
+
+    if tuple(map(int, version)) >= minimum_required_version:
+        return True
+
+    return False
+
+
+def _check_compiler_comatibility():
+    # we use clang-cl on windows, refer: https://clang.llvm.org/docs/UsersManual.html#clang-cl
+    compiler = (
+        os.environ.get("CXX", "clang-cl")
+        if IS_WINDOWS
+        else os.environ.get("CXX", "c++")
+    )
+
+    existed = _check_compiler_existed_for_platform(compiler)
+    if existed == False:
+        log_str = (
+            "Cannot find compiler which is compatible with the compiler "
+            "MegEngine was built with for this platform, which is {mge_compiler} on "
+            "{platform}. Please use {mge_compiler} to to compile your extension. "
+            "Alternatively, you may compile MegEngine from source using "
+            "{user_compiler}, and then you can also use {user_compiler} to compile "
+            "your extension."
+        ).format(
+            user_compiler=compiler,
+            mge_compiler=_accepted_compilers_for_platform()[0],
+            platform=sys.platform,
+        )
+
+        logger.warning(log_str)
+        return False
+
+    compatible = _check_compiler_abi_compatibility(compiler)
+    if compatible == False:
+        log_str = (
+            "Your compiler version may be ABI-incompatible with MegEngine! "
+            "Please use a compiler that is ABI-compatible with GCC 5.0 on Linux "
+            "and LLVM/Clang 12.0 on Windows ."
+        )
+        logger.warning(log_str)
+    return True
+
+
+#####################################################################
+# Phase 4
+#####################################################################
+
+
+# Quote command-line arguments for DOS/Windows conventions.
+def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
+    # Cover None-type
+    if not args:
+        return []
+    return ['"{}"'.format(arg) if " " in arg else arg for arg in args]
+
+
+# Now we need user to specify the arch of GPU
+def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
+    return []
+
+
+def _setup_sys_includes(with_cuda: bool, with_cudnn: bool):
+    includes = [os.path.join(MGE_INC_PATH)]
+    if with_cuda:
+        includes.append(os.path.join(CUDA_ROOT_DIR, "include"))
+    if with_cudnn:
+        includes.append(os.path.join(CUDNN_ROOT_DIR, "include"))
+    return includes
+
+
+def _setup_includes(extra_include_paths: List[str], with_cuda: bool, with_cudnn: bool):
+    user_includes = [os.path.abspath(path) for path in extra_include_paths]
+    system_includes = _setup_sys_includes(with_cuda, with_cudnn)
+    if IS_WINDOWS:
+        user_includes += system_includes
+        system_includes.clear()
+    return user_includes, system_includes
+
+
+def _setup_common_cflags(user_includes: List[str], system_includes: List[str]):
+    common_cflags = []
+    common_cflags += ["-I{}".format(include) for include in user_includes]
+    common_cflags += ["-isystem {}".format(include) for include in system_includes]
+    if not IS_WINDOWS:
+        common_cflags += ["-D_GLIBCXX_USE_CXX11_ABI={}".format(MGE_ABI_VER)]
+    return common_cflags
+
+
+def _setup_cuda_cflags(cflags: List[str], extra_cuda_cflags: List[str]):
+    cuda_flags = cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
+    if IS_WINDOWS:
+        for flag in COMMON_MSVC_FLAGS:
+            cuda_flags = ["-Xcompiler", flag] + cuda_flags
+        for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
+            cuda_flags = ["-Xcudafe", "--diag_suppress=" + ignore_warning] + cuda_flags
+        cuda_flags = _nt_quote_args(cuda_flags)
+        cuda_flags += _nt_quote_args(extra_cuda_cflags)
+    else:
+        cuda_flags += ["--compiler-options", '"-fPIC"']
+        cuda_flags += extra_cuda_cflags
+        if not any(flag.startswith("-std=") for flag in cuda_flags):
+            cuda_flags.append("-std=c++14")
+        if os.getenv("CC") is not None:
+            cuda_flags = ["-ccbin", os.getenv("CC")] + cuda_flags
+    return cuda_flags
+
+
+def _setup_ldflags(
+    extra_ldflags: List[str], with_cuda: bool, with_cudnn: bool
+) -> List[str]:
+    ldflags = extra_ldflags
+    if IS_WINDOWS:
+        ldflags.append(os.path.join(MGE_LIB_PATH, "megengine_shared.lib"))
+        if with_cuda:
+            ldflags.append(os.path.join(CUDA_ROOT_DIR, "lib", "x64", "cudart.lib"))
+        if with_cudnn:
+            ldflags.append(os.path.join(CUDNN_ROOT_DIR, "lib", "x64", "cudnn.lib"))
+
+    else:
+        ldflags.append("-lmegengine_shared -L{}".format(MGE_LIB_PATH))
+        ldflags.append("-Wl,-rpath,{}".format(MGE_LIB_PATH))
+        if with_cuda:
+            ldflags.append("-lcudart")
+            ldflags.append("-L{}".format(os.path.join(CUDA_ROOT_DIR, "lib64")))
+            ldflags.append("-Wl,-rpath,{}".format(os.path.join(CUDA_ROOT_DIR, "lib64")))
+        if with_cudnn:
+            ldflags.append("-L{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64")))
+            ldflags.append(
+                "-Wl,-rpath,{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64"))
+            )
+
+    return ldflags
+
+
+def _add_shared_flag(ldflags: List[str]):
+    ldflags += ["/LD" if IS_WINDOWS else "-shared"]
+    return ldflags
+
+
+#####################################################################
+# Phase 5
+#####################################################################
+
+
+def _obj_file_path(src_file_path: str):
+    file_name = os.path.splitext(os.path.basename(src_file_path))[0]
+    if _is_cuda_file(src_file_path):
+        target = "{}.cuda.o".format(file_name)
+    else:
+        target = "{}.o".format(file_name)
+    return target
+
+
+def _dump_ninja_file(
+    path,
+    cflags,
+    post_cflags,
+    cuda_cflags,
+    cuda_post_cflags,
+    sources,
+    objects,
+    ldflags,
+    library_target,
+    with_cuda,
+):
+    def sanitize_flags(flags):
+        return [] if flags is None else [flag.strip() for flag in flags]
+
+    cflags = sanitize_flags(cflags)
+    post_cflags = sanitize_flags(post_cflags)
+    cuda_cflags = sanitize_flags(cuda_cflags)
+    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
+    ldflags = sanitize_flags(ldflags)
+
+    assert len(sources) == len(objects)
+    assert len(sources) > 0
+
+    if IS_WINDOWS:
+        compiler = os.environ.get("CXX", "clang-cl")
+    else:
+        compiler = os.environ.get("CXX", "c++")
+
+    # Version 1.3 is required for the `deps` directive.
+    config = ["ninja_required_version = 1.3"]
+    config.append("cxx = {}".format(compiler))
+    if with_cuda:
+        nvcc = os.path.join(CUDA_ROOT_DIR, "bin", "nvcc")
+        config.append("nvcc = {}".format(nvcc))
+
+    flags = ["cflags = {}".format(" ".join(cflags))]
+    flags.append("post_cflags = {}".format(" ".join(post_cflags)))
+    if with_cuda:
+        flags.append("cuda_cflags = {}".format(" ".join(cuda_cflags)))
+        flags.append("cuda_post_cflags = {}".format(" ".join(cuda_post_cflags)))
+    flags.append("ldflags = {}".format(" ".join(ldflags)))
+
+    # Turn into absolute paths so we can emit them into the ninja build
+    # file wherever it is.
+    sources = [os.path.abspath(file) for file in sources]
+
+    # See https://ninja-build.org/build.ninja.html for reference.
+    compile_rule = ["rule compile"]
+    if IS_WINDOWS:
+        compile_rule.append(
+            "  command = clang-cl /showIncludes $cflags -c $in /Fo$out $post_cflags"
+        )
+        compile_rule.append("  deps = msvc")
+    else:
+        compile_rule.append(
+            "  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags"
+        )
+        compile_rule.append("  depfile = $out.d")
+        compile_rule.append("  deps = gcc")
+
+    if with_cuda:
+        cuda_compile_rule = ["rule cuda_compile"]
+        nvcc_gendeps = ""
+        cuda_compile_rule.append(
+            "  command = $nvcc {} $cuda_cflags -c $in -o $out $cuda_post_cflags".format(
+                nvcc_gendeps
+            )
+        )
+
+    # Emit one build rule per source to enable incremental build.
+    build = []
+    for source_file, object_file in zip(sources, objects):
+        is_cuda_source = _is_cuda_file(source_file) and with_cuda
+        rule = "cuda_compile" if is_cuda_source else "compile"
+        if IS_WINDOWS:
+            source_file = source_file.replace(":", "$:")
+            object_file = object_file.replace(":", "$:")
+        source_file = source_file.replace(" ", "$ ")
+        object_file = object_file.replace(" ", "$ ")
+        build.append("build {}: {} {}".format(object_file, rule, source_file))
+
+    if library_target is not None:
+        link_rule = ["rule link"]
+        if IS_WINDOWS:
+            link_rule.append("  command = clang-cl $in /nologo $ldflags /out:$out")
+        else:
+            link_rule.append("  command = $cxx $in $ldflags -o $out")
+
+        link = ["build {}: link {}".format(library_target, " ".join(objects))]
+        default = ["default {}".format(library_target)]
+    else:
+        link_rule, link, default = [], [], []
+
+    # 'Blocks' should be separated by newlines, for visual benefit.
+    blocks = [config, flags, compile_rule]
+    if with_cuda:
+        blocks.append(cuda_compile_rule)
+    blocks += [link_rule, build, link, default]
+    with open(path, "w") as build_file:
+        for block in blocks:
+            lines = "\n".join(block)
+            build_file.write("{}\n\n".format(lines))
+
+
+class FileBaton:
+    def __init__(self, lock_file_path, wait_seconds=0.1):
+        self.lock_file_path = lock_file_path
+        self.wait_seconds = wait_seconds
+        self.fd = None
+
+    def try_acquire(self):
+        try:
+            self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
+            return True
+        except FileExistsError:
+            return False
+
+    def wait(self):
+        while os.path.exists(self.lock_file_path):
+            time.sleep(self.wait_seconds)
+
+    def release(self):
+        if self.fd is not None:
+            os.close(self.fd)
+
+        os.remove(self.lock_file_path)
+
+
+#####################################################################
+# Phase 6
+#####################################################################
+
+
+def _build_with_ninja(build_dir: str, verbose: bool, error_prefix: str):
+    command = ["ninja", "-v"]
+    env = os.environ.copy()
+    try:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        stdout_fileno = 1
+        subprocess.run(
+            command,
+            stdout=stdout_fileno if verbose else subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            cwd=build_dir,
+            check=True,
+            env=env,
+        )
+    except subprocess.CalledProcessError as e:
+        with open(os.path.join(build_dir, "build.ninja")) as f:
+            lines = f.readlines()
+            print(lines)
+        _, error, _ = sys.exc_info()
+        message = error_prefix
+        if hasattr(error, "output") and error.output:
+            message += ": {}".format(error.output.decode())
+        raise RuntimeError(message) from e
+
+
+def build(
+    name: str,
+    sources: Union[str, List[str]],
+    extra_cflags: Union[str, List[str]] = [],
+    extra_cuda_cflags: Union[str, List[str]] = [],
+    extra_ldflags: Union[str, List[str]] = [],
+    extra_include_paths: Union[str, List[str]] = [],
+    with_cuda: Optional[bool] = None,
+    build_dir: Optional[bool] = None,
+    verbose: bool = False,
+    abi_tag: Optional[int] = None,
+) -> str:
+    r"""Build a Custom Op with ninja in the way of just-in-time (JIT).
+
+    To build the custom op, a Ninja build file is emitted, which is used to
+    compile the given sources into a dynamic library.
+
+    By default, the directory to which the build file is emitted and the
+    resulting library compiled to is ``<tmp>/mge_custom_op/<name>``, where
+    ``<tmp>`` is the temporary folder on the current platform and ``<name>``
+    the name of the custom op. This location can be overridden in two ways.
+    First, if the ``MGE_CUSTOM_OP_DIR`` environment variable is set, it
+    replaces ``<tmp>/mge_custom_op`` and all custom op will be compiled
+    into subfolders of this directory. Second, if the ``build_dir``
+    argument to this function is supplied, it overrides the entire path, i.e.
+    the library will be compiled into that folder directly.
+
+    To compile the sources, the default system compiler (``c++``) is used,
+    which can be overridden by setting the ``CXX`` environment variable. To pass
+    additional arguments to the compilation process, ``extra_cflags`` or
+    ``extra_ldflags`` can be provided. For example, to compile your custom op
+    with optimizations, pass ``extra_cflags=['-O3']``. You can also use
+    ``extra_cflags`` to pass further include directories.
+
+    CUDA support with mixed compilation is provided. Simply pass CUDA source
+    files (``.cu`` or ``.cuh``) along with other sources. Such files will be
+    detected and compiled with nvcc rather than the C++ compiler. This includes
+    passing the CUDA lib64 directory as a library directory, and linking
+    ``cudart``. You can pass additional flags to nvcc via
+    ``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various
+    heuristics for finding the CUDA install directory are used, which usually
+    work fine. If not, setting the ``CUDA_ROOT_DIR`` environment variable is the
+    safest option. If you use CUDNN, please also setting the ``CUDNN_ROOT_DIR`` 
+    environment variable.
+
+    Args:
+        name: The name of the custom op to build.
+        sources: A list of relative or absolute paths to C++ source files.
+        extra_cflags: optional list of compiler flags to forward to the build.
+        extra_cuda_cflags: optional list of compiler flags to forward to nvcc
+            when building CUDA sources.
+        extra_ldflags: optional list of linker flags to forward to the build.
+        extra_include_paths: optional list of include directories to forward
+            to the build.
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on the existence of ``.cu`` or
+            ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
+            and libraries to be included.
+        build_dir: optional path to use as build workspace.
+        verbose: If ``True``, turns on verbose logging of load steps.
+        abi_tag: Determines the value of MACRO ``_GLIBCXX_USE_CXX11_ABI``
+            in gcc compiler, should be ``0`` or ``1``.
+
+    Returns:
+        the compiled dynamic library path
+
+    """
+
+    # phase 1: prepare config
+    if abi_tag != None:
+        global MGE_ABI_VER
+        MGE_ABI_VER = abi_tag
+
+    def strlist(args, name):
+        assert isinstance(args, str) or isinstance(
+            args, list
+        ), "{} must be str or list[str]".format(name)
+        if isinstance(args, str):
+            return [args]
+        for arg in args:
+            assert isinstance(arg, str)
+        args = [arg.strip() for arg in args]
+        return args
+
+    sources = strlist(sources, "sources")
+    extra_cflags = strlist(extra_cflags, "extra_cflags")
+    extra_cuda_cflags = strlist(extra_cuda_cflags, "extra_cuda_cflags")
+    extra_ldflags = strlist(extra_ldflags, "extra_ldflags")
+    extra_include_paths = strlist(extra_include_paths, "extra_include_paths")
+
+    with_cuda = any(map(_is_cuda_file, sources)) if with_cuda is None else with_cuda
+    with_cudnn = any(["cudnn" in f for f in extra_ldflags])
+
+    if CUDA_ROOT_DIR == None and with_cuda:
+        print(
+            "No CUDA runtime is found, using {}=/path/to/your/cuda_root_dir".format(
+                ev_cuda_root_dir
+            )
+        )
+    if CUDNN_ROOT_DIR == None and with_cudnn:
+        print(
+            "Cannot find the root directory of cudnn, using {}=/path/to/your/cudnn_root_dir".format(
+                ev_cudnn_root_dir
+            )
+        )
+
+    build_dir = os.path.abspath(
+        _get_build_dir(name) if build_dir is None else build_dir
+    )
+    if not os.path.exists(build_dir):
+        os.makedirs(build_dir, exist_ok=True)
+
+    if verbose:
+        print("Using {} to build megengine custom op".format(build_dir))
+
+    # phase 2: version check
+    version, old_version = version_check(
+        name,
+        sources,
+        [extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths],
+        build_dir,
+        with_cuda,
+        with_cudnn,
+        abi_tag,
+    )
+    if verbose:
+        if version != old_version and old_version != None:
+            print(
+                "Input conditions of custom op {} have changed, bumping to version {}".format(
+                    name, version
+                )
+            )
+        print("Building custom op {} with version {}".format(name, version))
+    if version == old_version:
+        if verbose:
+            print(
+                "No modifications detected for {}, skipping build step...".format(name)
+            )
+        return
+    name = "{}_v{}".format(name, version)
+
+    # phase 3: compiler and ninja check
+    _check_ninja_availability()
+    _check_compiler_comatibility()
+
+    # phase 4: setup the compile flags
+    user_includes, system_includes = _setup_includes(
+        extra_include_paths, with_cuda, with_cudnn
+    )
+    common_cflags = _setup_common_cflags(user_includes, system_includes)
+    cuda_cflags = (
+        _setup_cuda_cflags(common_cflags, extra_cuda_cflags) if with_cuda else None
+    )
+    ldflags = _setup_ldflags(extra_ldflags, with_cuda, with_cudnn)
+
+    if IS_WINDOWS:
+        cflags = common_cflags + COMMON_MSVC_FLAGS + extra_cflags
+        cflags = _nt_quote_args(cflags)
+    else:
+        cflags = common_cflags + ["-fPIC", "-std=c++14"] + extra_cflags
+
+    ldflags = _add_shared_flag(ldflags)
+    if sys.platform.startswith("darwin"):
+        ldflags.append("-undefined dynamic_lookup")
+    elif IS_WINDOWS:
+        ldflags += ["/link"]
+        ldflags = _nt_quote_args(ldflags)
+
+    baton = FileBaton(os.path.join(build_dir, "lock"))
+    if baton.try_acquire():
+        try:
+            # phase 5: generate ninja build file
+            objs = [_obj_file_path(src) for src in sources]
+            name += ".dll" if IS_WINDOWS else ".so"
+
+            build_file_path = os.path.join(build_dir, "build.ninja")
+            if verbose:
+                print("Emitting ninja build file {}".format(build_file_path))
+            _dump_ninja_file(
+                path=build_file_path,
+                cflags=cflags,
+                post_cflags=None,
+                cuda_cflags=cuda_cflags,
+                cuda_post_cflags=None,
+                sources=sources,
+                objects=objs,
+                ldflags=ldflags,
+                library_target=name,
+                with_cuda=with_cuda,
+            )
+
+            # phase 6: build with ninja
+            if verbose:
+                print(
+                    "Compiling and linking your custom op {}".format(
+                        os.path.join(build_dir, name)
+                    )
+                )
+            _build_with_ninja(build_dir, verbose, "compiling error")
+        finally:
+            baton.release()
+    else:
+        baton.wait()
+
+    return os.path.join(build_dir, name)
+
+
+def build_and_load(
+    name: str,
+    sources: Union[str, List[str]],
+    extra_cflags: Union[str, List[str]] = [],
+    extra_cuda_cflags: Union[str, List[str]] = [],
+    extra_ldflags: Union[str, List[str]] = [],
+    extra_include_paths: Union[str, List[str]] = [],
+    with_cuda: Optional[bool] = None,
+    build_dir: Optional[bool] = None,
+    verbose: bool = False,
+    abi_tag: Optional[int] = None,
+) -> str:
+    r"""Build and Load a Custom Op with ninja in the way of just-in-time (JIT).
+    Same as the function ``build()`` but load the built dynamic library.
+
+    Args:
+        same as ``build()``
+
+    Returns:
+        the compiled dynamic library path
+
+    """
+
+    lib_path = build(
+        name,
+        sources,
+        extra_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        extra_include_paths,
+        with_cuda,
+        build_dir,
+        verbose,
+        abi_tag,
+    )
+    if verbose:
+        print("Load the compiled custom op {}".format(lib_path))
+    load(lib_path)
+    return lib_path
diff --git a/imperative/python/megengine/utils/persistent_cache.py b/imperative/python/megengine/utils/persistent_cache.py
index 3b0f7ae2..b6aadf8a 100644
--- a/imperative/python/megengine/utils/persistent_cache.py
+++ b/imperative/python/megengine/utils/persistent_cache.py
@@ -8,87 +8,115 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
 import argparse
+import contextlib
 import getpass
 import os
 import sys
 import urllib.parse
 
-from ..core._imperative_rt import PersistentCacheManager as _PersistentCacheManager
+import filelock
+
+from ..core._imperative_rt import PersistentCache as _PersistentCache
 from ..logger import get_logger
 from ..version import __version__, git_version
 
 
-class PersistentCacheManager(_PersistentCacheManager):
+class PersistentCacheOnServer(_PersistentCache):
     def __init__(self):
         super().__init__()
-        if os.getenv("MGE_FASTRUN_CACHE_TYPE") == "MEMORY":
-            get_logger().info("fastrun use in-memory cache")
-            self.open_memory()
-        elif os.getenv("MGE_FASTRUN_CACHE_TYPE") == "FILE":
-            self.open_file()
-        else:
-            self.open_redis()
-
-    def open_memory(self):
-        pass
+        cache_type = os.getenv("MGE_FASTRUN_CACHE_TYPE")
+        if cache_type not in ("FILE", "MEMORY"):
+            try:
+                redis_config = self.get_redis_config()
+            except Exception as exc:
+                get_logger().error(
+                    "failed to connect to cache server {!r}; try fallback to "
+                    "in-file cache".format(exc)
+                )
+            else:
+                if redis_config is not None:
+                    self.add_config(
+                        "redis",
+                        redis_config,
+                        "fastrun use redis cache",
+                        "failed to connect to cache server",
+                    )
+        if cache_type != "MEMORY":
+            path = self.get_cache_file(self.get_cache_dir())
+            self.add_config(
+                "in-file",
+                {"path": path},
+                "fastrun use in-file cache in {}".format(path),
+                "failed to create cache file in {}".format(path),
+            )
+        self.add_config(
+            "in-memory",
+            {},
+            "fastrun use in-memory cache",
+            "failed to create in-memory cache",
+        )
 
-    def open_file(self):
+    def get_cache_dir(self):
         cache_dir = os.getenv("MGE_FASTRUN_CACHE_DIR")
-        try:
-            if not cache_dir:
-                from ..hub.hub import _get_megengine_home
+        if not cache_dir:
+            from ..hub.hub import _get_megengine_home
 
-                cache_dir = os.path.expanduser(
-                    os.path.join(_get_megengine_home(), "persistent_cache.bin")
-                )
-            os.makedirs(cache_dir, exist_ok=True)
-            cache_file = os.path.join(cache_dir, "cache")
-            with open(cache_file, "a"):
-                pass
-            assert self.try_open_file(cache_file), "cannot create file"
-            get_logger().info("fastrun use in-file cache in {}".format(cache_dir))
-        except Exception as exc:
-            get_logger().error(
-                "failed to create cache file in {} {!r}; fallback to "
-                "in-memory cache".format(cache_dir, exc)
+            cache_dir = os.path.expanduser(
+                os.path.join(_get_megengine_home(), "persistent_cache")
             )
-            self.open_memory()
-
-    def open_redis(self):
+        os.makedirs(cache_dir, exist_ok=True)
+        return cache_dir
+
+    def get_cache_file(self, cache_dir):
+        cache_file = os.path.join(cache_dir, "cache.bin")
+        with open(cache_file, "a"):
+            pass
+        return cache_file
+
+    @contextlib.contextmanager
+    def lock_cache_file(self, cache_dir):
+        lock_file = os.path.join(cache_dir, "cache.lock")
+        with filelock.FileLock(lock_file):
+            yield
+
+    def get_redis_config(self):
+        url = os.getenv("MGE_FASTRUN_CACHE_URL")
+        if url is None:
+            return None
+        assert sys.platform != "win32", "redis cache on windows not tested"
         prefix = "mgbcache:{}:MGB{}:GIT:{}".format(
             getpass.getuser(), __version__, git_version
         )
-        url = os.getenv("MGE_FASTRUN_CACHE_URL")
-        if url is None:
-            self.open_file()
-        try:
-            assert sys.platform != "win32", "redis cache on windows not tested"
-            parse_result = urllib.parse.urlparse(url, scheme="redis")
-            assert parse_result.scheme == "redis", "unsupported scheme"
-            assert not parse_result.username, "redis conn with username unsupported"
-            assert self.try_open_redis(
-                parse_result.hostname, parse_result.port, parse_result.password, prefix
-            ), "connect failed"
-        except Exception as exc:
-            get_logger().error(
-                "failed to connect to cache server {!r}; try fallback to "
-                "in-file cache".format(exc)
-            )
-            self.open_file()
-
-
-_manager = None
-
+        parse_result = urllib.parse.urlparse(url)
+        assert not parse_result.username, "redis conn with username unsupported"
+        if parse_result.scheme == "redis":
+            assert parse_result.hostname and parse_result.port, "invalid url"
+            assert not parse_result.path
+            config = {
+                "hostname": parse_result.hostname,
+                "port": str(parse_result.port),
+            }
+        elif parse_result.scheme == "redis+socket":
+            assert not (parse_result.hostname or parse_result.port)
+            assert parse_result.path
+            config = {
+                "unixsocket": parse_result.path,
+            }
+        else:
+            assert False, "unsupported scheme"
+        if parse_result.password is not None:
+            config["password"] = parse_result.password
+        config["prefix"] = prefix
+        return config
 
-def get_manager():
-    global _manager
-    if _manager is None:
-        _manager = PersistentCacheManager()
-    return _manager
+    def flush(self):
+        if self.config is not None and self.config.type == "in-file":
+            with self.lock_cache_file(self.get_cache_dir()):
+                super().flush()
 
 
 def _clean():
-    nr_del = get_manager().clean()
+    nr_del = PersistentCacheOnServer().clean()
     if nr_del is not None:
         print("{} cache entries deleted".format(nr_del))
 
diff --git a/imperative/python/requires-test.txt b/imperative/python/requires-test.txt
index 05643464..7b33b1ce 100644
--- a/imperative/python/requires-test.txt
+++ b/imperative/python/requires-test.txt
@@ -2,3 +2,4 @@ pytest==5.3.0
 pytest-sphinx==0.3.1
 tensorboardX==2.4
 six==1.16.0
+redislite ; platform_system == "Linux" or platform_system == "Darwin"
diff --git a/imperative/python/requires.txt b/imperative/python/requires.txt
index 58a806c0..894b332a 100644
--- a/imperative/python/requires.txt
+++ b/imperative/python/requires.txt
@@ -4,8 +4,8 @@ pyarrow
 requests
 tabulate
 tqdm
-redispy
 deprecated
 mprop
 wheel
-megfile>=0.0.10
\ No newline at end of file
+megfile>=0.0.10
+filelock
diff --git a/imperative/python/src/ops.cpp b/imperative/python/src/ops.cpp
index 30f61a0f..be184cff 100644
--- a/imperative/python/src/ops.cpp
+++ b/imperative/python/src/ops.cpp
@@ -567,7 +567,15 @@ void init_ops(py::module m) {
                 rng::delete_handle(handle);
             },
             py::call_guard<py::gil_scoped_release>());
-    m.def("set_global_rng_seed", &rng::set_global_rng_seed);
+    m.def("set_global_rng_seed", [](uint64_t seed) -> void {
+        mgb_assert(
+                python::interpreter_for_py->check_available(),
+                "set global random seed failed since imperative interpreter has been "
+                "destroyed");
+        python::interpreter_for_py->sync();
+        mgb::CompNode::sync_all();
+        rng::set_global_rng_seed(seed);
+    });
     m.def("get_global_rng_seed", &rng::get_global_rng_seed);
     m.def("get_rng_handle_compnode", &rng::get_rng_handle_compnode);
 
@@ -766,6 +774,13 @@ void init_custom(pybind11::module m) {
     m.def("_install", &install_custom);
     m.def("_uninstall", &uninstall_custom);
     m.def("_get_custom_op_list", &get_custom_op_list);
+    m.def("get_custom_op_abi_tag", [](void) -> int {
+        int ret = 0;
+#ifdef _GLIBCXX_USE_CXX11_ABI
+        ret = _GLIBCXX_USE_CXX11_ABI;
+#endif
+        return ret;
+    });
 
     static PyMethodDef method_def = {
 #ifdef METH_FASTCALL
diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp
index 87f2459d..f415f572 100644
--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -1074,6 +1074,10 @@ void init_tensor(py::module m) {
             []() {
                 interpreter_for_py->sync();
                 CompNode::sync_all();
+                CompNode::foreach ([](CompNode cn) {
+                    auto err = cn.check_async_error();
+                    mgb_assert(!err, "%s", err->what());
+                });
                 sync_py_task_q();
             },
             py::call_guard<py::gil_scoped_release>());
diff --git a/imperative/python/src/utils.cpp b/imperative/python/src/utils.cpp
index 5162f488..260c7aa2 100644
--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
@@ -210,7 +210,7 @@ void init_utils(py::module m) {
             .def("disable", [](TensorSanityCheck& checker) { checker.disable(); });
 
 #if MGB_ENABLE_OPR_MM
-    m.def("create_mm_server", &create_zmqrpc_server, py::arg("addr"),
+    m.def("create_mm_server", &mgb::opr::create_zmqrpc_server, py::arg("addr"),
           py::arg("port") = 0);
 #else
     m.def("create_mm_server", []() {});
@@ -234,51 +234,108 @@ void init_utils(py::module m) {
     using ExtendedPersistentCache =
             mgb::imperative::persistent_cache::ExtendedPersistentCache;
 
-    struct PersistentCacheManager {
-        std::shared_ptr<ExtendedPersistentCache> instance;
+    struct ConfigurablePersistentCache : mgb::PersistentCache {
+        struct Config {
+            std::string type;
+            std::unordered_map<std::string, std::string> args;
+            std::string on_success;
+            std::string on_fail;
+        };
 
-        bool try_reg(std::shared_ptr<ExtendedPersistentCache> cache) {
-            if (cache) {
-                instance = cache;
-                PersistentCache::set_impl(cache);
-                return true;
-            }
-            return false;
-        }
-        bool open_redis(
-                std::string ip, size_t port, std::string password, std::string prefix) {
-            return try_reg(mgb::imperative::persistent_cache::make_redis(
-                    ip, port, password, prefix));
+        std::shared_ptr<ExtendedPersistentCache> impl;
+        std::optional<Config> impl_config;
+        std::vector<Config> configs;
+
+        void add_config(
+                std::string type, std::unordered_map<std::string, std::string> args,
+                std::string on_success, std::string on_fail) {
+            configs.push_back({type, args, on_success, on_fail});
         }
-        bool open_file(std::string path) {
-            return try_reg(mgb::imperative::persistent_cache::make_in_file(path));
+
+        std::optional<size_t> clean() { return get_impl()->clear(); }
+
+        void load_config() {
+            std::optional<std::string> err_msg;
+            for (size_t i = 0; i < configs.size(); ++i) {
+                auto& config = configs[i];
+                if (err_msg) {
+                    mgb_log_warn("try fallback to %s cache", config.type.c_str());
+                } else {
+                    err_msg.emplace();
+                }
+                auto cache = ExtendedPersistentCache::make_from_config(
+                        config.type, config.args, *err_msg);
+                if (!cache) {
+                    mgb_log_warn("%s %s", config.on_fail.c_str(), err_msg->c_str());
+                } else {
+                    impl = cache;
+                    impl_config = config;
+                    break;
+                }
+            }
+            mgb_assert(impl_config.has_value(), "not valid config");
         }
-        std::optional<size_t> clean() {
-            if (instance) {
-                return instance->clear();
+
+        std::shared_ptr<ExtendedPersistentCache> get_impl() {
+            if (!impl) {
+                load_config();
             }
-            return {};
+            return impl;
         }
-        void put(std::string category, std::string key, std::string value) {
-            PersistentCache::inst().put(
-                    category, {key.data(), key.size()}, {value.data(), value.size()});
+
+        virtual mgb::Maybe<Blob> get(const std::string& category, const Blob& key) {
+            return get_impl()->get(category, key);
+        }
+
+        virtual void put(
+                const std::string& category, const Blob& key, const Blob& value) {
+            return get_impl()->put(category, key, value);
         }
-        py::object get(std::string category, std::string key) {
-            auto value =
-                    PersistentCache::inst().get(category, {key.data(), key.size()});
+
+        virtual bool support_dump_cache() { return get_impl()->support_dump_cache(); }
+
+        py::object py_get(std::string category, std::string key) {
+            auto value = get_impl()->get(category, {key.data(), key.size()});
             if (value.valid()) {
                 return py::bytes(std::string((const char*)value->ptr, value->size));
             } else {
                 return py::none();
             }
         }
+
+        void py_put(std::string category, std::string key, std::string value) {
+            get_impl()->put(
+                    category, {key.data(), key.size()}, {value.data(), value.size()});
+        }
+
+        void flush() {
+            if (impl) {
+                impl->flush();
+            }
+        }
     };
 
-    py::class_<PersistentCacheManager>(m, "PersistentCacheManager")
-            .def(py::init<>())
-            .def("try_open_redis", &PersistentCacheManager::open_redis)
-            .def("try_open_file", &PersistentCacheManager::open_file)
-            .def("clean", &PersistentCacheManager::clean)
-            .def("put", &PersistentCacheManager::put)
-            .def("get", &PersistentCacheManager::get);
+    auto PyConfigurablePersistentCache =
+            py::class_<
+                    ConfigurablePersistentCache,
+                    std::shared_ptr<ConfigurablePersistentCache>>(m, "PersistentCache")
+                    .def(py::init<>())
+                    .def("add_config", &ConfigurablePersistentCache::add_config)
+                    .def("reg",
+                         [](std::shared_ptr<ConfigurablePersistentCache> inst) {
+                             PersistentCache::set_impl(inst);
+                         })
+                    .def("clean", &ConfigurablePersistentCache::clean)
+                    .def("get", &ConfigurablePersistentCache::py_get)
+                    .def("put", &ConfigurablePersistentCache::py_put)
+                    .def_readonly("config", &ConfigurablePersistentCache::impl_config)
+                    .def("flush", &ConfigurablePersistentCache::flush);
+
+    py::class_<ConfigurablePersistentCache::Config>(
+            PyConfigurablePersistentCache, "Config")
+            .def_readwrite("type", &ConfigurablePersistentCache::Config::type)
+            .def_readwrite("args", &ConfigurablePersistentCache::Config::args)
+            .def_readwrite("on_fail", &ConfigurablePersistentCache::Config::on_fail)
+            .def_readwrite(
+                    "on_success", &ConfigurablePersistentCache::Config::on_success);
 }
diff --git a/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp b/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
new file mode 100644
index 00000000..d8f0299d
--- /dev/null
+++ b/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
@@ -0,0 +1,140 @@
+/**
+ * \file imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "megbrain/custom/custom.h"
+
+CUSTOM_OP_REG_BEGIN(ElemAddSmooth)
+
+void forward_device_infer(
+        const std::vector<Device>& inputs, const Param& params,
+        std::vector<Device>& outputs) {
+    outputs[0] = inputs[0];
+}
+
+void forward_shape_infer(
+        const std::vector<Shape>& inputs, const Param& params,
+        std::vector<Shape>& outputs) {
+    outputs[0] = inputs[0];
+}
+
+void forward_dtype_infer(
+        const std::vector<DType>& inputs, const Param& params,
+        std::vector<DType>& outputs) {
+    outputs[0] = inputs[0];
+}
+
+void forward_format_infer(
+        const std::vector<Format>& inputs, const Param& params,
+        std::vector<Format>& outputs) {
+    outputs[0] = inputs[0];
+}
+
+template <typename scalar_t>
+void forward_kernel(
+        const scalar_t* input0, const scalar_t* input1, scalar_t* output, size_t len,
+        float smooth) {
+    for (size_t i = 0; i < len; ++i) {
+        output[i] = input0[i] + input1[i];
+        if (output[i] < 0)
+            output[i] += smooth;
+        else
+            output[i] -= smooth;
+    }
+}
+
+void forward_compute(
+        const std::vector<Tensor>& inputs, const Param& params,
+        std::vector<Tensor>& outputs) {
+    DISPATCH_SIGN_INT_AND_FLOAT_TYPES(
+            outputs[0].dtype(), "forward_compute", ([&]() {
+                forward_kernel<scalar_t>(
+                        inputs[0].data<scalar_t>(), inputs[1].data<scalar_t>(),
+                        outputs[0].data<scalar_t>(), outputs[0].size(),
+                        params["smooth"].as<float>());
+            }));
+}
+
+CUSTOM_OP_REG(ElemAddSmoothForward)
+        .set_description(
+                "Custom ElemAdd Operator With a Smooth Parameter, "
+                "which is used to verify the CPU kernel")
+        .add_input("lhs")
+        .add_input("rhs")
+        .add_output("output")
+        .add_param("smooth", 0.f)
+        .set_device_infer(forward_device_infer)
+        .set_shape_infer(forward_shape_infer)
+        .set_dtype_infer(forward_dtype_infer)
+        .set_format_infer(forward_format_infer)
+        .set_compute(forward_compute);
+
+void backward_device_infer(
+        const std::vector<Device>& ograds, const Param& params,
+        std::vector<Device>& igrads) {
+    igrads[0] = ograds[0];
+    igrads[1] = ograds[0];
+}
+
+void backward_shape_infer(
+        const std::vector<Shape>& ograds, const Param& params,
+        std::vector<Shape>& igrads) {
+    igrads[0] = ograds[0];
+    igrads[1] = ograds[0];
+}
+
+void backward_dtype_infer(
+        const std::vector<DType>& ograds, const Param& params,
+        std::vector<DType>& igrads) {
+    igrads[0] = ograds[0];
+    igrads[1] = ograds[0];
+}
+
+void backward_format_infer(
+        const std::vector<Format>& ograds, const Param& params,
+        std::vector<Format>& igrads) {
+    igrads[0] = ograds[0];
+    igrads[1] = ograds[0];
+}
+
+template <typename scalar_t>
+void backward_kernel(
+        const scalar_t* ograd, scalar_t* igrad0, scalar_t* igrad1, size_t len) {
+    for (size_t i = 0; i < len; ++i) {
+        igrad0[i] = ograd[i];
+        igrad1[i] = ograd[i];
+    }
+}
+
+void backward_compute(
+        const std::vector<Tensor>& ograds, const Param& params,
+        std::vector<Tensor>& igrads) {
+    DISPATCH_SIGN_INT_AND_FLOAT_TYPES(
+            igrads[0].dtype(), "backward_compute", ([&]() {
+                backward_kernel<scalar_t>(
+                        ograds[0].data<scalar_t>(), igrads[0].data<scalar_t>(),
+                        igrads[1].data<scalar_t>(), igrads[0].size());
+            }));
+}
+
+CUSTOM_OP_REG(ElemAddSmoothBackward)
+        .set_description(
+                "Custom ElemAdd Operator With a Smooth Parameter, "
+                "which is used to verify the CPU kernel")
+        .add_input("ograd")
+        .add_output("igrad_lhs")
+        .add_output("igrad_rhs")
+        .set_device_infer(backward_device_infer)
+        .set_shape_infer(backward_shape_infer)
+        .set_dtype_infer(backward_dtype_infer)
+        .set_format_infer(backward_format_infer)
+        .set_compute(backward_compute);
+
+CUSTOM_OP_REG_END(ElemAddSmooth)
diff --git a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
new file mode 100644
index 00000000..31998dd9
--- /dev/null
+++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
@@ -0,0 +1,65 @@
+/**
+ * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./matmul_scale.h"
+#include "megbrain/custom/custom.h"
+
+CUSTOM_OP_REG_BEGIN(MatMulScale)
+
+void forward_shape_infer(
+        const std::vector<Shape>& inputs, const Param& params,
+        std::vector<Shape>& outputs) {
+    outputs[0] = {inputs[0][0], inputs[1][1]};
+}
+
+void forward_compute(
+        const std::vector<Tensor>& inputs, const Param& params,
+        std::vector<Tensor>& outputs) {
+    matmul_forward_helper(
+            inputs[0], inputs[1], outputs[0], inputs[0].shape()[0],
+            inputs[0].shape()[1], inputs[1].shape()[1], params["scale"].as<float>());
+}
+
+CUSTOM_OP_REG(MatMulScaleForward)
+        .add_inputs(2)
+        .add_outputs(1)
+        .add_param("scale", 1.0f)
+        .set_shape_infer(forward_shape_infer)
+        .set_compute("cuda", forward_compute);
+
+void backward_shape_infer(
+        const std::vector<Shape>& ograd_and_inputs, const Param& params,
+        std::vector<Shape>& outputs) {
+    outputs[0] = ograd_and_inputs[1];
+    outputs[1] = ograd_and_inputs[2];
+}
+
+void backward_compute(
+        const std::vector<Tensor>& ograd_and_inputs, const Param& params,
+        std::vector<Tensor>& igrads) {
+    matmul_backward_lhs_helper(
+            ograd_and_inputs[2], ograd_and_inputs[0], igrads[0],
+            ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1],
+            ograd_and_inputs[2].shape()[1], params["scale"].as<float>());
+    matmul_backward_rhs_helper(
+            ograd_and_inputs[1], ograd_and_inputs[0], igrads[1],
+            ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1],
+            ograd_and_inputs[2].shape()[1], params["scale"].as<float>());
+}
+
+CUSTOM_OP_REG(MatMulScaleBackward)
+        .add_inputs(3)
+        .add_outputs(2)
+        .add_param("scale", 1.0f)
+        .set_shape_infer(backward_shape_infer)
+        .set_compute("cuda", backward_compute);
+
+CUSTOM_OP_REG_END(MatMulScale)
diff --git a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
new file mode 100644
index 00000000..9d847d32
--- /dev/null
+++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
@@ -0,0 +1,97 @@
+/**
+ * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "./matmul_scale.h"
+
+using namespace custom;
+
+// matmul_forward for Mat_mxk * Mat_k*n
+template <typename T>
+__global__ void matmul_forward_naive(
+        const T* lhs, const T* rhs, T* res, size_t M, size_t K, size_t N, float scale) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    T acc = 0;
+    for (int i = 0; i < K; ++i)
+        acc += lhs[row * K + i] * rhs[i * N + col];
+    res[row * N + col] = acc * scale;
+}
+
+// matmul_backward_lhs for Mat_mxk * Mat_k*n = Mat_mxn
+// that is Mat_mxn * Mat_nxk
+template <typename T>
+__global__ void matmul_backward_lhs_naive(
+        const T* rhs, const T* ograd, T* lhs_grad, size_t M, size_t K, size_t N,
+        float scale) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    T acc = 0;
+    for (int i = 0; i < N; ++i)
+        acc += ograd[row * N + i] * rhs[col * N + i];
+    lhs_grad[row * K + col] = acc / scale;
+}
+
+// matmul_backward_rhs for Mat_mxk * Mat_k*n = Mat_mxn
+// that is Mat_kxm * Mat_mxn
+template <typename T>
+__global__ void matmul_backward_rhs_naive(
+        const T* lhs, const T* ograd, T* rhs_grad, size_t M, size_t K, size_t N,
+        float scale) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    T acc = 0;
+    for (int i = 0; i < M; ++i)
+        acc += lhs[i * K + row] * ograd[i * N + col];
+    rhs_grad[row * N + col] = acc / scale;
+}
+
+void matmul_forward_helper(
+        const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N,
+        float scale) {
+    dim3 block(1, 1);
+    dim3 grid(N / block.x, M / block.y);
+
+    DISPATCH_INT_AND_FLOAT_TYPES(res.dtype(), "matmul_forward", ([&]() {
+                                     matmul_forward_naive<scalar_t><<<grid, block>>>(
+                                             lhs.data<scalar_t>(), rhs.data<scalar_t>(),
+                                             res.data<scalar_t>(), M, K, N, scale);
+                                 }));
+}
+
+void matmul_backward_lhs_helper(
+        const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K,
+        size_t N, float scale) {
+    dim3 block(1, 1);
+    dim3 grid(K / block.x, M / block.y);
+    DISPATCH_INT_AND_FLOAT_TYPES(
+            lhs_grad.dtype(), "matmul_backward_lhs", ([&]() {
+                matmul_backward_lhs_naive<scalar_t><<<grid, block>>>(
+                        rhs.data<scalar_t>(), ograd.data<scalar_t>(),
+                        lhs_grad.data<scalar_t>(), M, K, N, scale);
+            }));
+}
+
+void matmul_backward_rhs_helper(
+        const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K,
+        size_t N, float scale) {
+    dim3 block(1, 1);
+    dim3 grid(N / block.x, K / block.y);
+    DISPATCH_INT_AND_FLOAT_TYPES(
+            rhs_grad.dtype(), "matmul_backward_rhs", ([&]() {
+                matmul_backward_rhs_naive<scalar_t><<<grid, block>>>(
+                        lhs.data<scalar_t>(), ograd.data<scalar_t>(),
+                        rhs_grad.data<scalar_t>(), M, K, N, scale);
+            }));
+}
diff --git a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
new file mode 100644
index 00000000..5f7ea8d0
--- /dev/null
+++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
@@ -0,0 +1,24 @@
+/**
+ * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "megbrain/custom/custom.h"
+
+using Tensor = custom::Tensor;
+
+void matmul_forward_helper(
+        const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N,
+        float scale);
+void matmul_backward_lhs_helper(
+        const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K,
+        size_t N, float scale);
+void matmul_backward_rhs_helper(
+        const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K,
+        size_t N, float scale);
diff --git a/imperative/python/test/unit/core/test_custom_op.py b/imperative/python/test/unit/core/test_custom_op.py
new file mode 100644
index 00000000..e2a9e4b2
--- /dev/null
+++ b/imperative/python/test/unit/core/test_custom_op.py
@@ -0,0 +1,111 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import platform
+import shutil
+import sys
+
+import numpy as np
+import pytest
+
+import megengine
+import megengine.functional as F
+import megengine.optimizer as optim
+from megengine import jit
+from megengine.autodiff import Function, GradManager
+from megengine.core._imperative_rt.core2 import apply
+from megengine.core.ops import custom
+from megengine.device import get_device_count
+from megengine.module import Conv2d, Linear, Module
+from megengine.random import normal
+from megengine.tensor import Parameter, Tensor
+from megengine.utils import custom_op_tools
+
+
+def compare(ref, real):
+    if ref.shape != real.shape:
+        real = real.T
+    np.testing.assert_allclose(ref, real, rtol=1e-3, atol=1e-5)
+
+
+def build_and_clean(test_func):
+    def wrapper():
+        cur_dir_path = os.path.dirname(os.path.abspath(__file__))
+        build_path = os.path.join(cur_dir_path, "custom_opsrc", "build")
+        mgb_root_path = os.path.dirname(
+            os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.dirname(cur_dir_path)))
+            )
+        )
+        extra_include_paths = [os.path.join(mgb_root_path, "src", "custom", "include")]
+        extra_ld_flags = []
+
+        if sys.platform != "win32":
+            ld_path = os.environ.get("LD_LIBRARY_PATH")
+            if ld_path != None:
+                ld_dirs = ld_path.split(":")
+                for ld_dir in ld_dirs:
+                    if os.path.exists(ld_dir) and os.path.isdir(ld_dir):
+                        for lib in os.listdir(ld_dir):
+                            if "megengine_shared" in lib:
+                                extra_ld_flags += [
+                                    "-L{} -Wl,-rpath,{}".format(ld_dir, ld_dir)
+                                ]
+                                break
+
+        if get_device_count("gpu") > 0:
+            custom_opsrc = [
+                os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cpp"),
+                os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cu"),
+            ]
+        else:
+            custom_opsrc = [os.path.join(cur_dir_path, "custom_opsrc", "elem_add.cpp")]
+
+        lib_path = custom_op_tools.build_and_load(
+            "test_op",
+            custom_opsrc,
+            extra_include_paths=extra_include_paths,
+            extra_ldflags=extra_ld_flags,
+            build_dir=build_path,
+            verbose=False,
+            abi_tag=custom.get_custom_op_abi_tag(),
+        )
+        test_func()
+
+        custom.unload(lib_path)
+        if os.path.exists(build_path):
+            shutil.rmtree(build_path)
+
+    return wrapper
+
+
+@pytest.mark.skipif(
+    get_device_count("gpu") > 0, reason="elem_add operator is only supported on CPU"
+)
+@build_and_clean
+def test_custom_op_cpu_build():
+    assert "ElemAddSmoothForward" in custom._get_custom_op_list()
+    assert "ElemAddSmoothBackward" in custom._get_custom_op_list()
+    assert hasattr(custom, "ElemAddSmoothForward")
+    assert hasattr(custom, "ElemAddSmoothBackward")
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin",
+    reason="GPU kernel is only support on Linux and Windows",
+)
+@pytest.mark.skipif(
+    get_device_count("gpu") < 1, reason="matmul scale operator is only supported on GPU"
+)
+@build_and_clean
+def test_custom_op_gpu_build():
+    assert "MatMulScaleForward" in custom._get_custom_op_list()
+    assert "MatMulScaleBackward" in custom._get_custom_op_list()
+    assert hasattr(custom, "MatMulScaleForward")
+    assert hasattr(custom, "MatMulScaleBackward")
diff --git a/imperative/python/test/unit/core/test_interpreter.py b/imperative/python/test/unit/core/test_interpreter.py
index 3bbb5caf..2513c36b 100644
--- a/imperative/python/test/unit/core/test_interpreter.py
+++ b/imperative/python/test/unit/core/test_interpreter.py
@@ -96,6 +96,15 @@ def test_regression_2870():
     (x + x).numpy()
 
 
+@pytest.mark.require_ngpu(1)
+def test_async_error_check():
+    src = mge.tensor([[1.0, 2.0]])
+    index = mge.tensor([3])
+    val = F.indexing_one_hot(src, index)
+    with pytest.raises(RuntimeError):
+        val.numpy()
+
+
 # NOTE: DO NOT REMOVE THIS TEST
 #   This is also a compatibility test for
 #   mge.core.set_option('async_level', 0).
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
index 1a82b30d..3176c057 100644
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -59,14 +59,47 @@ def test_where():
 
 
 def test_dropout():
-    # test training mode
-    data = tensor(np.ones(10000000, dtype=np.float32))
-    out = F.nn.dropout(data, 1.0 / 3.0, training=True)
-    assert not out.numpy().all()
-
-    # test eval mode
-    out = F.nn.dropout(data, 1.0 / 3.0, training=False)
-    assert out.numpy().all()
+    from megengine.autodiff import GradManager
+    from megengine.core._imperative_rt.ops import set_global_rng_seed
+
+    def test_dropout_with_shape(shape, rate):
+        data = tensor(np.ones(shape, dtype=np.float32))
+        gm = GradManager().attach([data])
+        with gm:
+            out = F.nn.dropout(data, rate, training=True)
+            gm.backward(out, tensor(np.ones(shape, dtype=np.float32)))
+            assert not out.numpy().all()
+            np.testing.assert_allclose(out.numpy(), data.grad.numpy(), 1e-7, 1e-7)
+
+    def test_multiple_dropout(shape, rate):
+        data = tensor(np.ones(shape, dtype=np.float32))
+        gm = GradManager().attach([data])
+        with gm:
+            out1 = F.nn.dropout(data, rate, training=True)
+            out2 = F.nn.dropout(out1, rate, training=True)
+            out3 = F.nn.dropout(out2, rate, training=True)
+            gm.backward(out3, tensor(np.ones(shape, dtype=np.float32)))
+            np.testing.assert_allclose(out3.numpy(), data.grad.numpy(), 1e-7, 1e-7)
+
+    def test_dropout_seed(shape, rate):
+        data = tensor(np.random.randn(*shape), dtype="float32")
+        set_global_rng_seed(111)
+        out1 = F.nn.dropout(data, rate, training=True)
+        out2 = F.nn.dropout(data, rate, training=True)
+        assert not (out1.numpy() == out2.numpy()).all()
+
+        set_global_rng_seed(111)
+        out3 = F.nn.dropout(data, rate, training=True)
+        assert (out1.numpy() == out3.numpy()).all()
+
+        set_global_rng_seed(222)
+        out4 = F.nn.dropout(data, rate, training=True)
+        assert not (out1.numpy() == out4.numpy()).all()
+
+    test_dropout_with_shape([13, 17, 63, 21], 0.4)
+    test_dropout_with_shape([16, 32, 64], 0.3)
+    test_multiple_dropout([1024], 0.2)
+    test_dropout_seed([16, 32], 0.2)
 
 
 def test_matinv():
@@ -865,61 +898,6 @@ def test_conv1d():
     )
 
 
-def test_layer_norm():
-    def _layer_norm(x, normalized_shape, affine, weight=None, bias=None, eps=1e-5):
-        __layer_norm = LayerNorm(normalized_shape=normalized_shape, affine=affine)
-        __layer_norm.weight = weight
-        __layer_norm.bias = bias
-        return __layer_norm(x)
-
-    def _layer_norm_numpy(
-        x, normalized_shape, affine, weight=None, bias=None, eps=1e-5
-    ):
-        x_shape = x.shape
-        dim_delta = len(x_shape) - len(normalized_shape)
-        non_flatten_shape = x_shape[:dim_delta]
-        x = x.reshape(*non_flatten_shape, -1)
-
-        mean = x.mean(axis=-1, keepdims=True)
-        var = (x ** 2).mean(axis=-1, keepdims=True) - mean * mean
-
-        x = (x - mean) / F.sqrt(var + eps)
-        x = x.reshape(x_shape)
-        if affine:
-            x = weight * x + bias
-
-        return x
-
-    normalized_shape = (28, 28)
-    inp_feat = Tensor(np.random.randn(32, 64, 28, 28), dtype="float32")
-    weight = Tensor(np.random.randn(28, 28), dtype="float32")
-    bias = Tensor(np.random.randn(28, 28), dtype="float32")
-
-    inp_feat = inp_feat + 1
-    weight = weight + 1
-    bias = bias
-
-    affine = False
-
-    outvar = F.nn.layer_norm(inp_feat, normalized_shape, affine, weight, bias)
-    targetvar = _layer_norm_numpy(inp_feat, normalized_shape, affine, weight, bias)
-
-    assert abs(outvar - targetvar).mean() < 1e-7
-
-    # no random, affine True
-    normalized_shape = (28, 28)
-    inp_feat = Tensor(np.ones((32, 64, 28, 28)), dtype="float32")
-    weight = Tensor(np.ones((28, 28)), dtype="float32")
-    bias = Tensor(np.zeros((28, 28)), dtype="float32")
-
-    affine = True
-
-    outvar = F.nn.layer_norm(inp_feat, normalized_shape, affine, weight, bias)
-    targetvar = _layer_norm(inp_feat, normalized_shape, affine, weight, bias)
-    assert abs((outvar - targetvar).mean()) < 1e-7
-    assert abs(outvar.mean()) < 1e-7
-
-
 def test_batchnorm2d_autocast():
     """check amp's result is equal to manually converted result"""
     amp.enabled = True
diff --git a/imperative/python/test/unit/functional/test_loss.py b/imperative/python/test/unit/functional/test_loss.py
index d46f40b6..abf4b2fe 100644
--- a/imperative/python/test/unit/functional/test_loss.py
+++ b/imperative/python/test/unit/functional/test_loss.py
@@ -43,7 +43,7 @@ def test_cross_entropy():
     x = softmax(x)
     l_ref = ref(x, y)
     l = F.nn.cross_entropy(tensor(x, "float32"), tensor(y, "int32"), with_logits=False)
-    np.testing.assert_allclose(l.numpy(), l_ref)
+    np.testing.assert_allclose(l.numpy(), l_ref, 1e-6, 1e-6)
 
 
 def test_cross_entropy_reduction():
diff --git a/imperative/python/test/unit/random/test_rng.py b/imperative/python/test/unit/random/test_rng.py
index a33a5840..1083e947 100644
--- a/imperative/python/test/unit/random/test_rng.py
+++ b/imperative/python/test/unit/random/test_rng.py
@@ -226,7 +226,7 @@ def test_UniformRNG():
     out2 = m2.uniform(size=(100,))
     out3 = m3.uniform(size=(100,))
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
     assert not (out1.numpy() == out1_.numpy()).all()
@@ -254,7 +254,7 @@ def test_NormalRNG():
     out2 = m2.normal(size=(100,))
     out3 = m3.normal(size=(100,))
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
     assert not (out1.numpy() == out1_.numpy()).all()
@@ -283,7 +283,7 @@ def test_GammaRNG():
     out2 = m2.gamma(2, size=(100,))
     out3 = m3.gamma(2, size=(100,))
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
     assert not (out1.numpy() == out1_.numpy()).all()
@@ -316,7 +316,7 @@ def test_BetaRNG():
     out2 = m2.beta(2, 1, size=(100,))
     out3 = m3.beta(2, 1, size=(100,))
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
     assert not (out1.numpy() == out1_.numpy()).all()
@@ -351,7 +351,7 @@ def test_PoissonRNG():
     out2 = m2.poisson(lam.to("xpu1"), size=(100,))
     out3 = m3.poisson(lam.to("xpu0"), size=(100,))
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
 
@@ -381,7 +381,7 @@ def test_PermutationRNG(symbolic):
     out2 = m2.permutation(1000)
     out3 = m3.permutation(1000)
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
     assert not (out1.numpy() == out1_.numpy()).all()
@@ -443,7 +443,7 @@ def test_ShuffleRNG():
     m2.shuffle(out2)
     m3.shuffle(out3)
 
-    np.testing.assert_equal(out1.numpy(), out2.numpy())
+    np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6)
     assert out1.device == "xpu0" and out2.device == "xpu1"
     assert not (out1.numpy() == out3.numpy()).all()
 
@@ -465,7 +465,7 @@ def test_seed():
 
     set_global_seed(10)
     out3 = uniform(size=[10, 10])
-    np.testing.assert_equal(out1.numpy(), out3.numpy())
+    np.testing.assert_allclose(out1.numpy(), out3.numpy(), atol=1e-6)
 
     set_global_seed(11)
     out4 = uniform(size=[10, 10])
diff --git a/imperative/python/test/unit/traced_module/test_modification.py b/imperative/python/test/unit/traced_module/test_modification.py
index 1a9c99f9..036924a7 100644
--- a/imperative/python/test/unit/traced_module/test_modification.py
+++ b/imperative/python/test/unit/traced_module/test_modification.py
@@ -377,6 +377,33 @@ def test_set_node_name():
     rename("output")
     np.testing.assert_equal(str(graph.outputs[0]), "output")
 
+    def add_1(x):
+        x = x + 1
+        x.name = "func_add_1"
+        return x
+
+    class ModuleAdd_3(M.Module):
+        def forward(self, x):
+            x = x + 1
+            x.name = "module_add_1"
+            x = x + 2
+            return x
+
+    setattr(traced_module, "add_3", ModuleAdd_3())
+
+    self = graph.inputs[0]
+    with graph.insert_exprs():
+        x = output_node + 1
+        x.name = "_add_1"
+        x = add_1(x)
+        x = self.add_3(x)
+    graph.replace_node({output_node: x})
+    graph.compile()
+
+    assert "_add_1" in graph._namespace.used_names
+    assert "func_add_1" in graph._namespace.used_names
+    assert "module_add_1" in traced_module.add_3.graph._namespace.used_names
+
 
 def test_set_graph_name():
     traced_module, x, expect = _init_module()
diff --git a/imperative/python/test/unit/traced_module/test_qat_module.py b/imperative/python/test/unit/traced_module/test_qat_module.py
index 6ef8764b..57a94693 100644
--- a/imperative/python/test/unit/traced_module/test_qat_module.py
+++ b/imperative/python/test/unit/traced_module/test_qat_module.py
@@ -109,6 +109,7 @@ def build_observered_net(net: M.Module, observer_cls):
     )
     Q.enable_observer(qat_net)
     inp = Tensor(np.random.random(size=(5, 3, 32, 32)))
+    qat_net.eval()
     qat_net(inp)
     Q.disable_observer(qat_net)
     return qat_net
@@ -116,6 +117,7 @@ def build_observered_net(net: M.Module, observer_cls):
 
 def build_fakequanted_net(net: QATModule, fakequant_cls):
     qat_net = Q.reset_qconfig(net, get_lsq_config(fakequant_cls))
+    qat_net.eval()
     return qat_net
 
 
@@ -162,6 +164,7 @@ def test_load_param():
 
     def _check_module(build_func: Callable):
         net = build_func()
+        net.eval()
         buffer = io.BytesIO()
         mge.save(net.state_dict(), buffer)
         buffer.seek(0)
@@ -185,6 +188,7 @@ def test_load_param():
 def test_qualname():
     def _check_qualname(net):
         inp = Tensor(np.random.random(size=(5, 3, 32, 32)))
+        net.eval()
         traced_net = trace_module(net, inp)
         base_qualname = traced_net.graph.qualname
         for node in traced_net.graph.nodes():
diff --git a/imperative/python/test/unit/traced_module/test_trace_module.py b/imperative/python/test/unit/traced_module/test_trace_module.py
index e4441c49..d3baf153 100644
--- a/imperative/python/test/unit/traced_module/test_trace_module.py
+++ b/imperative/python/test/unit/traced_module/test_trace_module.py
@@ -6,7 +6,7 @@ import megengine.functional as F
 import megengine.module as M
 from megengine import Tensor
 from megengine.module.module import Module
-from megengine.traced_module import TracedModule, trace_module
+from megengine.traced_module import TracedModule, enable_expr_checker, trace_module
 from megengine.traced_module.expr import CallFunction
 
 
@@ -58,7 +58,7 @@ class MyModule4(M.Module):
 
 
 def test_trace_module():
-
+    enable_expr_checker()
     x = Tensor(1)
     m1 = MyModule1()
     tm1 = trace_module(m1, x)
diff --git a/imperative/python/test/unit/utils/test_utils.py b/imperative/python/test/unit/utils/test_utils.py
index 0ca81072..f32a3d93 100644
--- a/imperative/python/test/unit/utils/test_utils.py
+++ b/imperative/python/test/unit/utils/test_utils.py
@@ -1,15 +1,69 @@
+import os
+import platform
+
 import pytest
 
-from megengine.utils.persistent_cache import _manager
+from megengine.utils.persistent_cache import PersistentCacheOnServer
+
+
+@pytest.mark.parametrize("with_flag", [True, False])
+@pytest.mark.skipif(
+    platform.system() not in {"Linux", "Darwin"},
+    reason="redislite not implemented in windows",
+)
+def test_persistent_cache_redis(monkeypatch, with_flag):
+    import redislite
+
+    server = redislite.Redis()
+    monkeypatch.delenv("MGE_FASTRUN_CACHE_TYPE", raising=False)
+    monkeypatch.setenv(
+        "MGE_FASTRUN_CACHE_URL", "redis+socket://{}".format(server.socket_file)
+    )
+    if with_flag:
+        server.set("mgb-cache-flag", 1)
+    pc = PersistentCacheOnServer()
+    pc.put("test", "hello", "world")
+    if with_flag:
+        pc = PersistentCacheOnServer()
+        assert pc.get("test", "hello") == b"world"
+        assert pc.config.type == "redis"
+    else:
+        assert pc.config.type == "in-file"
+
+
+def test_persistent_cache_file(monkeypatch, tmp_path):
+    monkeypatch.setenv("MGE_FASTRUN_CACHE_TYPE", "FILE")
+    monkeypatch.setenv("MGE_FASTRUN_CACHE_DIR", tmp_path)
+    pc = PersistentCacheOnServer()
+    pc.put("test", "store", "this")
+    assert pc.config.type == "in-file"
+    del pc
+    pc = PersistentCacheOnServer()
+    assert pc.get("test", "store") == b"this"
+
+
+def test_persistent_cache_file_clear(monkeypatch, tmp_path):
+    monkeypatch.setenv("MGE_FASTRUN_CACHE_TYPE", "FILE")
+    monkeypatch.setenv("MGE_FASTRUN_CACHE_DIR", tmp_path)
+    pc = PersistentCacheOnServer()
+    pc_dummy = PersistentCacheOnServer()
+    pc.put("test", "drop", "this")
+    assert pc.config.type == "in-file"
+    del pc
+    # this dummy instance shouldn't override cache file
+    del pc_dummy
+    os.unlink(os.path.join(tmp_path, "cache.bin"))
+    pc = PersistentCacheOnServer()
+    assert pc.get("test", "drop") is None
 
 
-def test_persistent_cache():
-    pc = _manager
-    k0 = b"\x00\x00"
-    k1 = b"\x00\x01"
-    cat = "test"
-    pc.put(cat, k0, k1)
-    pc.put(cat, k1, k0)
-    assert k1 == pc.get(cat, k0)
-    assert k0 == pc.get(cat, k1)
-    assert pc.get("test1", k0) == None
+def test_persistent_cache_memory(monkeypatch):
+    monkeypatch.setenv("MGE_FASTRUN_CACHE_TYPE", "MEMORY")
+    pc = PersistentCacheOnServer()
+    assert pc.config is None
+    pc.put("test", "drop", "this")
+    assert pc.config.type == "in-memory"
+    assert pc.get("test", "drop") == b"this"
+    del pc
+    pc = PersistentCacheOnServer()
+    assert pc.get("test", "drop") is None
diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp
index bf122982..ecf63e65 100644
--- a/imperative/src/impl/interpreter/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter/interpreter_impl.cpp
@@ -156,6 +156,8 @@ TensorInfo* ChannelImpl::put_impl(const HostTensorND& value, bool no_cache) {
     if (m_async_level == 0) {
         sync_impl();
         info->desc.comp_node.sync();
+        auto err = info->desc.comp_node.check_async_error();
+        mgb_assert(!err, "%s", err->what());
     }
     return info;
 }
@@ -336,6 +338,8 @@ void ChannelImpl::dispatch_kernel(
         for (auto&& oup : *outputs) {
             auto info = reinterpret_cast<TensorInfo*>(oup);
             info->ptr->comp_node().sync();
+            auto err = info->ptr->comp_node().check_async_error();
+            mgb_assert(!err, "%s", err->what());
         }
     }
 }
@@ -931,7 +935,8 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
     MGB_RECORD_EVENT(TensorWaitPropEvent, info->id, m_waitee_id, prop);
     bool require_host = prop == TensorProp::HostValue;
     auto host_available = [&] { return info->ptr && info->ptr->value_fetched(); };
-    if (require_host && !host_available()) {
+    bool wait_host = !host_available();
+    if (require_host && wait_host) {
         // avoid dead lock
         lock.unlock();
         m_buffer.enqueue(GetValue{info});
@@ -944,6 +949,10 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
     });
     MGB_RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop);
     m_waitee = nullptr;
+    if (require_host && wait_host) {
+        auto err = info->ptr->comp_node().check_async_error();
+        mgb_assert(!err, "%s", err->what());
+    }
     return info->ptr;
 }
 
diff --git a/imperative/src/impl/ops/collective_comm.cpp b/imperative/src/impl/ops/collective_comm.cpp
index 6c969a22..141f7f62 100644
--- a/imperative/src/impl/ops/collective_comm.cpp
+++ b/imperative/src/impl/ops/collective_comm.cpp
@@ -27,7 +27,7 @@ namespace imperative {
 namespace {
 cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
     auto&& comm = def.cast_final_safe<CollectiveComm>();
-    auto group_client = std::make_shared<GroupClientProxy>(
+    auto group_client = std::make_shared<opr::GroupClientProxy>(
             ssprintf("%s:%d", comm.addr.data(), comm.port));
     SmallVector<std::shared_ptr<mgb::DeviceTensorND>> dev_buffer_arr(1, nullptr);
     auto disable = std::make_shared<DTypeScalar>();
diff --git a/imperative/src/impl/ops/elemwise.cpp b/imperative/src/impl/ops/elemwise.cpp
index 232e85ef..809b53ca 100644
--- a/imperative/src/impl/ops/elemwise.cpp
+++ b/imperative/src/impl/ops/elemwise.cpp
@@ -158,70 +158,71 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
 
 MGB_DEFINE_OPR_CLASS(
         ForceInplaceElemwise,
-        cg::SingleCNOperatorNodeBaseT<opr::mixin::MegDNNOprHolder>)  //{
+        cg::SingleCNOperatorNodeBaseT<opr::mixin::MegDNNOprHolder>) // {
 public:
-struct Param {
-    using Mode = megdnn::Elemwise::Param::Mode;
-    Mode mode;
-    size_t inplace_index;
-};
-using Mode = Param::Mode;
-ForceInplaceElemwise(
-        const VarNodeArray& inputs, Param param, OperatorNodeConfig config = {})
-        : Super(inputs[0]->owner_graph(), config, "device_add_update", inputs),
-          m_param{param} {
-    for (auto* input : inputs) {
-        add_input({input});
+    struct Param {
+        using Mode = megdnn::Elemwise::Param::Mode;
+        Mode mode;
+        size_t inplace_index;
+    };
+    using Mode = Param::Mode;
+    ForceInplaceElemwise(
+            const VarNodeArray& inputs, Param param, OperatorNodeConfig config = {})
+            : Super(inputs[0]->owner_graph(), config, "device_add_update", inputs),
+              m_param{param} {
+        for (auto* input : inputs) {
+            add_input({input});
+        }
+        add_output(None)
+                ->set_fwd_in2out_writable_force(input(param.inplace_index))
+                .add_flag(VarNode::Flag::NO_MEM_RECLAIM);
     }
-    add_output(None)
-            ->set_fwd_in2out_writable_force(input(param.inplace_index))
-            .add_flag(VarNode::Flag::NO_MEM_RECLAIM);
-}
-static SymbolVar make(const VarNodeArray& inputs, Param param) {
-    return SymbolVar{inputs[0]}.insert_single_output_opr<ForceInplaceElemwise>(
-            inputs, param);
-}
-static cg::OperatorNodeBase* shallow_copy(
-        const serialization::OprShallowCopyContext& ctx,
-        const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs,
-        const OperatorNodeConfig& config);
+    static SymbolVar make(const VarNodeArray& inputs, Param param) {
+        return SymbolVar{inputs[0]}.insert_single_output_opr<ForceInplaceElemwise>(
+                inputs, param);
+    }
+    static cg::OperatorNodeBase* shallow_copy(
+            const serialization::OprShallowCopyContext& ctx,
+            const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs,
+            const OperatorNodeConfig& config);
 
 protected:
-NodeProp* do_make_node_prop() const override {
-    auto ret = Super::do_make_node_prop();
-    ret->add_flag(NodeProp::Flag::FORCE_UPDATE_INPUT_VAR);
-    return ret;
-}
-void create_megdnn_opr() override {
-    auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node());
-    opr->param().mode = m_param.mode;
-    set_megdnn_opr(std::move(opr));
-}
-void scn_do_execute() override {
-    auto to_dnnnd = [&](auto* var) { return var->dev_tensor().as_megdnn(); };
-    megdnn::TensorNDArray inputs_dnnnd;
-    for (auto* input : input()) {
-        inputs_dnnnd.push_back(to_dnnnd(input));
+    NodeProp* do_make_node_prop() const override {
+        auto ret = Super::do_make_node_prop();
+        ret->add_flag(NodeProp::Flag::FORCE_UPDATE_INPUT_VAR);
+        return ret;
     }
-    mgb_assert(
-            input(m_param.inplace_index)->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC),
-            "ForceInplaceElemwise cannot be applied in internal tensor");
-    auto* out_dest = output(0);
-    auto* opr = static_cast<megdnn::Elemwise*>(megdnn_opr());
-    opr->exec(std::move(inputs_dnnnd), to_dnnnd(out_dest));
-}
-void init_output_static_infer_desc() override {
-    using namespace cg::static_infer;
+    void create_megdnn_opr() override {
+        auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node());
+        opr->param().mode = m_param.mode;
+        set_megdnn_opr(std::move(opr));
+    }
+    void scn_do_execute() override {
+        auto to_dnnnd = [&](auto* var) { return var->dev_tensor().as_megdnn(); };
+        megdnn::TensorNDArray inputs_dnnnd;
+        for (auto* input : input()) {
+            inputs_dnnnd.push_back(to_dnnnd(input));
+        }
+        mgb_assert(
+                input(m_param.inplace_index)
+                        ->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC),
+                "ForceInplaceElemwise cannot be applied in internal tensor");
+        auto* out_dest = output(0);
+        auto* opr = static_cast<megdnn::Elemwise*>(megdnn_opr());
+        opr->exec(std::move(inputs_dnnnd), to_dnnnd(out_dest));
+    }
+    void init_output_static_infer_desc() override {
+        using namespace cg::static_infer;
 
-    owner_graph()->static_infer_manager().register_shape_infer(
-            output(0), ShapeInferDesc::make_identity(input(m_param.inplace_index)));
-}
+        owner_graph()->static_infer_manager().register_shape_infer(
+                output(0), ShapeInferDesc::make_identity(input(m_param.inplace_index)));
+    }
 
 private:
-Param m_param;
-void record_execute_deps(ExecDependencyArray& deps) override {
-    record_megdnn_opr(deps);
-}
+    Param m_param;
+    void record_execute_deps(ExecDependencyArray& deps) override {
+        record_megdnn_opr(deps);
+    }
 };
 
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(ForceInplaceElemwise);
diff --git a/imperative/src/impl/ops/io_remote.cpp b/imperative/src/impl/ops/io_remote.cpp
index 29b0316e..03e4d58a 100644
--- a/imperative/src/impl/ops/io_remote.cpp
+++ b/imperative/src/impl/ops/io_remote.cpp
@@ -28,7 +28,7 @@ namespace {
 cg::OperatorNodeBase* apply_on_var_node_remote_send(
         const OpDef& def, const VarNodeArray& inputs) {
     auto&& send = def.cast_final_safe<RemoteSend>();
-    auto group_client = std::make_shared<GroupClientProxy>(
+    auto group_client = std::make_shared<opr::GroupClientProxy>(
             ssprintf("%s:%d", send.addr.data(), send.port));
     auto&& graph = inputs[0]->owner_graph();
 
@@ -44,7 +44,7 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv(
     auto&& recv = def.cast_final_safe<RemoteRecv>();
     OperatorNodeConfig config{recv.cn};
     config.name(recv.make_name());
-    auto group_client = std::make_shared<GroupClientProxy>(
+    auto group_client = std::make_shared<opr::GroupClientProxy>(
             ssprintf("%s:%d", recv.addr.data(), recv.port));
     auto&& graph = inputs[0]->owner_graph();
     return graph->insert_opr(std::make_unique<mgb::opr::RemoteRecv>(
diff --git a/imperative/src/impl/ops/rng.cpp b/imperative/src/impl/ops/rng.cpp
index 232629f8..311a780a 100644
--- a/imperative/src/impl/ops/rng.cpp
+++ b/imperative/src/impl/ops/rng.cpp
@@ -282,6 +282,21 @@ struct OpMeth<ShuffleRNG> {
     }
 };
 
+template <>
+struct OpMeth<Dropout> {
+    using DnnOp = megdnn::Dropout;
+    using Param = DnnOp::Param;
+    using OpNode = mgb::opr::Dropout;
+    static Param make_param(const Dropout& opdef) {
+        auto handle_seed = RNGDnnOpManager::get_seed(opdef.handle);
+        mgb_assert(
+                handle_seed == opdef.seed,
+                "inconsistent dropout seed: dropout op: %lu handle: %lu", handle_seed,
+                opdef.seed);
+        return {opdef.drop_prob, handle_seed};
+    }
+};
+
 template <bool>
 struct _InferLayout;
 
@@ -482,6 +497,26 @@ SmallVector<LogicalTensorDesc> infer_output_attrs<ShuffleRNG>(
     return dests;
 }
 
+template <>
+SmallVector<LogicalTensorDesc> infer_output_attrs<Dropout>(
+        const OpDef& op, const SmallVector<TensorPtr>& inputs) {
+    SmallVector<LogicalTensorDesc> dests(2);
+    auto&& cn = inputs[0]->comp_node();
+
+    dests[0].comp_node = cn;
+    dests[0].layout = TensorLayout(inputs[0]->layout());
+    dests[0].layout.dtype = inputs[0]->layout().dtype;
+
+    auto get_mask_size = [&]() -> size_t {
+        auto dnn_handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(cn)).handle();
+        return dnn_handle->create_operator<megdnn::Dropout>()->get_mask_size_in_bytes(
+                inputs[0]->layout());
+    };
+    dests[1].comp_node = cn;
+    dests[1].layout = TensorLayout(TensorShape({get_mask_size()}), dtype::Byte());
+    return dests;
+}
+
 template <typename Op>
 std::tuple<SmallVector<MemoryDesc>, SmallVector<MemoryDesc>> infer_output_mem_desc(
         const OpDef& def, const SmallVector<TensorPtr>& inputs_tensors,
@@ -559,6 +594,25 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible<
     return {dests, true};
 }
 
+template <>
+std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible<Dropout>(
+        const OpDef& op, const SmallVector<LogicalTensorDesc>& inputs) {
+    SmallVector<LogicalTensorDesc> dests(2);
+    auto cn = inputs[0].comp_node;
+    dests[0].comp_node = cn;
+    dests[0].layout = TensorLayout(inputs[0].layout);
+    dests[0].layout.dtype = inputs[0].layout.dtype;
+
+    auto get_mask_size = [&]() -> size_t {
+        auto dnn_handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(cn)).handle();
+        return dnn_handle->create_operator<megdnn::Dropout>()->get_mask_size_in_bytes(
+                inputs[0].layout);
+    };
+    dests[1].comp_node = cn;
+    dests[1].layout = TensorLayout(TensorShape({get_mask_size()}), dtype::Byte());
+    return {dests, true};
+}
+
 }  // anonymous namespace
 
 Handle new_handle(CompNode comp_node, uint64_t seed) {
@@ -599,6 +653,7 @@ REG_RNG_OP(PermutationRNG, SymbolVar)
 REG_RNG_OP(PoissonRNG, SymbolVar)
 REG_RNG_OP(BetaRNG, SymbolVar)
 REG_RNG_OP(ShuffleRNG, SymbolVarArray)
+REG_RNG_OP(Dropout, SymbolVarArray)
 #undef REG_RNG_OP
 
 }  // namespace mgb::imperative::rng
diff --git a/imperative/src/impl/ops/specializations.cpp b/imperative/src/impl/ops/specializations.cpp
index d153b429..5d3562a2 100644
--- a/imperative/src/impl/ops/specializations.cpp
+++ b/imperative/src/impl/ops/specializations.cpp
@@ -20,6 +20,7 @@
 #include "megbrain/opr/dnn/correlation.h"
 #include "megbrain/opr/dnn/fake_quant.h"
 #include "megbrain/opr/dnn/images2neibs.h"
+#include "megbrain/opr/dnn/layer_norm.h"
 #include "megbrain/opr/dnn/local.h"
 #include "megbrain/opr/dnn/lrn.h"
 #include "megbrain/opr/dnn/lsq.h"
@@ -636,4 +637,29 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
 }
 OP_TRAIT_REG(LRN, LRN).apply_on_var_node(apply_on_var_node).fallback();
 }  // namespace lrn
+
+namespace layer_norm {
+
+cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
+    auto&& op = static_cast<const LayerNorm&>(def);
+    size_t nr_inp = inputs.size();
+    auto p = op.param();
+    mgb_assert((nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine));
+    OperatorNodeConfig config{op.make_name()};
+    if (nr_inp == 3) {
+        return opr::LayerNorm::make(
+                       inputs[0], inputs[1], inputs[2], op.param(), config)[0]
+                .node()
+                ->owner_opr();
+    } else {
+        return opr::LayerNorm::make(inputs[0], op.param(), config)[0]
+                .node()
+                ->owner_opr();
+    }
+}
+
+OP_TRAIT_REG(LayerNorm, LayerNorm).apply_on_var_node(apply_on_var_node).fallback();
+
+}  // namespace layer_norm
+
 }  // namespace mgb::imperative
diff --git a/imperative/src/impl/persistent_cache.cpp b/imperative/src/impl/persistent_cache.cpp
index ba3809ab..5fc291c8 100644
--- a/imperative/src/impl/persistent_cache.cpp
+++ b/imperative/src/impl/persistent_cache.cpp
@@ -27,8 +27,10 @@ public:
         m_local = std::make_shared<mgb::InMemoryPersistentCache>();
     }
 
-    bool connect(std::string ip, size_t port, std::string password) {
-        m_client.auth(password);
+    void connect(std::string ip, size_t port, std::optional<std::string> password) {
+        if (password) {
+            m_client.auth(*password);
+        }
         m_client.connect(
                 ip, port,
                 [](const std::string& host, std::size_t port,
@@ -40,16 +42,32 @@ public:
                     }
                 },
                 std::uint32_t(200));
-        if (!m_client.is_connected()) {
-            return false;
-        }
+        mgb_assert(m_client.is_connected(), "connect failed");
         auto flag = m_client.get("mgb-cache-flag");
         sync();
-        return flag.get().ok();
+        auto is_valid = [](const cpp_redis::reply& reply) {
+            switch (reply.get_type()) {
+                case cpp_redis::reply::type::error:
+                case cpp_redis::reply::type::null:
+                    return false;
+                case cpp_redis::reply::type::integer:
+                    return reply.as_integer() != 0;
+                case cpp_redis::reply::type::simple_string:
+                case cpp_redis::reply::type::bulk_string:
+                    return !reply.as_string().empty();
+                case cpp_redis::reply::type::array:
+                    return !reply.as_array().empty();
+                default:
+                    mgb_assert(false, "unknown reply type %d", (int)reply.get_type());
+            }
+        };
+        mgb_assert(is_valid(flag.get()), "invalid mgb-cache-flag");
     }
 
     bool valid() const override { return m_client.is_connected(); }
 
+    void flush() override {}
+
     mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override {
         MGB_LOCK_GUARD(m_mtx);
         auto mem_result = m_local->get(category, key);
@@ -75,7 +93,7 @@ public:
         MGB_LOCK_GUARD(m_mtx);
         std::string key_str(static_cast<const char*>(key.ptr), key.size);
         std::string redis_key_str;
-        encode(category + '@' + key_str, redis_key_str);
+        encode(category + '@' + key_str, redis_key_str, 24);
         std::string value_str(static_cast<const char*>(value.ptr), value.size);
         std::string redis_value_str;
         encode(value_str, redis_value_str);
@@ -118,18 +136,16 @@ private:
 
 class ExtendedInFilePersistentCache final : public ExtendedPersistentCache {
 private:
-    std::string m_path;
+    std::optional<std::string> m_path;
     std::unique_ptr<mgb::InFilePersistentCache> m_impl;
 
 public:
     ExtendedInFilePersistentCache() = default;
 
-    bool open(std::string path) {
+    void open(std::string path) {
         std::fstream file;
         file.open(path, std::ios::in | std::ios::binary);
-        if (!file.is_open()) {
-            return false;
-        }
+        mgb_assert(file.is_open(), "can't open file in %s", path.c_str());
         std::vector<char> bytes = {
                 std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>()};
         if (bytes.size()) {
@@ -139,14 +155,11 @@ public:
             m_impl = std::make_unique<mgb::InFilePersistentCache>();
         }
         m_path = path;
-        return true;
     }
 
-    ~ExtendedInFilePersistentCache() {
-        if (m_impl) {
-            m_impl->dump_cache(m_path.c_str());
-        }
-    }
+    void open() { m_impl = std::make_unique<mgb::InFilePersistentCache>(); }
+
+    ~ExtendedInFilePersistentCache() { flush(); }
 
     mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override {
         return m_impl->get(category, key);
@@ -157,29 +170,64 @@ public:
     }
 
     std::optional<size_t> clear() override {
-        m_impl = std::make_unique<mgb::InFilePersistentCache>();
-        m_impl->dump_cache(m_path.c_str());
+        if (m_impl) {
+            m_impl = std::make_unique<mgb::InFilePersistentCache>();
+            if (m_path) {
+                m_impl->dump_cache(m_path->c_str());
+            }
+        }
         return {};
     }
 
     bool valid() const override { return m_impl != nullptr; }
-};
 
-std::shared_ptr<ExtendedPersistentCache> make_redis(
-        std::string ip, size_t port, std::string password, std::string prefix) {
-    auto cache = std::make_shared<RedisCache>(prefix, 100);
-    if (!cache->connect(ip, port, password)) {
-        return nullptr;
+    void flush() override {
+        if (m_impl && m_path) {
+            m_impl->dump_cache(m_path->c_str());
+        }
     }
-    return cache;
-}
+};
 
-std::shared_ptr<ExtendedPersistentCache> make_in_file(std::string path) {
-    auto cache = std::make_shared<ExtendedInFilePersistentCache>();
-    if (!cache->open(path)) {
-        return nullptr;
+std::shared_ptr<ExtendedPersistentCache> ExtendedPersistentCache::make_from_config(
+        std::string type, std::unordered_map<std::string, std::string> args,
+        std::string& err_msg) {
+    try {
+        if (type == "redis") {
+            std::string prefix = args.at("prefix");
+            std::optional<std::string> password = args.count("password")
+                                                        ? args.at("password")
+                                                        : std::optional<std::string>();
+            auto cache = std::make_shared<RedisCache>(prefix, 100);
+            if (args.count("unixsocket")) {
+                std::string unixsocket = args.at("unixsocket");
+                cache->connect(unixsocket, 0, password);
+            } else {
+                std::string ip = args.at("hostname");
+                int port = atoi(args.at("port").c_str());
+                std::optional<std::string> password =
+                        args.count("password") ? args.at("password")
+                                               : std::optional<std::string>();
+                cache->connect(ip, port, password);
+            }
+            return cache;
+        } else if (type == "in-file") {
+            std::string path = args.at("path");
+            auto cache = std::make_shared<ExtendedInFilePersistentCache>();
+            cache->open(path);
+            return cache;
+        } else if (type == "in-memory") {
+            auto cache = std::make_shared<ExtendedInFilePersistentCache>();
+            cache->open();
+            return cache;
+        } else {
+            mgb_assert(false, "persistent cache type %s unsupported", type.c_str());
+        }
+    } catch (const std::exception& exc) {
+        err_msg = exc.what();
+    } catch (...) {
+        err_msg = "unknown exception";
     }
-    return cache;
+    return nullptr;
 }
 
 }  // namespace mgb::imperative::persistent_cache
diff --git a/imperative/src/include/megbrain/imperative/persistent_cache.h b/imperative/src/include/megbrain/imperative/persistent_cache.h
index 59326bbd..4d63eaae 100644
--- a/imperative/src/include/megbrain/imperative/persistent_cache.h
+++ b/imperative/src/include/megbrain/imperative/persistent_cache.h
@@ -20,12 +20,12 @@ class ExtendedPersistentCache : public mgb::PersistentCache {
 public:
     virtual bool valid() const = 0;
     virtual std::optional<size_t> clear() = 0;
-};
-
-std::shared_ptr<ExtendedPersistentCache> make_redis(
-        std::string ip, size_t port, std::string password, std::string prefix);
+    virtual void flush() = 0;
 
-std::shared_ptr<ExtendedPersistentCache> make_in_file(std::string path);
+    static std::shared_ptr<ExtendedPersistentCache> make_from_config(
+            std::string type, std::unordered_map<std::string, std::string> args,
+            std::string& err_msg);
+};
 
 }  // namespace mgb::imperative::persistent_cache
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/test/collective_comm.cpp b/imperative/src/test/collective_comm.cpp
index 01c0829d..4a31c54b 100644
--- a/imperative/src/test/collective_comm.cpp
+++ b/imperative/src/test/collective_comm.cpp
@@ -20,7 +20,7 @@ TEST(TestImperative, AllReduceBasic) {
     REQUIRE_GPU(2);
     const char* server_addr = "127.0.0.1";
     uint32_t port = 3456;
-    mgb_assert(create_zmqrpc_server(server_addr, port) > 0);
+    mgb_assert(opr::create_zmqrpc_server(server_addr, port) > 0);
     HostTensorGenerator<> gen;
     CompNode cn0 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1");
 
diff --git a/imperative/src/test/io_remote.cpp b/imperative/src/test/io_remote.cpp
index 8e32f7ab..97a7b62d 100644
--- a/imperative/src/test/io_remote.cpp
+++ b/imperative/src/test/io_remote.cpp
@@ -20,7 +20,7 @@ TEST(TestImperative, IORemote) {
     REQUIRE_GPU(2);
     const char* server_addr = "127.0.0.1";
     uint32_t port = 4567;
-    mgb_assert(create_zmqrpc_server(server_addr, port) > 0);
+    mgb_assert(opr::create_zmqrpc_server(server_addr, port) > 0);
     HostTensorGenerator<> gen;
     CompNode cn0 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1");
 
diff --git a/imperative/tablegen/CMakeLists.txt b/imperative/tablegen/CMakeLists.txt
index 7b4a1802..f2d3ed76 100644
--- a/imperative/tablegen/CMakeLists.txt
+++ b/imperative/tablegen/CMakeLists.txt
@@ -1,6 +1,7 @@
 # mgb tablegen executable
 set(TABLE_TARGET mgb-mlir-autogen)
-file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.h ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(${TABLE_TARGET} ${SRCS})
 target_include_directories(${TABLE_TARGET} PRIVATE ${MLIR_LLVM_INCLUDE_DIR})
 target_link_libraries(${TABLE_TARGET} PRIVATE LLVMTableGen MLIRTableGen LLVMSupport)
@@ -13,5 +14,8 @@ tablegen(MGB opdef.cpp.inl ${MGE_IR_INCLUDE_DIRS} "--gen-cpp-body")
 tablegen(MGB opdef.py.inl ${MGE_IR_INCLUDE_DIRS} "--gen-python-binding")
 tablegen(MGB opdef.cpy.inl ${MGE_IR_INCLUDE_DIRS} "--gen-python-c-extension")
 tablegen(MGB enum_macro.h ${MGE_IR_INCLUDE_DIRS} "--gen-enum-list-macro")
-add_custom_target(mgb_opdef ALL DEPENDS opdef.h.inl opdef.cpp.inl opdef.py.inl opdef.cpy.inl enum_macro.h param_defs_tblgen)
-set(MGB_OPDEF_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
+add_custom_target(mgb_opdef ALL DEPENDS opdef.h.inl opdef.cpp.inl opdef.py.inl
+                                        opdef.cpy.inl enum_macro.h param_defs_tblgen)
+set(MGB_OPDEF_OUT_DIR
+    ${CMAKE_CURRENT_BINARY_DIR}
+    PARENT_SCOPE)
diff --git a/imperative/test/CMakeLists.txt b/imperative/test/CMakeLists.txt
index debaa5c9..68fc59f1 100644
--- a/imperative/test/CMakeLists.txt
+++ b/imperative/test/CMakeLists.txt
@@ -5,46 +5,60 @@ file(GLOB_RECURSE SOURCES ../src/test/*.cpp ../src/impl/*.cpp ${MGB_TEST_DIR}/*.
 
 # disable distributed tests
 if(NOT MGE_WITH_DISTRIBUTED)
-    list(FILTER SOURCES EXCLUDE REGEX ".*test/collective_comm.cpp")
-    list(FILTER SOURCES EXCLUDE REGEX ".*test/io_remote.cpp")
+  list(FILTER SOURCES EXCLUDE REGEX ".*test/collective_comm.cpp")
+  list(FILTER SOURCES EXCLUDE REGEX ".*test/io_remote.cpp")
 endif()
 
 # TODO: turn python binding into a static/object library
 add_executable(imperative_test ${SOURCES} ${SRCS})
 add_dependencies(imperative_test mgb_opdef)
-target_include_directories(imperative_test PRIVATE ${MGB_TEST_DIR}/include ../src/include ${MGB_OPDEF_OUT_DIR} ${CPP_REDIS_INCLUDES})
+target_include_directories(
+  imperative_test PRIVATE ${MGB_TEST_DIR}/include ../src/include ${MGB_OPDEF_OUT_DIR}
+                          ${CPP_REDIS_INCLUDES})
 
 # Python binding
-target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_include_directories(
+  imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHON_INCLUDE_DIRS}
+                          ${NUMPY_INCLUDE_DIR})
 target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C)
 target_compile_options(imperative_test PRIVATE -Wno-unused-parameter)
 
-set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json)
+set(LINK_LIBS
+    megbrain
+    megdnn
+    ${MGE_CUDA_LIBS}
+    gtest
+    gmock
+    pybind11::embed
+    range-v3
+    nlohmann_json::nlohmann_json)
 
 if(MGE_WITH_CUDA)
-    list(APPEND LINK_LIBS cudart)
+  list(APPEND LINK_LIBS cudart)
 endif()
 
 if(MGE_WITH_DISTRIBUTED)
-    list(APPEND LINK_LIBS megray)
+  list(APPEND LINK_LIBS megray)
 endif()
 
 target_link_libraries(imperative_test ${LINK_LIBS})
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
-    if(MGE_WITH_CUDA)
-        target_compile_options(imperative_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
-            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
-    else()
-        target_compile_options(imperative_test PRIVATE "-Wno-class-memaccess")
-    endif()
+  if(MGE_WITH_CUDA)
+    target_compile_options(
+      imperative_test
+      PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
+              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
+  else()
+    target_compile_options(imperative_test PRIVATE "-Wno-class-memaccess")
+  endif()
 endif()
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(imperative_test dl)
-    else()
-        target_link_libraries(imperative_test dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(imperative_test dl)
+  else()
+    target_link_libraries(imperative_test dl rt)
+  endif()
 endif()
 
 install(TARGETS imperative_test RUNTIME DESTINATION test)
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index b0966ea8..69b54eb3 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -8,155 +8,185 @@ set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS})
 set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC})
 
 if(NOT MGB_WITH_FLATBUFFERS)
-    include(../cmake/flatbuffers.cmake)
+  include(../cmake/flatbuffers.cmake)
 endif()
 
 file(GLOB_RECURSE SRC_FBS src/**/*.fbs)
 build_flatbuffers(
-    "${SRC_FBS}"
-    ""
-    lite_fbs_generate
-    ""
-    "${CMAKE_CURRENT_BINARY_DIR}"
-    ""
-    ""
-    )
+  "${SRC_FBS}"
+  ""
+  lite_fbs_generate
+  ""
+  "${CMAKE_CURRENT_BINARY_DIR}"
+  ""
+  "")
 
 file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp)
 
 if(MGE_WITH_MINIMUM_SIZE)
-    set(LITE_ENABLE_LOGGING OFF)
-    set(LITE_ENABLE_EXCEPTION OFF)
+  set(LITE_ENABLE_LOGGING OFF)
+  set(LITE_ENABLE_EXCEPTION OFF)
 endif()
 
-# Write out lite_build_config.h
-# It defines macros needed by lite
-configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include)
+# Write out lite_build_config.h It defines macros needed by lite
+configure_file(src/lite_build_config.h.in
+               ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include)
 
 # begin config lite
-if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
-    # FXIME third_party cpp redis do not support build with clang-cl
-    list(APPEND SOURCES_LITE ${CPP_REDIS_SRCS})
+if(LITE_BUILD_WITH_MGE
+   AND LITE_WITH_CUDA
+   AND NOT WIN32)
+  # FXIME third_party cpp redis do not support build with clang-cl
+  list(APPEND SOURCES_LITE ${CPP_REDIS_SRCS})
 endif()
 add_library(lite_static STATIC ${SOURCES_LITE})
 add_dependencies(lite_static lite_fbs_generate)
 include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>)
 
 if(LITE_BUILD_WITH_MGE)
-    target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
-    add_compile_definitions(LITE_BUILD_WITH_MGE=1)
-    message(STATUS "build lite with MegEngine.")
+  target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+  add_compile_definitions(LITE_BUILD_WITH_MGE=1)
+  message(STATUS "build lite with MegEngine.")
 else()
-    target_link_libraries(lite_static PUBLIC flatbuffers)
+  target_link_libraries(lite_static PUBLIC flatbuffers)
 endif()
 
 include_directories(
-    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include>
-    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include>
-    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite>
-    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include>
-    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src>
-    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include>
-    )
+  PUBLIC
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include>
+  PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include>
+  PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite>
+  PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include>
+  PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src>
+  PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include>)
 # end config lite
 
 # define a shared lib
 add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>)
 if(LITE_BUILD_WITH_MGE)
-    target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+  target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
 endif()
 if(ANDROID)
-    link_libraries(log)
-    target_link_libraries(lite_static PRIVATE log)
-    target_link_libraries(lite_shared PRIVATE log)
+  link_libraries(log)
+  target_link_libraries(lite_static PRIVATE log)
+  target_link_libraries(lite_shared PRIVATE log)
 endif()
 
 # define a shared lib for whl
 add_library(lite_shared_whl SHARED $<TARGET_OBJECTS:lite_static>)
 if(LITE_BUILD_WITH_MGE)
-    if (IOS)
-        target_link_libraries(lite_shared_whl PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
-    else()
-        target_link_libraries(lite_shared_whl PRIVATE megengine_shared)
-    endif()
+  if(IOS)
+    target_link_libraries(lite_shared_whl PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+  else()
+    target_link_libraries(lite_shared_whl PRIVATE megengine_shared)
+  endif()
 endif()
 if(ANDROID)
-    target_link_libraries(lite_shared_whl PRIVATE log)
+  target_link_libraries(lite_shared_whl PRIVATE log)
 endif()
 
-# add lite_static_all_in_one same name build by BUILD
-# please do not change flatbuffers/cpuinfo/clog/lite_static order, if change!, cmake
-# can not gen flatbuffers/cpuinfo/clog OBJs to lite_static_all_in_one, this may cmake issue
-# NOTICE: this target always use to separate build with lite, if build lite via include
+# add lite_static_all_in_one same name build by BUILD please do not change
+# flatbuffers/cpuinfo/clog/lite_static order, if change!, cmake can not gen
+# flatbuffers/cpuinfo/clog OBJs to lite_static_all_in_one, this may cmake issue NOTICE:
+# this target always use to separate build with lite, if build lite via include
 # MegEngine/megbrain ROOT_DIR/CMakeLists.txt, just depends lite_static or lite_shared
-#TODO: need refine lite_static_all_in_one depend objects, but now cmake do not support
+# TODO: need refine lite_static_all_in_one depend objects, but now cmake do not support
 # define a add_library which OBJECTS args is a set or list or string
-if (MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL)
-    add_library(lite_static_all_in_one STATIC $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:cpuinfo> $<TARGET_OBJECTS:clog> $<TARGET_OBJECTS:OpenCL> $<TARGET_OBJECTS:lite_static>)
-elseif (MGE_ENABLE_CPUINFO AND NOT MGE_WITH_OPENCL)
-    add_library(lite_static_all_in_one STATIC $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:cpuinfo> $<TARGET_OBJECTS:clog> $<TARGET_OBJECTS:lite_static>)
-elseif (NOT MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL)
-    add_library(lite_static_all_in_one STATIC $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:OpenCL> $<TARGET_OBJECTS:lite_static>)
+if(MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL)
+  add_library(
+    lite_static_all_in_one STATIC
+    $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:cpuinfo> $<TARGET_OBJECTS:clog>
+    $<TARGET_OBJECTS:OpenCL> $<TARGET_OBJECTS:lite_static>)
+elseif(MGE_ENABLE_CPUINFO AND NOT MGE_WITH_OPENCL)
+  add_library(
+    lite_static_all_in_one STATIC
+    $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:cpuinfo> $<TARGET_OBJECTS:clog>
+    $<TARGET_OBJECTS:lite_static>)
+elseif(NOT MGE_ENABLE_CPUINFO AND MGE_WITH_OPENCL)
+  add_library(
+    lite_static_all_in_one STATIC
+    $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:OpenCL>
+    $<TARGET_OBJECTS:lite_static>)
 else()
-    add_library(lite_static_all_in_one STATIC $<TARGET_OBJECTS:flatbuffers> $<TARGET_OBJECTS:lite_static>)
+  add_library(lite_static_all_in_one STATIC $<TARGET_OBJECTS:flatbuffers>
+                                            $<TARGET_OBJECTS:lite_static>)
 endif()
 if(LITE_BUILD_WITH_MGE)
-    target_link_libraries(lite_static_all_in_one PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
+  target_link_libraries(lite_static_all_in_one PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
 endif()
 
-if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
-    # FXIME third_party cpp redis do not support build with clang-cl
-    target_include_directories(lite_static PRIVATE ${CPP_REDIS_INCLUDES})
-    target_include_directories(lite_shared PRIVATE ${CPP_REDIS_INCLUDES})
-    target_include_directories(lite_shared_whl PRIVATE ${CPP_REDIS_INCLUDES})
-    target_include_directories(lite_static_all_in_one PRIVATE ${CPP_REDIS_INCLUDES})
+if(LITE_BUILD_WITH_MGE
+   AND LITE_WITH_CUDA
+   AND NOT WIN32)
+  # FXIME third_party cpp redis do not support build with clang-cl
+  target_include_directories(lite_static PRIVATE ${CPP_REDIS_INCLUDES})
+  target_include_directories(lite_shared PRIVATE ${CPP_REDIS_INCLUDES})
+  target_include_directories(lite_shared_whl PRIVATE ${CPP_REDIS_INCLUDES})
+  target_include_directories(lite_static_all_in_one PRIVATE ${CPP_REDIS_INCLUDES})
 endif()
-set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script")
+set(LITE_VERSION_SCRIPT
+    ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld
+    CACHE INTERNAL "Path to linker version script")
 add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT})
 if(NOT MSVC AND NOT WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
 endif()
-#TODO: implemente version script for other OS
-if (UNIX AND NOT APPLE)
-    target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
-    set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
-    target_link_options(lite_shared_whl PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
-    set_target_properties(lite_shared_whl PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
+# TODO: implemente version script for other OS
+if(UNIX AND NOT APPLE)
+  target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
+  set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
+  target_link_options(lite_shared_whl PRIVATE
+                      -Wl,--version-script=${LITE_VERSION_SCRIPT})
+  set_target_properties(lite_shared_whl PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
 endif()
 
 # config install
-install(TARGETS lite_static
-    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
-    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
-    ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
-
-install(TARGETS lite_shared
-    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
-    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
-    ARCHIVE DESTINATION lite/lib/${MGE_ARCH}
-    )
-
-install(TARGETS lite_static_all_in_one
-    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
-    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
-    ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
+install(
+  TARGETS lite_static
+  LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+  FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+  ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
+
+install(
+  TARGETS lite_shared
+  LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+  FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+  ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
+
+install(
+  TARGETS lite_static_all_in_one
+  LIBRARY DESTINATION lite/lib/${MGE_ARCH}
+  FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
+  ARCHIVE DESTINATION lite/lib/${MGE_ARCH})
 install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c)
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c)
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/lite
+  FILES_MATCHING
+  PATTERN "*.h")
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/lite
+  FILES_MATCHING
+  PATTERN "*.h")
 
 add_subdirectory(example)
 if(MGE_WITH_TEST)
-    add_subdirectory(test)
+  add_subdirectory(test)
 endif()
 
-#load_and_run
+# load_and_run
 add_subdirectory(load_and_run)
 
 # tools and example
@@ -164,11 +194,12 @@ add_executable(rc4_encryptor tools/rc4_encrypt.cpp)
 
 target_link_libraries(rc4_encryptor lite_static)
 if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
-    # FIXME: hip obj can not find cpp obj only through lite_static
-    target_link_libraries(rc4_encryptor megdnn)
+  # FIXME: hip obj can not find cpp obj only through lite_static
+  target_link_libraries(rc4_encryptor megdnn)
 endif()
-target_include_directories(rc4_encryptor PRIVATE
-    {PROJECT_SOURCE_DIR}/lite/src/decryption)
-install (TARGETS rc4_encryptor
-    EXPORT ${LITE_EXPORT_TARGETS}
-    RUNTIME DESTINATION lite/tools)
+target_include_directories(rc4_encryptor
+                           PRIVATE {PROJECT_SOURCE_DIR}/lite/src/decryption)
+install(
+  TARGETS rc4_encryptor
+  EXPORT ${LITE_EXPORT_TARGETS}
+  RUNTIME DESTINATION lite/tools)
diff --git a/lite/example/c_example/CMakeLists.txt b/lite/example/c_example/CMakeLists.txt
index 141725e5..b0a0d0bd 100644
--- a/lite/example/c_example/CMakeLists.txt
+++ b/lite/example/c_example/CMakeLists.txt
@@ -1,44 +1,46 @@
 add_executable(lite_c_examples ./main.c)
 if(LITE_BUILD_WITH_RKNPU)
-    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
-    target_link_options(lite_c_examples  PRIVATE "-fuse-ld=gold")
+  # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+  target_link_options(lite_c_examples PRIVATE "-fuse-ld=gold")
 endif()
 target_link_libraries(lite_c_examples lite_static)
 if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
-    # FIXME: hip obj can not find cpp obj only through lite_static
-    target_link_libraries(lite_c_examples megdnn)
+  # FIXME: hip obj can not find cpp obj only through lite_static
+  target_link_libraries(lite_c_examples megdnn)
 endif()
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(lite_c_examples dl)
-    else()
-        target_link_libraries(lite_c_examples dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(lite_c_examples dl)
+  else()
+    target_link_libraries(lite_c_examples dl rt)
+  endif()
 endif()
 
-install (TARGETS lite_c_examples
-    EXPORT ${LITE_EXPORT_TARGETS}
-    RUNTIME DESTINATION lite/bin)
+install(
+  TARGETS lite_c_examples
+  EXPORT ${LITE_EXPORT_TARGETS}
+  RUNTIME DESTINATION lite/bin)
 
 # add lite_examples_depends_shared for CI check symbol export valid
-add_executable(lite_c_examples_depends_shared  ./main.c)
+add_executable(lite_c_examples_depends_shared ./main.c)
 
 if(LITE_BUILD_WITH_RKNPU)
-    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
-    target_link_options(lite_c_examples_depends_shared  PRIVATE "-fuse-ld=gold")
+  # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+  target_link_options(lite_c_examples_depends_shared PRIVATE "-fuse-ld=gold")
 endif()
 
 target_link_libraries(lite_c_examples_depends_shared lite_shared)
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(lite_c_examples_depends_shared dl)
-    else()
-        target_link_libraries(lite_c_examples_depends_shared dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(lite_c_examples_depends_shared dl)
+  else()
+    target_link_libraries(lite_c_examples_depends_shared dl rt)
+  endif()
 endif()
 
-install (TARGETS lite_c_examples_depends_shared
-    EXPORT ${LITE_EXPORT_TARGETS}
-    RUNTIME DESTINATION lite/bin)
+install(
+  TARGETS lite_c_examples_depends_shared
+  EXPORT ${LITE_EXPORT_TARGETS}
+  RUNTIME DESTINATION lite/bin)
diff --git a/lite/example/cpp_example/CMakeLists.txt b/lite/example/cpp_example/CMakeLists.txt
index f7227b62..1649c4b1 100644
--- a/lite/example/cpp_example/CMakeLists.txt
+++ b/lite/example/cpp_example/CMakeLists.txt
@@ -1,49 +1,51 @@
-file (GLOB_RECURSE SOURCES ./*.cpp)
-add_executable(lite_examples  ${SOURCES})
+file(GLOB_RECURSE SOURCES ./*.cpp)
+add_executable(lite_examples ${SOURCES})
 target_include_directories(lite_examples PUBLIC ./)
 
 if(LITE_BUILD_WITH_RKNPU)
-    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
-    target_link_options(lite_examples  PRIVATE "-fuse-ld=gold")
+  # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+  target_link_options(lite_examples PRIVATE "-fuse-ld=gold")
 endif()
 
 target_link_libraries(lite_examples lite_static)
 if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
-    # FIXME: hip obj can not find cpp obj only through lite_static
-    target_link_libraries(lite_examples megdnn)
+  # FIXME: hip obj can not find cpp obj only through lite_static
+  target_link_libraries(lite_examples megdnn)
 endif()
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(lite_examples dl)
-    else()
-        target_link_libraries(lite_examples dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(lite_examples dl)
+  else()
+    target_link_libraries(lite_examples dl rt)
+  endif()
 endif()
 
-install (TARGETS lite_examples
-    EXPORT ${LITE_EXPORT_TARGETS}
-    RUNTIME DESTINATION lite/bin)
+install(
+  TARGETS lite_examples
+  EXPORT ${LITE_EXPORT_TARGETS}
+  RUNTIME DESTINATION lite/bin)
 
 # add lite_examples_depends_shared for CI check symbol export valid
-add_executable(lite_examples_depends_shared  ${SOURCES})
+add_executable(lite_examples_depends_shared ${SOURCES})
 
 if(LITE_BUILD_WITH_RKNPU)
-    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
-    target_link_options(lite_examples_depends_shared  PRIVATE "-fuse-ld=gold")
+  # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+  target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold")
 endif()
 
 target_link_libraries(lite_examples_depends_shared lite_shared)
 target_include_directories(lite_examples_depends_shared PUBLIC ./)
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(lite_examples_depends_shared dl)
-    else()
-        target_link_libraries(lite_examples_depends_shared dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(lite_examples_depends_shared dl)
+  else()
+    target_link_libraries(lite_examples_depends_shared dl rt)
+  endif()
 endif()
 
-install (TARGETS lite_examples_depends_shared
-    EXPORT ${LITE_EXPORT_TARGETS}
-    RUNTIME DESTINATION lite/bin)
+install(
+  TARGETS lite_examples_depends_shared
+  EXPORT ${LITE_EXPORT_TARGETS}
+  RUNTIME DESTINATION lite/bin)
diff --git a/lite/load_and_run/CMakeLists.txt b/lite/load_and_run/CMakeLists.txt
index d7b5b9d9..6ef5279a 100644
--- a/lite/load_and_run/CMakeLists.txt
+++ b/lite/load_and_run/CMakeLists.txt
@@ -1,55 +1,62 @@
 # BUILD the load and run for lite
-include_directories(PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/load_and_run/src>)
-file (GLOB_RECURSE SOURCES ./*.cpp)
+include_directories(PUBLIC
+                    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/load_and_run/src>)
+file(GLOB_RECURSE SOURCES ./*.cpp)
 
-add_executable (load_and_run  ${SOURCES})
+add_executable(load_and_run ${SOURCES})
 
 target_link_libraries(load_and_run lite_static)
 target_link_libraries(load_and_run megbrain)
 target_link_libraries(load_and_run gflags)
 
 if(LITE_BUILD_WITH_RKNPU)
-    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
-    target_link_options(load_and_run  PRIVATE "-fuse-ld=gold")
+  # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+  target_link_options(load_and_run PRIVATE "-fuse-ld=gold")
 endif()
 
 if(MGE_WITH_ROCM)
-    # FIXME: hip obj can not find cpp obj only through lite_static
-    target_link_libraries(load_and_run megdnn)
+  # FIXME: hip obj can not find cpp obj only through lite_static
+  target_link_libraries(load_and_run megdnn)
 endif()
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(load_and_run dl)
-    else()
-        target_link_libraries(load_and_run dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(load_and_run dl)
+  else()
+    target_link_libraries(load_and_run dl rt)
+  endif()
 endif()
 
-install (TARGETS load_and_run EXPORT ${LITE_EXPORT_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(
+  TARGETS load_and_run
+  EXPORT ${LITE_EXPORT_TARGETS}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 if(BUILD_SHARED_LIBS)
-    add_executable(load_and_run_depends_shared ${SOURCES})
-    target_link_libraries(load_and_run_depends_shared lite_shared)
-    target_link_libraries(load_and_run_depends_shared gflags)
-    target_link_libraries(load_and_run_depends_shared megengine)
-
-    if(LITE_BUILD_WITH_RKNPU)
-        #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
-        target_link_options(load_and_run_depends_shared  PRIVATE "-fuse-ld=gold")
-    endif()
+  add_executable(load_and_run_depends_shared ${SOURCES})
+  target_link_libraries(load_and_run_depends_shared lite_shared)
+  target_link_libraries(load_and_run_depends_shared gflags)
+  target_link_libraries(load_and_run_depends_shared megengine)
 
-    if(MGE_WITH_ROCM)
-        # FIXME: hip obj can not find cpp obj only through lite_static
-        target_link_libraries(load_and_run_depends_shared megdnn)
-    endif()
+  if(LITE_BUILD_WITH_RKNPU)
+    # rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
+    target_link_options(load_and_run_depends_shared PRIVATE "-fuse-ld=gold")
+  endif()
+
+  if(MGE_WITH_ROCM)
+    # FIXME: hip obj can not find cpp obj only through lite_static
+    target_link_libraries(load_and_run_depends_shared megdnn)
+  endif()
 
-    if(UNIX)
-        if(APPLE OR ANDROID)
-            target_link_libraries(load_and_run_depends_shared dl)
-        else()
-            target_link_libraries(load_and_run_depends_shared dl rt)
-        endif()
+  if(UNIX)
+    if(APPLE OR ANDROID)
+      target_link_libraries(load_and_run_depends_shared dl)
+    else()
+      target_link_libraries(load_and_run_depends_shared dl rt)
     endif()
+  endif()
 
-    install(TARGETS load_and_run_depends_shared EXPORT ${MGE_EXPORT_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+  install(
+    TARGETS load_and_run_depends_shared
+    EXPORT ${MGE_EXPORT_TARGETS}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
diff --git a/lite/load_and_run/src/helpers/data_parser.cpp b/lite/load_and_run/src/helpers/data_parser.cpp
index 0ba71626..d5b7518e 100644
--- a/lite/load_and_run/src/helpers/data_parser.cpp
+++ b/lite/load_and_run/src/helpers/data_parser.cpp
@@ -30,7 +30,12 @@ void DataParser::feed(const std::string& path) {
     }
 
     auto endWith = [blob_string](std::string suffix) -> bool {
-        return blob_string.rfind(suffix) == (blob_string.length() - suffix.length());
+        const auto index = blob_string.rfind(suffix);
+        if (index != std::string::npos and
+            index == blob_string.length() - suffix.length()) {
+            return true;
+        }
+        return false;
     };
 
     if (endWith(".ppm") || endWith(".pgm")) {
diff --git a/lite/pylite/megenginelite/__init__.py b/lite/pylite/megenginelite/__init__.py
index 95c22633..ec14aeec 100644
--- a/lite/pylite/megenginelite/__init__.py
+++ b/lite/pylite/megenginelite/__init__.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
 from .base import *
+from .base import version as __version__
 from .global_setting import *
 from .network import *
 from .struct import *
diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py
index 1106079a..c8072791 100644
--- a/lite/pylite/megenginelite/network.py
+++ b/lite/pylite/megenginelite/network.py
@@ -69,7 +69,9 @@ class LiteOptions(Structure):
             "const_shape": bool(self.const_shape),
             "force_dynamic_alloc": bool(self.force_dynamic_alloc),
             "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc),
-            "force_output_nocopy": bool(self.force_output_nocopy),
+            "force_output_use_user_specified_memory": bool(
+                self.force_output_use_user_specified_memory
+            ),
             "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change),
             "jit_level": self.jit_level,
             "comp_node_seq_record_level": self.comp_node_seq_record_level,
@@ -99,7 +101,7 @@ class LiteConfig(Structure):
         ("device_id", c_int),
         ("device_type", c_int),
         ("backend", c_int),
-        ("bare_model_cryption_name", c_char_p),
+        ("_bare_model_cryption_name", c_char_p),
         ("options", LiteOptions),
     ]
 
@@ -110,18 +112,30 @@ class LiteConfig(Structure):
         else:
             self.options = LiteOptions()
 
-        self.bare_model_cryption_name = c_char_p(b"")
+        self._bare_model_cryption_name = c_char_p(b"")
         self.use_loader_dynamic_param = 0
         self.has_compression = 0
         self.backend = LiteBackend.LITE_DEFAULT
 
+    @property
+    def bare_model_cryption_name(self):
+        return self._bare_model_cryption_name.decode("utf-8")
+
+    @bare_model_cryption_name.setter
+    def bare_model_cryption_name(self, name):
+        if isinstance(name, str):
+            self._bare_model_cryption_name = name.encode("utf-8")
+        else:
+            assert isinstance(name, bytes), "name should be str or bytes type."
+            self._bare_model_cryption_name = name
+
     def __repr__(self):
         data = {
             "has_compression": bool(self.has_compression),
             "device_id": LiteDeviceType(self.device_id),
             "device_type": LiteDeviceType(self.device_type),
             "backend": LiteBackend(self.backend),
-            "bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"),
+            "bare_model_cryption_name": self.bare_model_cryption_name,
             "options": self.options,
         }
         return data.__repr__()
@@ -149,7 +163,7 @@ class LiteIO(Structure):
     """
 
     _fields_ = [
-        ("name", c_char_p),
+        ("_name", c_char_p),
         ("is_host", c_int),
         ("io_type", c_int),
         ("config_layout", LiteLayout),
@@ -159,9 +173,9 @@ class LiteIO(Structure):
         self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None
     ):
         if type(name) == str:
-            self.name = c_char_p(name.encode("utf-8"))
+            self._name = c_char_p(name.encode("utf-8"))
         else:
-            self.name = c_char_p(name)
+            self._name = c_char_p(name)
 
         if layout:
             self.config_layout = layout
@@ -171,6 +185,18 @@ class LiteIO(Structure):
         self.is_host = is_host
         self.io_type = io_type
 
+    @property
+    def name(self):
+        return self._name.decode("utf-8")
+
+    @name.setter
+    def name(self, name):
+        if isinstance(name, str):
+            self._name = name.encode("utf-8")
+        else:
+            assert isinstance(name, bytes), "name should be str or bytes type."
+            self._name = name
+
     def __repr__(self):
         data = {
             "name": self.name,
@@ -208,17 +234,45 @@ class LiteNetworkIO(object):
     the input and output information for user to construct _LiteNetWorkIO
     """
 
-    def __init__(self):
+    def __init__(self, inputs=None, outputs=None):
         self.inputs = []
         self.outputs = []
+        if inputs:
+            for i in inputs:
+                if isinstance(i, list):
+                    self.inputs.append(LiteIO(*i))
+                else:
+                    assert isinstance(
+                        i, LiteIO
+                    ), "the param to construct LiteNetworkIO must be list of the LiteIO member or the LiteIO."
+                    self.inputs.append(i)
+        if outputs:
+            for i in outputs:
+                if isinstance(i, list):
+                    self.outputs.append(LiteIO(*i))
+                else:
+                    assert isinstance(
+                        i, LiteIO
+                    ), "the param to construct LiteNetworkIO must be list of the LiteIO member or the LiteIO."
+                    self.outputs.append(i)
+
+    def add_input(
+        self, obj, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None
+    ):
+        if isinstance(obj, LiteIO):
+            self.inputs.append(obj)
+        else:
+            name = obj
+            self.add_input(LiteIO(name, is_host, io_type, layout))
 
-    def add_input(self, input_io):
-        assert isinstance(input_io, LiteIO)
-        self.inputs.append(input_io)
-
-    def add_output(self, output_io):
-        assert isinstance(output_io, LiteIO)
-        self.outputs.append(output_io)
+    def add_output(
+        self, obj, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None
+    ):
+        if isinstance(obj, LiteIO):
+            self.outputs.append(obj)
+        else:
+            name = obj
+            self.add_output(LiteIO(name, is_host, io_type, layout))
 
     def _create_network_io(self):
         network_io = _LiteNetworkIO()
diff --git a/lite/pylite/megenginelite/tensor.py b/lite/pylite/megenginelite/tensor.py
index ef86154f..188a486e 100644
--- a/lite/pylite/megenginelite/tensor.py
+++ b/lite/pylite/megenginelite/tensor.py
@@ -48,6 +48,15 @@ ctype_to_lite_dtypes = {
     c_ushort: LiteDataType.LITE_UINT16,
 }
 
+_lite_dtypes_to_ctype = {
+    LiteDataType.LITE_INT: c_int,
+    LiteDataType.LITE_FLOAT: c_float,
+    LiteDataType.LITE_UINT8: c_ubyte,
+    LiteDataType.LITE_INT8: c_byte,
+    LiteDataType.LITE_INT16: c_short,
+    LiteDataType.LITE_UINT16: c_ushort,
+}
+
 
 class LiteLayout(Structure):
     """
@@ -55,7 +64,7 @@ class LiteLayout(Structure):
     """
 
     _fields_ = [
-        ("shapes", c_size_t * MAX_DIM),
+        ("_shapes", c_size_t * MAX_DIM),
         ("ndim", c_size_t),
         ("data_type", c_int),
     ]
@@ -64,10 +73,10 @@ class LiteLayout(Structure):
         if shape:
             shape = list(shape)
             assert len(shape) <= MAX_DIM, "Layout max dim is 7."
-            self.shapes = (c_size_t * MAX_DIM)(*shape)
+            self._shapes = (c_size_t * MAX_DIM)(*shape)
             self.ndim = len(shape)
         else:
-            self.shapes = (c_size_t * MAX_DIM)()
+            self._shapes = (c_size_t * MAX_DIM)()
             self.ndim = 0
         if not dtype:
             self.data_type = LiteDataType.LITE_FLOAT
@@ -83,9 +92,24 @@ class LiteLayout(Structure):
         else:
             raise RuntimeError("unkonw data type")
 
+    @property
+    def dtype(self):
+        return _lite_type_to_nptypes[LiteDataType(self.data_type)]
+
+    @property
+    def shapes(self):
+        return list(self._shapes)[0 : self.ndim]
+
+    @shapes.setter
+    def shapes(self, shape):
+        shape = list(shape)
+        assert len(shape) <= MAX_DIM, "Layout max dim is 7."
+        self._shapes = (c_size_t * MAX_DIM)(*shape)
+        self.ndim = len(shape)
+
     def __repr__(self):
         data = {
-            "shapes": list(self.shapes)[0 : self.ndim],
+            "shapes": self.shapes,
             "ndim": self.ndim,
             "data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)],
         }
@@ -177,15 +201,20 @@ class LiteTensor(object):
         device_type=LiteDeviceType.LITE_CPU,
         device_id=0,
         is_pinned_host=False,
+        shapes=None,
+        dtype=None,
     ):
         """
-        create a Tensor with layout, device, is_pinned_host param
+        create a Tensor with layout, device, is_pinned_host or shapes, dtype,
+        device_type, device_id, is_pinned_host param
         """
         self._tensor = _Ctensor()
-        if layout:
+        self._layout = LiteLayout()
+        if layout is not None:
             self._layout = layout
-        else:
-            self._layout = LiteLayout()
+        elif shapes is not None:
+            shapes = list(shapes)
+            self._layout = LiteLayout(shapes, dtype)
         self._device_type = device_type
         self._device_id = device_id
         self._is_pinned_host = is_pinned_host
@@ -222,9 +251,12 @@ class LiteTensor(object):
 
     @layout.setter
     def layout(self, layout):
-        assert isinstance(layout, LiteLayout)
-        self._layout = layout
-        self._api.LITE_set_tensor_layout(self._tensor, layout)
+        if isinstance(layout, LiteLayout):
+            self._layout = layout
+        elif isinstance(layout, list):
+            self._layout.shapes = layout
+
+        self._api.LITE_set_tensor_layout(self._tensor, self._layout)
 
     @property
     def is_pinned_host(self):
@@ -270,7 +302,6 @@ class LiteTensor(object):
         """
         get the length of the meomry in byte
         """
-        self.update()
         length = c_size_t()
         self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length))
         return length.value
@@ -336,7 +367,6 @@ class LiteTensor(object):
         """
         get the memory of the tensor, return c_void_p of the tensor memory
         """
-        self.update()
         mem = c_void_p()
         self._api.LITE_get_tensor_memory(self._tensor, byref(mem))
         return mem
@@ -347,7 +377,6 @@ class LiteTensor(object):
         param data: the data will shared to the tensor, it should be a
         numpy.ndarray or ctypes data
         """
-        self.update()
         if isinstance(data, np.ndarray):
             assert (
                 self.is_continue
@@ -356,8 +385,7 @@ class LiteTensor(object):
                 self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
             ), "set_data_by_share can only apply in cpu tensor or pinned tensor."
 
-            np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
-            c_type = np.ctypeslib.as_ctypes_type(np_type)
+            c_type = _lite_dtypes_to_ctype[LiteDataType(self._layout.data_type)]
 
             if self.nbytes != data.nbytes:
                 self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type])
@@ -377,7 +405,6 @@ class LiteTensor(object):
         param data: the data to copy to tensor, it should be list,
         numpy.ndarraya or ctypes with length
         """
-        self.update()
         if layout is not None:
             self.layout = layout
 
@@ -386,8 +413,7 @@ class LiteTensor(object):
             self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
         ), "set_data_by_copy can only apply in cpu tensor or pinned tensor."
 
-        np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
-        c_type = np.ctypeslib.as_ctypes_type(np_type)
+        c_type = _lite_dtypes_to_ctype[LiteDataType(self._layout.data_type)]
 
         tensor_memory = c_void_p()
 
@@ -415,6 +441,22 @@ class LiteTensor(object):
             self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
             memmove(tensor_memory, data, data_length)
 
+    def get_data_by_share(self):
+        """
+        get the data in the tensor, add share the data with a new numpy, and
+        return the numpy arrray, be careful, the data in numpy is valid before
+        the tensor memory is write again, such as LiteNetwok forward next time.
+        """
+        assert self.is_continue, "get_data_by_share can only apply in continue tensor."
+        assert (
+            self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
+        ), "get_data_by_share can only apply in CPU tensor or cpu pinned tensor."
+
+        memory = self.get_ctypes_memory()
+        c_type = _lite_dtypes_to_ctype[LiteDataType(self._layout.data_type)]
+        pnt = cast(memory, POINTER(c_type))
+        return np.ctypeslib.as_array(pnt, self._layout.shapes)
+
     def to_numpy(self):
         """
         get the buffer of the tensor
@@ -475,3 +517,13 @@ def LiteTensorConcat(
     )
     result_tensor.update()
     return result_tensor
+
+
+def lite_dtype_2_numpy(dtype):
+    """
+    convert lite dtype to corresponding numpy dtype
+    """
+    assert isinstance(
+        dtype, LiteDataType
+    ), "input must be LiteDataType when using lite_dtype_2_numpy."
+    return _lite_type_to_nptypes[dtype]
diff --git a/lite/pylite/test/test_network.py b/lite/pylite/test/test_network.py
index 6bb8c979..70d4aecf 100644
--- a/lite/pylite/test/test_network.py
+++ b/lite/pylite/test/test_network.py
@@ -21,6 +21,12 @@ def test_version():
     print("Lite verson: {}".format(version))
 
 
+def test_config():
+    config = LiteConfig()
+    config.bare_model_cryption_name = "nothing"
+    print(config)
+
+
 def test_network_io():
     input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE)
     input_io2 = LiteIO(
@@ -32,6 +38,7 @@ def test_network_io():
     io = LiteNetworkIO()
     io.add_input(input_io1)
     io.add_input(input_io2)
+    io.add_input("data3", False)
 
     output_io1 = LiteIO("out1", is_host=False)
     output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000]))
@@ -39,7 +46,7 @@ def test_network_io():
     io.add_output(output_io1)
     io.add_output(output_io2)
 
-    assert len(io.inputs) == 2
+    assert len(io.inputs) == 3
     assert len(io.outputs) == 2
 
     assert io.inputs[0] == input_io1
@@ -47,9 +54,25 @@ def test_network_io():
 
     c_io = io._create_network_io()
 
-    assert c_io.input_size == 2
+    assert c_io.input_size == 3
     assert c_io.output_size == 2
 
+    ins = [["data1", True], ["data2", False, LiteIOType.LITE_IO_SHAPE]]
+    outs = [["out1", True], ["out2", False, LiteIOType.LITE_IO_VALUE]]
+
+    io2 = LiteNetworkIO(ins, outs)
+    assert len(io2.inputs) == 2
+    assert len(io2.outputs) == 2
+
+    io3 = LiteNetworkIO([input_io1, input_io2], [output_io1, output_io2])
+    assert len(io3.inputs) == 2
+    assert len(io3.outputs) == 2
+
+    test_io = LiteIO("test")
+    assert test_io.name == "test"
+    test_io.name = "test2"
+    assert test_io.name == "test2"
+
 
 class TestShuffleNet(unittest.TestCase):
     source_dir = os.getenv("LITE_TEST_RESOURCE")
@@ -319,9 +342,9 @@ class TestNetwork(TestShuffleNet):
                 data = ios[key].to_numpy().flatten()
                 input_data = self.input_data.flatten()
                 assert data.size == input_data.size
-                assert io.name.decode("utf-8") == "data"
+                assert io.name == "data"
                 for i in range(data.size):
-                    assert data[i] == input_data[i]
+                    assert abs(data[i] - input_data[i]) < 1e-5
             return 0
 
         network.set_start_callback(start_callback)
@@ -343,7 +366,7 @@ class TestNetwork(TestShuffleNet):
                 output_data = self.correct_data.flatten()
                 assert data.size == output_data.size
                 for i in range(data.size):
-                    assert data[i] == output_data[i]
+                    assert abs(data[i] - output_data[i]) < 1e-5
             return 0
 
         network.set_finish_callback(finish_callback)
@@ -404,3 +427,27 @@ class TestNetwork(TestShuffleNet):
             binary_equal_between_batch=True,
         )
         self.do_forward(network)
+
+    def test_device_tensor_no_copy(self):
+        # construct LiteOption
+        net_config = LiteConfig()
+        net_config.options.force_output_use_user_specified_memory = True
+
+        network = LiteNetwork(config=net_config)
+        network.load(self.model_path)
+
+        input_tensor = network.get_io_tensor("data")
+        # fill input_data with device data
+        input_tensor.set_data_by_share(self.input_data)
+
+        output_tensor = network.get_io_tensor(network.get_output_name(0))
+        out_array = np.zeros(output_tensor.layout.shapes, output_tensor.layout.dtype)
+
+        output_tensor.set_data_by_share(out_array)
+
+        # inference
+        for i in range(2):
+            network.forward()
+            network.wait()
+
+        self.check_correct(out_array)
diff --git a/lite/pylite/test/test_tensor.py b/lite/pylite/test/test_tensor.py
index 6232d7f8..86af29c3 100644
--- a/lite/pylite/test/test_tensor.py
+++ b/lite/pylite/test/test_tensor.py
@@ -54,6 +54,16 @@ def test_tensor_make():
     tensor = LiteTensor(layout, device_id=1)
     assert tensor.device_id == 1
 
+    tensor.layout = [8, 14]
+    assert tensor.layout.shapes[0] == 8
+    assert tensor.layout.shapes[1] == 14
+    assert tensor.layout.data_type == LiteDataType.LITE_FLOAT
+
+    tensor_new = LiteTensor(shapes=[1, 3, 224], dtype=np.int8)
+    assert tensor_new.layout.shapes[1] == 3
+    assert tensor_new.layout.shapes[2] == 224
+    assert tensor_new.layout.data_type == LiteDataType.LITE_INT8
+
 
 def test_tensor_set_data():
     layout = LiteLayout([2, 16], "int8")
@@ -292,3 +302,24 @@ def test_tensor_concat():
         for i in range(128):
             index = j * 128 + i
             assert real_data[index // 32][index % 32] == j
+
+
+def test_tensor_get_memory_by_share():
+    layout = LiteLayout([4, 32], "int16")
+    tensor = LiteTensor(layout)
+    assert tensor.nbytes == 4 * 32 * 2
+
+    arr = np.ones([4, 32], "int16")
+    for i in range(128):
+        arr[i // 32][i % 32] = i
+    tensor.set_data_by_copy(arr)
+    test_data = tensor.get_data_by_share()
+    real_data = tensor.to_numpy()
+    for i in range(128):
+        assert real_data[i // 32][i % 32] == test_data[i // 32][i % 32]
+
+    arr[1][18] = 5
+    arr[3][7] = 345
+    tensor.set_data_by_copy(arr)
+    assert test_data[1][18] == 5
+    assert test_data[3][7] == 345
diff --git a/lite/test/CMakeLists.txt b/lite/test/CMakeLists.txt
index 1bf6c836..dce10aaf 100644
--- a/lite/test/CMakeLists.txt
+++ b/lite/test/CMakeLists.txt
@@ -1,27 +1,28 @@
-if (MGE_WITH_TEST)
-    file (GLOB_RECURSE SOURCES ./*.cpp main.cpp)
-    add_executable (lite_test  ${SOURCES})
+if(MGE_WITH_TEST)
+  file(GLOB_RECURSE SOURCES ./*.cpp main.cpp)
+  add_executable(lite_test ${SOURCES})
 
-    target_link_libraries(lite_test gtest)
-    target_link_libraries(lite_test lite_static)
-    if(LITE_BUILD_WITH_MGE)
-        # lite_test will depends megbrain interface
-        target_link_libraries(lite_test megbrain)
-        if (MGE_WITH_ROCM)
-            # FIXME: hip obj can not find cpp obj only through lite_static
-            target_link_libraries(lite_test megdnn)
-        endif ()
+  target_link_libraries(lite_test gtest)
+  target_link_libraries(lite_test lite_static)
+  if(LITE_BUILD_WITH_MGE)
+    # lite_test will depends megbrain interface
+    target_link_libraries(lite_test megbrain)
+    if(MGE_WITH_ROCM)
+      # FIXME: hip obj can not find cpp obj only through lite_static
+      target_link_libraries(lite_test megdnn)
     endif()
+  endif()
 
-    if(UNIX)
-        if(APPLE OR ANDROID)
-            target_link_libraries(lite_test dl)
-        else()
-            target_link_libraries(lite_test dl rt)
-        endif()
+  if(UNIX)
+    if(APPLE OR ANDROID)
+      target_link_libraries(lite_test dl)
+    else()
+      target_link_libraries(lite_test dl rt)
     endif()
+  endif()
 
-    install (TARGETS lite_test
-        EXPORT ${LITE_EXPORT_TARGETS}
-        RUNTIME DESTINATION lite/bin)
+  install(
+    TARGETS lite_test
+    EXPORT ${LITE_EXPORT_TARGETS}
+    RUNTIME DESTINATION lite/bin)
 endif()
diff --git a/scripts/whl/macos/macos_build_whl.sh b/scripts/whl/macos/macos_build_whl.sh
index c411c67d..c20559a7 100755
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -171,6 +171,7 @@ function do_build() {
         mkdir -p staging
 
         cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+        cp -a ${SRC_DIR}/src/custom/include staging/megengine/core/include/
         cd ${BUILD_DIR}/staging/megengine/core
         rt_file=`ls _imperative_rt.*.so`
         echo "rt file is: ${rt_file}"
diff --git a/scripts/whl/manylinux2014/do_build_common.sh b/scripts/whl/manylinux2014/do_build_common.sh
index 2df0dbff..0f149724 100755
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
@@ -151,6 +151,7 @@ do
     rm -rf staging
     mkdir -p staging
     cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+    cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include
 
     cd ${BUILD_DIR}/staging/megengine/core
     mkdir -p lib/ucx
diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh
index b3824fbc..d33cb5c5 100755
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -77,11 +77,13 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6
 CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll"
 CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll"
 CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll"
-MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll"
+MGE_EXPORT_DLL="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll"
+MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.lib"
 
 function depend_real_copy() {
     REAL_DST=$1
     echo "real copy lib to $1"
+    cp "${MGE_EXPORT_DLL}" ${REAL_DST}
     cp "${MGE_EXPORT_LIB}" ${REAL_DST}
 
     if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
@@ -190,6 +192,7 @@ function do_build() {
         rm -rf staging
         mkdir -p staging
         cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
+        cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include/
         cd ${BUILD_DIR}/staging/megengine/core
         rt_file=`ls _imperative_rt.*.pyd`
         echo "rt file is: ${rt_file}"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 57664c74..807f44f1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,251 +1,288 @@
+# force define a SHARED target for whl, caused by when build for APPLE we will force set
+# BUILD_SHARED_LIBS=OFF for xcode needed
+set(MGE_SHARED_LIB megengine_shared)
+set(MGE_SHARED_LIB
+    ${MGE_SHARED_LIB}
+    PARENT_SCOPE)
+
 if(MGE_WITH_JIT_MLIR)
-    add_subdirectory(jit/include/megbrain/jit/mlir/ir)
+  add_subdirectory(jit/include/megbrain/jit/mlir/ir)
 endif()
 
-file(GLOB_RECURSE SOURCES core/impl/*.cpp gopt/impl/*.cpp opr/impl/*.cpp opr/impl/nvof/*.cpp plugin/impl/*.cpp serialization/impl/*.cpp core/impl/*.inl gopt/impl/*.inl opr/impl/*.inl plugin/impl/*.inl serialization/impl/*.inl)
-
+file(
+  GLOB_RECURSE
+  SOURCES
+  core/impl/*.cpp
+  gopt/impl/*.cpp
+  opr/impl/*.cpp
+  opr/impl/nvof/*.cpp
+  plugin/impl/*.cpp
+  serialization/impl/*.cpp
+  core/impl/*.inl
+  gopt/impl/*.inl
+  opr/impl/*.inl
+  plugin/impl/*.inl
+  serialization/impl/*.inl)
 
 if(MGE_WITH_JIT)
-    file(GLOB_RECURSE SOURCES_ jit/impl/*.cpp jit/impl/*.inl)
-    if(MGE_WITH_JIT_MLIR)
-        file(GLOB_RECURSE MLIR_SOURCES_ jit/impl/mlir/ir/*.cpp jit/impl/mlir/*.cpp)
-        list(APPEND SOURCES_ ${MLIR_SOURCES_})
-    endif()
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ jit/impl/*.cpp jit/impl/*.inl)
+  if(MGE_WITH_JIT_MLIR)
+    file(GLOB_RECURSE MLIR_SOURCES_ jit/impl/mlir/ir/*.cpp jit/impl/mlir/*.cpp)
+    list(APPEND SOURCES_ ${MLIR_SOURCES_})
+  endif()
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
 if(MGE_WITH_DISTRIBUTED)
-    file(GLOB_RECURSE SOURCES_ opr-mm/impl/*.cpp opr-mm/impl/*.inl)
-    list(APPEND SOURCES ${SOURCES_})
-    file(GLOB_RECURSE PROTO_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../src/opr-mm/proto/*.proto")
-    PROTOBUF_GENERATE_CPP_WITH_ROOT(GRPC_SRCS GRPC_HDRS ${CMAKE_CURRENT_SOURCE_DIR} ${PROTO_FILES})
-    add_custom_target(mgb_proto_target DEPENDS ${GRPC_SRCS} ${GRPC_HDRS} ${PROTOBUF_PROTOC_EXECUTABLE})
-    list(APPEND SOURCES ${GRPC_SRCS})
+  file(GLOB_RECURSE SOURCES_ opr-mm/impl/*.cpp opr-mm/impl/*.inl)
+  list(APPEND SOURCES ${SOURCES_})
+  file(
+    GLOB_RECURSE PROTO_FILES
+    RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+    "../src/opr-mm/proto/*.proto")
+  protobuf_generate_cpp_with_root(GRPC_SRCS GRPC_HDRS ${CMAKE_CURRENT_SOURCE_DIR}
+                                  ${PROTO_FILES})
+  add_custom_target(mgb_proto_target DEPENDS ${GRPC_SRCS} ${GRPC_HDRS}
+                                             ${PROTOBUF_PROTOC_EXECUTABLE})
+  list(APPEND SOURCES ${GRPC_SRCS})
 endif()
 
-set(MGB_INC ${PROJECT_BINARY_DIR}/genfiles ${CMAKE_CURRENT_LIST_DIR}/core/include ${CMAKE_CURRENT_LIST_DIR}/gopt/include ${CMAKE_CURRENT_LIST_DIR}/opr/include ${CMAKE_CURRENT_LIST_DIR}/plugin/include ${CMAKE_CURRENT_LIST_DIR}/serialization/include)
+set(MGB_INC
+    ${PROJECT_BINARY_DIR}/genfiles
+    ${CMAKE_CURRENT_LIST_DIR}/core/include
+    ${CMAKE_CURRENT_LIST_DIR}/gopt/include
+    ${CMAKE_CURRENT_LIST_DIR}/opr/include
+    ${CMAKE_CURRENT_LIST_DIR}/plugin/include
+    ${CMAKE_CURRENT_LIST_DIR}/serialization/include)
 
 if(MGE_WITH_JIT)
-    list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include)
-    if(MGE_WITH_CUDA)
-        list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/impl/cuda)
-    endif()
+  list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include)
+  if(MGE_WITH_CUDA)
+    list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/impl/cuda)
+  endif()
 endif()
 
 if(MGE_WITH_DISTRIBUTED)
-    list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/opr-mm/include)
+  list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/opr-mm/include)
 endif()
 
 if(MGE_WITH_CUDA AND MGE_WITH_TRT)
-    list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/tensorrt/include)
-    file(GLOB_RECURSE SOURCES_ tensorrt/impl/*.cpp tensorrt/impl/*.inl)
-    list(APPEND SOURCES ${SOURCES_})
+  list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/tensorrt/include)
+  file(GLOB_RECURSE SOURCES_ tensorrt/impl/*.cpp tensorrt/impl/*.inl)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
 if(MGE_WITH_CAMBRICON)
-    list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/cambricon/include)
-    file(GLOB_RECURSE SOURCES_ cambricon/impl/*.cpp cambricon/impl/*.inl)
-    list(APPEND SOURCES ${SOURCES_})
+  list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/cambricon/include)
+  file(GLOB_RECURSE SOURCES_ cambricon/impl/*.cpp cambricon/impl/*.inl)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON})
 
 set(MGB_ATLAS ${MGE_WITH_ATLAS})
 
 if(MGE_WITH_CUDA)
-    file(GLOB_RECURSE SOURCES_ opr/impl/standalone/*.cu)
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ opr/impl/standalone/*.cu)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
 if(MGE_WITH_CUSTOM_OP)
-    list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/custom/include)
-    file(GLOB_RECURSE SOURCES_ custom/impl/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+  list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/custom/include)
+  file(GLOB_RECURSE SOURCES_ custom/impl/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
 add_library(megbrain OBJECT ${SOURCES})
 target_link_libraries(megbrain PUBLIC mgb_opr_param_defs)
 if(MGE_WITH_CUDA)
-    target_include_directories(megbrain PUBLIC ${TRT_INCLUDE_DIR})
-    target_include_directories(megbrain PRIVATE ${CUDNN_INCLUDE_DIR})
-    find_path(NVTX3_INCLUDE
-        NAMES nvToolsExtCudaRt.h
-        HINTS $ENV{CUDA_ROOT_DIR} $ENV{CUDA_PATH} $ENV{CUDA_BIN_PATH}
-        PATH_SUFFIXES include/nvtx3
-        DOC "NVTX3_INCLUDE" )
-    if(NVTX3_INCLUDE STREQUAL "NVTX3_INCLUDE-NOTFOUND")
-        message(FATAL_ERROR "Can not find NVTX3 INCLUDE, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH")
-    endif()
-    target_include_directories(megbrain PRIVATE ${NVTX3_INCLUDE})
-endif()
-target_include_directories(megbrain
-    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-    PRIVATE ${PROJECT_SOURCE_DIR}/third_party/midout/src
-)
-
-foreach (INCPATH IN LISTS MGB_INC)
-    target_include_directories(megbrain
-        PUBLIC $<BUILD_INTERFACE:${INCPATH}>
+  target_include_directories(megbrain PUBLIC ${TRT_INCLUDE_DIR})
+  target_include_directories(megbrain PRIVATE ${CUDNN_INCLUDE_DIR})
+  find_path(
+    NVTX3_INCLUDE
+    NAMES nvToolsExtCudaRt.h
+    HINTS $ENV{CUDA_ROOT_DIR} $ENV{CUDA_PATH} $ENV{CUDA_BIN_PATH}
+    PATH_SUFFIXES include/nvtx3
+    DOC "NVTX3_INCLUDE")
+  if(NVTX3_INCLUDE STREQUAL "NVTX3_INCLUDE-NOTFOUND")
+    message(
+      FATAL_ERROR
+        "Can not find NVTX3 INCLUDE, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH"
     )
+  endif()
+  target_include_directories(megbrain PRIVATE ${NVTX3_INCLUDE})
+endif()
+target_include_directories(
+  megbrain
+  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PRIVATE ${PROJECT_SOURCE_DIR}/third_party/midout/src)
+
+foreach(INCPATH IN LISTS MGB_INC)
+  target_include_directories(megbrain PUBLIC $<BUILD_INTERFACE:${INCPATH}>)
 endforeach()
 
 if(MGE_WITH_CUDA)
-    if(NOT WIN32 AND NOT MSVC)
-        target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>"
-            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>")
-    endif()
+  if(NOT WIN32 AND NOT MSVC)
+    target_compile_options(
+      megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>"
+                       "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>")
+  endif()
 else()
-    target_compile_options(megbrain PRIVATE "-Wno-unused-parameter")
+  target_compile_options(megbrain PRIVATE "-Wno-unused-parameter")
 endif()
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
-    if(MGE_WITH_CUDA)
-        target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
-            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
-    else()
-        target_compile_options(megbrain PRIVATE "-Wno-class-memaccess")
-    endif()
+  if(MGE_WITH_CUDA)
+    target_compile_options(
+      megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
+                       "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
+  else()
+    target_compile_options(megbrain PRIVATE "-Wno-class-memaccess")
+  endif()
 endif()
 target_link_libraries(megbrain PUBLIC megdnn)
 if(MGE_WITH_DISTRIBUTED)
-    add_dependencies(megbrain mgb_proto_target)
-    target_link_libraries (megbrain PRIVATE libprotobuf libzmq)
-    set(CPPZMQ_INC ${PROJECT_SOURCE_DIR}/third_party/cppzmq)
-    # FIXME: add CMAKE_CURRENT_BINARY_DIR for including mm_handler.pb.h
-    target_include_directories(megbrain PRIVATE ${CPPZMQ_INC} ${CMAKE_CURRENT_BINARY_DIR})
-    target_link_libraries (megbrain PRIVATE megray)
+  add_dependencies(megbrain mgb_proto_target)
+  target_link_libraries(megbrain PRIVATE libprotobuf libzmq)
+  set(CPPZMQ_INC ${PROJECT_SOURCE_DIR}/third_party/cppzmq)
+  # FIXME: add CMAKE_CURRENT_BINARY_DIR for including mm_handler.pb.h
+  target_include_directories(megbrain PRIVATE ${CPPZMQ_INC} ${CMAKE_CURRENT_BINARY_DIR})
+  target_link_libraries(megbrain PRIVATE megray)
 endif()
 target_link_libraries(megbrain PUBLIC ${MGE_CAMBRICON_LIBS})
 target_link_libraries(megbrain PUBLIC ${MGE_ATLAS_LIBS})
 if(MGE_WITH_JIT AND MGE_WITH_HALIDE)
-    target_link_libraries(megbrain PRIVATE libhalide)
-    target_link_libraries(megbrain PRIVATE ${HALIDE_LLVM_LIBS})
+  target_link_libraries(megbrain PRIVATE libhalide)
+  target_link_libraries(megbrain PRIVATE ${HALIDE_LLVM_LIBS})
 endif()
 if(MGE_WITH_JIT_MLIR)
-    target_include_directories(megbrain PRIVATE ${MLIR_LLVM_INCLUDE_DIR})
-    target_link_libraries(megbrain PRIVATE ${MLIR_LLVM_LIBS})
-    add_dependencies(megbrain mgb_dialect)
-    target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/include)
-endif()
-if (MGB_WITH_FLATBUFFERS)
-    set (GEN_FLATBUFFERS_SCHEMA_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_schema.py)
-    set (OPR_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py)
-    set (MGB_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py)
-    file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl)
-    add_custom_command(
-        OUTPUT
+  target_include_directories(megbrain PRIVATE ${MLIR_LLVM_INCLUDE_DIR})
+  target_link_libraries(megbrain PRIVATE ${MLIR_LLVM_LIBS})
+  add_dependencies(megbrain mgb_dialect)
+  target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/include)
+endif()
+if(MGB_WITH_FLATBUFFERS)
+  set(GEN_FLATBUFFERS_SCHEMA_PY
+      ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_schema.py)
+  set(OPR_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py)
+  set(MGB_PARAM_DEFS_PY ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py)
+  file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl)
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY}
             ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
-        COMMAND
-            ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
-        DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY}
-        VERBATIM
-    )
-    add_custom_command(
-        OUTPUT
+    DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY}
+    VERBATIM)
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY}
             ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs
-        COMMAND
-            ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs
-        DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY}
-        VERBATIM
-    )
-    list(APPEND FLATBUFFERS_SCHEMA_FILES
-        ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/dtype.fbs
-        ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
-        ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs
-        ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl/mgb_cpp_opr.fbs
-        ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/schema.fbs
-    )
-    list(APPEND FLATBUFFERS_SCHEMA_INCLUDE_DIR
-        ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl
-        ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl
-        ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl
-    )
-    build_flatbuffers(
-        "${FLATBUFFERS_SCHEMA_FILES}"
-        "${FLATBUFFERS_SCHEMA_INCLUDE_DIR}"
-        mgb_serialization_schema_fbs
-        "${FLATBUFFERS_SCHEMA_FILES}"
-        "${CMAKE_CURRENT_BINARY_DIR}/serialization/include/megbrain/serialization/internal"
-        ""
-        ""
-    )
-    add_dependencies(megbrain mgb_serialization_schema_fbs)
-    target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include)
-    target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1)
-    target_link_libraries(megbrain PUBLIC flatbuffers)
-    set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles)
-    set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py)
-    file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH})
-    add_custom_command(
-        OUTPUT
+    DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${MGB_PARAM_DEFS_PY}
+    VERBATIM)
+  list(
+    APPEND
+    FLATBUFFERS_SCHEMA_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/dtype.fbs
+    ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs
+    ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/mgb_opr_param_defs.fbs
+    ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl/mgb_cpp_opr.fbs
+    ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl/schema.fbs)
+  list(
+    APPEND FLATBUFFERS_SCHEMA_INCLUDE_DIR
+    ${CMAKE_CURRENT_SOURCE_DIR}/serialization/impl
+    ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl ${CMAKE_CURRENT_SOURCE_DIR}/opr/impl)
+  build_flatbuffers(
+    "${FLATBUFFERS_SCHEMA_FILES}"
+    "${FLATBUFFERS_SCHEMA_INCLUDE_DIR}"
+    mgb_serialization_schema_fbs
+    "${FLATBUFFERS_SCHEMA_FILES}"
+    "${CMAKE_CURRENT_BINARY_DIR}/serialization/include/megbrain/serialization/internal"
+    ""
+    "")
+  add_dependencies(megbrain mgb_serialization_schema_fbs)
+  target_include_directories(megbrain
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include)
+  target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1)
+  target_link_libraries(megbrain PUBLIC flatbuffers)
+  set(GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles)
+  set(GEN_FLATBUFFERS_CONVERTER_PY
+      ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py)
+  file(MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH})
+  add_custom_command(
+    OUTPUT ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY}
             ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl
-        COMMAND
-            ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY} ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl
-        DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY}
-        VERBATIM
-    )
-    add_custom_command(
-        OUTPUT
+    DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${OPR_PARAM_DEFS_PY}
+    VERBATIM)
+  add_custom_command(
+    OUTPUT ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY}
             ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl
-        COMMAND
-            ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY} ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl
-        DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY}
-        VERBATIM
-    )
-    target_sources(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl)
-    target_sources(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl)
-    target_include_directories(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH})
+    DEPENDS ${GEN_FLATBUFFERS_CONVERTER_PY} ${MGB_PARAM_DEFS_PY}
+    VERBATIM)
+  target_sources(
+    megbrain
+    PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/opr_param_defs_converter.inl)
+  target_sources(
+    megbrain
+    PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH}/mgb_opr_param_defs_converter.inl)
+  target_include_directories(megbrain PRIVATE ${GENERATED_FLATBUFFERS_CONVERTER_PATH})
 endif()
-if(UNIX AND NOT ANDROID AND NOT APPLE)
-    target_link_libraries(megbrain PUBLIC dl rt atomic)
+if(UNIX
+   AND NOT ANDROID
+   AND NOT APPLE)
+  target_link_libraries(megbrain PUBLIC dl rt atomic)
 endif()
 
 if(ANDROID)
-    target_link_libraries(megbrain PUBLIC log)
+  target_link_libraries(megbrain PUBLIC log)
 endif()
 
-set (_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld)
+set(_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld)
 
 # Build as SHARED or STATIC depending on BUILD_SHARED_LIBS=ON/OFF
 add_library(megengine)
-# force define a SHARED target for whl, caused by when build for APPLE
-# we will force set BUILD_SHARED_LIBS=OFF for xcode needed
-add_library(megengine_shared SHARED)
+add_library(${MGE_SHARED_LIB} SHARED)
 target_link_libraries(megengine PRIVATE ${MGE_CUDA_LIBS})
 target_link_libraries(megengine PUBLIC megbrain megdnn)
-target_link_libraries(megengine_shared PUBLIC megbrain megdnn)
-target_link_libraries(megengine_shared PRIVATE ${MGE_CUDA_LIBS})
-if (UNIX AND NOT APPLE)
-    target_link_options(megengine PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
-    set_target_properties(megengine PROPERTIES LINK_DEPENDS ${_VER_FILE})
-    target_link_options(megengine_shared PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
-    set_target_properties(megengine_shared PROPERTIES LINK_DEPENDS ${_VER_FILE})
+target_link_libraries(${MGE_SHARED_LIB} PUBLIC megbrain megdnn)
+target_link_libraries(${MGE_SHARED_LIB} PRIVATE ${MGE_CUDA_LIBS})
+if(UNIX AND NOT APPLE)
+  target_link_options(megengine PRIVATE -Wl,--no-undefined
+                      -Wl,--version-script=${_VER_FILE})
+  set_target_properties(megengine PROPERTIES LINK_DEPENDS ${_VER_FILE})
+  target_link_options(${MGE_SHARED_LIB} PRIVATE -Wl,--no-undefined
+                      -Wl,--version-script=${_VER_FILE})
+  set_target_properties(${MGE_SHARED_LIB} PROPERTIES LINK_DEPENDS ${_VER_FILE})
 endif()
 if(WIN32 OR MSVC)
-    target_compile_definitions(megbrain PRIVATE MGE_DLL_EXPORT)
-    target_compile_definitions(megdnn PRIVATE MGE_DLL_EXPORT)
-    target_compile_definitions(megengine PRIVATE MGE_DLL_EXPORT)
-    target_compile_definitions(megengine_shared PRIVATE MGE_DLL_EXPORT)
-    # please do not use WINDOWS_EXPORT_ALL_SYMBOLS, as symbols max than 65535 when build with CUDA
-    #set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
-    #set_target_properties(megengine_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
-endif()
-if (MGE_WITH_DISTRIBUTED)
-    message(VERBOSE "megengine configured to link megray")
-    target_link_libraries(megengine PUBLIC megray)
-    target_link_libraries(megengine_shared PUBLIC megray)
-endif()
-# Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready
-# for this.
-install(TARGETS megengine
-    EXPORT ${MGE_EXPORT_TARGETS}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-if (NOT MGE_WITH_DISTRIBUTED)
-    install(TARGETS megbrain
-            EXPORT ${MGE_EXPORT_TARGETS}
-            )
+  target_compile_definitions(megbrain PRIVATE MGE_DLL_EXPORT)
+  target_compile_definitions(megdnn PRIVATE MGE_DLL_EXPORT)
+  target_compile_definitions(megengine PRIVATE MGE_DLL_EXPORT)
+  target_compile_definitions(${MGE_SHARED_LIB} PRIVATE MGE_DLL_EXPORT)
+  # please do not use WINDOWS_EXPORT_ALL_SYMBOLS, as symbols max than 65535 when build
+  # with CUDA set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS
+  # TRUE) set_target_properties(${MGE_SHARED_LIB} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS
+  # TRUE)
+endif()
+if(MGE_WITH_DISTRIBUTED)
+  message(VERBOSE "megengine configured to link megray")
+  target_link_libraries(megengine PUBLIC megray)
+  target_link_libraries(${MGE_SHARED_LIB} PUBLIC megray)
+endif()
+# Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready for this.
+install(
+  TARGETS megengine
+  EXPORT ${MGE_EXPORT_TARGETS}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+if(NOT MGE_WITH_DISTRIBUTED)
+  install(TARGETS megbrain EXPORT ${MGE_EXPORT_TARGETS})
 endif()
 
 foreach(_PATH ${MGB_INC})
-    install(DIRECTORY ${_PATH}/megbrain DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h")
+  install(
+    DIRECTORY ${_PATH}/megbrain
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    FILES_MATCHING
+    PATTERN "*.h")
 endforeach()
diff --git a/src/core/impl/graph/cg_impl_seq.cpp b/src/core/impl/graph/cg_impl_seq.cpp
index 40a6ef01..4a1c731d 100644
--- a/src/core/impl/graph/cg_impl_seq.cpp
+++ b/src/core/impl/graph/cg_impl_seq.cpp
@@ -539,7 +539,8 @@ void ComputingGraphImpl::ComputingSequence::do_regist() const {
                     auto& mc = mp.chunk();
                     if (mp.valid() && mc.mem_alloc_status.is_from_owner_var()) {
                         auto size = mgb::get_aligned_power2(
-                                mc.size(), j->comp_node().get_mem_addr_alignment());
+                                mp.layout().span().dist_byte(),
+                                j->comp_node().get_mem_addr_alignment());
 
                         recorder.regist_memory_chunk(
                                 {chunk_id++, size, 0, this->m_opr_seq->size(),
diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h
index 56a281f7..787054a9 100644
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -577,7 +577,7 @@ protected:
         virtual size_t get_max_reserved_memory() { return 0; }
         virtual size_t get_max_used_memory() { return 0; }
         virtual size_t get_max_block_size_available() { return 0; }
-        virtual size_t get_free_mem() { return 0; }
+        virtual size_t get_free_mem() { return get_mem_status_bytes().second; }
         virtual void reset_max_reserved_memory() {}
         virtual void reset_max_used_memory() {}
 #endif
diff --git a/src/core/include/megbrain/graph/operator_node.h b/src/core/include/megbrain/graph/operator_node.h
index 67b9c81b..df410da8 100644
--- a/src/core/include/megbrain/graph/operator_node.h
+++ b/src/core/include/megbrain/graph/operator_node.h
@@ -1013,13 +1013,13 @@ using OprNodeArray = SmallVector<OperatorNodeBase*>;
  *
  * Note that opening brace is included
  */
-#define MGB_DEFINE_OPR_CLASS(_name, _base, ...)                  \
-    MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__) \
-    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+#define MGB_DEFINE_OPR_CLASS(_name, _base, ...)                          \
+    MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__)         \
+        MGB_DYN_TYPE_OBJ_FINAL_DECL;
 
-#define MGB_DEFINE_OPR_CLASS_WITH_EXPORT(_name, _base, ...)      \
-    MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__) \
-    MGB_DYN_TYPE_OBJ_FINAL_DECL_WITH_EXPORT;
+#define MGB_DEFINE_OPR_CLASS_WITH_EXPORT(_name, _base, ...)              \
+    MGB_DEFINE_CLS_WITH_SUPER(_name final, _base, ##__VA_ARGS__)         \
+        MGB_DYN_TYPE_OBJ_FINAL_DECL_WITH_EXPORT;
 }  // namespace cg
 }  // namespace mgb
 
diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td
index 233c99f3..30c795ad 100644
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -431,4 +431,21 @@ def Padding: MgbHashableOp<"Padding", [PaddingParam]>;
 
 def LRN: MgbHashableOp<"LRN", [LRNParam]>;
 
+def LayerNorm: MgbHashableOp<"LayerNorm", [LayerNormParam]>;
+
+def Dropout: MgbHashableOp<"Dropout", [DropoutParam]> {
+  let extraArguments = (ins
+    MgbSizeTAddr:$handle
+  );
+  let hashFunction = [{
+    return mgb::hash_pair_combine(
+      mgb::hash($_self.dyn_typeinfo()),
+      mgb::hash_pair_combine(
+        mgb::hash($_self.drop_prob),
+        mgb::hash($_self.handle))
+      );
+  }];
+  let cmpFunction = [{return $0.handle == $1.handle && $0.drop_prob == $1.drop_prob;}];
+
+}
 #endif // MGB_OPS
diff --git a/src/core/include/megbrain/utils/metahelper.h b/src/core/include/megbrain/utils/metahelper.h
index ae566f02..9f886f1a 100644
--- a/src/core/include/megbrain/utils/metahelper.h
+++ b/src/core/include/megbrain/utils/metahelper.h
@@ -495,18 +495,18 @@ private:
 
 }  // namespace mgb
 
-#define _MGB_DEFINE_CLS_WITH_SUPER_IMPL(_tpl, _name, _base, ...) \
-    class _name : public _base, ##__VA_ARGS__ {                  \
-    public:                                                      \
-        using Super = _tpl _base;                                \
-                                                                 \
+#define MGB_DEFINE_CLS_WITH_SUPER_IMPL(_tpl, _name, _base, ...) \
+    class _name : public _base, ##__VA_ARGS__ {                 \
+    public:                                                     \
+        using Super = _tpl _base;                               \
+                                                                \
     private:
 
 /*!
  * \brief define a class which has Super defined to base
  */
 #define MGB_DEFINE_CLS_WITH_SUPER(_name, _base, ...) \
-    _MGB_DEFINE_CLS_WITH_SUPER_IMPL(, _name, _base, ##__VA_ARGS__)
+    MGB_DEFINE_CLS_WITH_SUPER_IMPL(, _name, _base, ##__VA_ARGS__)
 
 /*!
  * \brief define a class which has Super defined to base
@@ -514,5 +514,5 @@ private:
  * Used when this class is a template and base class has template
  */
 #define MGB_DEFINE_CLS_WITH_SUPER_TPL(_name, _base, ...) \
-    _MGB_DEFINE_CLS_WITH_SUPER_IMPL(typename, _name, _base, ##__VA_ARGS__)
+    MGB_DEFINE_CLS_WITH_SUPER_IMPL(typename, _name, _base, ##__VA_ARGS__)
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/custom/impl/manager.cpp b/src/custom/impl/manager.cpp
index 3de0986f..39419d7e 100644
--- a/src/custom/impl/manager.cpp
+++ b/src/custom/impl/manager.cpp
@@ -18,12 +18,31 @@
 
 #ifndef _WIN32
 #include <dlfcn.h>
+#else
+#include <windows.h>
 #endif
 
 using namespace mgb;
 
 namespace custom {
 
+#ifdef _WIN32
+#define RTLD_LAZY 0
+
+void* dlopen(const char* file, int) {
+    return static_cast<void*>(LoadLibrary(file));
+}
+
+int dlclose(void* handle) {
+    return static_cast<int>(FreeLibrary(static_cast<HMODULE>(handle)));
+}
+
+const char* dlerror(void) {
+    static char win_err_info[] = "no dlerror info in windows";
+    return win_err_info;
+}
+#endif
+
 CustomOpManager* CustomOpManager::inst(void) {
     static CustomOpManager op_manager;
     return &op_manager;
@@ -127,7 +146,6 @@ std::vector<RunTimeId> CustomOpManager::op_id_list(void) {
     return ret;
 }
 
-#ifndef _WIN32
 CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY)
         : m_handle(nullptr, [](void* handle) { dlclose(handle); }) {
     auto op_list_before_load = CustomOpManager::inst()->op_name_list();
@@ -146,12 +164,6 @@ CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY)
         }
     }
 }
-#else
-CustomLib::CustomLib(const std::string& path, int mode = 0)
-        : m_handle(nullptr, [](void* handle) {}) {
-    mgb_assert(false, "custom op is only supported on Linux now");
-}
-#endif
 
 const std::vector<std::string>& CustomLib::ops_in_lib(void) const {
     return m_ops;
diff --git a/src/custom/include/megbrain/custom/custom.h b/src/custom/include/megbrain/custom/custom.h
index e6751f25..726076a7 100644
--- a/src/custom/include/megbrain/custom/custom.h
+++ b/src/custom/include/megbrain/custom/custom.h
@@ -16,7 +16,8 @@
 #include "tensor.h"
 
 namespace custom {
-std::shared_ptr<CustomOp> op_insert(std::string opname, uint32_t version);
+MGE_WIN_DECLSPEC_FUC std::shared_ptr<CustomOp> op_insert(
+        std::string opname, uint32_t version);
 }
 
 #define CUSTOM_OP_REG(OpName) \
diff --git a/src/custom/include/megbrain/custom/op.h b/src/custom/include/megbrain/custom/op.h
index 2646ce56..b1afc801 100644
--- a/src/custom/include/megbrain/custom/op.h
+++ b/src/custom/include/megbrain/custom/op.h
@@ -32,27 +32,26 @@ namespace custom {
 
 using RunTimeId = uint64_t;
 
-class ArgInfo {
+class MGE_WIN_DECLSPEC_FUC ArgInfo {
     CUSTOM_PIMPL_CLS_DECL(ArgInfo);
-    MGE_WIN_DECLSPEC_FUC ArgInfo(
-            const std::string& name, const std::string& desc,
+    ArgInfo(const std::string& name, const std::string& desc,
             const std::unordered_set<std::string>& dtypes, const int& ndim,
             const std::string& mem_stgy);
 
-    MGE_WIN_DECLSPEC_FUC const std::string& name(void) const;
-    MGE_WIN_DECLSPEC_FUC const std::string& desc(void) const;
-    MGE_WIN_DECLSPEC_FUC const std::unordered_set<std::string>& dtypes(void) const;
-    MGE_WIN_DECLSPEC_FUC int ndim(void) const;
-    MGE_WIN_DECLSPEC_FUC const std::string& mem_strategy(void) const;
+    const std::string& name(void) const;
+    const std::string& desc(void) const;
+    const std::unordered_set<std::string>& dtypes(void) const;
+    int ndim(void) const;
+    const std::string& mem_strategy(void) const;
 
-    MGE_WIN_DECLSPEC_FUC std::string str() const;
+    std::string str() const;
 };
 
-class CustomOp {
+class MGE_WIN_DECLSPEC_FUC CustomOp {
     std::unique_ptr<void, void_deleter> m_impl;
 
 public:
-    MGE_WIN_DECLSPEC_FUC CustomOp(const std::string& op_type, uint32_t version);
+    CustomOp(const std::string& op_type, uint32_t version);
     PREVENT_COPY_AND_ASSIGN(CustomOp);
 
     using DeviceInferFuncPtr =
@@ -71,70 +70,65 @@ public:
             void (*)(const std::vector<Tensor>&, const Param&, std::vector<Tensor>&);
 
     // write for forward
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_device_infer(DeviceInferFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_shape_infer(ShapeInferFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_dtype_infer(DTypeInferFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_format_infer(FormatInferFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(PreprocessFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(
-            const std::string& device, PreprocessFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(PostprocessFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(
-            const std::string& device, PostprocessFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(ComputeFuncPtr func);
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(
-            const std::string& device, ComputeFuncPtr func);
-
-    MGE_WIN_DECLSPEC_FUC CustomOp& set_description(const std::string& op_desc);
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_input(
+    CustomOp& set_device_infer(DeviceInferFuncPtr func);
+    CustomOp& set_shape_infer(ShapeInferFuncPtr func);
+    CustomOp& set_dtype_infer(DTypeInferFuncPtr func);
+    CustomOp& set_format_infer(FormatInferFuncPtr func);
+    CustomOp& set_preprocess(PreprocessFuncPtr func);
+    CustomOp& set_preprocess(const std::string& device, PreprocessFuncPtr func);
+    CustomOp& set_postprocess(PostprocessFuncPtr func);
+    CustomOp& set_postprocess(const std::string& device, PostprocessFuncPtr func);
+    CustomOp& set_compute(ComputeFuncPtr func);
+    CustomOp& set_compute(const std::string& device, ComputeFuncPtr func);
+
+    CustomOp& set_description(const std::string& op_desc);
+    CustomOp& add_input(
             const std::string& name, const std::string& desc,
             const std::initializer_list<std::string>& legal_dtypes = {"float32"},
             int dims = -1, const std::string& mem_stgy = "default");
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_output(
+    CustomOp& add_output(
             const std::string& name, const std::string& desc,
             const std::initializer_list<std::string>& legal_dtypes = {"float32"},
             int dims = -1, const std::string& mem_stgy = "default");
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_input(
+    CustomOp& add_input(
             const std::string& name,
             const std::initializer_list<std::string>& legal_dtypes = {"float32"},
             int dims = -1, const std::string& mem_stgy = "default");
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_output(
+    CustomOp& add_output(
             const std::string& name,
             const std::initializer_list<std::string>& legal_dtypes = {"float32"},
             int dims = -1, const std::string& mem_stgy = "default");
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_inputs(const size_t& input_num);
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_outputs(const size_t& output_num);
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_param(
-            const std::string& name, const ParamVal& default_val);
-    MGE_WIN_DECLSPEC_FUC CustomOp& add_param(
+    CustomOp& add_inputs(const size_t& input_num);
+    CustomOp& add_outputs(const size_t& output_num);
+    CustomOp& add_param(const std::string& name, const ParamVal& default_val);
+    CustomOp& add_param(
             const std::string& name, const std::string& desc,
             const ParamVal& default_val);
 
     // read
-    MGE_WIN_DECLSPEC_FUC std::string op_type(void) const;
-    MGE_WIN_DECLSPEC_FUC std::string op_desc(void) const;
-    MGE_WIN_DECLSPEC_FUC RunTimeId runtime_id(void) const;
-    MGE_WIN_DECLSPEC_FUC size_t input_num(void) const;
-    MGE_WIN_DECLSPEC_FUC size_t output_num(void) const;
-    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
-
-    MGE_WIN_DECLSPEC_FUC const ParamInfo& param_info(void) const;
-    MGE_WIN_DECLSPEC_FUC ArgInfo input_info(size_t idx) const;
-    MGE_WIN_DECLSPEC_FUC ArgInfo output_info(size_t idx) const;
-    MGE_WIN_DECLSPEC_FUC const std::vector<ArgInfo>& inputs_info(void) const;
-    MGE_WIN_DECLSPEC_FUC const std::vector<ArgInfo>& outputs_info(void) const;
+    std::string op_type(void) const;
+    std::string op_desc(void) const;
+    RunTimeId runtime_id(void) const;
+    size_t input_num(void) const;
+    size_t output_num(void) const;
+    std::string str(void) const;
+
+    const ParamInfo& param_info(void) const;
+    ArgInfo input_info(size_t idx) const;
+    ArgInfo output_info(size_t idx) const;
+    const std::vector<ArgInfo>& inputs_info(void) const;
+    const std::vector<ArgInfo>& outputs_info(void) const;
 
     // use
-    MGE_WIN_DECLSPEC_FUC std::vector<Device> infer_output_device(
+    std::vector<Device> infer_output_device(
             const std::vector<Device>&, const Param&) const;
-    MGE_WIN_DECLSPEC_FUC std::vector<Shape> infer_output_shape(
+    std::vector<Shape> infer_output_shape(
             const std::vector<Shape>&, const Param&) const;
-    MGE_WIN_DECLSPEC_FUC std::vector<DType> infer_output_dtype(
+    std::vector<DType> infer_output_dtype(
             const std::vector<DType>&, const Param&) const;
-    MGE_WIN_DECLSPEC_FUC std::vector<Format> infer_output_format(
+    std::vector<Format> infer_output_format(
             const std::vector<Format>&, const Param&) const;
-    MGE_WIN_DECLSPEC_FUC void compute(
-            const std::vector<Tensor>&, const Param&, std::vector<Tensor>&) const;
+    void compute(const std::vector<Tensor>&, const Param&, std::vector<Tensor>&) const;
 };
 
 }  // namespace custom
diff --git a/src/custom/include/megbrain/custom/param.h b/src/custom/include/megbrain/custom/param.h
index d895d913..f90a2674 100644
--- a/src/custom/include/megbrain/custom/param.h
+++ b/src/custom/include/megbrain/custom/param.h
@@ -23,7 +23,7 @@ class ParamInfoImpl;
 class ParamImpl;
 
 // Schema of a param element
-class ParamSchema {
+class MGE_WIN_DECLSPEC_FUC ParamSchema {
     CUSTOM_PIMPL_CLS_DECL(ParamSchema);
     ParamSchema(
             const std::string& name, const ParamVal& value,
@@ -36,7 +36,7 @@ class ParamSchema {
     std::string str(void) const;
 };
 
-class ParamInfo {
+class MGE_WIN_DECLSPEC_FUC ParamInfo {
     CUSTOM_PIMPL_CLS_DECL(ParamInfo);
 
     void set_tag(const std::string&);
@@ -46,16 +46,16 @@ class ParamInfo {
     const std::vector<ParamSchema>& meta(void) const;
 };
 
-class Param {
+class MGE_WIN_DECLSPEC_FUC Param {
     CUSTOM_PIMPL_CLS_DECL(Param);
 
-    MGE_WIN_DECLSPEC_FUC Param(const ParamInfo&);
-    MGE_WIN_DECLSPEC_FUC ParamVal& operator[](const std::string&);
-    MGE_WIN_DECLSPEC_FUC const ParamVal& operator[](const std::string&) const;
-    MGE_WIN_DECLSPEC_FUC const std::unordered_map<std::string, ParamVal>& raw() const;
-    MGE_WIN_DECLSPEC_FUC bool exist(const std::string& name) const;
-    MGE_WIN_DECLSPEC_FUC std::string to_bytes(void) const;
-    MGE_WIN_DECLSPEC_FUC void from_bytes(const std::string&);
+    Param(const ParamInfo&);
+    ParamVal& operator[](const std::string&);
+    const ParamVal& operator[](const std::string&) const;
+    const std::unordered_map<std::string, ParamVal>& raw() const;
+    bool exist(const std::string& name) const;
+    std::string to_bytes(void) const;
+    void from_bytes(const std::string&);
 };
 
 MGE_WIN_DECLSPEC_FUC bool operator==(const Param&, const Param&);
diff --git a/src/custom/include/megbrain/custom/param_val.h b/src/custom/include/megbrain/custom/param_val.h
index 31b2a4b6..d7f3b521 100644
--- a/src/custom/include/megbrain/custom/param_val.h
+++ b/src/custom/include/megbrain/custom/param_val.h
@@ -169,21 +169,21 @@ std::string vec2str(const std::vector<T>& vec) {
  * Con1: user need to set the type explicitly when class template instantiation
  * Con2: ParamVal<int> can not be assigned to ParamVal<double>
  */
-class ParamVal {
+class MGE_WIN_DECLSPEC_FUC ParamVal {
     std::unique_ptr<void, void_deleter> m_ptr;
     ParamDynType m_type;
 
 public:
     template <typename T>
-    MGE_WIN_DECLSPEC_FUC ParamVal(const T& val);
+    ParamVal(const T& val);
     template <typename T>
-    MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list<T>& val);
+    ParamVal(const std::initializer_list<T>& val);
 
-    MGE_WIN_DECLSPEC_FUC ParamVal();
-    MGE_WIN_DECLSPEC_FUC ParamVal(const char* str);
-    MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list<const char*>& strs);
-    MGE_WIN_DECLSPEC_FUC ParamVal(const std::vector<const char*>& strs);
-    MGE_WIN_DECLSPEC_FUC ParamVal(const ParamVal& rhs);
+    ParamVal();
+    ParamVal(const char* str);
+    ParamVal(const std::initializer_list<const char*>& strs);
+    ParamVal(const std::vector<const char*>& strs);
+    ParamVal(const ParamVal& rhs);
 
     template <typename T>
     ParamVal& operator=(const T& rhs);
@@ -196,30 +196,39 @@ public:
     ParamVal& operator=(const ParamVal& rhs);
 
     template <typename T>
-    MGE_WIN_DECLSPEC_FUC const T& as(void) const;
+    const T& as(void) const;
     template <typename T>
-    MGE_WIN_DECLSPEC_FUC T& as(void);
-
-    MGE_WIN_DECLSPEC_FUC const void* raw_ptr(void) const;
-    MGE_WIN_DECLSPEC_FUC void* raw_ptr(void);
-    MGE_WIN_DECLSPEC_FUC ParamDynType type(void) const;
-    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
-    MGE_WIN_DECLSPEC_FUC size_t size(void) const;
-
-    MGE_WIN_DECLSPEC_FUC static std::string to_bytes(const ParamVal& value);
-    MGE_WIN_DECLSPEC_FUC static ParamVal from_bytes(
-            const std::string& bytes, size_t& offset);
-
-    friend ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs);
-    friend ParamVal operator-(const ParamVal& lhs, const ParamVal& rhs);
-    friend ParamVal operator*(const ParamVal& lhs, const ParamVal& rhs);
-    friend ParamVal operator/(const ParamVal& lhs, const ParamVal& rhs);
-    friend bool operator==(const ParamVal& lhs, const ParamVal& rhs);
-    friend bool operator!=(const ParamVal& lhs, const ParamVal& rhs);
-    friend bool operator>(const ParamVal& lhs, const ParamVal& rhs);
-    friend bool operator<(const ParamVal& lhs, const ParamVal& rhs);
-    friend bool operator>=(const ParamVal& lhs, const ParamVal& rhs);
-    friend bool operator<=(const ParamVal& lhs, const ParamVal& rhs);
+    T& as(void);
+
+    const void* raw_ptr(void) const;
+    void* raw_ptr(void);
+    ParamDynType type(void) const;
+    std::string str(void) const;
+    size_t size(void) const;
+
+    static std::string to_bytes(const ParamVal& value);
+    static ParamVal from_bytes(const std::string& bytes, size_t& offset);
+
+    MGE_WIN_DECLSPEC_FUC friend ParamVal operator+(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend ParamVal operator-(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend ParamVal operator*(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend ParamVal operator/(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator!=(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator>(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator<(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator>=(
+            const ParamVal& lhs, const ParamVal& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator<=(
+            const ParamVal& lhs, const ParamVal& rhs);
 };
 
 ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs);
diff --git a/src/custom/include/megbrain/custom/tensor.h b/src/custom/include/megbrain/custom/tensor.h
index a1dd9ba5..53c54dc8 100644
--- a/src/custom/include/megbrain/custom/tensor.h
+++ b/src/custom/include/megbrain/custom/tensor.h
@@ -30,9 +30,9 @@ namespace custom {
 #define CUSTOM_DEVICE_TYPE_ENUM_DECL(custom_type, builtin_type, builtin_str) \
     custom_type,
 
-class Device {
-    MGE_WIN_DECLSPEC_FUC const void* impl() const;
-    MGE_WIN_DECLSPEC_FUC Device(const void* impl);
+class MGE_WIN_DECLSPEC_FUC Device {
+    const void* impl() const;
+    Device(const void* impl);
     CUSTOM_PIMPL_CLS_DECL(Device);
 
 public:
@@ -40,19 +40,19 @@ public:
         CUSTOM_FOR_EACH_DEVICE_TYPE(CUSTOM_DEVICE_TYPE_ENUM_DECL)
     };
 
-    MGE_WIN_DECLSPEC_FUC Device(const std::string& device);
-    MGE_WIN_DECLSPEC_FUC Device(const char* device);
-    MGE_WIN_DECLSPEC_FUC Device(DeviceEnum device);
+    Device(const std::string& device);
+    Device(const char* device);
+    Device(DeviceEnum device);
 
-    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
-    MGE_WIN_DECLSPEC_FUC DeviceEnum enumv(void) const;
+    std::string str(void) const;
+    DeviceEnum enumv(void) const;
 
-    MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& device);
-    MGE_WIN_DECLSPEC_FUC static bool is_legal(DeviceEnum device);
-    MGE_WIN_DECLSPEC_FUC static std::vector<std::string> legal_devices(void);
+    static bool is_legal(const std::string& device);
+    static bool is_legal(DeviceEnum device);
+    static std::vector<std::string> legal_devices(void);
 
     friend class Tensor;
-    friend bool operator==(const Device& lhs, const Device& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(const Device& lhs, const Device& rhs);
     CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };
 
@@ -60,23 +60,23 @@ using DeviceEnum = Device::DeviceEnum;
 
 bool operator==(const Device& lhs, const Device& rhs);
 
-class Shape {
-    MGE_WIN_DECLSPEC_FUC const void* impl() const;
-    MGE_WIN_DECLSPEC_FUC Shape(const void* impl);
+class MGE_WIN_DECLSPEC_FUC Shape {
+    const void* impl() const;
+    Shape(const void* impl);
     CUSTOM_PIMPL_CLS_DECL(Shape);
 
 public:
-    MGE_WIN_DECLSPEC_FUC Shape(const std::vector<size_t>& rhs);
-    MGE_WIN_DECLSPEC_FUC Shape(const std::initializer_list<size_t>& rhs);
+    Shape(const std::vector<size_t>& rhs);
+    Shape(const std::initializer_list<size_t>& rhs);
 
     size_t& operator[](size_t idx);
     size_t operator[](size_t idx) const;
 
-    MGE_WIN_DECLSPEC_FUC void ndim(size_t dim);
-    MGE_WIN_DECLSPEC_FUC size_t ndim(void) const;
+    void ndim(size_t dim);
+    size_t ndim(void) const;
 
     friend class Tensor;
-    friend bool operator==(const Shape& lhs, const Shape& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(const Shape& lhs, const Shape& rhs);
     CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };
 
@@ -104,9 +104,9 @@ using bfloat16_t = uint16_t;
 
 #define CUSTOM_DTYPE_ENUM_DECL(custom_type, builtin_type, ctype) custom_type,
 
-class DType {
-    MGE_WIN_DECLSPEC_FUC const void* impl() const;
-    MGE_WIN_DECLSPEC_FUC DType(const void* impl);
+class MGE_WIN_DECLSPEC_FUC DType {
+    const void* impl() const;
+    DType(const void* impl);
     CUSTOM_PIMPL_CLS_DECL(DType);
 
 public:
@@ -114,27 +114,33 @@ public:
         CUSTOM_FOR_EACH_TENSOR_DATA_TYPE(CUSTOM_DTYPE_ENUM_DECL)
     };
 
-    MGE_WIN_DECLSPEC_FUC DType(const std::string& dtype);
-    MGE_WIN_DECLSPEC_FUC DType(const char* dtype);
-    MGE_WIN_DECLSPEC_FUC DType(
-            const std::string& dtype, float scale, uint8_t zero_point = 0);
-    MGE_WIN_DECLSPEC_FUC DType(const char* dtype, float scale, uint8_t zero_point = 0);
-    MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype);
-    MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0);
-
-    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
-    MGE_WIN_DECLSPEC_FUC DTypeEnum enumv() const;
-    MGE_WIN_DECLSPEC_FUC float scale(void) const;
-    MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const;
+    DType(const std::string& dtype);
+    DType(const char* dtype);
+    DType(const std::string& dtype, float scale, uint8_t zero_point = 0);
+    DType(const char* dtype, float scale, uint8_t zero_point = 0);
+    DType(DTypeEnum dtype);
+    DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0);
+
+    std::string str(void) const;
+    DTypeEnum enumv() const;
+    float scale(void) const;
+    uint8_t zero_point(void) const;
     template <typename T>
-    MGE_WIN_DECLSPEC_FUC bool is_compatible(void) const;
+    bool is_compatible(void) const;
 
-    MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& dtype);
-    MGE_WIN_DECLSPEC_FUC static bool is_legal(const DTypeEnum& dtype);
-    MGE_WIN_DECLSPEC_FUC static std::vector<std::string> legal_dtypes(void);
+    static bool is_legal(const std::string& dtype);
+    static bool is_legal(const DTypeEnum& dtype);
+    static std::vector<std::string> legal_dtypes(void);
 
     friend class Tensor;
-    friend bool operator==(const DType& lhs, const DType& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const DType& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(
+            const DType& lhs, const std::string& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const char* rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(
+            const std::string& lhs, const DType& rhs);
+    MGE_WIN_DECLSPEC_FUC friend bool operator==(const char* lhs, const DType& rhs);
+
     CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };
 
@@ -180,45 +186,45 @@ bool operator==(const DType& lhs, const char* rhs);
 bool operator==(const std::string& lhs, const DType& rhs);
 bool operator==(const char* lhs, const DType& rhs);
 
-class Format {
-    MGE_WIN_DECLSPEC_FUC const void* impl() const;
-    MGE_WIN_DECLSPEC_FUC Format(const void* impl);
+class MGE_WIN_DECLSPEC_FUC Format {
+    const void* impl() const;
+    Format(const void* impl);
     CUSTOM_PIMPL_CLS_DECL(Format);
 
 public:
-    MGE_WIN_DECLSPEC_FUC Format(const std::string& format);
-    MGE_WIN_DECLSPEC_FUC Format(const char* format);
+    Format(const std::string& format);
+    Format(const char* format);
 
-    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
-    MGE_WIN_DECLSPEC_FUC bool is_default(void) const;
+    std::string str(void) const;
+    bool is_default(void) const;
 
     friend class Tensor;
     CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };
 
-class Tensor {
+class MGE_WIN_DECLSPEC_FUC Tensor {
     void* m_tensor;
 
-    MGE_WIN_DECLSPEC_FUC const void* impl(void) const;
-    MGE_WIN_DECLSPEC_FUC Tensor(const void* impl);
+    const void* impl(void) const;
+    Tensor(const void* impl);
 
-    MGE_WIN_DECLSPEC_FUC const size_t* shapes_raw(void) const;
-    MGE_WIN_DECLSPEC_FUC const ptrdiff_t* strides_raw(void) const;
+    const size_t* shapes_raw(void) const;
+    const ptrdiff_t* strides_raw(void) const;
 
 public:
     Tensor() = delete;
-    MGE_WIN_DECLSPEC_FUC Tensor(const Tensor& rhs);
-    MGE_WIN_DECLSPEC_FUC Tensor& operator=(const Tensor& rhs);
-
-    MGE_WIN_DECLSPEC_FUC Shape shape(void) const;
-    MGE_WIN_DECLSPEC_FUC DType dtype(void) const;
-    MGE_WIN_DECLSPEC_FUC Format format(void) const;
-    MGE_WIN_DECLSPEC_FUC Device device(void) const;
-
-    MGE_WIN_DECLSPEC_FUC size_t size(void) const;
-    MGE_WIN_DECLSPEC_FUC std::vector<ptrdiff_t> stride(void) const;
-    MGE_WIN_DECLSPEC_FUC float scale(void) const;
-    MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const;
+    Tensor(const Tensor& rhs);
+    Tensor& operator=(const Tensor& rhs);
+
+    Shape shape(void) const;
+    DType dtype(void) const;
+    Format format(void) const;
+    Device device(void) const;
+
+    size_t size(void) const;
+    std::vector<ptrdiff_t> stride(void) const;
+    float scale(void) const;
+    uint8_t zero_point(void) const;
 
     void* data(void);
     const void* data(void) const;
diff --git a/src/custom/include/megbrain/custom/utils.h b/src/custom/include/megbrain/custom/utils.h
index 318bc62d..1bc64c6a 100644
--- a/src/custom/include/megbrain/custom/utils.h
+++ b/src/custom/include/megbrain/custom/utils.h
@@ -19,10 +19,19 @@
 
 namespace custom {
 
-void assert_failed_log(
+#ifndef MGE_WIN_DECLSPEC_FUC
+#ifdef _WIN32
+#define MGE_WIN_DECLSPEC_FUC __declspec(dllexport)
+#else
+#define MGE_WIN_DECLSPEC_FUC
+#endif
+#endif
+
+MGE_WIN_DECLSPEC_FUC void assert_failed_log(
         const char* file, int line, const char* func, const char* expr,
         const char* msg_fmt, ...);
 
+#ifndef _WIN32
 #define custom_expect(expr, msg...)                                               \
     if (!(expr)) {                                                                \
         assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \
@@ -33,8 +42,22 @@ void assert_failed_log(
         assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \
     }                                                                             \
     assert((expr))
+#else
+#define custom_expect(expr, ...)                                              \
+    if (!(expr)) {                                                            \
+        assert_failed_log(                                                    \
+                __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \
+    }
+
+#define custom_assert(expr, ...)                                              \
+    if (!(expr)) {                                                            \
+        assert_failed_log(                                                    \
+                __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \
+    }                                                                         \
+    assert((expr))
+#endif
 
-class UnImpleWarnLog {
+class MGE_WIN_DECLSPEC_FUC UnImpleWarnLog {
 public:
     UnImpleWarnLog(
             const std::string& func, const std::string& attr, const std::string& val);
@@ -54,9 +77,9 @@ void impl_deleter(void* ptr) {
     std::unique_ptr<void, void_deleter> m_impl; \
                                                 \
 public:                                         \
-    MGE_WIN_DECLSPEC_FUC Cls();                 \
-    MGE_WIN_DECLSPEC_FUC Cls(const Cls& rhs);   \
-    MGE_WIN_DECLSPEC_FUC Cls& operator=(const Cls& rhs)
+    Cls();                                      \
+    Cls(const Cls& rhs);                        \
+    Cls& operator=(const Cls& rhs)
 
 #define CUSTOM_PIMPL_CLS_DEFINE(Cls)                                                 \
     Cls::Cls() : m_impl(new Cls##Impl(), impl_deleter<Cls##Impl>) {}                 \
diff --git a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
index d783a1f5..84d82e45 100644
--- a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
+++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
@@ -220,6 +220,28 @@ struct MultiAlgoOprTrait;
                     ::megdnn::has_available_algo(megdnn_opr, args...), array_layouts); \
             MIDOUT_E                                                                   \
         }                                                                              \
+        static bool has_no_naive_heuristic_algo(                                       \
+                const VarNodeArray& i, const cg::OperatorNodeBase* opr_) {             \
+            MIDOUT_B(                                                                  \
+                    midout_iv(MGB_HASH_STR(#_Opr)),                                    \
+                    midout_iv(MGB_HASH_STR("has_no_naive_heuristic_algo")))            \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                                \
+            auto&& megdnn_opr = reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr());        \
+            FixedTensorLayouts array_layouts;                                          \
+            size_t in = i.size() - 1;                                                  \
+            for (size_t idx = 0; idx < in; idx++) {                                    \
+                const auto& v = i[idx];                                                \
+                array_layouts[idx] =                                                   \
+                        TensorLayout{v->shape(), v->dtype(), v->format()};             \
+            }                                                                          \
+            const auto& v = i[in];                                                     \
+            array_layouts[arity - 1] =                                                 \
+                    TensorLayout{v->shape(), v->dtype(), v->format()};                 \
+            return APPLY(                                                              \
+                    ::megdnn::has_no_naive_heuristic_algo(megdnn_opr, args...),        \
+                    array_layouts);                                                    \
+            MIDOUT_E                                                                   \
+        }                                                                              \
     };
 INST(Convolution)
 INST(ConvBiasForward)
@@ -365,6 +387,23 @@ bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr)
 #undef cb
 }
 
+bool has_no_naive_heuristic_algo(
+        const VarNodeArray& i, const cg::OperatorNodeBase* opr) {
+#define cb(_Opr)                                                             \
+    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                           \
+        MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo);                \
+        VarNodeArray _ = i;                                                  \
+        _.emplace_back(opr->output(0));                                      \
+        return MultiAlgoOprTrait<_Opr>::has_no_naive_heuristic_algo(_, opr); \
+    } else
+    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) cb(PoolingForward) {
+        mgb_throw(
+                InternalError, "invalid multi-algo operator(got:%s)",
+                opr->dyn_typeinfo()->name);
+    }
+#undef cb
+}
+
 bool has_opr_format(const cg::OperatorNodeBase* opr) {
     bool ret = false;
 #define cb(_Opr) ret |= opr->dyn_typeinfo() == _Opr::typeinfo();
diff --git a/src/gopt/impl/global_layout_transform/opr_format_modifier.h b/src/gopt/impl/global_layout_transform/opr_format_modifier.h
index 2ab6697a..77b1d292 100644
--- a/src/gopt/impl/global_layout_transform/opr_format_modifier.h
+++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.h
@@ -27,6 +27,9 @@ namespace intl {
 
 bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);
 
+bool has_no_naive_heuristic_algo(
+        const VarNodeArray& i, const cg::OperatorNodeBase* opr);
+
 struct OprFormatInfo {
     opr::Convolution::Param::Format opr_format;
     struct TensorFormatsInfo {
diff --git a/src/gopt/impl/global_layout_transform/profiler_impl.cpp b/src/gopt/impl/global_layout_transform/profiler_impl.cpp
index 48d7188a..6257a24a 100644
--- a/src/gopt/impl/global_layout_transform/profiler_impl.cpp
+++ b/src/gopt/impl/global_layout_transform/profiler_impl.cpp
@@ -99,7 +99,7 @@ float GraphPartitionProfiler::duration_in_usec() const {
  * \brief An operator that indicates its input var node is contiguous
  */
 // clang-format off
-MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{
+MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) // {
     void scn_do_execute() override {};
     void init_output_static_infer_desc() override;
     void add_input_layout_constraint() override {
@@ -331,7 +331,8 @@ float ProfilerImpl::profile_operator(
             opr::PoolingForward::typeinfo(),
     };
     if (multi_algo_oprs.count(opr->dyn_typeinfo()) &&
-        !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()))
+        (!mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()) ||
+         !mgb::gopt::intl::has_no_naive_heuristic_algo(new_inps, y->owner_opr())))
         return PROFILE_TIME_OUT;
     if (!m_opr_filter(opr, y->owner_opr()))
         return PROFILE_TIME_OUT;
diff --git a/src/jit/test/mlir/CMakeLists.txt b/src/jit/test/mlir/CMakeLists.txt
index aad1717d..8d1eff47 100644
--- a/src/jit/test/mlir/CMakeLists.txt
+++ b/src/jit/test/mlir/CMakeLists.txt
@@ -1,27 +1,20 @@
 configure_lit_site_cfg(
-        ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.site.cfg.py.in
-        ${CMAKE_CURRENT_BINARY_DIR}/utils/lit.site.cfg.py
-        MAIN_CONFIG
-        ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.cfg.py
-)
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/utils/lit.site.cfg.py MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils/lit.cfg.py)
 
-set(LLVM_EXTERNAL_LIT "${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/utils/lit/lit.py" CACHE STRING "External lit")
+set(LLVM_EXTERNAL_LIT
+    "${PROJECT_SOURCE_DIR}/third_party/llvm-project/llvm/utils/lit/lit.py"
+    CACHE STRING "External lit")
 
-set(MLIR_MGB_TEST_DEPENDS
-        mgb-file-check
-        count not
-        mgb-opt
-)
+set(MLIR_MGB_TEST_DEPENDS mgb-file-check count not mgb-opt)
 
 add_lit_testsuite(mgb-mlir-test-lit "Running the mgb regression tests"
-        ${CMAKE_CURRENT_BINARY_DIR}/utils
-        DEPENDS ${MLIR_MGB_TEST_DEPENDS}
-        )
+                  ${CMAKE_CURRENT_BINARY_DIR}/utils DEPENDS ${MLIR_MGB_TEST_DEPENDS})
 set_target_properties(mgb-mlir-test-lit PROPERTIES FOLDER "Tests")
 
-add_lit_testsuites(MLIR_TEST ${CMAKE_CURRENT_SOURCE_DIR} 
-        DEPENDS ${MLIR_MGB_TEST_DEPENDS}
-)
+add_lit_testsuites(MLIR_TEST ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS
+                   ${MLIR_MGB_TEST_DEPENDS})
 
 add_custom_target(mlir_pass_check)
 add_dependencies(mlir_pass_check mgb-mlir-test-lit)
diff --git a/src/opr-mm/impl/mm_handler.cpp b/src/opr-mm/impl/mm_handler.cpp
index 1f13b697..7cbb8a3e 100644
--- a/src/opr-mm/impl/mm_handler.cpp
+++ b/src/opr-mm/impl/mm_handler.cpp
@@ -17,6 +17,9 @@
 #include "megbrain/opr/zmq_rpc.h"
 #include "mm_handler.pb.h"
 
+using namespace mgb;
+using namespace opr;
+
 /* ======================== GroupServerProxy ========================== */
 /*!
  * A proxy that receives zmqrpc call, direct call to NCCL Manager
@@ -213,7 +216,7 @@ struct ServerInfo {
     std::unique_ptr<ZmqRpc::ZmqRpcServer> server;
 };
 
-int create_zmqrpc_server(const std::string& server_addr, int port) {
+int mgb::opr::create_zmqrpc_server(const std::string& server_addr, int port) {
     static std::unordered_map<std::string, ServerInfo> addr2server;
     static std::mutex mtx;
     MGB_LOCK_GUARD(mtx);
diff --git a/src/opr-mm/include/megbrain/opr/mm_handler.h b/src/opr-mm/include/megbrain/opr/mm_handler.h
index 7c03bf96..97b829d4 100644
--- a/src/opr-mm/include/megbrain/opr/mm_handler.h
+++ b/src/opr-mm/include/megbrain/opr/mm_handler.h
@@ -16,8 +16,8 @@
 #include "megbrain/opr/collective_comm.h"
 #include "megbrain/opr/group_manager.h"
 
-using namespace mgb;
-using namespace opr;
+namespace mgb {
+namespace opr {
 
 /*!
  * Comm MM Client Proxy.
@@ -56,6 +56,9 @@ private:
 
 int create_zmqrpc_server(const std::string& server_addr, int port);
 
+}  // namespace opr
+}  // namespace mgb
+
 #endif
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/dnn/dnn.sereg.h b/src/opr/impl/dnn/dnn.sereg.h
index 4455bceb..ceb0c972 100644
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -16,6 +16,7 @@
 #include "megbrain/opr/dnn/correlation.h"
 #include "megbrain/opr/dnn/fake_quant.h"
 #include "megbrain/opr/dnn/images2neibs.h"
+#include "megbrain/opr/dnn/layer_norm.h"
 #include "megbrain/opr/dnn/local.h"
 #include "megbrain/opr/dnn/lrn.h"
 #include "megbrain/opr/dnn/lsq.h"
@@ -420,6 +421,47 @@ struct OprMaker<opr::BatchNormBackward, 6> {
     }
 };
 
+template <>
+struct OprMaker<opr::LayerNorm, 0> {
+    using Param = opr::LayerNorm::Param;
+    static cg::OperatorNodeBase* make(
+            const Param& param, const cg::VarNodeArray& i, ComputingGraph& graph,
+            const OperatorNodeConfig& config) {
+        MGB_MARK_USED_VAR(graph);
+        if (i.size() == 3) {
+            return opr::LayerNorm::make(i[0], i[1], i[2], param, config)[0]
+                    .node()
+                    ->owner_opr();
+        } else {
+            mgb_assert(i.size() == 1);
+            return opr::LayerNorm::make(i[0], param, config)[0].node()->owner_opr();
+        }
+    }
+};
+
+// OprMaker in MGB_SEREG_OPR only support unique output opr
+template <>
+struct OprMaker<opr::LayerNormBackward, 0> {
+    using Param = opr::LayerNormBackward::Param;
+    static cg::OperatorNodeBase* make(
+            const Param& param, const cg::VarNodeArray& i, ComputingGraph& graph,
+            const OperatorNodeConfig& config) {
+        MGB_MARK_USED_VAR(graph);
+        if (i.size() == 5) {
+            return opr::LayerNormBackward::make(
+                           i[0], i[1], i[2], i[3], i[4], param, config)[0]
+                    .node()
+                    ->owner_opr();
+        } else {
+            mgb_assert(i.size() == 4);
+            return opr::LayerNormBackward::make(
+                           i[0], i[1], i[2], i[3], param, config)[0]
+                    .node()
+                    ->owner_opr();
+        }
+    }
+};
+
 template <class MegDNNConv = megdnn::LocalShare>
 struct MakeLocalShareCaller2 {
     template <typename Opr>
@@ -641,6 +683,8 @@ MGB_SEREG_OPR(TQT, 2);
 MGB_SEREG_OPR(TQTBackward, 3);
 MGB_SEREG_OPR(LSQ, 4);
 MGB_SEREG_OPR(LSQBackward, 5);
+MGB_SEREG_OPR(LayerNorm, 0);
+MGB_SEREG_OPR(LayerNormBackward, 0);
 }  // namespace opr
 
 }  // namespace mgb
diff --git a/src/opr/impl/dnn/layer_norm.cpp b/src/opr/impl/dnn/layer_norm.cpp
new file mode 100644
index 00000000..3506111a
--- /dev/null
+++ b/src/opr/impl/dnn/layer_norm.cpp
@@ -0,0 +1,248 @@
+/**
+ * \file src/opr/impl/dnn/layer_norm.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/opr/dnn/layer_norm.h"
+
+#include "megbrain/graph/grad_impl.h"
+#include "megbrain/opr/internal/out_shape_by_sym_var.h"
+#include "megbrain/opr/utility.h"
+
+#include "../internal/megdnn_opr_wrapper.inl"
+
+using namespace mgb;
+using namespace opr;
+
+/* ==================== LayerNormForward  ==================== */
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(LayerNormForward);
+
+LayerNormForward::LayerNormForward(
+        VarNode* data, VarNode* weight, VarNode* bias, const Param& param,
+        const OperatorNodeConfig& config)
+        : Super{data->owner_graph(), config, "layer_norm", {data, weight, bias}} {
+    init_megdnn_opr(*this, param);
+
+    add_input({data, weight, bias});
+    output(0)->dtype(data->dtype());
+    output(1)->dtype(dtype::Float32());
+    output(2)->dtype(dtype::Float32());
+}
+
+LayerNormForward::LayerNormForward(
+        VarNode* data, const Param& param, const OperatorNodeConfig& config)
+        : Super{data->owner_graph(), config, "layer_norm", {data}} {
+    init_megdnn_opr(*this, param);
+
+    add_input({data});
+    output(0)->dtype(data->dtype());
+    output(1)->dtype(dtype::Float32());
+    output(2)->dtype(dtype::Float32());
+}
+
+SymbolVarArray LayerNormForward::make(
+        SymbolVar data, SymbolVar weight, SymbolVar bias, const Param& param,
+        const OperatorNodeConfig& config) {
+    auto outs = data.node()
+                        ->owner_graph()
+                        ->insert_opr(std::make_unique<LayerNormForward>(
+                                data.node(), weight.node(), bias.node(), param, config))
+                        ->output();
+    SymbolVarArray ret;
+    for (auto&& out : outs) {
+        ret.emplace_back(out);
+    }
+    return ret;
+}
+
+SymbolVarArray LayerNormForward::make(
+        SymbolVar data, const Param& param, const OperatorNodeConfig& config) {
+    auto outs = data.node()
+                        ->owner_graph()
+                        ->insert_opr(std::make_unique<LayerNormForward>(
+                                data.node(), param, config))
+                        ->output();
+    SymbolVarArray ret;
+    for (auto&& out : outs) {
+        ret.emplace_back(out);
+    }
+    return ret;
+}
+
+void LayerNormForward::get_output_var_shape(
+        const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
+    uint64_t normalized_dim = param().normalized_dim;
+    out_shape[0] = inp_shape[0];
+    TensorShape unnormalized_shape;
+    unnormalized_shape.ndim = inp_shape[0].ndim - normalized_dim;
+    for (size_t i = 0; i < unnormalized_shape.ndim; ++i) {
+        unnormalized_shape.shape[i] = inp_shape[0].shape[i];
+    }
+    out_shape[1] = unnormalized_shape;
+    out_shape[2] = unnormalized_shape;
+}
+
+size_t LayerNormForward::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return 0;
+}
+
+void LayerNormForward::scn_do_execute() {
+    if (param().affine) {
+        megdnn_opr()->exec(
+                input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(),
+                input(2)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(),
+                output(1)->dev_tensor().as_megdnn(),
+                output(2)->dev_tensor().as_megdnn(), {});
+    } else {
+        megdnn_opr()->exec(
+                input(0)->dev_tensor().as_megdnn(), {}, {},
+                output(0)->dev_tensor().as_megdnn(),
+                output(1)->dev_tensor().as_megdnn(),
+                output(2)->dev_tensor().as_megdnn(), {});
+    }
+}
+
+#if MGB_ENABLE_GRAD
+MGB_IMPL_OPR_GRAD(LayerNormForward) {
+    auto p = opr.param();
+    SymbolVarArray grad;
+    VarNodeArray ret;
+    if (p.affine) {
+        mgb_assert(wrt_idx < 3, "wrt_idx %zu is out of range", wrt_idx);
+        grad = LayerNormBackward::make(
+                out_grad[0], opr.input(0), opr.input(1), opr.output(1), opr.output(2),
+                opr.param());
+    } else {
+        mgb_assert(wrt_idx < 1, "wrt_idx %zu is out of range", wrt_idx);
+        grad = LayerNormBackward::make(
+                out_grad[0], opr.input(0), opr.output(1), opr.output(2), opr.param());
+    }
+
+    uint32_t nr_ret = p.affine ? 3 : 1;
+    for (uint32_t i = 0; i < nr_ret; ++i) {
+        ret.push_back(grad[i].node());
+    }
+    return ret;
+}
+#endif
+
+/* ==================== LayerNormBackward ==================== */
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(LayerNormBackward);
+
+LayerNormBackward::LayerNormBackward(
+        VarNode* diff, VarNode* data, VarNode* weight, VarNode* mean, VarNode* rstd,
+        const Param& param, const OperatorNodeConfig& config)
+        : Super({diff->owner_graph(),
+                 config,
+                 "layer_norm_backward",
+                 {diff, data, weight, mean, rstd}},
+                0, true) {
+    init_megdnn_opr(*this, param);
+    add_input({diff, data, weight, mean, rstd});
+}
+
+LayerNormBackward::LayerNormBackward(
+        VarNode* diff, VarNode* data, VarNode* mean, VarNode* rstd, const Param& param,
+        const OperatorNodeConfig& config)
+        : Super({diff->owner_graph(),
+                 config,
+                 "layer_norm_backward",
+                 {diff, data, mean, rstd}},
+                0, true) {
+    init_megdnn_opr(*this, param);
+    add_input({diff, data, mean, rstd});
+    auto mark_empty_var = [&](VarNode* var) {
+        var->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE)
+                .add_flag(VarNode::Flag::VOLATILE_CONTENT);
+    };
+    mark_empty_var(output(1));
+    mark_empty_var(output(2));
+}
+
+SymbolVarArray LayerNormBackward::make(
+        SymbolVar diff, SymbolVar data, SymbolVar weight, SymbolVar mean,
+        SymbolVar rstd, const Param& param, const OperatorNodeConfig& config) {
+    auto outs = diff.node()
+                        ->owner_graph()
+                        ->insert_opr(std::make_unique<LayerNormBackward>(
+                                diff.node(), data.node(), weight.node(), mean.node(),
+                                rstd.node(), param, config))
+                        ->output();
+    SymbolVarArray ret;
+    for (auto&& out : outs) {
+        ret.emplace_back(out);
+    }
+    return ret;
+}
+
+SymbolVarArray LayerNormBackward::make(
+        SymbolVar diff, SymbolVar data, SymbolVar mean, SymbolVar rstd,
+        const Param& param, const OperatorNodeConfig& config) {
+    auto outs = diff.node()
+                        ->owner_graph()
+                        ->insert_opr(std::make_unique<LayerNormBackward>(
+                                diff.node(), data.node(), mean.node(), rstd.node(),
+                                param, config))
+                        ->output();
+    SymbolVarArray ret;
+    for (auto&& out : outs) {
+        ret.emplace_back(out);
+    }
+    return ret;
+}
+
+void LayerNormBackward::init_output_static_infer_desc() {
+    using namespace cg::static_infer;
+    auto&& mgr = owner_graph()->static_infer_manager();
+    mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(1)));
+    if (param().affine) {
+        mgr.register_shape_infer(output(1), ShapeInferDesc::make_identity(input(2)));
+        mgr.register_shape_infer(output(2), ShapeInferDesc::make_identity(input(2)));
+    } else {
+        TensorShape empty;
+        empty.ndim = 0;
+        mgr.register_shape_infer(output(1), ShapeInferDesc::make_const(empty));
+        mgr.register_shape_infer(output(2), ShapeInferDesc::make_const(empty));
+    }
+    this->init_output_static_infer_desc_workspace(false);
+}
+
+void LayerNormBackward::init_output_dtype() {
+    output(0)->dtype(input(1)->dtype());
+    output(1)->dtype(input(2)->dtype());
+    output(2)->dtype(input(2)->dtype());
+}
+
+size_t LayerNormBackward::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return 0;
+}
+
+void LayerNormBackward::scn_do_execute() {
+    if (param().affine) {
+        megdnn_opr()->exec(
+                input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(),
+                input(2)->dev_tensor().as_megdnn(), input(3)->dev_tensor().as_megdnn(),
+                input(4)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(),
+                output(1)->dev_tensor().as_megdnn(),
+                output(2)->dev_tensor().as_megdnn(), {});
+    } else {
+        megdnn_opr()->exec(
+                input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(),
+                {}, input(2)->dev_tensor().as_megdnn(),
+                input(3)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(),
+                {}, {}, {});
+    }
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/rand.cpp b/src/opr/impl/rand.cpp
index 6be879f1..043a15e5 100644
--- a/src/opr/impl/rand.cpp
+++ b/src/opr/impl/rand.cpp
@@ -201,6 +201,8 @@ template class RNGOprBase<::megdnn::BetaRNG>;
 template class RNGOprBase<::megdnn::PoissonRNG>;
 template class RNGOprBase<::megdnn::ShuffleRNGForward>;
 template class RNGOprBase<::megdnn::ShuffleRNGBackward>;
+template class RNGOprBase<::megdnn::DropoutForward>;
+template class RNGOprBase<::megdnn::DropoutBackward>;
 #if MGB_ENABLE_GRAD
 IMPL(GaussianRNG);
 IMPL(UniformRNG);
@@ -300,4 +302,134 @@ MGB_IMPL_OPR_GRAD(ShuffleRNGForward) {
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(ShuffleRNGBackward);
 MEGDNN_OPR_INIT3(ShuffleRNGBackward, "shuffle_rng_bwd", 2, true)
 
+/* ================= DropoutForward =================  */
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(DropoutForward);
+
+DropoutForward::DropoutForward(
+        VarNode* inp, const Param& param, const OperatorNodeConfig& config)
+        : Super({inp->owner_graph(), config, "dropout", {inp}}, param) {
+    add_input({inp});
+    add_output(None)->dtype(inp->dtype()).add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
+    add_output(None)->dtype(dtype::Byte()).add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
+    cg::add_workspace_output(this);
+    add_equivalence_component<ScalarHash<void*>>(this);
+}
+
+SymbolVarArray DropoutForward::make(
+        SymbolVar inp, const Param& param, const OperatorNodeConfig& config) {
+    auto node = inp.node()->owner_graph()->insert_opr(
+            std::make_unique<DropoutForward>(inp.node(), param, config));
+    mgb_assert(node->output().size() == 3);
+    return {node->output(0), node->output(1)};
+}
+
+void DropoutForward::init_output_static_infer_desc() {
+    using namespace cg::static_infer;
+    auto&& mgr = owner_graph()->static_infer_manager();
+    mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(0)));
+
+    auto infer_mask = [this](TensorShape& dest, const InpVal& iv) {
+        ensure_megdnn_opr();
+        dest.ndim = 1;
+        dest.shape[0] = m_dnn_opr->get_mask_size_in_bytes(
+                {iv.val[0].shape(), input(0)->dtype()});
+        return true;
+    };
+    mgr.register_shape_infer(
+            output(1), {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_mask});
+
+    auto infer_wk = [this](TensorShape& dest, const InpVal& inp) {
+        ensure_megdnn_opr();
+        dest.ndim = 1;
+        dest.shape[0] = m_dnn_opr->get_workspace_in_bytes(
+                {inp.val[0].shape(), input(0)->dtype()},
+                {output(0)->shape(), output(0)->dtype()},
+                {output(1)->shape(), output(1)->dtype()});
+        return true;
+    };
+    mgr.register_shape_infer(
+            output(2), {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_wk});
+}
+
+void DropoutForward::add_input_layout_constraint() {
+    input(0)->add_layout_constraint_contiguous();
+};
+
+void DropoutForward::scn_do_execute() {
+    auto&& ret = output(0);
+    if (ret->layout().is_empty()) {
+        mgb_assert(ret->dev_tensor().empty());
+        return;
+    }
+    m_dnn_opr->exec(
+            input(0)->dev_tensor().as_megdnn(), output(0)->dev_tensor().as_megdnn(),
+            output(1)->dev_tensor().as_megdnn(),
+            get_megdnn_workspace_from_var(output(2)));
+}
+
+cg::OperatorNodeBase::NodeProp* DropoutForward::do_make_node_prop() const {
+    auto prop = Super::do_make_node_prop();
+    prop->add_flag(NodeProp::Flag::IMPURE_FUNC);
+    for (auto i : input()) {
+        prop->add_dep_type_existing_var(i, NodeProp::DepType::VALUE_ALLOW_EMPTY);
+    }
+    return prop;
+}
+
+#if MGB_ENABLE_GRAD
+MGB_IMPL_OPR_GRAD(DropoutForward) {
+    SymbolVar grad = DropoutBackward::make(out_grad[0], opr.output(1), opr.param());
+    VarNodeArray ret;
+    ret.push_back(grad.node());
+    return ret;
+}
+#endif
+
+/* ==================== LayerNormBackward ==================== */
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(DropoutBackward);
+
+DropoutBackward::DropoutBackward(
+        VarNode* doup, VarNode* mask, const Param& param,
+        const OperatorNodeConfig& config)
+        : Super({doup->owner_graph(), config, "dropout_backward", {doup, mask}}, 0,
+                true) {
+    init_megdnn_opr(*this, param);
+    add_input({doup, mask});
+}
+
+SymbolVar DropoutBackward::make(
+        SymbolVar doup, SymbolVar mask, const Param& param,
+        const OperatorNodeConfig& config) {
+    return doup.insert_single_output_opr<DropoutBackward>(
+            doup.node(), mask.node(), param, config);
+}
+
+void DropoutBackward::init_output_static_infer_desc() {
+    using namespace cg::static_infer;
+    auto&& mgr = owner_graph()->static_infer_manager();
+    mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(0)));
+    this->init_output_static_infer_desc_workspace(false);
+}
+
+void DropoutBackward::init_output_dtype() {
+    output(0)->dtype(input(0)->dtype());
+}
+
+size_t DropoutBackward::get_workspace_size_bytes(
+        const TensorShapeArray& input_shapes,
+        const TensorShapeArray& output_shapes) const {
+    return megdnn_opr()->get_workspace_in_bytes(
+            {input_shapes[0], input(0)->dtype(), input(0)->format()},
+            {input_shapes[1], input(1)->dtype(), input(1)->format()},
+            {output_shapes[0], output(0)->dtype(), output(0)->format()});
+}
+
+void DropoutBackward::scn_do_execute() {
+    megdnn_opr()->exec(
+            input(0)->dev_tensor().as_megdnn(), input(1)->dev_tensor().as_megdnn(),
+            output(0)->dev_tensor().as_megdnn(), {});
+}
+
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/rand.sereg.h b/src/opr/impl/rand.sereg.h
index fe3bd8b1..869fb72c 100644
--- a/src/opr/impl/rand.sereg.h
+++ b/src/opr/impl/rand.sereg.h
@@ -29,6 +29,19 @@ struct OprMaker<opr::ShuffleRNG, 1> {
         return out[0].node()->owner_opr();
     }
 };
+
+// OprMaker in MGB_SEREG_OPR only support unique output opr
+template <>
+struct OprMaker<opr::DropoutForward, 1> {
+    using Param = opr::DropoutForward::Param;
+    static cg::OperatorNodeBase* make(
+            const Param& param, const cg::VarNodeArray& i, ComputingGraph& graph,
+            const OperatorNodeConfig& config) {
+        MGB_MARK_USED_VAR(graph);
+        return opr::DropoutForward::make(i[0], param, config)[0].node()->owner_opr();
+    }
+};
+
 }  // namespace serialization
 
 namespace opr {
@@ -43,6 +56,8 @@ MGB_SEREG_OPR(PermutationRNG, 1);
 MGB_SEREG_OPR(BetaRNG, 2);
 MGB_SEREG_OPR(ShuffleRNG, 1);
 MGB_SEREG_OPR(ShuffleRNGBackward, 3);
+MGB_SEREG_OPR(Dropout, 1);
+MGB_SEREG_OPR(DropoutBackward, 2);
 
 }  // namespace opr
 }  // namespace mgb
diff --git a/src/opr/include/megbrain/opr/dnn/layer_norm.h b/src/opr/include/megbrain/opr/dnn/layer_norm.h
new file mode 100644
index 00000000..29712de0
--- /dev/null
+++ b/src/opr/include/megbrain/opr/dnn/layer_norm.h
@@ -0,0 +1,78 @@
+/**
+ * \file src/opr/include/megbrain/opr/dnn/layer_norm.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
+#include "megdnn/oprs.h"
+
+namespace mgb {
+namespace opr {
+
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
+        LayerNormForward, intl::MegDNNOprWrapperFwd<megdnn::LayerNormForward>) // {
+public:
+    MGE_WIN_DECLSPEC_FUC LayerNormForward(
+            VarNode* data, VarNode* weight, VarNode* bias, const Param& param,
+            const OperatorNodeConfig& config);
+    MGE_WIN_DECLSPEC_FUC LayerNormForward(
+            VarNode* data, const Param& param, const OperatorNodeConfig& config);
+
+    MGE_WIN_DECLSPEC_FUC static SymbolVarArray make(
+            SymbolVar data, SymbolVar weight, SymbolVar bias, const Param& param = {},
+            const OperatorNodeConfig& config = {});
+    MGE_WIN_DECLSPEC_FUC static SymbolVarArray make(
+            SymbolVar data, const Param& param = {},
+            const OperatorNodeConfig& config = {});
+
+private:
+    void get_output_var_shape(
+            const TensorShapeArray& inp_shape,
+            TensorShapeArray& out_shape) const override;
+    size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override;
+    void scn_do_execute() override;
+};
+using LayerNorm = LayerNormForward;
+
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
+        LayerNormBackward, intl::MegDNNOprWrapperBwd<megdnn::LayerNormBackward>) // {
+public:
+    MGE_WIN_DECLSPEC_FUC LayerNormBackward(
+            VarNode* diff, VarNode* data, VarNode* weight, VarNode* mean, VarNode* rstd,
+            const Param& param, const OperatorNodeConfig& config);
+
+    MGE_WIN_DECLSPEC_FUC LayerNormBackward(
+            VarNode* diff, VarNode* data, VarNode* mean, VarNode* rstd,
+            const Param& param, const OperatorNodeConfig& config);
+
+    MGE_WIN_DECLSPEC_FUC static SymbolVarArray make(
+            SymbolVar diff, SymbolVar data, SymbolVar weight, SymbolVar mean,
+            SymbolVar rstd, const Param& param = {},
+            const OperatorNodeConfig& config = {});
+    MGE_WIN_DECLSPEC_FUC static SymbolVarArray make(
+            SymbolVar diff, SymbolVar data, SymbolVar mean, SymbolVar rstd,
+            const Param& param = {}, const OperatorNodeConfig& config = {});
+
+private:
+    void init_output_static_infer_desc() override;
+    void init_output_dtype() override;
+    size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override;
+    void scn_do_execute() override;
+};
+
+}  // namespace opr
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/include/megbrain/opr/dnn/pooling.h b/src/opr/include/megbrain/opr/dnn/pooling.h
index 658f13f6..3b4efdde 100644
--- a/src/opr/include/megbrain/opr/dnn/pooling.h
+++ b/src/opr/include/megbrain/opr/dnn/pooling.h
@@ -20,38 +20,38 @@ namespace opr {
 
 MGB_DEFINE_OPR_CLASS(
         PoolingForward, intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>,
-        public mixin::AlgoChooserHelper)  //{
+        public mixin::AlgoChooserHelper) // {
 public:
-MGE_WIN_DECLSPEC_FUC PoolingForward(
-        VarNode* src, const Param& param, const ExecutionPolicy& policy,
-        const OperatorNodeConfig& config);
-MGE_WIN_DECLSPEC_FUC static SymbolVar make(
-        SymbolVar src, const Param& param, const ExecutionPolicy& policy = {},
-        const OperatorNodeConfig& config = {});
-
-void init_output_static_infer_desc() override;
-
-size_t get_workspace_size_bytes(
-        const TensorShapeArray& input_shapes,
-        const TensorShapeArray& output_shapes) const override;
+    MGE_WIN_DECLSPEC_FUC PoolingForward(
+            VarNode* src, const Param& param, const ExecutionPolicy& policy,
+            const OperatorNodeConfig& config);
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar src, const Param& param, const ExecutionPolicy& policy = {},
+            const OperatorNodeConfig& config = {});
+
+    void init_output_static_infer_desc() override;
+
+    size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override;
 };
 using Pooling = PoolingForward;
 
 MGB_DEFINE_OPR_CLASS(
         PoolingBackward, intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>,
-        public mixin::AlgoChooserHelper)  //{
+        public mixin::AlgoChooserHelper) // {
 public:
-MGE_WIN_DECLSPEC_FUC PoolingBackward(
-        VarNode* src, VarNode* dst, VarNode* diff, const Param& param,
-        const ExecutionPolicy& policy, const OperatorNodeConfig& config);
+    MGE_WIN_DECLSPEC_FUC PoolingBackward(
+            VarNode* src, VarNode* dst, VarNode* diff, const Param& param,
+            const ExecutionPolicy& policy, const OperatorNodeConfig& config);
 
-MGE_WIN_DECLSPEC_FUC static SymbolVar make(
-        SymbolVar src, SymbolVar dst, SymbolVar diff, const Param& param,
-        const ExecutionPolicy& policy = {}, const OperatorNodeConfig& config = {});
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar src, SymbolVar dst, SymbolVar diff, const Param& param,
+            const ExecutionPolicy& policy = {}, const OperatorNodeConfig& config = {});
 
-MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes(
-        const TensorShapeArray& input_shapes,
-        const TensorShapeArray& output_shapes) const override final;
+    MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override final;
 };
 
 }  // namespace opr
diff --git a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h
index c4fa8725..7491054c 100644
--- a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h
+++ b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h
@@ -86,7 +86,7 @@ MGE_WIN_DECLSPEC_FUC void add_input_layout_constraint_contig(OperatorNodeBase& o
 //! called in constructor to add output vars
 MGE_WIN_DECLSPEC_FUC void add_output_vars(
         OperatorNodeBase& opr, size_t nr_output, bool add_workspace);
-}
+}  // namespace megdnn_utils
 
 /*!
  * \brief mixin for infer workspace size based on input and output shapes
@@ -344,34 +344,34 @@ private:
 }  // namespace mgb
 
 //! define a megdnn opr wrapper class with 1 input for forward
-#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD1(_name)                             \
-    MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd<megdnn::_name>)     \
-public:                                                                       \
-    _name(VarNode* p0, const Param& param, const OperatorNodeConfig& config); \
-    MGE_WIN_DECLSPEC_FUC static SymbolVar make(                               \
-            SymbolVar p0, const Param& param = {},                            \
-            const OperatorNodeConfig& config = {});                           \
+#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD1(_name)                                 \
+    MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd<megdnn::_name>)         \
+    public:                                                                       \
+        _name(VarNode* p0, const Param& param, const OperatorNodeConfig& config); \
+        MGE_WIN_DECLSPEC_FUC static SymbolVar make(                               \
+                SymbolVar p0, const Param& param = {},                            \
+                const OperatorNodeConfig& config = {});                           \
     }
 
 //! define a megdnn opr wrapper class with 2 inputs for forward
-#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD2(_name)                         \
-    MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd<megdnn::_name>) \
-public:                                                                   \
-    _name(VarNode* p0, VarNode* p1, const Param& param,                   \
-          const OperatorNodeConfig& config);                              \
-    MGE_WIN_DECLSPEC_FUC static SymbolVar make(                           \
-            SymbolVar p0, SymbolVar p1, const Param& param = {},          \
-            const OperatorNodeConfig& config = {});                       \
+#define MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD2(_name)                                 \
+    MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperFwd<megdnn::_name>)         \
+    public:                                                                       \
+        _name(VarNode* p0, VarNode* p1, const Param& param,                       \
+              const OperatorNodeConfig& config);                                  \
+        MGE_WIN_DECLSPEC_FUC static SymbolVar make(                               \
+                SymbolVar p0, SymbolVar p1, const Param& param = {},              \
+                const OperatorNodeConfig& config = {});                           \
     }
 
 //! define a megdnn opr wrapper class with 3 inputs for grad
 #define MGB_DEFINE_MEGDNN_OPR_WRAPPER_BWD3(_name, _extra...)                         \
     MGB_DEFINE_OPR_CLASS(_name, intl::MegDNNOprWrapperBwd<megdnn::_name>)            \
-    _extra public : _name(VarNode* p0, VarNode* p1, VarNode* p2, const Param& param, \
-                          const OperatorNodeConfig& config);                         \
-    MGE_WIN_DECLSPEC_FUC static SymbolVar make(                                      \
-            SymbolVar p0, SymbolVar p1, SymbolVar p2, const Param& param = {},       \
-            const OperatorNodeConfig& config = {});                                  \
+        _extra public : _name(VarNode* p0, VarNode* p1, VarNode* p2,                 \
+                              const Param& param, const OperatorNodeConfig& config); \
+        MGE_WIN_DECLSPEC_FUC static SymbolVar make(                                  \
+                SymbolVar p0, SymbolVar p1, SymbolVar p2, const Param& param = {},   \
+                const OperatorNodeConfig& config = {});                              \
     }
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/include/megbrain/opr/rand.h b/src/opr/include/megbrain/opr/rand.h
index 1424dca7..e7199ccf 100644
--- a/src/opr/include/megbrain/opr/rand.h
+++ b/src/opr/include/megbrain/opr/rand.h
@@ -40,25 +40,25 @@ protected:
 };
 
 /* ================= RNG with shape =================  */
-#define _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(RNG)                                       \
-    MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase<megdnn::RNG>)                  \
-    cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override;             \
-                                                                                    \
-public:                                                                             \
-    RNG(VarNode* shape, const Param& param, const OperatorNodeConfig& config);      \
-    MGE_WIN_DECLSPEC_FUC static SymbolVar make(                                     \
-            SymbolVar shape, const Param& param = {},                               \
-            const OperatorNodeConfig& config = {});                                 \
-    static SymbolVar make(                                                          \
-            ComputingGraph& graph, const TensorShape& shape,                        \
-            const OperatorNodeConfig& config, const Param& param = {}) {            \
-        return make(                                                                \
-                var_from_tensor_shape(graph, config, "rng", shape), param, config); \
-    }                                                                               \
-    void init_output_static_infer_desc() override;                                  \
-    void scn_do_execute() override;                                                 \
-    }                                                                               \
-    ;
+#define _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(RNG)                                      \
+    MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase<megdnn::RNG>)                 \
+        cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override;        \
+                                                                                   \
+    public:                                                                        \
+        RNG(VarNode* shape, const Param& param, const OperatorNodeConfig& config); \
+        MGE_WIN_DECLSPEC_FUC static SymbolVar make(                                \
+                SymbolVar shape, const Param& param = {},                          \
+                const OperatorNodeConfig& config = {});                            \
+        static SymbolVar make(                                                     \
+                ComputingGraph& graph, const TensorShape& shape,                   \
+                const OperatorNodeConfig& config, const Param& param = {}) {       \
+            return make(                                                           \
+                    var_from_tensor_shape(graph, config, "rng", shape), param,     \
+                    config);                                                       \
+        }                                                                          \
+        void init_output_static_infer_desc() override;                             \
+        void scn_do_execute() override;                                            \
+    };
 
 _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(UniformRNG)
 _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(GaussianRNG)
@@ -66,20 +66,19 @@ _DEFINE_RNG_OPR_WITH_SHAPE_CLASS(PermutationRNG)
 #undef _DEFINE_RNG_OPR_WITH_SHAPE_CLASS
 
 /* ================= RNG with input =================  */
-#define _DEFINE_RNG_OPR_WITH_INPUT_CLASS(RNG)                                     \
-    MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase<megdnn::RNG>)                \
-    void add_input_layout_constraint() override;                                  \
-    cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override;           \
-                                                                                  \
-public:                                                                           \
-    RNG(_INPUTS(VarNode*), const Param& param, const OperatorNodeConfig& config); \
-    MGE_WIN_DECLSPEC_FUC static _OUTPUTS make(                                    \
-            _INPUTS(SymbolVar), const Param& param = {},                          \
-            const OperatorNodeConfig& config = {});                               \
-    void init_output_static_infer_desc() override;                                \
-    void scn_do_execute() override;                                               \
-    }                                                                             \
-    ;
+#define _DEFINE_RNG_OPR_WITH_INPUT_CLASS(RNG)                                         \
+    MGB_DEFINE_OPR_CLASS_WITH_EXPORT(RNG, RNGOprBase<megdnn::RNG>)                    \
+        void add_input_layout_constraint() override;                                  \
+        cg::OperatorNodeBase::NodeProp* do_make_node_prop() const override;           \
+                                                                                      \
+    public:                                                                           \
+        RNG(_INPUTS(VarNode*), const Param& param, const OperatorNodeConfig& config); \
+        MGE_WIN_DECLSPEC_FUC static _OUTPUTS make(                                    \
+                _INPUTS(SymbolVar), const Param& param = {},                          \
+                const OperatorNodeConfig& config = {});                               \
+        void init_output_static_infer_desc() override;                                \
+        void scn_do_execute() override;                                               \
+    };
 
 /* ================= 1 input =================  */
 #define _INPUTS(preifx) preifx i0
@@ -88,6 +87,7 @@ _DEFINE_RNG_OPR_WITH_INPUT_CLASS(PoissonRNG)
 #undef _OUTPUTS
 #define _OUTPUTS SymbolVarArray
 _DEFINE_RNG_OPR_WITH_INPUT_CLASS(ShuffleRNGForward)
+_DEFINE_RNG_OPR_WITH_INPUT_CLASS(DropoutForward)
 #undef _OUTPUTS
 #undef _INPUTS
 
@@ -100,7 +100,7 @@ _DEFINE_RNG_OPR_WITH_INPUT_CLASS(GammaRNG)
 #undef _INPUTS
 #undef _DEFINE_RNG_OPR_WITH_INPUT_CLASS
 
-}  // intl
+}  // namespace intl
 
 using UniformRNG = intl::UniformRNG;
 using GaussianRNG = intl::GaussianRNG;
@@ -109,18 +109,39 @@ using PermutationRNG = intl::PermutationRNG;
 using PoissonRNG = intl::PoissonRNG;
 using BetaRNG = intl::BetaRNG;
 using ShuffleRNG = intl::ShuffleRNGForward;
+using Dropout = intl::DropoutForward;
+using DropoutForward = intl::DropoutForward;
 
 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
-        ShuffleRNGBackward,
-        intl::MegDNNOprWrapperBwd<megdnn::ShuffleRNGBackward>)  //{
+        ShuffleRNGBackward, intl::MegDNNOprWrapperBwd<megdnn::ShuffleRNGBackward>) // {
 public:
-ShuffleRNGBackward(
-        VarNode* out_diff, VarNode* indices, VarNode* result_shape, const Param& param,
-        const OperatorNodeConfig& config);
+    ShuffleRNGBackward(
+            VarNode* out_diff, VarNode* indices, VarNode* result_shape,
+            const Param& param, const OperatorNodeConfig& config);
 
-MGE_WIN_DECLSPEC_FUC static SymbolVar make(
-        SymbolVar out_diff, SymbolVar indices, SymbolVar result_shape,
-        const Param& param = {}, const OperatorNodeConfig& config = {});
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar out_diff, SymbolVar indices, SymbolVar result_shape,
+            const Param& param = {}, const OperatorNodeConfig& config = {});
+};
+
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
+        DropoutBackward, intl::MegDNNOprWrapperBwd<megdnn::DropoutBackward>) // {
+public:
+    MGE_WIN_DECLSPEC_FUC DropoutBackward(
+            VarNode* doup, VarNode* mask, const Param& param,
+            const OperatorNodeConfig& config);
+
+    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
+            SymbolVar doup, SymbolVar mask, const Param& param = {},
+            const OperatorNodeConfig& config = {});
+
+private:
+    void init_output_static_infer_desc() override;
+    void init_output_dtype() override;
+    size_t get_workspace_size_bytes(
+            const TensorShapeArray& input_shapes,
+            const TensorShapeArray& output_shapes) const override;
+    void scn_do_execute() override;
 };
 
 }  // namespace opr
diff --git a/src/opr/test/dnn/layer_norm.cpp b/src/opr/test/dnn/layer_norm.cpp
new file mode 100644
index 00000000..15db672c
--- /dev/null
+++ b/src/opr/test/dnn/layer_norm.cpp
@@ -0,0 +1,108 @@
+/**
+ * \file src/opr/test/dnn/layer_norm.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/opr/dnn/layer_norm.h"
+#include "megbrain/comp_node_env.h"
+#include "megbrain/test/autocheck.h"
+#include "megbrain/test/helper.h"
+#include "megbrain/test/megdnn_helper.h"
+
+#include "megdnn/oprs.h"
+
+#include <cmath>
+#include <iomanip>
+#include <random>
+#include <sstream>
+
+using namespace mgb;
+
+namespace {
+using Param = opr::LayerNormForward::Param;
+
+void run_forward(bool is_affine, size_t normalized_size) {
+    using Checker = AutoOprChecker<3, 3>;
+
+    Param param;
+    param.eps = 1e-5;
+    param.affine = is_affine;
+    param.normalized_dim = 1;
+    param.normalized_size = normalized_size;
+
+    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
+        auto out = opr::LayerNormForward::make(inputs[0], inputs[1], inputs[2], param);
+        return {out[0], out[1], out[2]};
+    };
+
+    auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
+        auto opr =
+                MegDNNHandle::get(CompNodeEnv::from_comp_node(CompNode::default_cpu()))
+                        ->create_operator<megdnn::LayerNormForward>();
+        auto inp_shape = inp[0]->shape();
+        auto n_slices = inp_shape[0];
+        auto slice_len = inp_shape[1];
+
+        opr->param() = param;
+
+        dest[0].dtype(dtype::Float32())
+                .comp_node(inp[0]->comp_node())
+                .resize({n_slices, slice_len});
+        dest[1].dtype(dtype::Float32())
+                .comp_node(inp[0]->comp_node())
+                .resize({n_slices});
+        dest[2].dtype(dtype::Float32())
+                .comp_node(inp[0]->comp_node())
+                .resize({n_slices});
+        opr->exec(
+                inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
+                dest[0].as_megdnn(), dest[1].as_megdnn(), dest[2].as_megdnn(), {});
+    };
+
+    auto gen = [&](HostTensorND& src) {
+        HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> src_gen(0.f);
+        src = *src_gen(src.shape(), src.comp_node());
+    };
+
+    Checker::RunOptions option;
+    option.numdiff_max_err = 1e-4;
+    Checker checker{make_graph, fwd};
+
+    checker.set_input_generator(0, gen);
+    checker.set_input_generator(1, gen);
+    checker.set_input_generator(2, gen);
+    checker.set_input_allow_grad(0, false);
+    checker.set_input_allow_grad(1, false);
+    checker.set_input_allow_grad(2, false);
+    checker.set_output_allow_grad(0, false);
+    checker.set_output_allow_grad(1, false);
+    checker.set_output_allow_grad(2, false);
+
+    checker.run({TensorShape{normalized_size, normalized_size},
+                 TensorShape{normalized_size}, TensorShape{normalized_size}},
+                option)
+            .run({TensorShape{normalized_size, normalized_size},
+                  TensorShape{normalized_size}, TensorShape{normalized_size}},
+                 option)
+            .run({TensorShape{normalized_size, normalized_size},
+                  TensorShape{normalized_size}, TensorShape{normalized_size}},
+                 option);
+}
+
+TEST(TestOprDNN, LayerNormForwardAffine) {
+    REQUIRE_GPU(1);
+    run_forward(true, 1);
+    run_forward(true, 16);
+    run_forward(true, 17);
+}
+
+}  // anonymous namespace
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/test/rand.cpp b/src/opr/test/rand.cpp
index b584629a..a7171a19 100644
--- a/src/opr/test/rand.cpp
+++ b/src/opr/test/rand.cpp
@@ -446,4 +446,42 @@ TEST(TestOprRand, PermutationReprod) {
     });
 }
 
+TEST(TestOprRand, Dropout) {
+    auto run = [&](TensorShape shape, uint64_t seed, float drop_prob) {
+        using Param = megdnn::DropoutBase::Param;
+        Param param(drop_prob, seed);
+        float scale = 1.0 / (1.0 - drop_prob);
+
+        std::shared_ptr<HostTensorND> inp_host(
+                new HostTensorND{CompNode::load("xpux"), shape, dtype::Float32()});
+        for (size_t i = 0; i < shape.total_nr_elems(); ++i) {
+            inp_host->ptr<dt_float32>()[i] = 1.0f;
+        }
+
+        auto graph = ComputingGraph::make();
+        auto inp_sym = opr::Host2DeviceCopy::make(*graph, inp_host);
+        auto outs = opr::DropoutForward::make(inp_sym, param);
+
+        HostTensorND oup_host, mask_host, ws_host;
+        auto func = graph->compile(
+                {make_callback_copy(outs[0], oup_host),
+                 make_callback_copy(outs[1], mask_host)});
+        func->execute();
+
+        size_t droped_cnt = 0;
+        for (size_t i = 0; i < shape.total_nr_elems(); ++i) {
+            ASSERT_TRUE(
+                    oup_host.ptr<dt_float32>()[i] == 0 ||
+                    oup_host.ptr<dt_float32>()[i] == scale);
+            if (oup_host.ptr<dt_float32>()[i] == 0) {
+                droped_cnt++;
+            }
+        }
+        float real_drop = droped_cnt * 1.0 / shape.total_nr_elems();
+        ASSERT_LT(abs(drop_prob - real_drop), 1e-2);
+    };
+    run({100000}, 0, 0.2);
+    run({64, 32, 16, 16}, 1, 0.4);
+}
+
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/serialization/impl/schema.fbs b/src/serialization/impl/schema.fbs
index f91477e6..7b3aa4ed 100644
--- a/src/serialization/impl/schema.fbs
+++ b/src/serialization/impl/schema.fbs
@@ -116,6 +116,8 @@ union OperatorParam {
     param.Padding = 82,
     param.ShuffleRNG = 83,
     param.CheckNonFinite = 84,
+    param.LayerNorm = 85,
+    param.Dropout = 86,
 }
 
 table Operator {
diff --git a/src/tensorrt/impl/tensorrt_runtime_opr.cpp b/src/tensorrt/impl/tensorrt_runtime_opr.cpp
index ef232d8c..822116bb 100644
--- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp
+++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp
@@ -107,6 +107,7 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr(
 void TensorRTRuntimeOpr::get_output_var_shape(
         const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
     auto batch = inp_shape.at(0)[0];
+    m_manager.clear_trt_context();
     m_manager.create_trt_context(this->comp_node(), inp_shape, m_engine.get());
     auto get_mgb_shape = [&](int binding_idx) -> TensorShape {
         auto dims = m_engine->getBindingDimensions(binding_idx);
@@ -217,6 +218,12 @@ SymbolVarArray TensorRTRuntimeOpr::make(
         std::shared_ptr<nvinfer1::ICudaEngine> engine,
         std::shared_ptr<GpuAllocator> gpu_allocator, const SymbolVarArray& src,
         const OperatorNodeConfig& config) {
+    mgb_assert(
+            NV_TENSORRT_VERSION == getInferLibVersion(),
+            "TensorRT version mismatch: compiled with %d; detected %d at runtime , may "
+            "caused by customized environment, for example LD_LIBRARY_PATH on LINUX "
+            "and PATH on Windows!!",
+            NV_TENSORRT_VERSION, getInferLibVersion());
     VarNodeArray var_node_array = cg::to_var_node_array(src);
     auto tensor_rt_opr = std::make_unique<TensorRTRuntimeOpr>(
             std::move(engine), std::move(gpu_allocator), var_node_array, config);
diff --git a/src/version.ld b/src/version.ld
index db71a72b..f70a5677 100644
--- a/src/version.ld
+++ b/src/version.ld
@@ -13,8 +13,6 @@ global:
         base_exceptions*;
     };
     megcore*;
-    *GroupClientProxy*;
-    *create_zmqrpc_server*;
     *custom*;
 
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dd4b2c7e..030e5e6e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,52 +1,63 @@
 include_directories("./src/include")
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-file(GLOB_RECURSE SOURCES ./*.cpp ../src/core/test/*.cpp ../src/gopt/test/*.cpp ../src/opr/test/*.cpp ../src/plugin/test/*.cpp ../src/serialization/test/*.cpp)
+file(
+  GLOB_RECURSE
+  SOURCES
+  ./*.cpp
+  ../src/core/test/*.cpp
+  ../src/gopt/test/*.cpp
+  ../src/opr/test/*.cpp
+  ../src/plugin/test/*.cpp
+  ../src/serialization/test/*.cpp)
 if(MGE_WITH_JIT)
-    file(GLOB_RECURSE SOURCES_ ../src/jit/test/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ ../src/jit/test/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 if(MGE_WITH_DISTRIBUTED)
-    file(GLOB_RECURSE SOURCES_ ../src/opr-mm/test/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+  file(GLOB_RECURSE SOURCES_ ../src/opr-mm/test/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
-if (MGE_WITH_CUDA AND MGE_WITH_TRT)
-    file(GLOB_RECURSE SOURCES_ ../src/tensorrt/test/*.cpp)
-    list(APPEND SOURCES ${SOURCES_})
+if(MGE_WITH_CUDA AND MGE_WITH_TRT)
+  file(GLOB_RECURSE SOURCES_ ../src/tensorrt/test/*.cpp)
+  list(APPEND SOURCES ${SOURCES_})
 endif()
 
 add_executable(megbrain_test ${SOURCES})
 if(WIN32 OR MSVC)
-    target_compile_definitions(megbrain_test PRIVATE MGE_WINDOWS_STATIC_LINK)
+  target_compile_definitions(megbrain_test PRIVATE MGE_WINDOWS_STATIC_LINK)
 endif()
 target_link_libraries(megbrain_test gtest gmock)
 target_link_libraries(megbrain_test megbrain megdnn ${MGE_CUDA_LIBS})
-if (MGE_WITH_CUDA)
-    target_include_directories(megbrain_test PRIVATE ${CUDNN_INCLUDE_DIR})
+if(MGE_WITH_CUDA)
+  target_include_directories(megbrain_test PRIVATE ${CUDNN_INCLUDE_DIR})
 endif()
 if(CXX_SUPPORT_WCLASS_MEMACCESS)
-    if(MGE_WITH_CUDA)
-        target_compile_options(megbrain_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
-            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
-    else()
-        target_compile_options(megbrain_test PRIVATE "-Wno-class-memaccess")
-    endif()
+  if(MGE_WITH_CUDA)
+    target_compile_options(
+      megbrain_test
+      PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
+              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
+  else()
+    target_compile_options(megbrain_test PRIVATE "-Wno-class-memaccess")
+  endif()
 endif()
 
 if(UNIX)
-    if(APPLE OR ANDROID)
-        target_link_libraries(megbrain_test dl)
-    else()
-        target_link_libraries(megbrain_test dl rt)
-    endif()
+  if(APPLE OR ANDROID)
+    target_link_libraries(megbrain_test dl)
+  else()
+    target_link_libraries(megbrain_test dl rt)
+  endif()
 endif()
 
-if (MGE_WITH_DISTRIBUTED)
-    target_link_libraries(megbrain_test megray)
+if(MGE_WITH_DISTRIBUTED)
+  target_link_libraries(megbrain_test megray)
 endif()
 
 if(MGE_WITH_JIT)
-    if(MGE_WITH_JIT_MLIR)
-        add_subdirectory(${PROJECT_SOURCE_DIR}/src/jit/test/mlir ${CMAKE_CURRENT_BINARY_DIR}/../src/jit/test/mlir)
-    endif()
+  if(MGE_WITH_JIT_MLIR)
+    add_subdirectory(${PROJECT_SOURCE_DIR}/src/jit/test/mlir
+                     ${CMAKE_CURRENT_BINARY_DIR}/../src/jit/test/mlir)
+  endif()
 endif()
diff --git a/toolchains/aarch64-linux-gnu.toolchain.cmake b/toolchains/aarch64-linux-gnu.toolchain.cmake
index 525817b3..cb09256f 100644
--- a/toolchains/aarch64-linux-gnu.toolchain.cmake
+++ b/toolchains/aarch64-linux-gnu.toolchain.cmake
@@ -2,8 +2,8 @@ set(ARM_CROSS_BUILD_ARCH aarch64)
 set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
 set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
 if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
 endif()
 set(CMAKE_STRIP "aarch64-linux-gnu-strip")
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
diff --git a/toolchains/aarch64-none-linux-gnu.toolchain.cmake b/toolchains/aarch64-none-linux-gnu.toolchain.cmake
index e16d3766..637c0c36 100644
--- a/toolchains/aarch64-none-linux-gnu.toolchain.cmake
+++ b/toolchains/aarch64-none-linux-gnu.toolchain.cmake
@@ -4,8 +4,8 @@ set(CMAKE_CXX_COMPILER "aarch64-none-linux-gnu-g++")
 set(CMAKE_C_FLAGS "-Wno-psabi")
 set(CMAKE_CXX_FLAGS "-Wno-psabi")
 if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
 endif()
 set(CMAKE_STRIP "aarch64-none-linux-gnu-strip")
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
diff --git a/toolchains/arm-linux-gnueabi.toolchain.cmake b/toolchains/arm-linux-gnueabi.toolchain.cmake
index 471b7806..bc4bc229 100644
--- a/toolchains/arm-linux-gnueabi.toolchain.cmake
+++ b/toolchains/arm-linux-gnueabi.toolchain.cmake
@@ -4,8 +4,8 @@ set(CMAKE_CXX_COMPILER "arm-linux-gnueabi-g++")
 set(CMAKE_C_FLAGS "-mfloat-abi=softfp -mfpu=neon-vfpv4 -Wno-psabi")
 set(CMAKE_CXX_FLAGS "-mfloat-abi=softfp -mfpu=neon-vfpv4 -Wno-psabi")
 if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
 endif()
 set(CMAKE_STRIP "arm-linux-gnueabi-strip")
 set(CMAKE_SYSTEM_PROCESSOR armv7)
diff --git a/toolchains/arm-linux-gnueabihf.toolchain.cmake b/toolchains/arm-linux-gnueabihf.toolchain.cmake
index b9e36412..4e29ae6d 100644
--- a/toolchains/arm-linux-gnueabihf.toolchain.cmake
+++ b/toolchains/arm-linux-gnueabihf.toolchain.cmake
@@ -4,8 +4,8 @@ set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
 set(CMAKE_C_FLAGS "-mfloat-abi=hard -mfpu=neon-vfpv4 -Wno-psabi")
 set(CMAKE_CXX_FLAGS "-mfloat-abi=hard -mfpu=neon-vfpv4 -Wno-psabi")
 if("$ENV{FORCE_CHECK_UNUSED_PARAMETER}" STREQUAL "true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-parameter")
 endif()
 set(CMAKE_STRIP "arm-linux-gnueabihf-strip")
 set(CMAKE_SYSTEM_PROCESSOR armv7)
diff --git a/toolchains/ios.toolchain.cmake b/toolchains/ios.toolchain.cmake
index 26eabf51..c57174ff 100644
--- a/toolchains/ios.toolchain.cmake
+++ b/toolchains/ios.toolchain.cmake
@@ -1,103 +1,87 @@
 # This file is part of the ios-cmake project. It was retrieved from
 # https://github.com/cristeab/ios-cmake.git, which is a fork of
-# https://code.google.com/p/ios-cmake/. Which in turn is based off of
-# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which
-# are included with CMake 2.8.4
+# https://code.google.com/p/ios-cmake/. Which in turn is based off of the
+# Platform/Darwin.cmake and Platform/UnixPaths.cmake files which are included with CMake
+# 2.8.4
 #
 # The ios-cmake project is licensed under the new BSD license.
 #
-# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software,
-# Kitware, Inc., Insight Software Consortium.  All rights reserved.
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
+# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, Kitware, Inc.,
+# Insight Software Consortium.  All rights reserved. Redistribution and use in source
+# and binary forms, with or without modification, are permitted provided that the
+# following conditions are met: 1. Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following disclaimer.
 #
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
+# 1. Redistributions in binary form must reproduce the above copyright notice, this list
+#   of conditions and the following disclaimer in the documentation and/or other
+#   materials provided with the distribution.
 #
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
+# 1. Neither the name of the copyright holder nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without specific
+#   prior written permission.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+# DAMAGE.
 #
-# This file is based off of the Platform/Darwin.cmake and
-# Platform/UnixPaths.cmake files which are included with CMake 2.8.4
-# It has been altered for iOS development.
+# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake files
+# which are included with CMake 2.8.4 It has been altered for iOS development.
 #
 # Updated by Alex Stewart (alexs.mac@gmail.com)
 #
 # *****************************************************************************
-#      Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com)
-#                      under the BSD-3-Clause license
-#                   https://github.com/leetal/ios-cmake
+# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) under the
+# BSD-3-Clause license https://github.com/leetal/ios-cmake
 # *****************************************************************************
 #
-#                           INFORMATION / HELP
+# INFORMATION / HELP
 #
 # The following variables control the behaviour of this toolchain:
 #
-# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS or WATCHOS or SIMULATOR_WATCHOS
-#    OS = Build for iPhoneOS.
-#    OS64 = Build for arm64 arm64e iPhoneOS.
-#    SIMULATOR = Build for x86 i386 iPhone Simulator.
-#    SIMULATOR64 = Build for x86_64 iPhone Simulator.
-#    TVOS = Build for AppleTVOS.
-#    SIMULATOR_TVOS = Build for x86_64 AppleTV Simulator.
-#    WATCHOS = Build for armv7k arm64_32 for WatchOS.
-#    SIMULATOR_WATCHOS = Build for x86_64 for Watch Simulator.
-# CMAKE_OSX_SYSROOT: Path to the iOS SDK to use.  By default this is
-#    automatically determined from IOS_PLATFORM and xcodebuild, but
-#    can also be manually specified (although this should not be required).
-# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform
-#    being compiled for.  By default this is automatically determined from
-#    CMAKE_OSX_SYSROOT, but can also be manually specified (although this should
-#    not be required).
-# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true)
-# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default)
-# ENABLE_VISIBILITY: (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility hidden by default)
-# IOS_ARCH: (armv7 armv7s armv7k arm64 arm64e arm64_32 i386 x86_64) If specified, will override the default architectures for the given IOS_PLATFORM
-#    OS = armv7 armv7s arm64 arm64e (if applicable)
-#    OS64 = arm64 arm64e (if applicable)
-#    SIMULATOR = i386 x86_64
-#    SIMULATOR64 = x86_64
-#    TVOS = arm64
-#    SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated)
-#    WATCHOS = armv7k arm64_32 (if applicable)
-#    SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated)
+# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS or
+# WATCHOS or SIMULATOR_WATCHOS OS = Build for iPhoneOS. OS64 = Build for arm64 arm64e
+# iPhoneOS. SIMULATOR = Build for x86 i386 iPhone Simulator. SIMULATOR64 = Build for
+# x86_64 iPhone Simulator. TVOS = Build for AppleTVOS. SIMULATOR_TVOS = Build for x86_64
+# AppleTV Simulator. WATCHOS = Build for armv7k arm64_32 for WatchOS. SIMULATOR_WATCHOS
+# = Build for x86_64 for Watch Simulator. CMAKE_OSX_SYSROOT: Path to the iOS SDK to use.
+# By default this is automatically determined from IOS_PLATFORM and xcodebuild, but can
+# also be manually specified (although this should not be required).
+# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform being
+# compiled for.  By default this is automatically determined from CMAKE_OSX_SYSROOT, but
+# can also be manually specified (although this should not be required). ENABLE_BITCODE:
+# (1|0) Enables or disables bitcode support. Default 1 (true) ENABLE_ARC: (1|0) Enables
+# or disables ARC support. Default 1 (true, ARC enabled by default) ENABLE_VISIBILITY:
+# (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility
+# hidden by default) IOS_ARCH: (armv7 armv7s armv7k arm64 arm64e arm64_32 i386 x86_64)
+# If specified, will override the default architectures for the given IOS_PLATFORM OS =
+# armv7 armv7s arm64 arm64e (if applicable) OS64 = arm64 arm64e (if applicable)
+# SIMULATOR = i386 x86_64 SIMULATOR64 = x86_64 TVOS = arm64 SIMULATOR_TVOS = x86_64
+# (i386 has since long been deprecated) WATCHOS = armv7k arm64_32 (if applicable)
+# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated)
 #
 # This toolchain defines the following variables for use externally:
 #
 # XCODE_VERSION: Version number (not including Build version) of Xcode detected.
-# IOS_SDK_VERSION: Version of iOS SDK being used.
-# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from
-#    IOS_PLATFORM).
+# IOS_SDK_VERSION: Version of iOS SDK being used. CMAKE_OSX_ARCHITECTURES: Architectures
+# being compiled for (generated from IOS_PLATFORM).
 #
 # This toolchain defines the following macros for use externally:
 #
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT)
-#   A convenience macro for setting xcode specific properties on targets.
-#   Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel
-#   example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) A convenience
+# macro for setting xcode specific properties on targets. Available variants are: All,
+# Release, RelWithDebInfo, Debug, MinSizeRel example: set_xcode_property (myioslib
+# IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
 #
-# find_host_package (PROGRAM ARGS)
-#   A macro used to find executable programs on the host system, not within the
-#   iOS environment.  Thanks to the android-cmake project for providing the
-#   command.
+# find_host_package (PROGRAM ARGS) A macro used to find executable programs on the host
+# system, not within the iOS environment.  Thanks to the android-cmake project for
+# providing the command.
 
 # Fix for PThread library not in path
 set(CMAKE_THREAD_LIBS_INIT "-lpthread")
@@ -106,57 +90,58 @@ set(CMAKE_USE_WIN32_THREADS_INIT 0)
 set(CMAKE_USE_PTHREADS_INIT 1)
 
 # Get the Xcode version being used.
-execute_process(COMMAND xcodebuild -version
+execute_process(
+  COMMAND xcodebuild -version
   OUTPUT_VARIABLE XCODE_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
 string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
 message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
-# Default to building for iPhoneOS if not specified otherwise, and we cannot
-# determine the platform from the CMAKE_OSX_ARCHITECTURES variable. The use
-# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly
-# determine the value of IOS_PLATFORM from the root project, as
-# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake.
-if (NOT DEFINED IOS_PLATFORM)
-  if (CMAKE_OSX_ARCHITECTURES)
-    if (CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*")
+# Default to building for iPhoneOS if not specified otherwise, and we cannot determine
+# the platform from the CMAKE_OSX_ARCHITECTURES variable. The use of
+# CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly determine
+# the value of IOS_PLATFORM from the root project, as CMAKE_OSX_ARCHITECTURES is
+# propagated to them by CMake.
+if(NOT DEFINED IOS_PLATFORM)
+  if(CMAKE_OSX_ARCHITECTURES)
+    if(CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*")
       set(IOS_PLATFORM "OS")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "i386")
+    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "i386")
       set(IOS_PLATFORM "SIMULATOR")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
+    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
       set(IOS_PLATFORM "SIMULATOR64")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "armv7k")
+    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "armv7k")
       set(IOS_PLATFORM "WATCHOS")
     endif()
   endif()
-  if (NOT IOS_PLATFORM)
+  if(NOT IOS_PLATFORM)
     set(IOS_PLATFORM "OS")
   endif()
 endif()
-set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING
-  "Type of iOS platform for which to build.")
-# Determine the platform name and architectures for use in xcodebuild commands
-# from the specified IOS_PLATFORM name.
-if (IOS_PLATFORM STREQUAL "OS")
+set(IOS_PLATFORM
+    ${IOS_PLATFORM}
+    CACHE STRING "Type of iOS platform for which to build.")
+# Determine the platform name and architectures for use in xcodebuild commands from the
+# specified IOS_PLATFORM name.
+if(IOS_PLATFORM STREQUAL "OS")
   set(XCODE_IOS_PLATFORM iphoneos)
   if(NOT IOS_ARCH)
-    if (XCODE_VERSION VERSION_GREATER 10.0)
+    if(XCODE_VERSION VERSION_GREATER 10.0)
       set(IOS_ARCH armv7 armv7s arm64 arm64e)
     else()
       set(IOS_ARCH armv7 armv7s arm64)
     endif()
   endif()
- elseif (IOS_PLATFORM STREQUAL "OS64")
+elseif(IOS_PLATFORM STREQUAL "OS64")
   set(XCODE_IOS_PLATFORM iphoneos)
   if(NOT IOS_ARCH)
-    if (XCODE_VERSION VERSION_GREATER 10.0)
+    if(XCODE_VERSION VERSION_GREATER 10.0)
       set(IOS_ARCH arm64 arm64e)
     else()
       set(IOS_ARCH arm64)
     endif()
   endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR")
+elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
   set(XCODE_IOS_PLATFORM iphonesimulator)
   if(NOT IOS_ARCH)
     set(IOS_ARCH i386 x86_64)
@@ -166,26 +151,26 @@ elseif(IOS_PLATFORM STREQUAL "SIMULATOR64")
   if(NOT IOS_ARCH)
     set(IOS_ARCH x86_64)
   endif()
-elseif (IOS_PLATFORM STREQUAL "TVOS")
+elseif(IOS_PLATFORM STREQUAL "TVOS")
   set(XCODE_IOS_PLATFORM appletvos)
   if(NOT IOS_ARCH)
     set(IOS_ARCH arm64)
   endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
+elseif(IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
   set(XCODE_IOS_PLATFORM appletvsimulator)
   if(NOT IOS_ARCH)
     set(IOS_ARCH x86_64)
   endif()
-elseif (IOS_PLATFORM STREQUAL "WATCHOS")
+elseif(IOS_PLATFORM STREQUAL "WATCHOS")
   set(XCODE_IOS_PLATFORM watchos)
   if(NOT IOS_ARCH)
-    if (XCODE_VERSION VERSION_GREATER 10.0)
+    if(XCODE_VERSION VERSION_GREATER 10.0)
       set(IOS_ARCH armv7k arm64_32)
     else()
       set(IOS_ARCH armv7k)
     endif()
   endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS")
+elseif(IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS")
   set(XCODE_IOS_PLATFORM watchsimulator)
   if(NOT IOS_ARCH)
     set(IOS_ARCH x86_64)
@@ -194,130 +179,166 @@ else()
   message(FATAL_ERROR "Invalid IOS_PLATFORM: ${IOS_PLATFORM}")
 endif()
 message(STATUS "Configuring iOS build for platform: ${IOS_PLATFORM}, "
-  "architecture(s): ${IOS_ARCH}")
+               "architecture(s): ${IOS_ARCH}")
 # If user did not specify the SDK root to use, then query xcodebuild for it.
-execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
-    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT
-     OUTPUT_QUIET ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+execute_process(
+  COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
+  OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT
+  OUTPUT_QUIET ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # If user did not specify the SDK root to use, then query xcodebuild for it.
-if (NOT DEFINED CMAKE_OSX_SYSROOT OR (NOT CMAKE_OSX_SYSROOT STREQUAL CMAKE_OSX_SYSROOT_INT))
-  execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
+if(NOT DEFINED CMAKE_OSX_SYSROOT OR (NOT CMAKE_OSX_SYSROOT STREQUAL
+                                     CMAKE_OSX_SYSROOT_INT))
+  execute_process(
+    COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
     OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 endif()
-if (NOT EXISTS ${CMAKE_OSX_SYSROOT})
-  message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain"
-  "is pointing to the correct path. Please run:"
-  "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer"
-  "and see if that fixes the problem for you.")
+if(NOT EXISTS ${CMAKE_OSX_SYSROOT})
+  message(
+    SEND_ERROR "Please make sure that Xcode is installed and that the toolchain"
+               "is pointing to the correct path. Please run:"
+               "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer"
+               "and see if that fixes the problem for you.")
   message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} "
-  "does not exist.")
+                      "does not exist.")
 elseif(DEFINED CMAKE_OSX_SYSROOT)
-  message(STATUS "Using manually set SDK path: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
+  message(
+    STATUS
+      "Using manually set SDK path: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
 else()
-   message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
+  message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
 endif()
 # Specify minimum version of deployment target.
-if (NOT DEFINED IOS_DEPLOYMENT_TARGET)
-  if (IOS_PLATFORM STREQUAL "WATCHOS" OR IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS")
-    # Unless specified, SDK version 2.0 is used by default as minimum target version (watchOS).
-    set(IOS_DEPLOYMENT_TARGET "2.0"
-            CACHE STRING "Minimum iOS version to build for." )
+if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
+  if(IOS_PLATFORM STREQUAL "WATCHOS" OR IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS")
+    # Unless specified, SDK version 2.0 is used by default as minimum target version
+    # (watchOS).
+    set(IOS_DEPLOYMENT_TARGET
+        "2.0"
+        CACHE STRING "Minimum iOS version to build for.")
   else()
-    # Unless specified, SDK version 10.0 is used by default as minimum target version (iOS, tvOS).
-    set(IOS_DEPLOYMENT_TARGET "10.0"
-            CACHE STRING "Minimum iOS version to build for." )
+    # Unless specified, SDK version 10.0 is used by default as minimum target version
+    # (iOS, tvOS).
+    set(IOS_DEPLOYMENT_TARGET
+        "10.0"
+        CACHE STRING "Minimum iOS version to build for.")
   endif()
-  message(STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!")
+  message(
+    STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!")
 endif()
 # Use bitcode or not
-if (NOT DEFINED ENABLE_BITCODE AND NOT IOS_ARCH MATCHES "((^|, )(i386|x86_64))+")
+if(NOT DEFINED ENABLE_BITCODE AND NOT IOS_ARCH MATCHES "((^|, )(i386|x86_64))+")
   # Unless specified, enable bitcode support by default
-  set(ENABLE_BITCODE TRUE CACHE BOOL "Whether or not to enable bitcode")
+  set(ENABLE_BITCODE
+      TRUE
+      CACHE BOOL "Whether or not to enable bitcode")
   message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!")
 endif()
-if (NOT DEFINED ENABLE_BITCODE)
-  message(STATUS "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!")
+if(NOT DEFINED ENABLE_BITCODE)
+  message(
+    STATUS
+      "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!"
+  )
 endif()
 # Use ARC or not
-if (NOT DEFINED ENABLE_ARC)
+if(NOT DEFINED ENABLE_ARC)
   # Unless specified, enable ARC support by default
-  set(ENABLE_ARC TRUE CACHE BOOL "Whether or not to enable ARC")
+  set(ENABLE_ARC
+      TRUE
+      CACHE BOOL "Whether or not to enable ARC")
   message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!")
 endif()
 # Use hidden visibility or not
-if (NOT DEFINED ENABLE_VISIBILITY)
+if(NOT DEFINED ENABLE_VISIBILITY)
   # Unless specified, disable symbols visibility by default
-  set(ENABLE_VISIBILITY FALSE CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)")
-  message(STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!")
+  set(ENABLE_VISIBILITY
+      FALSE
+      CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)")
+  message(
+    STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!")
 endif()
 # Get the SDK version information.
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
+execute_process(
+  COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
   OUTPUT_VARIABLE IOS_SDK_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-# Find the Developer root for the specific iOS platform being compiled for
-# from CMAKE_OSX_SYSROOT.  Should be ../../ from SDK specified in
-# CMAKE_OSX_SYSROOT.  There does not appear to be a direct way to obtain
-# this information from xcrun or xcodebuild.
-if (NOT CMAKE_IOS_DEVELOPER_ROOT)
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+# Find the Developer root for the specific iOS platform being compiled for from
+# CMAKE_OSX_SYSROOT.  Should be ../../ from SDK specified in CMAKE_OSX_SYSROOT.  There
+# does not appear to be a direct way to obtain this information from xcrun or
+# xcodebuild.
+if(NOT CMAKE_IOS_DEVELOPER_ROOT)
   get_filename_component(IOS_PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH)
   get_filename_component(CMAKE_IOS_DEVELOPER_ROOT ${IOS_PLATFORM_SDK_DIR} PATH)
 endif()
-if (NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT})
+if(NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT})
   message(FATAL_ERROR "Invalid CMAKE_IOS_DEVELOPER_ROOT: "
-    "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.")
+                      "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.")
 endif()
 # Find the C & C++ compilers for the specified SDK.
-if (NOT CMAKE_C_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
+if(NOT CMAKE_C_COMPILER)
+  execute_process(
+    COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
     OUTPUT_VARIABLE CMAKE_C_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}")
 endif()
-if (NOT CMAKE_CXX_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
+if(NOT CMAKE_CXX_COMPILER)
+  execute_process(
+    COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
     OUTPUT_VARIABLE CMAKE_CXX_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}")
 endif()
 # Find (Apple's) libtool.
-execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool
+execute_process(
+  COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool
   OUTPUT_VARIABLE IOS_LIBTOOL
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 message(STATUS "Using libtool: ${IOS_LIBTOOL}")
-# Configure libtool to be used instead of ar + ranlib to build static libraries.
-# This is required on Xcode 7+, but should also work on previous versions of
-# Xcode.
+# Configure libtool to be used instead of ar + ranlib to build static libraries. This is
+# required on Xcode 7+, but should also work on previous versions of Xcode.
 set(CMAKE_C_CREATE_STATIC_LIBRARY
-  "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
+    "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
 set(CMAKE_CXX_CREATE_STATIC_LIBRARY
-  "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
+    "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
 # Get the version of Darwin (OS X) of the host.
-execute_process(COMMAND uname -r
+execute_process(
+  COMMAND uname -r
   OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # Standard settings.
-set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "")
-set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION} CACHE INTERNAL "")
-set(UNIX TRUE CACHE BOOL "")
-set(APPLE TRUE CACHE BOOL "")
-set(IOS TRUE CACHE BOOL "")
-set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
-set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE)
-# Force unset of OS X-specific deployment target (otherwise autopopulated),
-# required as of cmake 2.8.10.
-set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING
-  "Must be empty for iOS builds." FORCE)
+set(CMAKE_SYSTEM_NAME
+    Darwin
+    CACHE INTERNAL "")
+set(CMAKE_SYSTEM_VERSION
+    ${IOS_SDK_VERSION}
+    CACHE INTERNAL "")
+set(UNIX
+    TRUE
+    CACHE BOOL "")
+set(APPLE
+    TRUE
+    CACHE BOOL "")
+set(IOS
+    TRUE
+    CACHE BOOL "")
+set(CMAKE_AR
+    ar
+    CACHE FILEPATH "" FORCE)
+set(CMAKE_RANLIB
+    ranlib
+    CACHE FILEPATH "" FORCE)
+# Force unset of OS X-specific deployment target (otherwise autopopulated), required as
+# of cmake 2.8.10.
+set(CMAKE_OSX_DEPLOYMENT_TARGET
+    ""
+    CACHE STRING "Must be empty for iOS builds." FORCE)
 # Set the architectures for which to build.
-set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")
-# Change the type of target generated for try_compile() so it'll work when cross-compiling
+set(CMAKE_OSX_ARCHITECTURES
+    ${IOS_ARCH}
+    CACHE STRING "Build architecture for iOS")
+# Change the type of target generated for try_compile() so it'll work when
+# cross-compiling
 set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 # All iOS/Darwin specific settings - some may be redundant.
 set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
@@ -332,7 +353,8 @@ set(CMAKE_MODULE_EXISTS 1)
 set(CMAKE_DL_LIBS "")
 set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
 set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG
+    "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
 set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
 
 if(IOS_ARCH MATCHES "((^|, )(arm64|arm64e|x86_64))+")
@@ -350,35 +372,32 @@ message(STATUS "Building for minimum iOS version: ${IOS_DEPLOYMENT_TARGET}"
 # Note that only Xcode 7+ supports the newer more specific:
 # -m${XCODE_IOS_PLATFORM}-version-min flags, older versions of Xcode use:
 # -m(ios/ios-simulator)-version-min instead.
-if (IOS_PLATFORM STREQUAL "OS" OR IOS_PLATFORM STREQUAL "OS64")
-  if (XCODE_VERSION VERSION_LESS 7.0)
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-      "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
+if(IOS_PLATFORM STREQUAL "OS" OR IOS_PLATFORM STREQUAL "OS64")
+  if(XCODE_VERSION VERSION_LESS 7.0)
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
   else()
     # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
     set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-      "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
+        "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
   endif()
-elseif (IOS_PLATFORM STREQUAL "TVOS")
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}")
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-elseif (IOS_PLATFORM STREQUAL "WATCHOS")
+elseif(IOS_PLATFORM STREQUAL "TVOS")
+  set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}")
+elseif(IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mwatchos-version-min=${IOS_DEPLOYMENT_TARGET}")
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS")
+      "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+elseif(IOS_PLATFORM STREQUAL "WATCHOS")
+  set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mwatchos-version-min=${IOS_DEPLOYMENT_TARGET}")
+elseif(IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS")
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mwatchos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+      "-mwatchos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
 else()
   # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min.
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+      "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
 endif()
 message(STATUS "Version flags set to: ${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
 
-if (ENABLE_BITCODE)
+if(ENABLE_BITCODE)
   set(BITCODE "-fembed-bitcode")
   set(HEADER_PAD "")
   message(STATUS "Enabling bitcode support.")
@@ -388,7 +407,7 @@ else()
   message(STATUS "Disabling bitcode support.")
 endif()
 
-if (ENABLE_ARC)
+if(ENABLE_ARC)
   set(FOBJC_ARC "-fobjc-arc")
   message(STATUS "Enabling ARC support.")
 else()
@@ -396,7 +415,7 @@ else()
   message(STATUS "Disabling ARC support.")
 endif()
 
-if (NOT ENABLE_VISIBILITY)
+if(NOT ENABLE_VISIBILITY)
   set(VISIBILITY "-fvisibility=hidden")
   message(STATUS "Hiding symbols (-fvisibility=hidden).")
 else()
@@ -404,20 +423,31 @@ else()
 endif()
 
 set(CMAKE_C_FLAGS
-"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}")
+    "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}"
+)
 # Hidden visibilty is required for C++ on iOS.
 set(CMAKE_CXX_FLAGS
-"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}")
-set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_MINSIZEREL}")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELEASE}")
-set(CMAKE_C_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS}  -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
+    "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}"
+)
+set(CMAKE_CXX_FLAGS_MINSIZEREL
+    "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_MINSIZEREL}"
+)
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO
+    "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}"
+)
+set(CMAKE_CXX_FLAGS_RELEASE
+    "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELEASE}")
+set(CMAKE_C_LINK_FLAGS
+    "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
+set(CMAKE_CXX_LINK_FLAGS
+    "${XCODE_IOS_PLATFORM_VERSION_FLAGS}  -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}"
+)
 
-# In order to ensure that the updated compiler flags are used in try_compile()
-# tests, we have to forcibly set them in the CMake cache, not merely set them
-# in the local scope.
-list(APPEND VARS_TO_FORCE_IN_CACHE
+# In order to ensure that the updated compiler flags are used in try_compile() tests, we
+# have to forcibly set them in the CMake cache, not merely set them in the local scope.
+list(
+  APPEND
+  VARS_TO_FORCE_IN_CACHE
   CMAKE_C_FLAGS
   CMAKE_CXX_FLAGS
   CMAKE_CXX_FLAGS_RELWITHDEBINFO
@@ -426,37 +456,40 @@ list(APPEND VARS_TO_FORCE_IN_CACHE
   CMAKE_C_LINK_FLAGS
   CMAKE_CXX_LINK_FLAGS)
 foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE})
-  set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "")
+  set(${VAR_TO_FORCE}
+      "${${VAR_TO_FORCE}}"
+      CACHE STRING "")
 endforeach()
 
 set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-set (CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks")
+set(CMAKE_SHARED_LINKER_FLAGS
+    "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks")
 set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib ${HEADER_PAD}")
 set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle ${HEADER_PAD}")
 set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
 set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
 set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
 
-# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old
-# build tree (where install_name_tool was hardcoded) and where
-# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in
-# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode
-# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did
+# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the
+# cache and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did
 # before, Alex.
-if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
   find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+endif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
 
 # Set the find root to the iOS developer roots and to user defined paths.
-set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT}
-  ${CMAKE_PREFIX_PATH} CACHE STRING  "iOS find search path root" FORCE)
+set(CMAKE_FIND_ROOT_PATH
+    ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT} ${CMAKE_PREFIX_PATH}
+    CACHE STRING "iOS find search path root" FORCE)
 # Default to searching for frameworks first.
 set(CMAKE_FIND_FRAMEWORK FIRST)
 # Set up the default search directories for frameworks.
 set(CMAKE_SYSTEM_FRAMEWORK_PATH
-  ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks
-  ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks
-  ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks)
+    ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks
+    ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks
+    ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks)
 # Only search the specified iOS SDK, not the remainder of the host filesystem.
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
@@ -464,12 +497,14 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 # This little macro lets you set any XCode specific property.
 macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION)
   set(XCODE_RELVERSION_I "${XCODE_RELVERSION}")
-  if (XCODE_RELVERSION_I STREQUAL "All")
-    set_property(TARGET ${TARGET} PROPERTY
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}")
+  if(XCODE_RELVERSION_I STREQUAL "All")
+    set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}
+                                           "${XCODE_VALUE}")
   else()
-    set_property(TARGET ${TARGET} PROPERTY
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}")
+    set_property(
+      TARGET ${TARGET}
+      PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}]
+               "${XCODE_VALUE}")
   endif()
 endmacro(set_xcode_property)
 # This macro lets you find executable programs on the host system.
diff --git a/toolchains/riscv64-linux-gnu.toolchain.cmake b/toolchains/riscv64-linux-gnu.toolchain.cmake
index d90ad0c4..3100b0de 100644
--- a/toolchains/riscv64-linux-gnu.toolchain.cmake
+++ b/toolchains/riscv64-linux-gnu.toolchain.cmake
@@ -3,16 +3,18 @@ set(CMAKE_SYSTEM_PROCESSOR riscv64)
 set(RISCV_CROSS_BUILD_ARCH riscv64)
 
 if(DEFINED ENV{RISCV_TOOLCHAIN_ROOT})
-    file(TO_CMAKE_PATH $ENV{RISCV_TOOLCHAIN_ROOT} RISCV_TOOLCHAIN_ROOT)
+  file(TO_CMAKE_PATH $ENV{RISCV_TOOLCHAIN_ROOT} RISCV_TOOLCHAIN_ROOT)
 else()
-    message(FATAL_ERROR "RISCV_TOOLCHAIN_ROOT env must be defined")
+  message(FATAL_ERROR "RISCV_TOOLCHAIN_ROOT env must be defined")
 endif()
 
-set(RISCV_TOOLCHAIN_ROOT ${RISCV_TOOLCHAIN_ROOT} CACHE STRING "root path to riscv toolchain")
+set(RISCV_TOOLCHAIN_ROOT
+    ${RISCV_TOOLCHAIN_ROOT}
+    CACHE STRING "root path to riscv toolchain")
 
 set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc")
 set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-g++")
 set(CMAKE_FIND_ROOT_PATH "${RISCV_TOOLCHAIN_ROOT}/riscv64-unknown-linux-gnu")
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
\ No newline at end of file
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/tools/cmake_format_config.json b/tools/cmake_format_config.json
new file mode 100644
index 00000000..a7fd7394
--- /dev/null
+++ b/tools/cmake_format_config.json
@@ -0,0 +1,311 @@
+{
+  "_help_parse": "Options affecting listfile parsing",
+  "parse": {
+    "_help_additional_commands": [
+      "Specify structure for custom cmake functions"
+    ],
+    "additional_commands": {
+      "foo": {
+        "flags": [
+          "BAR",
+          "BAZ"
+        ],
+        "kwargs": {
+          "HEADERS": "*",
+          "SOURCES": "*",
+          "DEPENDS": "*"
+        }
+      }
+    },
+    "_help_override_spec": [
+      "Override configurations per-command where available"
+    ],
+    "override_spec": {},
+    "_help_vartags": [
+      "Specify variable tags."
+    ],
+    "vartags": [],
+    "_help_proptags": [
+      "Specify property tags."
+    ],
+    "proptags": []
+  },
+  "_help_format": "Options affecting formatting.",
+  "format": {
+    "_help_disable": [
+      "Disable formatting entirely, making cmake-format a no-op"
+    ],
+    "disable": false,
+    "_help_line_width": [
+      "How wide to allow formatted cmake files"
+    ],
+    "line_width": 88,
+    "_help_tab_size": [
+      "How many spaces to tab for indent"
+    ],
+    "tab_size": 2,
+    "_help_use_tabchars": [
+      "If true, lines are indented using tab characters (utf-8",
+      "0x09) instead of <tab_size> space characters (utf-8 0x20).",
+      "In cases where the layout would require a fractional tab",
+      "character, the behavior of the  fractional indentation is",
+      "governed by <fractional_tab_policy>"
+    ],
+    "use_tabchars": false,
+    "_help_fractional_tab_policy": [
+      "If <use_tabchars> is True, then the value of this variable",
+      "indicates how fractional indentions are handled during",
+      "whitespace replacement. If set to 'use-space', fractional",
+      "indentation is left as spaces (utf-8 0x20). If set to",
+      "`round-up` fractional indentation is replaced with a single",
+      "tab character (utf-8 0x09) effectively shifting the column",
+      "to the next tabstop"
+    ],
+    "fractional_tab_policy": "use-space",
+    "_help_max_subgroups_hwrap": [
+      "If an argument group contains more than this many sub-groups",
+      "(parg or kwarg groups) then force it to a vertical layout."
+    ],
+    "max_subgroups_hwrap": 2,
+    "_help_max_pargs_hwrap": [
+      "If a positional argument group contains more than this many",
+      "arguments, then force it to a vertical layout."
+    ],
+    "max_pargs_hwrap": 6,
+    "_help_max_rows_cmdline": [
+      "If a cmdline positional group consumes more than this many",
+      "lines without nesting, then invalidate the layout (and nest)"
+    ],
+    "max_rows_cmdline": 2,
+    "_help_separate_ctrl_name_with_space": [
+      "If true, separate flow control names from their parentheses",
+      "with a space"
+    ],
+    "separate_ctrl_name_with_space": false,
+    "_help_separate_fn_name_with_space": [
+      "If true, separate function names from parentheses with a",
+      "space"
+    ],
+    "separate_fn_name_with_space": false,
+    "_help_dangle_parens": [
+      "If a statement is wrapped to more than one line, than dangle",
+      "the closing parenthesis on its own line."
+    ],
+    "dangle_parens": false,
+    "_help_dangle_align": [
+      "If the trailing parenthesis must be 'dangled' on its on",
+      "line, then align it to this reference: `prefix`: the start",
+      "of the statement,  `prefix-indent`: the start of the",
+      "statement, plus one indentation  level, `child`: align to",
+      "the column of the arguments"
+    ],
+    "dangle_align": "prefix",
+    "_help_min_prefix_chars": [
+      "If the statement spelling length (including space and",
+      "parenthesis) is smaller than this amount, then force reject",
+      "nested layouts."
+    ],
+    "min_prefix_chars": 4,
+    "_help_max_prefix_chars": [
+      "If the statement spelling length (including space and",
+      "parenthesis) is larger than the tab width by more than this",
+      "amount, then force reject un-nested layouts."
+    ],
+    "max_prefix_chars": 10,
+    "_help_max_lines_hwrap": [
+      "If a candidate layout is wrapped horizontally but it exceeds",
+      "this many lines, then reject the layout."
+    ],
+    "max_lines_hwrap": 2,
+    "_help_line_ending": [
+      "What style line endings to use in the output."
+    ],
+    "line_ending": "unix",
+    "_help_command_case": [
+      "Format command names consistently as 'lower' or 'upper' case"
+    ],
+    "command_case": "canonical",
+    "_help_keyword_case": [
+      "Format keywords consistently as 'lower' or 'upper' case"
+    ],
+    "keyword_case": "unchanged",
+    "_help_always_wrap": [
+      "A list of command names which should always be wrapped"
+    ],
+    "always_wrap": [],
+    "_help_enable_sort": [
+      "If true, the argument lists which are known to be sortable",
+      "will be sorted lexicographicall"
+    ],
+    "enable_sort": true,
+    "_help_autosort": [
+      "If true, the parsers may infer whether or not an argument",
+      "list is sortable (without annotation)."
+    ],
+    "autosort": false,
+    "_help_require_valid_layout": [
+      "By default, if cmake-format cannot successfully fit",
+      "everything into the desired linewidth it will apply the",
+      "last, most agressive attempt that it made. If this flag is",
+      "True, however, cmake-format will print error, exit with non-",
+      "zero status code, and write-out nothing"
+    ],
+    "require_valid_layout": false,
+    "_help_layout_passes": [
+      "A dictionary mapping layout nodes to a list of wrap",
+      "decisions. See the documentation for more information."
+    ],
+    "layout_passes": {}
+  },
+  "_help_markup": "Options affecting comment reflow and formatting.",
+  "markup": {
+    "_help_bullet_char": [
+      "What character to use for bulleted lists"
+    ],
+    "bullet_char": "*",
+    "_help_enum_char": [
+      "What character to use as punctuation after numerals in an",
+      "enumerated list"
+    ],
+    "enum_char": ".",
+    "_help_first_comment_is_literal": [
+      "If comment markup is enabled, don't reflow the first comment",
+      "block in each listfile. Use this to preserve formatting of",
+      "your copyright/license statements."
+    ],
+    "first_comment_is_literal": false,
+    "_help_literal_comment_pattern": [
+      "If comment markup is enabled, don't reflow any comment block",
+      "which matches this (regex) pattern. Default is `None`",
+      "(disabled)."
+    ],
+    "literal_comment_pattern": ".*INTERNAL.*",
+    "_help_fence_pattern": [
+      "Regular expression to match preformat fences in comments",
+      "default= ``r'^\\s*([`~]{3}[`~]*)(.*)$'``"
+    ],
+    "fence_pattern": "^\\s*([`~]{3}[`~]*)(.*)$",
+    "_help_ruler_pattern": [
+      "Regular expression to match rulers in comments default=",
+      "``r'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'``"
+    ],
+    "ruler_pattern": "^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$",
+    "_help_explicit_trailing_pattern": [
+      "If a comment line matches starts with this pattern then it",
+      "is explicitly a trailing comment for the preceeding",
+      "argument. Default is '#<'"
+    ],
+    "explicit_trailing_pattern": "#<",
+    "_help_hashruler_min_length": [
+      "If a comment line starts with at least this many consecutive",
+      "hash characters, then don't lstrip() them off. This allows",
+      "for lazy hash rulers where the first hash char is not",
+      "separated by space"
+    ],
+    "hashruler_min_length": 10,
+    "_help_canonicalize_hashrulers": [
+      "If true, then insert a space between the first hash char and",
+      "remaining hash chars in a hash ruler, and normalize its",
+      "length to fill the column"
+    ],
+    "canonicalize_hashrulers": true,
+    "_help_enable_markup": [
+      "enable comment markup parsing and reflow"
+    ],
+    "enable_markup": true
+  },
+  "_help_lint": "Options affecting the linter",
+  "lint": {
+    "_help_disabled_codes": [
+      "a list of lint codes to disable"
+    ],
+    "disabled_codes": [],
+    "_help_function_pattern": [
+      "regular expression pattern describing valid function names"
+    ],
+    "function_pattern": "[0-9a-z_]+",
+    "_help_macro_pattern": [
+      "regular expression pattern describing valid macro names"
+    ],
+    "macro_pattern": "[0-9A-Z_]+",
+    "_help_global_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "variables with global (cache) scope"
+    ],
+    "global_var_pattern": "[A-Z][0-9A-Z_]+",
+    "_help_internal_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "variables with global scope (but internal semantic)"
+    ],
+    "internal_var_pattern": "_[A-Z][0-9A-Z_]+",
+    "_help_local_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "variables with local scope"
+    ],
+    "local_var_pattern": "[a-z][a-z0-9_]+",
+    "_help_private_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "privatedirectory variables"
+    ],
+    "private_var_pattern": "_[0-9a-z_]+",
+    "_help_public_var_pattern": [
+      "regular expression pattern describing valid names for public",
+      "directory variables"
+    ],
+    "public_var_pattern": "[A-Z][0-9A-Z_]+",
+    "_help_argument_var_pattern": [
+      "regular expression pattern describing valid names for",
+      "function/macro arguments and loop variables."
+    ],
+    "argument_var_pattern": "[a-z][a-z0-9_]+",
+    "_help_keyword_pattern": [
+      "regular expression pattern describing valid names for",
+      "keywords used in functions or macros"
+    ],
+    "keyword_pattern": "[A-Z][0-9A-Z_]+",
+    "_help_max_conditionals_custom_parser": [
+      "In the heuristic for C0201, how many conditionals to match",
+      "within a loop in before considering the loop a parser."
+    ],
+    "max_conditionals_custom_parser": 2,
+    "_help_min_statement_spacing": [
+      "Require at least this many newlines between statements"
+    ],
+    "min_statement_spacing": 1,
+    "_help_max_statement_spacing": [
+      "Require no more than this many newlines between statements"
+    ],
+    "max_statement_spacing": 2,
+    "max_returns": 6,
+    "max_branches": 12,
+    "max_arguments": 5,
+    "max_localvars": 15,
+    "max_statements": 50
+  },
+  "_help_encode": "Options affecting file encoding",
+  "encode": {
+    "_help_emit_byteorder_mark": [
+      "If true, emit the unicode byte-order mark (BOM) at the start",
+      "of the file"
+    ],
+    "emit_byteorder_mark": false,
+    "_help_input_encoding": [
+      "Specify the encoding of the input file. Defaults to utf-8"
+    ],
+    "input_encoding": "utf-8",
+    "_help_output_encoding": [
+      "Specify the encoding of the output file. Defaults to utf-8.",
+      "Note that cmake only claims to support utf-8 so be careful",
+      "when using anything else"
+    ],
+    "output_encoding": "utf-8"
+  },
+  "_help_misc": "Miscellaneous configurations options.",
+  "misc": {
+    "_help_per_command": [
+      "A dictionary containing any per-command configuration",
+      "overrides. Currently only `command_case` is supported."
+    ],
+    "per_command": {}
+  }
+}
diff --git a/tools/cmakeformat.py b/tools/cmakeformat.py
new file mode 100755
index 00000000..7c06f6e6
--- /dev/null
+++ b/tools/cmakeformat.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import argparse
+import os
+import subprocess
+from pathlib import Path
+
+CMAKE_FILS_DIRS = [
+    "test",
+    "dnn",
+    "tools",
+    "sdk",
+    "src",
+    "imperative",
+    "lite",
+    "cmake",
+    "toolchains",
+]
+
+
+def main():
+    os.chdir(str(Path(__file__).resolve().parent.parent))
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("--check", action="store_true", help="check model")
+    parser.add_argument(
+        "--cmake_files",
+        nargs="+",
+        default=None,
+        dest="cmake_files",
+        help="cmake files to format, please split with space",
+    )
+    args = parser.parse_args()
+
+    handle_files = []
+    if args.cmake_files:
+        handle_files = args.cmake_files
+        for cmake_file in handle_files:
+            assert os.path.isfile(
+                cmake_file
+            ), "error input --cmake_files, can not find file: {}".format(cmake_file)
+    else:
+        handle_files.append("CMakeLists.txt")
+        for cmake_file_dir in CMAKE_FILS_DIRS:
+            assert os.path.isdir(
+                cmake_file_dir
+            ), "{} is not a directory, may config error for CMAKE_FILS_DIRS".format(
+                cmake_file_dir
+            )
+            for cmake_file in [
+                os.path.join(root, file)
+                for root, dirs, files in os.walk(cmake_file_dir)
+                for file in files
+                if file.endswith("CMakeLists.txt") or file.endswith(".cmake")
+            ]:
+                print("find cmake_file: {}".format(cmake_file))
+                assert os.path.isfile(cmake_file), "code issue happened!!"
+                handle_files.append(cmake_file)
+
+    for cmake_file in handle_files:
+        handle_type = ["format", "--in-place"]
+        if args.check:
+            handle_type = ["check", "--check"]
+        cmd = "cmake-format -c tools/cmake_format_config.json {} {}".format(
+            handle_type[1], cmake_file
+        )
+        print("try {}: {} with command: {}".format(handle_type[0], cmake_file, cmd))
+        try:
+            subprocess.check_call(cmd, shell=True)
+        except Exception as exc:
+            print("run cmd {} failed".format(cmd))
+            if args.check:
+                print(
+                    'please run: "python3 tools/cmakeformat.py" to format cmake files'
+                )
+            else:
+                print("code issue happened!!, please FIXME!!")
+            raise exc
+
+
+if __name__ == "__main__":
+    subprocess.check_call("python3 -m pip install cmakelang==0.6.13 --user", shell=True)
+    main()
diff --git a/tools/format.py b/tools/format.py
index af0c0afd..e2d6921f 100755
--- a/tools/format.py
+++ b/tools/format.py
@@ -19,7 +19,8 @@ failed_files = Manager().list()
 
 def process_file(file, clang_format, write):
     source = open(file, "r").read()
-    source = re.sub(r"MGB_DEFINE(?P<r>(.|\n)*?)// +{", "class MGB_DEFINE\g<r>{", source)
+    source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source)
+    source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source)
 
     result = subprocess.check_output(
         [
@@ -33,6 +34,8 @@ def process_file(file, clang_format, write):
     )
 
     result = result.decode("utf-8")
+    if count:
+        result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2       \\", result)
     result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result)
 
     if write:
diff --git a/tools/mlir/mgb-file-check/CMakeLists.txt b/tools/mlir/mgb-file-check/CMakeLists.txt
index f1310670..bd7bd02d 100644
--- a/tools/mlir/mgb-file-check/CMakeLists.txt
+++ b/tools/mlir/mgb-file-check/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_custom_command(
-    OUTPUT link_sh
-    COMMAND ${CMAKE_COMMAND} -E create_symlink
-        ${PROJECT_SOURCE_DIR}/tools/mlir/mgb-file-check/mgb-file-check.sh
-        ${PROJECT_BINARY_DIR}/tools/mlir/mgb-file-check/mgb-file-check
-)
+  OUTPUT link_sh
+  COMMAND
+    ${CMAKE_COMMAND} -E create_symlink
+    ${PROJECT_SOURCE_DIR}/tools/mlir/mgb-file-check/mgb-file-check.sh
+    ${PROJECT_BINARY_DIR}/tools/mlir/mgb-file-check/mgb-file-check)
 
-add_custom_target(mgb-file-check DEPENDS link_sh)
\ No newline at end of file
+add_custom_target(mgb-file-check DEPENDS link_sh)
diff --git a/tools/mlir/mgb-opt/CMakeLists.txt b/tools/mlir/mgb-opt/CMakeLists.txt
index 26f1a873..2501b2c0 100644
--- a/tools/mlir/mgb-opt/CMakeLists.txt
+++ b/tools/mlir/mgb-opt/CMakeLists.txt
@@ -1,20 +1,18 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
 set(LIBS
-        ${dialect_libs}
-        ${conversion_libs}
-        LLVMSupport
-        MLIROptLib
-        MLIRIR
-        MLIRPass
-        MLIRSupport
-        )
+    ${dialect_libs}
+    ${conversion_libs}
+    LLVMSupport
+    MLIROptLib
+    MLIRIR
+    MLIRPass
+    MLIRSupport)
 add_executable(mgb-opt mgb-opt.cpp)
 
 target_include_directories(
-  mgb-opt
-  PRIVATE ${MLIR_LLVM_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/src/jit/include
-          ${PROJECT_BINARY_DIR}/src/jit/include)
+  mgb-opt PRIVATE ${MLIR_LLVM_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/src/jit/include
+                  ${PROJECT_BINARY_DIR}/src/jit/include)
 
 add_dependencies(mgb-opt mgb_dialect)