feat(imperative,src): add jit builder for custom op

GitOrigin-RevId: 3bb0b46311
3 years ago · a72e0cb568
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1145,6 +1145,14 @@ if(TARGET _imperative_rt)
        COMMAND ${CMAKE_COMMAND} -E create_symlink
          ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py
          ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py
        COMMAND ${CMAKE_COMMAND} -E create_symlink
          ${CMAKE_CURRENT_SOURCE_DIR}/src/custom/include
          ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/include
        COMMAND ${CMAKE_COMMAND} -E make_directory
          ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib
        COMMAND ${CMAKE_COMMAND} -E create_symlink
          ${CMAKE_CURRENT_BINARY_DIR}/src/$<TARGET_FILE_NAME:${MGE_SHARED_LIB}>
          ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib/$<TARGET_FILE_NAME:${MGE_SHARED_LIB}>
        DEPENDS _imperative_rt
        VERBATIM
    )
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -67,8 +67,11 @@ add_custom_command(
    COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/$<TARGET_FILE_NAME:${MODULE_NAME}> # clean develop
    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/version.py # clean develop
    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/include # clean develop
    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/lib # clean develop
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine ${CMAKE_CURRENT_BINARY_DIR}/python/megengine
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${CMAKE_CURRENT_BINARY_DIR}/python/test
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/custom/include ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/core/include
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt
--- a/imperative/python/megengine/core/ops/custom.py
+++ b/imperative/python/megengine/core/ops/custom.py
@@ -7,11 +7,14 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 import os

 from .._imperative_rt.ops._custom import (
    _get_custom_op_list,
    _install,
    _make_custom_op,
    _uninstall,
    get_custom_op_abi_tag,
 )

 __all__ = ["load"]
@@ -25,8 +28,16 @@ def _gen_custom_op_maker(custom_op_name):


 def load(lib_path):
    op_in_this_lib = _install(lib_path[0:-3], lib_path)
    lib_path = os.path.abspath(lib_path)
    lib_name = os.path.splitext(lib_path)[0]
    op_in_this_lib = _install(lib_name, lib_path)
    for op in op_in_this_lib:
        op_maker = _gen_custom_op_maker(op)
        globals()[op] = op_maker
        __all__.append(op)


 def unload(lib_path):
    lib_path = os.path.abspath(lib_path)
    lib_name = os.path.splitext(lib_path)[0]
    _uninstall(lib_name)
--- a/imperative/python/megengine/utils/custom_op_tools.py
+++ b/imperative/python/megengine/utils/custom_op_tools.py
@@ -0,0 +1,909 @@
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 import collections
 import ctypes
 import glob
 import os
 import re
 import subprocess
 import sys
 import time
 from typing import List, Optional, Union

 from ..core.ops.custom import load
 from ..logger import get_logger


 def _get_win_folder_with_ctypes(csidl_name):
    csidl_const = {
        "CSIDL_APPDATA": 26,
        "CSIDL_COMMON_APPDATA": 35,
        "CSIDL_LOCAL_APPDATA": 28,
    }[csidl_name]

    buf = ctypes.create_unicode_buffer(1024)
    ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)

    # Downgrade to short path name if have highbit chars. See
    # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
    has_high_char = False
    for c in buf:
        if ord(c) > 255:
            has_high_char = True
            break
    if has_high_char:
        buf2 = ctypes.create_unicode_buffer(1024)
        if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
            buf = buf2

    return buf.value


 system = sys.platform
 if system == "win32":
    _get_win_folder = _get_win_folder_with_ctypes

 PLAT_TO_VCVARS = {
    "win-amd64": "x86_amd64",
 }

 logger = get_logger()

 # environment varible
 ev_custom_op_root_dir = "MGE_CUSTOM_OP_DIR"
 ev_cuda_root_dir = "CUDA_ROOT_DIR"
 ev_cudnn_root_dir = "CUDNN_ROOT_DIR"

 # operating system
 IS_WINDOWS = system == "win32"
 IS_LINUX = system == "linux"
 IS_MACOS = system == "darwin"

 MGE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 MGE_INC_PATH = os.path.join(MGE_PATH, "core", "include")
 MGE_LIB_PATH = os.path.join(MGE_PATH, "core", "lib")
 MGE_ABI_VER = 0


 # compile version
 MINIMUM_GCC_VERSION = (5, 0, 0)
 MINIMUM_CLANG_CL_VERSION = (12, 0, 1)

 # compile flags
 COMMON_MSVC_FLAGS = [
    "/MD",
    "/wd4002",
    "/wd4819",
    "/EHsc",
 ]

 MSVC_IGNORE_CUDAFE_WARNINGS = [
    "field_without_dll_interface",
 ]

 COMMON_NVCC_FLAGS = []

 # Finds the CUDA install path
 def _find_cuda_root_dir() -> Optional[str]:
    cuda_root_dir = os.environ.get(ev_cuda_root_dir)
    if cuda_root_dir is None:
        try:
            which = "where" if IS_WINDOWS else "which"
            with open(os.devnull, "w") as devnull:
                nvcc = (
                    subprocess.check_output([which, "nvcc"], stderr=devnull)
                    .decode()
                    .rstrip("\r\n")
                )
                cuda_root_dir = os.path.dirname(os.path.dirname(nvcc))
        except Exception:
            if IS_WINDOWS:
                cuda_root_dir = os.environ.get("CUDA_PATH", None)
                if cuda_root_dir == None:
                    cuda_root_dirs = glob.glob(
                        "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*"
                    )
                    if len(cuda_root_dirs) == 0:
                        cuda_root_dir = ""
                    else:
                        cuda_root_dir = cuda_root_dirs[0]
            else:
                cuda_root_dir = "/usr/local/cuda"
            if not os.path.exists(cuda_root_dir):
                cuda_root_dir = None
    return cuda_root_dir


 def _find_cudnn_root_dir() -> Optional[str]:
    cudnn_root_dir = os.environ.get(ev_cudnn_root_dir)
    return cudnn_root_dir


 CUDA_ROOT_DIR = _find_cuda_root_dir()
 CUDNN_ROOT_DIR = _find_cudnn_root_dir()

 #####################################################################
 # Phase 1
 #####################################################################


 def _is_cuda_file(path: str) -> bool:
    valid_ext = [".cu", ".cuh"]
    return os.path.splitext(path)[1] in valid_ext


 # Return full path to the user-specific cache dir for this application.
 # Typical user cache directories are:
 #     Mac OS X:   ~/Library/Caches/<AppName>
 #     Unix:       ~/.cache/<AppName> (XDG default)
 #     Windows:    C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
 def _get_user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
    if system == "win32":
        appauthor = appname if appauthor is None else appauthor
        path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
        if appname:
            if appauthor is not False:
                path = os.path.join(path, appauthor)
            else:
                path = os.path.join(path, appname)
            if opinion:
                path = os.path.join(path, "Cache")
    elif system == "darwin":
        path = os.path.expanduser("~/Library/Caches")
        if appname:
            path = os.path.join(path, appname)
    else:
        path = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
        if appname:
            path = os.path.join(path, appname)
    if appname and version:
        path = os.path.join(path, version)
    return path


 # Returns the path to the root folder under which custom op will built.
 def _get_default_build_root() -> str:
    return os.path.realpath(_get_user_cache_dir(appname="mge_custom_op"))


 def _get_build_dir(name: str) -> str:
    custom_op_root_dir = os.environ.get(ev_custom_op_root_dir)
    if custom_op_root_dir is None:
        custom_op_root_dir = _get_default_build_root()

    build_dir = os.path.join(custom_op_root_dir, name)
    return build_dir


 #####################################################################
 # Phase 2
 #####################################################################


 def update_hash(seed, value):
    # using boost::hash_combine
    # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
    return seed ^ (hash(value) + 0x9E3779B9 + (seed << 6) + (seed >> 2))


 def hash_source_files(hash_value, source_files):
    for filename in source_files:
        with open(filename) as file:
            hash_value = update_hash(hash_value, file.read())
    return hash_value


 def hash_build_args(hash_value, build_args):
    for group in build_args:
        for arg in group:
            hash_value = update_hash(hash_value, arg)
    return hash_value


 Entry = collections.namedtuple("Entry", "version, hash")


 class Versioner(object):
    def __init__(self):
        self.entries = {}

    def get_version(self, name):
        entry = self.entries.get(name)
        return None if entry is None else entry.version

    def bump_version_if_changed(
        self, name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag
    ):
        hash_value = 0
        hash_value = hash_source_files(hash_value, sources)
        hash_value = hash_build_args(hash_value, build_args)
        hash_value = update_hash(hash_value, build_dir)
        hash_value = update_hash(hash_value, with_cuda)
        hash_value = update_hash(hash_value, with_cudnn)
        hash_value = update_hash(hash_value, abi_tag)

        entry = self.entries.get(name)
        if entry is None:
            self.entries[name] = entry = Entry(0, hash_value)
        elif hash_value != entry.hash:
            self.entries[name] = entry = Entry(entry.version + 1, hash_value)

        return entry.version


 custom_op_versioner = Versioner()


 def version_check(
    name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag,
 ):
    old_version = custom_op_versioner.get_version(name)
    version = custom_op_versioner.bump_version_if_changed(
        name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag,
    )
    return version, old_version


 #####################################################################
 # Phase 3
 #####################################################################


 def _check_ninja_availability():
    try:
        subprocess.check_output("ninja --version".split())
    except Exception:
        raise RuntimeError(
            "Ninja is required to build custom op, please install ninja and update your PATH"
        )


 def _mge_is_built_from_src():
    file_path = os.path.abspath(__file__)
    if "site-packages" in file_path:
        return False
    else:
        return True


 def _accepted_compilers_for_platform():
    if IS_WINDOWS:
        return ["clang-cl"]
    if IS_MACOS:
        return ["clang++", "clang"]
    if IS_LINUX:
        return ["g++", "gcc", "gnu-c++", "gnu-cc"]


 # Verifies that the compiler is the expected one for the current platform.
 def _check_compiler_existed_for_platform(compiler: str) -> bool:
    # there is no suitable cmd like `which` on windows, so we assume the compiler is always true on windows
    if IS_WINDOWS:
        try:
            version_string = subprocess.check_output(
                ["clang-cl", "--version"], stderr=subprocess.STDOUT
            ).decode()
            return True
        except Exception:
            return False

    # use os.path.realpath to resolve any symlinks, in particular from "c++" to e.g. "g++".
    which = subprocess.check_output(["which", compiler], stderr=subprocess.STDOUT)
    compiler_path = os.path.realpath(which.decode().strip())
    if any(name in compiler_path for name in _accepted_compilers_for_platform()):
        return True

    version_string = subprocess.check_output(
        [compiler, "-v"], stderr=subprocess.STDOUT
    ).decode()
    if sys.platform.startswith("linux"):
        pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
        results = re.findall(pattern, version_string)
        if len(results) != 1:
            return False
        compiler_path = os.path.realpath(results[0].strip())
        return any(name in compiler_path for name in _accepted_compilers_for_platform())

    if sys.platform.startswith("darwin"):
        return version_string.startswith("Apple clang")

    return False


 # Verifies that the given compiler is ABI-compatible with MegEngine.
 def _check_compiler_abi_compatibility(compiler: str):
    # we think if the megengine is built from source, the user will use the same compiler to compile the custom op
    if _mge_is_built_from_src() or os.environ.get("MGE_CHECK_ABI", "1") == "0":
        return True

    # [TODO] There is no particular minimum version we need for clang, so we"re good here.
    if sys.platform.startswith("darwin"):
        return True

    try:
        if sys.platform.startswith("linux"):
            minimum_required_version = MINIMUM_GCC_VERSION
            versionstr = subprocess.check_output(
                [compiler, "-dumpfullversion", "-dumpversion"]
            )
            version = versionstr.decode().strip().split(".")
        else:
            minimum_required_version = MINIMUM_CLANG_CL_VERSION
            compiler_info = subprocess.check_output(
                [compiler, "--version"], stderr=subprocess.STDOUT
            )
            match = re.search(r"(\d+)\.(\d+)\.(\d+)", compiler_info.decode().strip())
            version = (0, 0, 0) if match is None else match.groups()
    except Exception:
        _, error, _ = sys.exc_info()
        logger.warning(
            "Error checking compiler version for {}: {}".format(compiler, error)
        )
        return False

    if tuple(map(int, version)) >= minimum_required_version:
        return True

    return False


 def _check_compiler_comatibility():
    # we use clang-cl on windows, refer: https://clang.llvm.org/docs/UsersManual.html#clang-cl
    compiler = (
        os.environ.get("CXX", "clang-cl")
        if IS_WINDOWS
        else os.environ.get("CXX", "c++")
    )

    existed = _check_compiler_existed_for_platform(compiler)
    if existed == False:
        log_str = (
            "Cannot find compiler which is compatible with the compiler "
            "MegEngine was built with for this platform, which is {mge_compiler} on "
            "{platform}. Please use {mge_compiler} to to compile your extension. "
            "Alternatively, you may compile MegEngine from source using "
            "{user_compiler}, and then you can also use {user_compiler} to compile "
            "your extension."
        ).format(
            user_compiler=compiler,
            mge_compiler=_accepted_compilers_for_platform()[0],
            platform=sys.platform,
        )

        logger.warning(log_str)
        return False

    compatible = _check_compiler_abi_compatibility(compiler)
    if compatible == False:
        log_str = (
            "Your compiler version may be ABI-incompatible with MegEngine! "
            "Please use a compiler that is ABI-compatible with GCC 5.0 on Linux "
            "and LLVM/Clang 12.0 on Windows ."
        )
        logger.warning(log_str)
    return True


 #####################################################################
 # Phase 4
 #####################################################################


 # Quote command-line arguments for DOS/Windows conventions.
 def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
    # Cover None-type
    if not args:
        return []
    return ['"{}"'.format(arg) if " " in arg else arg for arg in args]


 # Now we need user to specify the arch of GPU
 def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
    return []


 def _setup_sys_includes(with_cuda: bool, with_cudnn: bool):
    includes = [os.path.join(MGE_INC_PATH)]
    if with_cuda:
        includes.append(os.path.join(CUDA_ROOT_DIR, "include"))
    if with_cudnn:
        includes.append(os.path.join(CUDNN_ROOT_DIR, "include"))
    return includes


 def _setup_includes(extra_include_paths: List[str], with_cuda: bool, with_cudnn: bool):
    user_includes = [os.path.abspath(path) for path in extra_include_paths]
    system_includes = _setup_sys_includes(with_cuda, with_cudnn)
    if IS_WINDOWS:
        user_includes += system_includes
        system_includes.clear()
    return user_includes, system_includes


 def _setup_common_cflags(user_includes: List[str], system_includes: List[str]):
    common_cflags = []
    common_cflags += ["-I{}".format(include) for include in user_includes]
    common_cflags += ["-isystem {}".format(include) for include in system_includes]
    if not IS_WINDOWS:
        common_cflags += ["-D_GLIBCXX_USE_CXX11_ABI={}".format(MGE_ABI_VER)]
    return common_cflags


 def _setup_cuda_cflags(cflags: List[str], extra_cuda_cflags: List[str]):
    cuda_flags = cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
    if IS_WINDOWS:
        for flag in COMMON_MSVC_FLAGS:
            cuda_flags = ["-Xcompiler", flag] + cuda_flags
        for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
            cuda_flags = ["-Xcudafe", "--diag_suppress=" + ignore_warning] + cuda_flags
        cuda_flags = _nt_quote_args(cuda_flags)
        cuda_flags += _nt_quote_args(extra_cuda_cflags)
    else:
        cuda_flags += ["--compiler-options", '"-fPIC"']
        cuda_flags += extra_cuda_cflags
        if not any(flag.startswith("-std=") for flag in cuda_flags):
            cuda_flags.append("-std=c++14")
        if os.getenv("CC") is not None:
            cuda_flags = ["-ccbin", os.getenv("CC")] + cuda_flags
    return cuda_flags


 def _setup_ldflags(
    extra_ldflags: List[str], with_cuda: bool, with_cudnn: bool
 ) -> List[str]:
    ldflags = extra_ldflags
    if IS_WINDOWS:
        ldflags.append(os.path.join(MGE_LIB_PATH, "megengine_shared.lib"))
        if with_cuda:
            ldflags.append(os.path.join(CUDA_ROOT_DIR, "lib", "x64", "cudart.lib"))
        if with_cudnn:
            ldflags.append(os.path.join(CUDNN_ROOT_DIR, "lib", "x64", "cudnn.lib"))

    else:
        ldflags.append("-lmegengine_shared -L{}".format(MGE_LIB_PATH))
        ldflags.append("-Wl,-rpath,{}".format(MGE_LIB_PATH))
        if with_cuda:
            ldflags.append("-lcudart")
            ldflags.append("-L{}".format(os.path.join(CUDA_ROOT_DIR, "lib64")))
            ldflags.append("-Wl,-rpath,{}".format(os.path.join(CUDA_ROOT_DIR, "lib64")))
        if with_cudnn:
            ldflags.append("-L{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64")))
            ldflags.append(
                "-Wl,-rpath,{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64"))
            )

    return ldflags


 def _add_shared_flag(ldflags: List[str]):
    ldflags += ["/LD" if IS_WINDOWS else "-shared"]
    return ldflags


 #####################################################################
 # Phase 5
 #####################################################################


 def _obj_file_path(src_file_path: str):
    file_name = os.path.splitext(os.path.basename(src_file_path))[0]
    if _is_cuda_file(src_file_path):
        target = "{}.cuda.o".format(file_name)
    else:
        target = "{}.o".format(file_name)
    return target


 def _dump_ninja_file(
    path,
    cflags,
    post_cflags,
    cuda_cflags,
    cuda_post_cflags,
    sources,
    objects,
    ldflags,
    library_target,
    with_cuda,
 ):
    def sanitize_flags(flags):
        return [] if flags is None else [flag.strip() for flag in flags]

    cflags = sanitize_flags(cflags)
    post_cflags = sanitize_flags(post_cflags)
    cuda_cflags = sanitize_flags(cuda_cflags)
    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
    ldflags = sanitize_flags(ldflags)

    assert len(sources) == len(objects)
    assert len(sources) > 0

    if IS_WINDOWS:
        compiler = os.environ.get("CXX", "clang-cl")
    else:
        compiler = os.environ.get("CXX", "c++")

    # Version 1.3 is required for the `deps` directive.
    config = ["ninja_required_version = 1.3"]
    config.append("cxx = {}".format(compiler))
    if with_cuda:
        nvcc = os.path.join(CUDA_ROOT_DIR, "bin", "nvcc")
        config.append("nvcc = {}".format(nvcc))

    flags = ["cflags = {}".format(" ".join(cflags))]
    flags.append("post_cflags = {}".format(" ".join(post_cflags)))
    if with_cuda:
        flags.append("cuda_cflags = {}".format(" ".join(cuda_cflags)))
        flags.append("cuda_post_cflags = {}".format(" ".join(cuda_post_cflags)))
    flags.append("ldflags = {}".format(" ".join(ldflags)))

    # Turn into absolute paths so we can emit them into the ninja build
    # file wherever it is.
    sources = [os.path.abspath(file) for file in sources]

    # See https://ninja-build.org/build.ninja.html for reference.
    compile_rule = ["rule compile"]
    if IS_WINDOWS:
        compile_rule.append(
            "  command = clang-cl /showIncludes $cflags -c $in /Fo$out $post_cflags"
        )
        compile_rule.append("  deps = msvc")
    else:
        compile_rule.append(
            "  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags"
        )
        compile_rule.append("  depfile = $out.d")
        compile_rule.append("  deps = gcc")

    if with_cuda:
        cuda_compile_rule = ["rule cuda_compile"]
        nvcc_gendeps = ""
        cuda_compile_rule.append(
            "  command = $nvcc {} $cuda_cflags -c $in -o $out $cuda_post_cflags".format(
                nvcc_gendeps
            )
        )

    # Emit one build rule per source to enable incremental build.
    build = []
    for source_file, object_file in zip(sources, objects):
        is_cuda_source = _is_cuda_file(source_file) and with_cuda
        rule = "cuda_compile" if is_cuda_source else "compile"
        if IS_WINDOWS:
            source_file = source_file.replace(":", "$:")
            object_file = object_file.replace(":", "$:")
        source_file = source_file.replace(" ", "$ ")
        object_file = object_file.replace(" ", "$ ")
        build.append("build {}: {} {}".format(object_file, rule, source_file))

    if library_target is not None:
        link_rule = ["rule link"]
        if IS_WINDOWS:
            link_rule.append("  command = clang-cl $in /nologo $ldflags /out:$out")
        else:
            link_rule.append("  command = $cxx $in $ldflags -o $out")

        link = ["build {}: link {}".format(library_target, " ".join(objects))]
        default = ["default {}".format(library_target)]
    else:
        link_rule, link, default = [], [], []

    # 'Blocks' should be separated by newlines, for visual benefit.
    blocks = [config, flags, compile_rule]
    if with_cuda:
        blocks.append(cuda_compile_rule)
    blocks += [link_rule, build, link, default]
    with open(path, "w") as build_file:
        for block in blocks:
            lines = "\n".join(block)
            build_file.write("{}\n\n".format(lines))


 class FileBaton:
    def __init__(self, lock_file_path, wait_seconds=0.1):
        self.lock_file_path = lock_file_path
        self.wait_seconds = wait_seconds
        self.fd = None

    def try_acquire(self):
        try:
            self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
            return True
        except FileExistsError:
            return False

    def wait(self):
        while os.path.exists(self.lock_file_path):
            time.sleep(self.wait_seconds)

    def release(self):
        if self.fd is not None:
            os.close(self.fd)

        os.remove(self.lock_file_path)


 #####################################################################
 # Phase 6
 #####################################################################


 def _build_with_ninja(build_dir: str, verbose: bool, error_prefix: str):
    command = ["ninja", "-v"]
    env = os.environ.copy()
    try:
        sys.stdout.flush()
        sys.stderr.flush()
        stdout_fileno = 1
        subprocess.run(
            command,
            stdout=stdout_fileno if verbose else subprocess.PIPE,
            stderr=subprocess.STDOUT,
            cwd=build_dir,
            check=True,
            env=env,
        )
    except subprocess.CalledProcessError as e:
        with open(os.path.join(build_dir, "build.ninja")) as f:
            lines = f.readlines()
            print(lines)
        _, error, _ = sys.exc_info()
        message = error_prefix
        if hasattr(error, "output") and error.output:
            message += ": {}".format(error.output.decode())
        raise RuntimeError(message) from e


 def build(
    name: str,
    sources: Union[str, List[str]],
    extra_cflags: Union[str, List[str]] = [],
    extra_cuda_cflags: Union[str, List[str]] = [],
    extra_ldflags: Union[str, List[str]] = [],
    extra_include_paths: Union[str, List[str]] = [],
    with_cuda: Optional[bool] = None,
    build_dir: Optional[bool] = None,
    verbose: bool = False,
    abi_tag: Optional[int] = None,
 ) -> str:
    r"""Build a Custom Op with ninja in the way of just-in-time (JIT).

    To build the custom op, a Ninja build file is emitted, which is used to
    compile the given sources into a dynamic library.

    By default, the directory to which the build file is emitted and the
    resulting library compiled to is ``<tmp>/mge_custom_op/<name>``, where
    ``<tmp>`` is the temporary folder on the current platform and ``<name>``
    the name of the custom op. This location can be overridden in two ways.
    First, if the ``MGE_CUSTOM_OP_DIR`` environment variable is set, it
    replaces ``<tmp>/mge_custom_op`` and all custom op will be compiled
    into subfolders of this directory. Second, if the ``build_dir``
    argument to this function is supplied, it overrides the entire path, i.e.
    the library will be compiled into that folder directly.

    To compile the sources, the default system compiler (``c++``) is used,
    which can be overridden by setting the ``CXX`` environment variable. To pass
    additional arguments to the compilation process, ``extra_cflags`` or
    ``extra_ldflags`` can be provided. For example, to compile your custom op
    with optimizations, pass ``extra_cflags=['-O3']``. You can also use
    ``extra_cflags`` to pass further include directories.

    CUDA support with mixed compilation is provided. Simply pass CUDA source
    files (``.cu`` or ``.cuh``) along with other sources. Such files will be
    detected and compiled with nvcc rather than the C++ compiler. This includes
    passing the CUDA lib64 directory as a library directory, and linking
    ``cudart``. You can pass additional flags to nvcc via
    ``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various
    heuristics for finding the CUDA install directory are used, which usually
    work fine. If not, setting the ``CUDA_ROOT_DIR`` environment variable is the
    safest option. If you use CUDNN, please also setting the ``CUDNN_ROOT_DIR`` 
    environment variable.

    Args:
        name: The name of the custom op to build.
        sources: A list of relative or absolute paths to C++ source files.
        extra_cflags: optional list of compiler flags to forward to the build.
        extra_cuda_cflags: optional list of compiler flags to forward to nvcc
            when building CUDA sources.
        extra_ldflags: optional list of linker flags to forward to the build.
        extra_include_paths: optional list of include directories to forward
            to the build.
        with_cuda: Determines whether CUDA headers and libraries are added to
            the build. If set to ``None`` (default), this value is
            automatically determined based on the existence of ``.cu`` or
            ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
            and libraries to be included.
        build_dir: optional path to use as build workspace.
        verbose: If ``True``, turns on verbose logging of load steps.
        abi_tag: Determines the value of MACRO ``_GLIBCXX_USE_CXX11_ABI``
            in gcc compiler, should be ``0`` or ``1``.

    Returns:
        the compiled dynamic library path

    """

    # phase 1: prepare config
    if abi_tag != None:
        global MGE_ABI_VER
        MGE_ABI_VER = abi_tag

    def strlist(args, name):
        assert isinstance(args, str) or isinstance(
            args, list
        ), "{} must be str or list[str]".format(name)
        if isinstance(args, str):
            return [args]
        for arg in args:
            assert isinstance(arg, str)
        args = [arg.strip() for arg in args]
        return args

    sources = strlist(sources, "sources")
    extra_cflags = strlist(extra_cflags, "extra_cflags")
    extra_cuda_cflags = strlist(extra_cuda_cflags, "extra_cuda_cflags")
    extra_ldflags = strlist(extra_ldflags, "extra_ldflags")
    extra_include_paths = strlist(extra_include_paths, "extra_include_paths")

    with_cuda = any(map(_is_cuda_file, sources)) if with_cuda is None else with_cuda
    with_cudnn = any(["cudnn" in f for f in extra_ldflags])

    if CUDA_ROOT_DIR == None and with_cuda:
        print(
            "No CUDA runtime is found, using {}=/path/to/your/cuda_root_dir".format(
                ev_cuda_root_dir
            )
        )
    if CUDNN_ROOT_DIR == None and with_cudnn:
        print(
            "Cannot find the root directory of cudnn, using {}=/path/to/your/cudnn_root_dir".format(
                ev_cudnn_root_dir
            )
        )

    build_dir = os.path.abspath(
        _get_build_dir(name) if build_dir is None else build_dir
    )
    if not os.path.exists(build_dir):
        os.makedirs(build_dir, exist_ok=True)

    if verbose:
        print("Using {} to build megengine custom op".format(build_dir))

    # phase 2: version check
    version, old_version = version_check(
        name,
        sources,
        [extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths],
        build_dir,
        with_cuda,
        with_cudnn,
        abi_tag,
    )
    if verbose:
        if version != old_version and old_version != None:
            print(
                "Input conditions of custom op {} have changed, bumping to version {}".format(
                    name, version
                )
            )
        print("Building custom op {} with version {}".format(name, version))
    if version == old_version:
        if verbose:
            print(
                "No modifications detected for {}, skipping build step...".format(name)
            )
        return
    name = "{}_v{}".format(name, version)

    # phase 3: compiler and ninja check
    _check_ninja_availability()
    _check_compiler_comatibility()

    # phase 4: setup the compile flags
    user_includes, system_includes = _setup_includes(
        extra_include_paths, with_cuda, with_cudnn
    )
    common_cflags = _setup_common_cflags(user_includes, system_includes)
    cuda_cflags = (
        _setup_cuda_cflags(common_cflags, extra_cuda_cflags) if with_cuda else None
    )
    ldflags = _setup_ldflags(extra_ldflags, with_cuda, with_cudnn)

    if IS_WINDOWS:
        cflags = common_cflags + COMMON_MSVC_FLAGS + extra_cflags
        cflags = _nt_quote_args(cflags)
    else:
        cflags = common_cflags + ["-fPIC", "-std=c++14"] + extra_cflags

    ldflags = _add_shared_flag(ldflags)
    if sys.platform.startswith("darwin"):
        ldflags.append("-undefined dynamic_lookup")
    elif IS_WINDOWS:
        ldflags += ["/link"]
        ldflags = _nt_quote_args(ldflags)

    baton = FileBaton(os.path.join(build_dir, "lock"))
    if baton.try_acquire():
        try:
            # phase 5: generate ninja build file
            objs = [_obj_file_path(src) for src in sources]
            name += ".dll" if IS_WINDOWS else ".so"

            build_file_path = os.path.join(build_dir, "build.ninja")
            if verbose:
                print("Emitting ninja build file {}".format(build_file_path))
            _dump_ninja_file(
                path=build_file_path,
                cflags=cflags,
                post_cflags=None,
                cuda_cflags=cuda_cflags,
                cuda_post_cflags=None,
                sources=sources,
                objects=objs,
                ldflags=ldflags,
                library_target=name,
                with_cuda=with_cuda,
            )

            # phase 6: build with ninja
            if verbose:
                print(
                    "Compiling and linking your custom op {}".format(
                        os.path.join(build_dir, name)
                    )
                )
            _build_with_ninja(build_dir, verbose, "compiling error")
        finally:
            baton.release()
    else:
        baton.wait()

    return os.path.join(build_dir, name)


 def build_and_load(
    name: str,
    sources: Union[str, List[str]],
    extra_cflags: Union[str, List[str]] = [],
    extra_cuda_cflags: Union[str, List[str]] = [],
    extra_ldflags: Union[str, List[str]] = [],
    extra_include_paths: Union[str, List[str]] = [],
    with_cuda: Optional[bool] = None,
    build_dir: Optional[bool] = None,
    verbose: bool = False,
    abi_tag: Optional[int] = None,
 ) -> str:
    r"""Build and Load a Custom Op with ninja in the way of just-in-time (JIT).
    Same as the function ``build()`` but load the built dynamic library.

    Args:
        same as ``build()``

    Returns:
        the compiled dynamic library path

    """

    lib_path = build(
        name,
        sources,
        extra_cflags,
        extra_cuda_cflags,
        extra_ldflags,
        extra_include_paths,
        with_cuda,
        build_dir,
        verbose,
        abi_tag,
    )
    if verbose:
        print("Load the compiled custom op {}".format(lib_path))
    load(lib_path)
    return lib_path
--- a/imperative/python/src/ops.cpp
+++ b/imperative/python/src/ops.cpp
@@ -766,6 +766,13 @@ void init_custom(pybind11::module m) {
    m.def("_install", &install_custom);
    m.def("_uninstall", &uninstall_custom);
    m.def("_get_custom_op_list", &get_custom_op_list);
    m.def("get_custom_op_abi_tag", [](void) -> int {
        int ret = 0;
 #ifdef _GLIBCXX_USE_CXX11_ABI
        ret = _GLIBCXX_USE_CXX11_ABI;
 #endif
        return ret;
    });

    static PyMethodDef method_def = {
 #ifdef METH_FASTCALL
--- a/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
+++ b/imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
@@ -0,0 +1,140 @@
 /**
 * \file imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "megbrain/custom/custom.h"

 CUSTOM_OP_REG_BEGIN(ElemAddSmooth)

 void forward_device_infer(
        const std::vector<Device>& inputs, const Param& params,
        std::vector<Device>& outputs) {
    outputs[0] = inputs[0];
 }

 void forward_shape_infer(
        const std::vector<Shape>& inputs, const Param& params,
        std::vector<Shape>& outputs) {
    outputs[0] = inputs[0];
 }

 void forward_dtype_infer(
        const std::vector<DType>& inputs, const Param& params,
        std::vector<DType>& outputs) {
    outputs[0] = inputs[0];
 }

 void forward_format_infer(
        const std::vector<Format>& inputs, const Param& params,
        std::vector<Format>& outputs) {
    outputs[0] = inputs[0];
 }

 template <typename scalar_t>
 void forward_kernel(
        const scalar_t* input0, const scalar_t* input1, scalar_t* output, size_t len,
        float smooth) {
    for (size_t i = 0; i < len; ++i) {
        output[i] = input0[i] + input1[i];
        if (output[i] < 0)
            output[i] += smooth;
        else
            output[i] -= smooth;
    }
 }

 void forward_compute(
        const std::vector<Tensor>& inputs, const Param& params,
        std::vector<Tensor>& outputs) {
    DISPATCH_SIGN_INT_AND_FLOAT_TYPES(
            outputs[0].dtype(), "forward_compute", ([&]() {
                forward_kernel<scalar_t>(
                        inputs[0].data<scalar_t>(), inputs[1].data<scalar_t>(),
                        outputs[0].data<scalar_t>(), outputs[0].size(),
                        params["smooth"].as<float>());
            }));
 }

 CUSTOM_OP_REG(ElemAddSmoothForward)
        .set_description(
                "Custom ElemAdd Operator With a Smooth Parameter, "
                "which is used to verify the CPU kernel")
        .add_input("lhs")
        .add_input("rhs")
        .add_output("output")
        .add_param("smooth", 0.f)
        .set_device_infer(forward_device_infer)
        .set_shape_infer(forward_shape_infer)
        .set_dtype_infer(forward_dtype_infer)
        .set_format_infer(forward_format_infer)
        .set_compute(forward_compute);

 void backward_device_infer(
        const std::vector<Device>& ograds, const Param& params,
        std::vector<Device>& igrads) {
    igrads[0] = ograds[0];
    igrads[1] = ograds[0];
 }

 void backward_shape_infer(
        const std::vector<Shape>& ograds, const Param& params,
        std::vector<Shape>& igrads) {
    igrads[0] = ograds[0];
    igrads[1] = ograds[0];
 }

 void backward_dtype_infer(
        const std::vector<DType>& ograds, const Param& params,
        std::vector<DType>& igrads) {
    igrads[0] = ograds[0];
    igrads[1] = ograds[0];
 }

 void backward_format_infer(
        const std::vector<Format>& ograds, const Param& params,
        std::vector<Format>& igrads) {
    igrads[0] = ograds[0];
    igrads[1] = ograds[0];
 }

 template <typename scalar_t>
 void backward_kernel(
        const scalar_t* ograd, scalar_t* igrad0, scalar_t* igrad1, size_t len) {
    for (size_t i = 0; i < len; ++i) {
        igrad0[i] = ograd[i];
        igrad1[i] = ograd[i];
    }
 }

 void backward_compute(
        const std::vector<Tensor>& ograds, const Param& params,
        std::vector<Tensor>& igrads) {
    DISPATCH_SIGN_INT_AND_FLOAT_TYPES(
            igrads[0].dtype(), "backward_compute", ([&]() {
                backward_kernel<scalar_t>(
                        ograds[0].data<scalar_t>(), igrads[0].data<scalar_t>(),
                        igrads[1].data<scalar_t>(), igrads[0].size());
            }));
 }

 CUSTOM_OP_REG(ElemAddSmoothBackward)
        .set_description(
                "Custom ElemAdd Operator With a Smooth Parameter, "
                "which is used to verify the CPU kernel")
        .add_input("ograd")
        .add_output("igrad_lhs")
        .add_output("igrad_rhs")
        .set_device_infer(backward_device_infer)
        .set_shape_infer(backward_shape_infer)
        .set_dtype_infer(backward_dtype_infer)
        .set_format_infer(backward_format_infer)
        .set_compute(backward_compute);

 CUSTOM_OP_REG_END(ElemAddSmooth)
--- a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
+++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
@@ -0,0 +1,65 @@
 /**
 * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "./matmul_scale.h"
 #include "megbrain/custom/custom.h"

 CUSTOM_OP_REG_BEGIN(MatMulScale)

 void forward_shape_infer(
        const std::vector<Shape>& inputs, const Param& params,
        std::vector<Shape>& outputs) {
    outputs[0] = {inputs[0][0], inputs[1][1]};
 }

 void forward_compute(
        const std::vector<Tensor>& inputs, const Param& params,
        std::vector<Tensor>& outputs) {
    matmul_forward_helper(
            inputs[0], inputs[1], outputs[0], inputs[0].shape()[0],
            inputs[0].shape()[1], inputs[1].shape()[1], params["scale"].as<float>());
 }

 CUSTOM_OP_REG(MatMulScaleForward)
        .add_inputs(2)
        .add_outputs(1)
        .add_param("scale", 1.0f)
        .set_shape_infer(forward_shape_infer)
        .set_compute("cuda", forward_compute);

 void backward_shape_infer(
        const std::vector<Shape>& ograd_and_inputs, const Param& params,
        std::vector<Shape>& outputs) {
    outputs[0] = ograd_and_inputs[1];
    outputs[1] = ograd_and_inputs[2];
 }

 void backward_compute(
        const std::vector<Tensor>& ograd_and_inputs, const Param& params,
        std::vector<Tensor>& igrads) {
    matmul_backward_lhs_helper(
            ograd_and_inputs[2], ograd_and_inputs[0], igrads[0],
            ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1],
            ograd_and_inputs[2].shape()[1], params["scale"].as<float>());
    matmul_backward_rhs_helper(
            ograd_and_inputs[1], ograd_and_inputs[0], igrads[1],
            ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1],
            ograd_and_inputs[2].shape()[1], params["scale"].as<float>());
 }

 CUSTOM_OP_REG(MatMulScaleBackward)
        .add_inputs(3)
        .add_outputs(2)
        .add_param("scale", 1.0f)
        .set_shape_infer(backward_shape_infer)
        .set_compute("cuda", backward_compute);

 CUSTOM_OP_REG_END(MatMulScale)
--- a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
+++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
@@ -0,0 +1,97 @@
 /**
 * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include "./matmul_scale.h"

 using namespace custom;

 // matmul_forward for Mat_mxk * Mat_k*n
 template <typename T>
 __global__ void matmul_forward_naive(
        const T* lhs, const T* rhs, T* res, size_t M, size_t K, size_t N, float scale) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    T acc = 0;
    for (int i = 0; i < K; ++i)
        acc += lhs[row * K + i] * rhs[i * N + col];
    res[row * N + col] = acc * scale;
 }

 // matmul_backward_lhs for Mat_mxk * Mat_k*n = Mat_mxn
 // that is Mat_mxn * Mat_nxk
 template <typename T>
 __global__ void matmul_backward_lhs_naive(
        const T* rhs, const T* ograd, T* lhs_grad, size_t M, size_t K, size_t N,
        float scale) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    T acc = 0;
    for (int i = 0; i < N; ++i)
        acc += ograd[row * N + i] * rhs[col * N + i];
    lhs_grad[row * K + col] = acc / scale;
 }

 // matmul_backward_rhs for Mat_mxk * Mat_k*n = Mat_mxn
 // that is Mat_kxm * Mat_mxn
 template <typename T>
 __global__ void matmul_backward_rhs_naive(
        const T* lhs, const T* ograd, T* rhs_grad, size_t M, size_t K, size_t N,
        float scale) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    T acc = 0;
    for (int i = 0; i < M; ++i)
        acc += lhs[i * K + row] * ograd[i * N + col];
    rhs_grad[row * N + col] = acc / scale;
 }

 void matmul_forward_helper(
        const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N,
        float scale) {
    dim3 block(1, 1);
    dim3 grid(N / block.x, M / block.y);

    DISPATCH_INT_AND_FLOAT_TYPES(res.dtype(), "matmul_forward", ([&]() {
                                     matmul_forward_naive<scalar_t><<<grid, block>>>(
                                             lhs.data<scalar_t>(), rhs.data<scalar_t>(),
                                             res.data<scalar_t>(), M, K, N, scale);
                                 }));
 }

 void matmul_backward_lhs_helper(
        const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K,
        size_t N, float scale) {
    dim3 block(1, 1);
    dim3 grid(K / block.x, M / block.y);
    DISPATCH_INT_AND_FLOAT_TYPES(
            lhs_grad.dtype(), "matmul_backward_lhs", ([&]() {
                matmul_backward_lhs_naive<scalar_t><<<grid, block>>>(
                        rhs.data<scalar_t>(), ograd.data<scalar_t>(),
                        lhs_grad.data<scalar_t>(), M, K, N, scale);
            }));
 }

 void matmul_backward_rhs_helper(
        const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K,
        size_t N, float scale) {
    dim3 block(1, 1);
    dim3 grid(N / block.x, K / block.y);
    DISPATCH_INT_AND_FLOAT_TYPES(
            rhs_grad.dtype(), "matmul_backward_rhs", ([&]() {
                matmul_backward_rhs_naive<scalar_t><<<grid, block>>>(
                        lhs.data<scalar_t>(), ograd.data<scalar_t>(),
                        rhs_grad.data<scalar_t>(), M, K, N, scale);
            }));
 }
--- a/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
+++ b/imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
@@ -0,0 +1,24 @@
 /**
 * \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "megbrain/custom/custom.h"

 using Tensor = custom::Tensor;

 void matmul_forward_helper(
        const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N,
        float scale);
 void matmul_backward_lhs_helper(
        const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K,
        size_t N, float scale);
 void matmul_backward_rhs_helper(
        const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K,
        size_t N, float scale);
--- a/imperative/python/test/unit/core/test_custom_op.py
+++ b/imperative/python/test/unit/core/test_custom_op.py
@@ -0,0 +1,111 @@
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 import os
 import platform
 import shutil
 import sys

 import numpy as np
 import pytest

 import megengine
 import megengine.functional as F
 import megengine.optimizer as optim
 from megengine import jit
 from megengine.autodiff import Function, GradManager
 from megengine.core._imperative_rt.core2 import apply
 from megengine.core.ops import custom
 from megengine.device import get_device_count
 from megengine.module import Conv2d, Linear, Module
 from megengine.random import normal
 from megengine.tensor import Parameter, Tensor
 from megengine.utils import custom_op_tools


 def compare(ref, real):
    if ref.shape != real.shape:
        real = real.T
    np.testing.assert_allclose(ref, real, rtol=1e-3, atol=1e-5)


 def build_and_clean(test_func):
    def wrapper():
        cur_dir_path = os.path.dirname(os.path.abspath(__file__))
        build_path = os.path.join(cur_dir_path, "custom_opsrc", "build")
        mgb_root_path = os.path.dirname(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.dirname(cur_dir_path)))
            )
        )
        extra_include_paths = [os.path.join(mgb_root_path, "src", "custom", "include")]
        extra_ld_flags = []

        if sys.platform != "win32":
            ld_path = os.environ.get("LD_LIBRARY_PATH")
            if ld_path != None:
                ld_dirs = ld_path.split(":")
                for ld_dir in ld_dirs:
                    if os.path.exists(ld_dir) and os.path.isdir(ld_dir):
                        for lib in os.listdir(ld_dir):
                            if "megengine_shared" in lib:
                                extra_ld_flags += [
                                    "-L{} -Wl,-rpath,{}".format(ld_dir, ld_dir)
                                ]
                                break

        if get_device_count("gpu") > 0:
            custom_opsrc = [
                os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cpp"),
                os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cu"),
            ]
        else:
            custom_opsrc = [os.path.join(cur_dir_path, "custom_opsrc", "elem_add.cpp")]

        lib_path = custom_op_tools.build_and_load(
            "test_op",
            custom_opsrc,
            extra_include_paths=extra_include_paths,
            extra_ldflags=extra_ld_flags,
            build_dir=build_path,
            verbose=False,
            abi_tag=custom.get_custom_op_abi_tag(),
        )
        test_func()

        custom.unload(lib_path)
        if os.path.exists(build_path):
            shutil.rmtree(build_path)

    return wrapper


@pytest.mark.skipif(
    get_device_count("gpu") > 0, reason="elem_add operator is only supported on CPU"
 )
@build_and_clean
 def test_custom_op_cpu_build():
    assert "ElemAddSmoothForward" in custom._get_custom_op_list()
    assert "ElemAddSmoothBackward" in custom._get_custom_op_list()
    assert hasattr(custom, "ElemAddSmoothForward")
    assert hasattr(custom, "ElemAddSmoothBackward")


@pytest.mark.skipif(
    platform.system() == "Darwin",
    reason="GPU kernel is only support on Linux and Windows",
 )
@pytest.mark.skipif(
    get_device_count("gpu") < 1, reason="matmul scale operator is only supported on GPU"
 )
@build_and_clean
 def test_custom_op_gpu_build():
    assert "MatMulScaleForward" in custom._get_custom_op_list()
    assert "MatMulScaleBackward" in custom._get_custom_op_list()
    assert hasattr(custom, "MatMulScaleForward")
    assert hasattr(custom, "MatMulScaleBackward")
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -171,6 +171,7 @@ function do_build() {
        mkdir -p staging

        cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
        cp -a ${SRC_DIR}/src/custom/include staging/megengine/core/include/
        cd ${BUILD_DIR}/staging/megengine/core
        rt_file=`ls _imperative_rt.*.so`
        echo "rt file is: ${rt_file}"
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
@@ -151,6 +151,7 @@ do
    rm -rf staging
    mkdir -p staging
    cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
    cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include

    cd ${BUILD_DIR}/staging/megengine/core
    mkdir -p lib/ucx
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -77,11 +77,13 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6
 CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll"
 CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll"
 CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll"
 MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll"
 MGE_EXPORT_DLL="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll"
 MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.lib"

 function depend_real_copy() {
    REAL_DST=$1
    echo "real copy lib to $1"
    cp "${MGE_EXPORT_DLL}" ${REAL_DST}
    cp "${MGE_EXPORT_LIB}" ${REAL_DST}

    if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
@@ -190,6 +192,7 @@ function do_build() {
        rm -rf staging
        mkdir -p staging
        cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
        cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include/
        cd ${BUILD_DIR}/staging/megengine/core
        rt_file=`ls _imperative_rt.*.pyd`
        echo "rt file is: ${rt_file}"
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,3 +1,8 @@
 # force define a SHARED target for whl, caused by when build for APPLE
 # we will force set BUILD_SHARED_LIBS=OFF for xcode needed
 set(MGE_SHARED_LIB megengine_shared)
 set(MGE_SHARED_LIB ${MGE_SHARED_LIB} PARENT_SCOPE)

 if(MGE_WITH_JIT_MLIR)
    add_subdirectory(jit/include/megbrain/jit/mlir/ir)
 endif()
@@ -206,32 +211,30 @@ set (_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld)

 # Build as SHARED or STATIC depending on BUILD_SHARED_LIBS=ON/OFF
 add_library(megengine)
 # force define a SHARED target for whl, caused by when build for APPLE
 # we will force set BUILD_SHARED_LIBS=OFF for xcode needed
 add_library(megengine_shared SHARED)
 add_library(${MGE_SHARED_LIB} SHARED)
 target_link_libraries(megengine PRIVATE ${MGE_CUDA_LIBS})
 target_link_libraries(megengine PUBLIC megbrain megdnn)
 target_link_libraries(megengine_shared PUBLIC megbrain megdnn)
 target_link_libraries(megengine_shared PRIVATE ${MGE_CUDA_LIBS})
 target_link_libraries(${MGE_SHARED_LIB} PUBLIC megbrain megdnn)
 target_link_libraries(${MGE_SHARED_LIB} PRIVATE ${MGE_CUDA_LIBS})
 if (UNIX AND NOT APPLE)
    target_link_options(megengine PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
    set_target_properties(megengine PROPERTIES LINK_DEPENDS ${_VER_FILE})
    target_link_options(megengine_shared PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
    set_target_properties(megengine_shared PROPERTIES LINK_DEPENDS ${_VER_FILE})
    target_link_options(${MGE_SHARED_LIB} PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
    set_target_properties(${MGE_SHARED_LIB} PROPERTIES LINK_DEPENDS ${_VER_FILE})
 endif()
 if(WIN32 OR MSVC)
    target_compile_definitions(megbrain PRIVATE MGE_DLL_EXPORT)
    target_compile_definitions(megdnn PRIVATE MGE_DLL_EXPORT)
    target_compile_definitions(megengine PRIVATE MGE_DLL_EXPORT)
    target_compile_definitions(megengine_shared PRIVATE MGE_DLL_EXPORT)
    target_compile_definitions(${MGE_SHARED_LIB} PRIVATE MGE_DLL_EXPORT)
    # please do not use WINDOWS_EXPORT_ALL_SYMBOLS, as symbols max than 65535 when build with CUDA
    #set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
    #set_target_properties(megengine_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
    #set_target_properties(${MGE_SHARED_LIB} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
 endif()
 if (MGE_WITH_DISTRIBUTED)
    message(VERBOSE "megengine configured to link megray")
    target_link_libraries(megengine PUBLIC megray)
    target_link_libraries(megengine_shared PUBLIC megray)
    target_link_libraries(${MGE_SHARED_LIB} PUBLIC megray)
 endif()
 # Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready
 # for this.
--- a/src/custom/impl/manager.cpp
+++ b/src/custom/impl/manager.cpp
@@ -18,12 +18,31 @@

 #ifndef _WIN32
 #include <dlfcn.h>
 #else
 #include <windows.h>
 #endif

 using namespace mgb;

 namespace custom {

 #ifdef _WIN32
 #define RTLD_LAZY 0

 void* dlopen(const char* file, int) {
    return static_cast<void*>(LoadLibrary(file));
 }

 int dlclose(void* handle) {
    return static_cast<int>(FreeLibrary(static_cast<HMODULE>(handle)));
 }

 const char* dlerror(void) {
    static char win_err_info[] = "no dlerror info in windows";
    return win_err_info;
 }
 #endif

 CustomOpManager* CustomOpManager::inst(void) {
    static CustomOpManager op_manager;
    return &op_manager;
@@ -127,7 +146,6 @@ std::vector<RunTimeId> CustomOpManager::op_id_list(void) {
    return ret;
 }

 #ifndef _WIN32
 CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY)
        : m_handle(nullptr, [](void* handle) { dlclose(handle); }) {
    auto op_list_before_load = CustomOpManager::inst()->op_name_list();
@@ -146,12 +164,6 @@ CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY)
        }
    }
 }
 #else
 CustomLib::CustomLib(const std::string& path, int mode = 0)
        : m_handle(nullptr, [](void* handle) {}) {
    mgb_assert(false, "custom op is only supported on Linux now");
 }
 #endif

 const std::vector<std::string>& CustomLib::ops_in_lib(void) const {
    return m_ops;
--- a/src/custom/include/megbrain/custom/custom.h
+++ b/src/custom/include/megbrain/custom/custom.h
@@ -16,7 +16,8 @@
 #include "tensor.h"

 namespace custom {
 std::shared_ptr<CustomOp> op_insert(std::string opname, uint32_t version);
 MGE_WIN_DECLSPEC_FUC std::shared_ptr<CustomOp> op_insert(
        std::string opname, uint32_t version);
 }

 #define CUSTOM_OP_REG(OpName) \
--- a/src/custom/include/megbrain/custom/op.h
+++ b/src/custom/include/megbrain/custom/op.h
@@ -32,27 +32,26 @@ namespace custom {

 using RunTimeId = uint64_t;

 class ArgInfo {
 class MGE_WIN_DECLSPEC_FUC ArgInfo {
    CUSTOM_PIMPL_CLS_DECL(ArgInfo);
    MGE_WIN_DECLSPEC_FUC ArgInfo(
            const std::string& name, const std::string& desc,
    ArgInfo(const std::string& name, const std::string& desc,
            const std::unordered_set<std::string>& dtypes, const int& ndim,
            const std::string& mem_stgy);

    MGE_WIN_DECLSPEC_FUC const std::string& name(void) const;
    MGE_WIN_DECLSPEC_FUC const std::string& desc(void) const;
    MGE_WIN_DECLSPEC_FUC const std::unordered_set<std::string>& dtypes(void) const;
    MGE_WIN_DECLSPEC_FUC int ndim(void) const;
    MGE_WIN_DECLSPEC_FUC const std::string& mem_strategy(void) const;
    const std::string& name(void) const;
    const std::string& desc(void) const;
    const std::unordered_set<std::string>& dtypes(void) const;
    int ndim(void) const;
    const std::string& mem_strategy(void) const;

    MGE_WIN_DECLSPEC_FUC std::string str() const;
    std::string str() const;
 };

 class CustomOp {
 class MGE_WIN_DECLSPEC_FUC CustomOp {
    std::unique_ptr<void, void_deleter> m_impl;

 public:
    MGE_WIN_DECLSPEC_FUC CustomOp(const std::string& op_type, uint32_t version);
    CustomOp(const std::string& op_type, uint32_t version);
    PREVENT_COPY_AND_ASSIGN(CustomOp);

    using DeviceInferFuncPtr =
@@ -71,70 +70,65 @@ public:
            void (*)(const std::vector<Tensor>&, const Param&, std::vector<Tensor>&);

    // write for forward
    MGE_WIN_DECLSPEC_FUC CustomOp& set_device_infer(DeviceInferFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_shape_infer(ShapeInferFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_dtype_infer(DTypeInferFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_format_infer(FormatInferFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(PreprocessFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(
            const std::string& device, PreprocessFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(PostprocessFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(
            const std::string& device, PostprocessFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(ComputeFuncPtr func);
    MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(
            const std::string& device, ComputeFuncPtr func);

    MGE_WIN_DECLSPEC_FUC CustomOp& set_description(const std::string& op_desc);
    MGE_WIN_DECLSPEC_FUC CustomOp& add_input(
    CustomOp& set_device_infer(DeviceInferFuncPtr func);
    CustomOp& set_shape_infer(ShapeInferFuncPtr func);
    CustomOp& set_dtype_infer(DTypeInferFuncPtr func);
    CustomOp& set_format_infer(FormatInferFuncPtr func);
    CustomOp& set_preprocess(PreprocessFuncPtr func);
    CustomOp& set_preprocess(const std::string& device, PreprocessFuncPtr func);
    CustomOp& set_postprocess(PostprocessFuncPtr func);
    CustomOp& set_postprocess(const std::string& device, PostprocessFuncPtr func);
    CustomOp& set_compute(ComputeFuncPtr func);
    CustomOp& set_compute(const std::string& device, ComputeFuncPtr func);

    CustomOp& set_description(const std::string& op_desc);
    CustomOp& add_input(
            const std::string& name, const std::string& desc,
            const std::initializer_list<std::string>& legal_dtypes = {"float32"},
            int dims = -1, const std::string& mem_stgy = "default");
    MGE_WIN_DECLSPEC_FUC CustomOp& add_output(
    CustomOp& add_output(
            const std::string& name, const std::string& desc,
            const std::initializer_list<std::string>& legal_dtypes = {"float32"},
            int dims = -1, const std::string& mem_stgy = "default");
    MGE_WIN_DECLSPEC_FUC CustomOp& add_input(
    CustomOp& add_input(
            const std::string& name,
            const std::initializer_list<std::string>& legal_dtypes = {"float32"},
            int dims = -1, const std::string& mem_stgy = "default");
    MGE_WIN_DECLSPEC_FUC CustomOp& add_output(
    CustomOp& add_output(
            const std::string& name,
            const std::initializer_list<std::string>& legal_dtypes = {"float32"},
            int dims = -1, const std::string& mem_stgy = "default");
    MGE_WIN_DECLSPEC_FUC CustomOp& add_inputs(const size_t& input_num);
    MGE_WIN_DECLSPEC_FUC CustomOp& add_outputs(const size_t& output_num);
    MGE_WIN_DECLSPEC_FUC CustomOp& add_param(
            const std::string& name, const ParamVal& default_val);
    MGE_WIN_DECLSPEC_FUC CustomOp& add_param(
    CustomOp& add_inputs(const size_t& input_num);
    CustomOp& add_outputs(const size_t& output_num);
    CustomOp& add_param(const std::string& name, const ParamVal& default_val);
    CustomOp& add_param(
            const std::string& name, const std::string& desc,
            const ParamVal& default_val);

    // read
    MGE_WIN_DECLSPEC_FUC std::string op_type(void) const;
    MGE_WIN_DECLSPEC_FUC std::string op_desc(void) const;
    MGE_WIN_DECLSPEC_FUC RunTimeId runtime_id(void) const;
    MGE_WIN_DECLSPEC_FUC size_t input_num(void) const;
    MGE_WIN_DECLSPEC_FUC size_t output_num(void) const;
    MGE_WIN_DECLSPEC_FUC std::string str(void) const;

    MGE_WIN_DECLSPEC_FUC const ParamInfo& param_info(void) const;
    MGE_WIN_DECLSPEC_FUC ArgInfo input_info(size_t idx) const;
    MGE_WIN_DECLSPEC_FUC ArgInfo output_info(size_t idx) const;
    MGE_WIN_DECLSPEC_FUC const std::vector<ArgInfo>& inputs_info(void) const;
    MGE_WIN_DECLSPEC_FUC const std::vector<ArgInfo>& outputs_info(void) const;
    std::string op_type(void) const;
    std::string op_desc(void) const;
    RunTimeId runtime_id(void) const;
    size_t input_num(void) const;
    size_t output_num(void) const;
    std::string str(void) const;

    const ParamInfo& param_info(void) const;
    ArgInfo input_info(size_t idx) const;
    ArgInfo output_info(size_t idx) const;
    const std::vector<ArgInfo>& inputs_info(void) const;
    const std::vector<ArgInfo>& outputs_info(void) const;

    // use
    MGE_WIN_DECLSPEC_FUC std::vector<Device> infer_output_device(
    std::vector<Device> infer_output_device(
            const std::vector<Device>&, const Param&) const;
    MGE_WIN_DECLSPEC_FUC std::vector<Shape> infer_output_shape(
    std::vector<Shape> infer_output_shape(
            const std::vector<Shape>&, const Param&) const;
    MGE_WIN_DECLSPEC_FUC std::vector<DType> infer_output_dtype(
    std::vector<DType> infer_output_dtype(
            const std::vector<DType>&, const Param&) const;
    MGE_WIN_DECLSPEC_FUC std::vector<Format> infer_output_format(
    std::vector<Format> infer_output_format(
            const std::vector<Format>&, const Param&) const;
    MGE_WIN_DECLSPEC_FUC void compute(
            const std::vector<Tensor>&, const Param&, std::vector<Tensor>&) const;
    void compute(const std::vector<Tensor>&, const Param&, std::vector<Tensor>&) const;
 };

 }  // namespace custom
--- a/src/custom/include/megbrain/custom/param.h
+++ b/src/custom/include/megbrain/custom/param.h
@@ -23,7 +23,7 @@ class ParamInfoImpl;
 class ParamImpl;

 // Schema of a param element
 class ParamSchema {
 class MGE_WIN_DECLSPEC_FUC ParamSchema {
    CUSTOM_PIMPL_CLS_DECL(ParamSchema);
    ParamSchema(
            const std::string& name, const ParamVal& value,
@@ -36,7 +36,7 @@ class ParamSchema {
    std::string str(void) const;
 };

 class ParamInfo {
 class MGE_WIN_DECLSPEC_FUC ParamInfo {
    CUSTOM_PIMPL_CLS_DECL(ParamInfo);

    void set_tag(const std::string&);
@@ -46,16 +46,16 @@ class ParamInfo {
    const std::vector<ParamSchema>& meta(void) const;
 };

 class Param {
 class MGE_WIN_DECLSPEC_FUC Param {
    CUSTOM_PIMPL_CLS_DECL(Param);

    MGE_WIN_DECLSPEC_FUC Param(const ParamInfo&);
    MGE_WIN_DECLSPEC_FUC ParamVal& operator[](const std::string&);
    MGE_WIN_DECLSPEC_FUC const ParamVal& operator[](const std::string&) const;
    MGE_WIN_DECLSPEC_FUC const std::unordered_map<std::string, ParamVal>& raw() const;
    MGE_WIN_DECLSPEC_FUC bool exist(const std::string& name) const;
    MGE_WIN_DECLSPEC_FUC std::string to_bytes(void) const;
    MGE_WIN_DECLSPEC_FUC void from_bytes(const std::string&);
    Param(const ParamInfo&);
    ParamVal& operator[](const std::string&);
    const ParamVal& operator[](const std::string&) const;
    const std::unordered_map<std::string, ParamVal>& raw() const;
    bool exist(const std::string& name) const;
    std::string to_bytes(void) const;
    void from_bytes(const std::string&);
 };

 MGE_WIN_DECLSPEC_FUC bool operator==(const Param&, const Param&);
--- a/src/custom/include/megbrain/custom/param_val.h
+++ b/src/custom/include/megbrain/custom/param_val.h
@@ -169,21 +169,21 @@ std::string vec2str(const std::vector<T>& vec) {
 * Con1: user need to set the type explicitly when class template instantiation
 * Con2: ParamVal<int> can not be assigned to ParamVal<double>
 */
 class ParamVal {
 class MGE_WIN_DECLSPEC_FUC ParamVal {
    std::unique_ptr<void, void_deleter> m_ptr;
    ParamDynType m_type;

 public:
    template <typename T>
    MGE_WIN_DECLSPEC_FUC ParamVal(const T& val);
    ParamVal(const T& val);
    template <typename T>
    MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list<T>& val);
    ParamVal(const std::initializer_list<T>& val);

    MGE_WIN_DECLSPEC_FUC ParamVal();
    MGE_WIN_DECLSPEC_FUC ParamVal(const char* str);
    MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list<const char*>& strs);
    MGE_WIN_DECLSPEC_FUC ParamVal(const std::vector<const char*>& strs);
    MGE_WIN_DECLSPEC_FUC ParamVal(const ParamVal& rhs);
    ParamVal();
    ParamVal(const char* str);
    ParamVal(const std::initializer_list<const char*>& strs);
    ParamVal(const std::vector<const char*>& strs);
    ParamVal(const ParamVal& rhs);

    template <typename T>
    ParamVal& operator=(const T& rhs);
@@ -196,30 +196,39 @@ public:
    ParamVal& operator=(const ParamVal& rhs);

    template <typename T>
    MGE_WIN_DECLSPEC_FUC const T& as(void) const;
    const T& as(void) const;
    template <typename T>
    MGE_WIN_DECLSPEC_FUC T& as(void);

    MGE_WIN_DECLSPEC_FUC const void* raw_ptr(void) const;
    MGE_WIN_DECLSPEC_FUC void* raw_ptr(void);
    MGE_WIN_DECLSPEC_FUC ParamDynType type(void) const;
    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
    MGE_WIN_DECLSPEC_FUC size_t size(void) const;

    MGE_WIN_DECLSPEC_FUC static std::string to_bytes(const ParamVal& value);
    MGE_WIN_DECLSPEC_FUC static ParamVal from_bytes(
            const std::string& bytes, size_t& offset);

    friend ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs);
    friend ParamVal operator-(const ParamVal& lhs, const ParamVal& rhs);
    friend ParamVal operator*(const ParamVal& lhs, const ParamVal& rhs);
    friend ParamVal operator/(const ParamVal& lhs, const ParamVal& rhs);
    friend bool operator==(const ParamVal& lhs, const ParamVal& rhs);
    friend bool operator!=(const ParamVal& lhs, const ParamVal& rhs);
    friend bool operator>(const ParamVal& lhs, const ParamVal& rhs);
    friend bool operator<(const ParamVal& lhs, const ParamVal& rhs);
    friend bool operator>=(const ParamVal& lhs, const ParamVal& rhs);
    friend bool operator<=(const ParamVal& lhs, const ParamVal& rhs);
    T& as(void);

    const void* raw_ptr(void) const;
    void* raw_ptr(void);
    ParamDynType type(void) const;
    std::string str(void) const;
    size_t size(void) const;

    static std::string to_bytes(const ParamVal& value);
    static ParamVal from_bytes(const std::string& bytes, size_t& offset);

    MGE_WIN_DECLSPEC_FUC friend ParamVal operator+(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend ParamVal operator-(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend ParamVal operator*(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend ParamVal operator/(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator!=(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator>(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator<(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator>=(
            const ParamVal& lhs, const ParamVal& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator<=(
            const ParamVal& lhs, const ParamVal& rhs);
 };

 ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs);
--- a/src/custom/include/megbrain/custom/tensor.h
+++ b/src/custom/include/megbrain/custom/tensor.h
@@ -30,9 +30,9 @@ namespace custom {
 #define CUSTOM_DEVICE_TYPE_ENUM_DECL(custom_type, builtin_type, builtin_str) \
    custom_type,

 class Device {
    MGE_WIN_DECLSPEC_FUC const void* impl() const;
    MGE_WIN_DECLSPEC_FUC Device(const void* impl);
 class MGE_WIN_DECLSPEC_FUC Device {
    const void* impl() const;
    Device(const void* impl);
    CUSTOM_PIMPL_CLS_DECL(Device);

 public:
@@ -40,19 +40,19 @@ public:
        CUSTOM_FOR_EACH_DEVICE_TYPE(CUSTOM_DEVICE_TYPE_ENUM_DECL)
    };

    MGE_WIN_DECLSPEC_FUC Device(const std::string& device);
    MGE_WIN_DECLSPEC_FUC Device(const char* device);
    MGE_WIN_DECLSPEC_FUC Device(DeviceEnum device);
    Device(const std::string& device);
    Device(const char* device);
    Device(DeviceEnum device);

    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
    MGE_WIN_DECLSPEC_FUC DeviceEnum enumv(void) const;
    std::string str(void) const;
    DeviceEnum enumv(void) const;

    MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& device);
    MGE_WIN_DECLSPEC_FUC static bool is_legal(DeviceEnum device);
    MGE_WIN_DECLSPEC_FUC static std::vector<std::string> legal_devices(void);
    static bool is_legal(const std::string& device);
    static bool is_legal(DeviceEnum device);
    static std::vector<std::string> legal_devices(void);

    friend class Tensor;
    friend bool operator==(const Device& lhs, const Device& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(const Device& lhs, const Device& rhs);
    CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };

@@ -60,23 +60,23 @@ using DeviceEnum = Device::DeviceEnum;

 bool operator==(const Device& lhs, const Device& rhs);

 class Shape {
    MGE_WIN_DECLSPEC_FUC const void* impl() const;
    MGE_WIN_DECLSPEC_FUC Shape(const void* impl);
 class MGE_WIN_DECLSPEC_FUC Shape {
    const void* impl() const;
    Shape(const void* impl);
    CUSTOM_PIMPL_CLS_DECL(Shape);

 public:
    MGE_WIN_DECLSPEC_FUC Shape(const std::vector<size_t>& rhs);
    MGE_WIN_DECLSPEC_FUC Shape(const std::initializer_list<size_t>& rhs);
    Shape(const std::vector<size_t>& rhs);
    Shape(const std::initializer_list<size_t>& rhs);

    size_t& operator[](size_t idx);
    size_t operator[](size_t idx) const;

    MGE_WIN_DECLSPEC_FUC void ndim(size_t dim);
    MGE_WIN_DECLSPEC_FUC size_t ndim(void) const;
    void ndim(size_t dim);
    size_t ndim(void) const;

    friend class Tensor;
    friend bool operator==(const Shape& lhs, const Shape& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(const Shape& lhs, const Shape& rhs);
    CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };

@@ -104,9 +104,9 @@ using bfloat16_t = uint16_t;

 #define CUSTOM_DTYPE_ENUM_DECL(custom_type, builtin_type, ctype) custom_type,

 class DType {
    MGE_WIN_DECLSPEC_FUC const void* impl() const;
    MGE_WIN_DECLSPEC_FUC DType(const void* impl);
 class MGE_WIN_DECLSPEC_FUC DType {
    const void* impl() const;
    DType(const void* impl);
    CUSTOM_PIMPL_CLS_DECL(DType);

 public:
@@ -114,27 +114,33 @@ public:
        CUSTOM_FOR_EACH_TENSOR_DATA_TYPE(CUSTOM_DTYPE_ENUM_DECL)
    };

    MGE_WIN_DECLSPEC_FUC DType(const std::string& dtype);
    MGE_WIN_DECLSPEC_FUC DType(const char* dtype);
    MGE_WIN_DECLSPEC_FUC DType(
            const std::string& dtype, float scale, uint8_t zero_point = 0);
    MGE_WIN_DECLSPEC_FUC DType(const char* dtype, float scale, uint8_t zero_point = 0);
    MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype);
    MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0);

    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
    MGE_WIN_DECLSPEC_FUC DTypeEnum enumv() const;
    MGE_WIN_DECLSPEC_FUC float scale(void) const;
    MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const;
    DType(const std::string& dtype);
    DType(const char* dtype);
    DType(const std::string& dtype, float scale, uint8_t zero_point = 0);
    DType(const char* dtype, float scale, uint8_t zero_point = 0);
    DType(DTypeEnum dtype);
    DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0);

    std::string str(void) const;
    DTypeEnum enumv() const;
    float scale(void) const;
    uint8_t zero_point(void) const;
    template <typename T>
    MGE_WIN_DECLSPEC_FUC bool is_compatible(void) const;
    bool is_compatible(void) const;

    MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& dtype);
    MGE_WIN_DECLSPEC_FUC static bool is_legal(const DTypeEnum& dtype);
    MGE_WIN_DECLSPEC_FUC static std::vector<std::string> legal_dtypes(void);
    static bool is_legal(const std::string& dtype);
    static bool is_legal(const DTypeEnum& dtype);
    static std::vector<std::string> legal_dtypes(void);

    friend class Tensor;
    friend bool operator==(const DType& lhs, const DType& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const DType& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(
            const DType& lhs, const std::string& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const char* rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(
            const std::string& lhs, const DType& rhs);
    MGE_WIN_DECLSPEC_FUC friend bool operator==(const char* lhs, const DType& rhs);

    CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };

@@ -180,45 +186,45 @@ bool operator==(const DType& lhs, const char* rhs);
 bool operator==(const std::string& lhs, const DType& rhs);
 bool operator==(const char* lhs, const DType& rhs);

 class Format {
    MGE_WIN_DECLSPEC_FUC const void* impl() const;
    MGE_WIN_DECLSPEC_FUC Format(const void* impl);
 class MGE_WIN_DECLSPEC_FUC Format {
    const void* impl() const;
    Format(const void* impl);
    CUSTOM_PIMPL_CLS_DECL(Format);

 public:
    MGE_WIN_DECLSPEC_FUC Format(const std::string& format);
    MGE_WIN_DECLSPEC_FUC Format(const char* format);
    Format(const std::string& format);
    Format(const char* format);

    MGE_WIN_DECLSPEC_FUC std::string str(void) const;
    MGE_WIN_DECLSPEC_FUC bool is_default(void) const;
    std::string str(void) const;
    bool is_default(void) const;

    friend class Tensor;
    CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
 };

 class Tensor {
 class MGE_WIN_DECLSPEC_FUC Tensor {
    void* m_tensor;

    MGE_WIN_DECLSPEC_FUC const void* impl(void) const;
    MGE_WIN_DECLSPEC_FUC Tensor(const void* impl);
    const void* impl(void) const;
    Tensor(const void* impl);

    MGE_WIN_DECLSPEC_FUC const size_t* shapes_raw(void) const;
    MGE_WIN_DECLSPEC_FUC const ptrdiff_t* strides_raw(void) const;
    const size_t* shapes_raw(void) const;
    const ptrdiff_t* strides_raw(void) const;

 public:
    Tensor() = delete;
    MGE_WIN_DECLSPEC_FUC Tensor(const Tensor& rhs);
    MGE_WIN_DECLSPEC_FUC Tensor& operator=(const Tensor& rhs);

    MGE_WIN_DECLSPEC_FUC Shape shape(void) const;
    MGE_WIN_DECLSPEC_FUC DType dtype(void) const;
    MGE_WIN_DECLSPEC_FUC Format format(void) const;
    MGE_WIN_DECLSPEC_FUC Device device(void) const;

    MGE_WIN_DECLSPEC_FUC size_t size(void) const;
    MGE_WIN_DECLSPEC_FUC std::vector<ptrdiff_t> stride(void) const;
    MGE_WIN_DECLSPEC_FUC float scale(void) const;
    MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const;
    Tensor(const Tensor& rhs);
    Tensor& operator=(const Tensor& rhs);

    Shape shape(void) const;
    DType dtype(void) const;
    Format format(void) const;
    Device device(void) const;

    size_t size(void) const;
    std::vector<ptrdiff_t> stride(void) const;
    float scale(void) const;
    uint8_t zero_point(void) const;

    void* data(void);
    const void* data(void) const;
--- a/src/custom/include/megbrain/custom/utils.h
+++ b/src/custom/include/megbrain/custom/utils.h
@@ -19,10 +19,19 @@

 namespace custom {

 void assert_failed_log(
 #ifndef MGE_WIN_DECLSPEC_FUC
 #ifdef _WIN32
 #define MGE_WIN_DECLSPEC_FUC __declspec(dllexport)
 #else
 #define MGE_WIN_DECLSPEC_FUC
 #endif
 #endif

 MGE_WIN_DECLSPEC_FUC void assert_failed_log(
        const char* file, int line, const char* func, const char* expr,
        const char* msg_fmt, ...);

 #ifndef _WIN32
 #define custom_expect(expr, msg...)                                               \
    if (!(expr)) {                                                                \
        assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \
@@ -33,8 +42,22 @@ void assert_failed_log(
        assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \
    }                                                                             \
    assert((expr))
 #else
 #define custom_expect(expr, ...)                                              \
    if (!(expr)) {                                                            \
        assert_failed_log(                                                    \
                __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \
    }

 #define custom_assert(expr, ...)                                              \
    if (!(expr)) {                                                            \
        assert_failed_log(                                                    \
                __FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \
    }                                                                         \
    assert((expr))
 #endif

 class UnImpleWarnLog {
 class MGE_WIN_DECLSPEC_FUC UnImpleWarnLog {
 public:
    UnImpleWarnLog(
            const std::string& func, const std::string& attr, const std::string& val);
@@ -54,9 +77,9 @@ void impl_deleter(void* ptr) {
    std::unique_ptr<void, void_deleter> m_impl; \
                                                \
 public:                                         \
    MGE_WIN_DECLSPEC_FUC Cls();                 \
    MGE_WIN_DECLSPEC_FUC Cls(const Cls& rhs);   \
    MGE_WIN_DECLSPEC_FUC Cls& operator=(const Cls& rhs)
    Cls();                                      \
    Cls(const Cls& rhs);                        \
    Cls& operator=(const Cls& rhs)

 #define CUSTOM_PIMPL_CLS_DEFINE(Cls)                                                 \
    Cls::Cls() : m_impl(new Cls##Impl(), impl_deleter<Cls##Impl>) {}                 \