Browse Source

feat(imperative,src): add jit builder for custom op

GitOrigin-RevId: 3bb0b46311
tags/v1.8.0
Megvii Engine Team 3 years ago
parent
commit
a72e0cb568
21 changed files with 1611 additions and 183 deletions
  1. +8
    -0
      CMakeLists.txt
  2. +3
    -0
      imperative/CMakeLists.txt
  3. +12
    -1
      imperative/python/megengine/core/ops/custom.py
  4. +909
    -0
      imperative/python/megengine/utils/custom_op_tools.py
  5. +7
    -0
      imperative/python/src/ops.cpp
  6. +140
    -0
      imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
  7. +65
    -0
      imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
  8. +97
    -0
      imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
  9. +24
    -0
      imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
  10. +111
    -0
      imperative/python/test/unit/core/test_custom_op.py
  11. +1
    -0
      scripts/whl/macos/macos_build_whl.sh
  12. +1
    -0
      scripts/whl/manylinux2014/do_build_common.sh
  13. +4
    -1
      scripts/whl/windows/windows_build_whl.sh
  14. +13
    -10
      src/CMakeLists.txt
  15. +19
    -7
      src/custom/impl/manager.cpp
  16. +2
    -1
      src/custom/include/megbrain/custom/custom.h
  17. +47
    -53
      src/custom/include/megbrain/custom/op.h
  18. +10
    -10
      src/custom/include/megbrain/custom/param.h
  19. +40
    -31
      src/custom/include/megbrain/custom/param_val.h
  20. +70
    -64
      src/custom/include/megbrain/custom/tensor.h
  21. +28
    -5
      src/custom/include/megbrain/custom/utils.h

+ 8
- 0
CMakeLists.txt View File

@@ -1145,6 +1145,14 @@ if(TARGET _imperative_rt)
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CMAKE_CURRENT_SOURCE_DIR}/src/custom/include
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/include
COMMAND ${CMAKE_COMMAND} -E make_directory
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CMAKE_CURRENT_BINARY_DIR}/src/$<TARGET_FILE_NAME:${MGE_SHARED_LIB}>
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/lib/$<TARGET_FILE_NAME:${MGE_SHARED_LIB}>
DEPENDS _imperative_rt
VERBATIM
)


+ 3
- 0
imperative/CMakeLists.txt View File

@@ -67,8 +67,11 @@ add_custom_command(
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/$<TARGET_FILE_NAME:${MODULE_NAME}> # clean develop
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/version.py # clean develop
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/include # clean develop
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/lib # clean develop
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine ${CMAKE_CURRENT_BINARY_DIR}/python/megengine
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${CMAKE_CURRENT_BINARY_DIR}/python/test
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/src/custom/include ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/core/include
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt


+ 12
- 1
imperative/python/megengine/core/ops/custom.py View File

@@ -7,11 +7,14 @@
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

import os

from .._imperative_rt.ops._custom import (
_get_custom_op_list,
_install,
_make_custom_op,
_uninstall,
get_custom_op_abi_tag,
)

__all__ = ["load"]
@@ -25,8 +28,16 @@ def _gen_custom_op_maker(custom_op_name):


def load(lib_path):
op_in_this_lib = _install(lib_path[0:-3], lib_path)
lib_path = os.path.abspath(lib_path)
lib_name = os.path.splitext(lib_path)[0]
op_in_this_lib = _install(lib_name, lib_path)
for op in op_in_this_lib:
op_maker = _gen_custom_op_maker(op)
globals()[op] = op_maker
__all__.append(op)


def unload(lib_path):
lib_path = os.path.abspath(lib_path)
lib_name = os.path.splitext(lib_path)[0]
_uninstall(lib_name)

+ 909
- 0
imperative/python/megengine/utils/custom_op_tools.py View File

@@ -0,0 +1,909 @@
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

import collections
import ctypes
import glob
import os
import re
import subprocess
import sys
import time
from typing import List, Optional, Union

from ..core.ops.custom import load
from ..logger import get_logger


def _get_win_folder_with_ctypes(csidl_name):
csidl_const = {
"CSIDL_APPDATA": 26,
"CSIDL_COMMON_APPDATA": 35,
"CSIDL_LOCAL_APPDATA": 28,
}[csidl_name]

buf = ctypes.create_unicode_buffer(1024)
ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)

# Downgrade to short path name if have highbit chars. See
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
has_high_char = False
for c in buf:
if ord(c) > 255:
has_high_char = True
break
if has_high_char:
buf2 = ctypes.create_unicode_buffer(1024)
if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
buf = buf2

return buf.value


system = sys.platform
if system == "win32":
_get_win_folder = _get_win_folder_with_ctypes

PLAT_TO_VCVARS = {
"win-amd64": "x86_amd64",
}

logger = get_logger()

# environment varible
ev_custom_op_root_dir = "MGE_CUSTOM_OP_DIR"
ev_cuda_root_dir = "CUDA_ROOT_DIR"
ev_cudnn_root_dir = "CUDNN_ROOT_DIR"

# operating system
IS_WINDOWS = system == "win32"
IS_LINUX = system == "linux"
IS_MACOS = system == "darwin"

MGE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MGE_INC_PATH = os.path.join(MGE_PATH, "core", "include")
MGE_LIB_PATH = os.path.join(MGE_PATH, "core", "lib")
MGE_ABI_VER = 0


# compile version
MINIMUM_GCC_VERSION = (5, 0, 0)
MINIMUM_CLANG_CL_VERSION = (12, 0, 1)

# compile flags
COMMON_MSVC_FLAGS = [
"/MD",
"/wd4002",
"/wd4819",
"/EHsc",
]

MSVC_IGNORE_CUDAFE_WARNINGS = [
"field_without_dll_interface",
]

COMMON_NVCC_FLAGS = []

# Finds the CUDA install path
def _find_cuda_root_dir() -> Optional[str]:
cuda_root_dir = os.environ.get(ev_cuda_root_dir)
if cuda_root_dir is None:
try:
which = "where" if IS_WINDOWS else "which"
with open(os.devnull, "w") as devnull:
nvcc = (
subprocess.check_output([which, "nvcc"], stderr=devnull)
.decode()
.rstrip("\r\n")
)
cuda_root_dir = os.path.dirname(os.path.dirname(nvcc))
except Exception:
if IS_WINDOWS:
cuda_root_dir = os.environ.get("CUDA_PATH", None)
if cuda_root_dir == None:
cuda_root_dirs = glob.glob(
"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*"
)
if len(cuda_root_dirs) == 0:
cuda_root_dir = ""
else:
cuda_root_dir = cuda_root_dirs[0]
else:
cuda_root_dir = "/usr/local/cuda"
if not os.path.exists(cuda_root_dir):
cuda_root_dir = None
return cuda_root_dir


def _find_cudnn_root_dir() -> Optional[str]:
cudnn_root_dir = os.environ.get(ev_cudnn_root_dir)
return cudnn_root_dir


CUDA_ROOT_DIR = _find_cuda_root_dir()
CUDNN_ROOT_DIR = _find_cudnn_root_dir()

#####################################################################
# Phase 1
#####################################################################


def _is_cuda_file(path: str) -> bool:
valid_ext = [".cu", ".cuh"]
return os.path.splitext(path)[1] in valid_ext


# Return full path to the user-specific cache dir for this application.
# Typical user cache directories are:
# Mac OS X: ~/Library/Caches/<AppName>
# Unix: ~/.cache/<AppName> (XDG default)
# Windows: C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
def _get_user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
if system == "win32":
appauthor = appname if appauthor is None else appauthor
path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
if appname:
if appauthor is not False:
path = os.path.join(path, appauthor)
else:
path = os.path.join(path, appname)
if opinion:
path = os.path.join(path, "Cache")
elif system == "darwin":
path = os.path.expanduser("~/Library/Caches")
if appname:
path = os.path.join(path, appname)
else:
path = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
if appname:
path = os.path.join(path, appname)
if appname and version:
path = os.path.join(path, version)
return path


# Returns the path to the root folder under which custom op will built.
def _get_default_build_root() -> str:
return os.path.realpath(_get_user_cache_dir(appname="mge_custom_op"))


def _get_build_dir(name: str) -> str:
custom_op_root_dir = os.environ.get(ev_custom_op_root_dir)
if custom_op_root_dir is None:
custom_op_root_dir = _get_default_build_root()

build_dir = os.path.join(custom_op_root_dir, name)
return build_dir


#####################################################################
# Phase 2
#####################################################################


def update_hash(seed, value):
# using boost::hash_combine
# https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
return seed ^ (hash(value) + 0x9E3779B9 + (seed << 6) + (seed >> 2))


def hash_source_files(hash_value, source_files):
for filename in source_files:
with open(filename) as file:
hash_value = update_hash(hash_value, file.read())
return hash_value


def hash_build_args(hash_value, build_args):
for group in build_args:
for arg in group:
hash_value = update_hash(hash_value, arg)
return hash_value


Entry = collections.namedtuple("Entry", "version, hash")


class Versioner(object):
def __init__(self):
self.entries = {}

def get_version(self, name):
entry = self.entries.get(name)
return None if entry is None else entry.version

def bump_version_if_changed(
self, name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag
):
hash_value = 0
hash_value = hash_source_files(hash_value, sources)
hash_value = hash_build_args(hash_value, build_args)
hash_value = update_hash(hash_value, build_dir)
hash_value = update_hash(hash_value, with_cuda)
hash_value = update_hash(hash_value, with_cudnn)
hash_value = update_hash(hash_value, abi_tag)

entry = self.entries.get(name)
if entry is None:
self.entries[name] = entry = Entry(0, hash_value)
elif hash_value != entry.hash:
self.entries[name] = entry = Entry(entry.version + 1, hash_value)

return entry.version


custom_op_versioner = Versioner()


def version_check(
name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag,
):
old_version = custom_op_versioner.get_version(name)
version = custom_op_versioner.bump_version_if_changed(
name, sources, build_args, build_dir, with_cuda, with_cudnn, abi_tag,
)
return version, old_version


#####################################################################
# Phase 3
#####################################################################


def _check_ninja_availability():
try:
subprocess.check_output("ninja --version".split())
except Exception:
raise RuntimeError(
"Ninja is required to build custom op, please install ninja and update your PATH"
)


def _mge_is_built_from_src():
file_path = os.path.abspath(__file__)
if "site-packages" in file_path:
return False
else:
return True


def _accepted_compilers_for_platform():
if IS_WINDOWS:
return ["clang-cl"]
if IS_MACOS:
return ["clang++", "clang"]
if IS_LINUX:
return ["g++", "gcc", "gnu-c++", "gnu-cc"]


# Verifies that the compiler is the expected one for the current platform.
def _check_compiler_existed_for_platform(compiler: str) -> bool:
# there is no suitable cmd like `which` on windows, so we assume the compiler is always true on windows
if IS_WINDOWS:
try:
version_string = subprocess.check_output(
["clang-cl", "--version"], stderr=subprocess.STDOUT
).decode()
return True
except Exception:
return False

# use os.path.realpath to resolve any symlinks, in particular from "c++" to e.g. "g++".
which = subprocess.check_output(["which", compiler], stderr=subprocess.STDOUT)
compiler_path = os.path.realpath(which.decode().strip())
if any(name in compiler_path for name in _accepted_compilers_for_platform()):
return True

version_string = subprocess.check_output(
[compiler, "-v"], stderr=subprocess.STDOUT
).decode()
if sys.platform.startswith("linux"):
pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
results = re.findall(pattern, version_string)
if len(results) != 1:
return False
compiler_path = os.path.realpath(results[0].strip())
return any(name in compiler_path for name in _accepted_compilers_for_platform())

if sys.platform.startswith("darwin"):
return version_string.startswith("Apple clang")

return False


# Verifies that the given compiler is ABI-compatible with MegEngine.
def _check_compiler_abi_compatibility(compiler: str):
# we think if the megengine is built from source, the user will use the same compiler to compile the custom op
if _mge_is_built_from_src() or os.environ.get("MGE_CHECK_ABI", "1") == "0":
return True

# [TODO] There is no particular minimum version we need for clang, so we"re good here.
if sys.platform.startswith("darwin"):
return True

try:
if sys.platform.startswith("linux"):
minimum_required_version = MINIMUM_GCC_VERSION
versionstr = subprocess.check_output(
[compiler, "-dumpfullversion", "-dumpversion"]
)
version = versionstr.decode().strip().split(".")
else:
minimum_required_version = MINIMUM_CLANG_CL_VERSION
compiler_info = subprocess.check_output(
[compiler, "--version"], stderr=subprocess.STDOUT
)
match = re.search(r"(\d+)\.(\d+)\.(\d+)", compiler_info.decode().strip())
version = (0, 0, 0) if match is None else match.groups()
except Exception:
_, error, _ = sys.exc_info()
logger.warning(
"Error checking compiler version for {}: {}".format(compiler, error)
)
return False

if tuple(map(int, version)) >= minimum_required_version:
return True

return False


def _check_compiler_comatibility():
# we use clang-cl on windows, refer: https://clang.llvm.org/docs/UsersManual.html#clang-cl
compiler = (
os.environ.get("CXX", "clang-cl")
if IS_WINDOWS
else os.environ.get("CXX", "c++")
)

existed = _check_compiler_existed_for_platform(compiler)
if existed == False:
log_str = (
"Cannot find compiler which is compatible with the compiler "
"MegEngine was built with for this platform, which is {mge_compiler} on "
"{platform}. Please use {mge_compiler} to to compile your extension. "
"Alternatively, you may compile MegEngine from source using "
"{user_compiler}, and then you can also use {user_compiler} to compile "
"your extension."
).format(
user_compiler=compiler,
mge_compiler=_accepted_compilers_for_platform()[0],
platform=sys.platform,
)

logger.warning(log_str)
return False

compatible = _check_compiler_abi_compatibility(compiler)
if compatible == False:
log_str = (
"Your compiler version may be ABI-incompatible with MegEngine! "
"Please use a compiler that is ABI-compatible with GCC 5.0 on Linux "
"and LLVM/Clang 12.0 on Windows ."
)
logger.warning(log_str)
return True


#####################################################################
# Phase 4
#####################################################################


# Quote command-line arguments for DOS/Windows conventions.
def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
# Cover None-type
if not args:
return []
return ['"{}"'.format(arg) if " " in arg else arg for arg in args]


# Now we need user to specify the arch of GPU
def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
return []


def _setup_sys_includes(with_cuda: bool, with_cudnn: bool):
includes = [os.path.join(MGE_INC_PATH)]
if with_cuda:
includes.append(os.path.join(CUDA_ROOT_DIR, "include"))
if with_cudnn:
includes.append(os.path.join(CUDNN_ROOT_DIR, "include"))
return includes


def _setup_includes(extra_include_paths: List[str], with_cuda: bool, with_cudnn: bool):
user_includes = [os.path.abspath(path) for path in extra_include_paths]
system_includes = _setup_sys_includes(with_cuda, with_cudnn)
if IS_WINDOWS:
user_includes += system_includes
system_includes.clear()
return user_includes, system_includes


def _setup_common_cflags(user_includes: List[str], system_includes: List[str]):
common_cflags = []
common_cflags += ["-I{}".format(include) for include in user_includes]
common_cflags += ["-isystem {}".format(include) for include in system_includes]
if not IS_WINDOWS:
common_cflags += ["-D_GLIBCXX_USE_CXX11_ABI={}".format(MGE_ABI_VER)]
return common_cflags


def _setup_cuda_cflags(cflags: List[str], extra_cuda_cflags: List[str]):
cuda_flags = cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
if IS_WINDOWS:
for flag in COMMON_MSVC_FLAGS:
cuda_flags = ["-Xcompiler", flag] + cuda_flags
for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
cuda_flags = ["-Xcudafe", "--diag_suppress=" + ignore_warning] + cuda_flags
cuda_flags = _nt_quote_args(cuda_flags)
cuda_flags += _nt_quote_args(extra_cuda_cflags)
else:
cuda_flags += ["--compiler-options", '"-fPIC"']
cuda_flags += extra_cuda_cflags
if not any(flag.startswith("-std=") for flag in cuda_flags):
cuda_flags.append("-std=c++14")
if os.getenv("CC") is not None:
cuda_flags = ["-ccbin", os.getenv("CC")] + cuda_flags
return cuda_flags


def _setup_ldflags(
extra_ldflags: List[str], with_cuda: bool, with_cudnn: bool
) -> List[str]:
ldflags = extra_ldflags
if IS_WINDOWS:
ldflags.append(os.path.join(MGE_LIB_PATH, "megengine_shared.lib"))
if with_cuda:
ldflags.append(os.path.join(CUDA_ROOT_DIR, "lib", "x64", "cudart.lib"))
if with_cudnn:
ldflags.append(os.path.join(CUDNN_ROOT_DIR, "lib", "x64", "cudnn.lib"))

else:
ldflags.append("-lmegengine_shared -L{}".format(MGE_LIB_PATH))
ldflags.append("-Wl,-rpath,{}".format(MGE_LIB_PATH))
if with_cuda:
ldflags.append("-lcudart")
ldflags.append("-L{}".format(os.path.join(CUDA_ROOT_DIR, "lib64")))
ldflags.append("-Wl,-rpath,{}".format(os.path.join(CUDA_ROOT_DIR, "lib64")))
if with_cudnn:
ldflags.append("-L{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64")))
ldflags.append(
"-Wl,-rpath,{}".format(os.path.join(CUDNN_ROOT_DIR, "lib64"))
)

return ldflags


def _add_shared_flag(ldflags: List[str]):
ldflags += ["/LD" if IS_WINDOWS else "-shared"]
return ldflags


#####################################################################
# Phase 5
#####################################################################


def _obj_file_path(src_file_path: str):
file_name = os.path.splitext(os.path.basename(src_file_path))[0]
if _is_cuda_file(src_file_path):
target = "{}.cuda.o".format(file_name)
else:
target = "{}.o".format(file_name)
return target


def _dump_ninja_file(
path,
cflags,
post_cflags,
cuda_cflags,
cuda_post_cflags,
sources,
objects,
ldflags,
library_target,
with_cuda,
):
def sanitize_flags(flags):
return [] if flags is None else [flag.strip() for flag in flags]

cflags = sanitize_flags(cflags)
post_cflags = sanitize_flags(post_cflags)
cuda_cflags = sanitize_flags(cuda_cflags)
cuda_post_cflags = sanitize_flags(cuda_post_cflags)
ldflags = sanitize_flags(ldflags)

assert len(sources) == len(objects)
assert len(sources) > 0

if IS_WINDOWS:
compiler = os.environ.get("CXX", "clang-cl")
else:
compiler = os.environ.get("CXX", "c++")

# Version 1.3 is required for the `deps` directive.
config = ["ninja_required_version = 1.3"]
config.append("cxx = {}".format(compiler))
if with_cuda:
nvcc = os.path.join(CUDA_ROOT_DIR, "bin", "nvcc")
config.append("nvcc = {}".format(nvcc))

flags = ["cflags = {}".format(" ".join(cflags))]
flags.append("post_cflags = {}".format(" ".join(post_cflags)))
if with_cuda:
flags.append("cuda_cflags = {}".format(" ".join(cuda_cflags)))
flags.append("cuda_post_cflags = {}".format(" ".join(cuda_post_cflags)))
flags.append("ldflags = {}".format(" ".join(ldflags)))

# Turn into absolute paths so we can emit them into the ninja build
# file wherever it is.
sources = [os.path.abspath(file) for file in sources]

# See https://ninja-build.org/build.ninja.html for reference.
compile_rule = ["rule compile"]
if IS_WINDOWS:
compile_rule.append(
" command = clang-cl /showIncludes $cflags -c $in /Fo$out $post_cflags"
)
compile_rule.append(" deps = msvc")
else:
compile_rule.append(
" command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags"
)
compile_rule.append(" depfile = $out.d")
compile_rule.append(" deps = gcc")

if with_cuda:
cuda_compile_rule = ["rule cuda_compile"]
nvcc_gendeps = ""
cuda_compile_rule.append(
" command = $nvcc {} $cuda_cflags -c $in -o $out $cuda_post_cflags".format(
nvcc_gendeps
)
)

# Emit one build rule per source to enable incremental build.
build = []
for source_file, object_file in zip(sources, objects):
is_cuda_source = _is_cuda_file(source_file) and with_cuda
rule = "cuda_compile" if is_cuda_source else "compile"
if IS_WINDOWS:
source_file = source_file.replace(":", "$:")
object_file = object_file.replace(":", "$:")
source_file = source_file.replace(" ", "$ ")
object_file = object_file.replace(" ", "$ ")
build.append("build {}: {} {}".format(object_file, rule, source_file))

if library_target is not None:
link_rule = ["rule link"]
if IS_WINDOWS:
link_rule.append(" command = clang-cl $in /nologo $ldflags /out:$out")
else:
link_rule.append(" command = $cxx $in $ldflags -o $out")

link = ["build {}: link {}".format(library_target, " ".join(objects))]
default = ["default {}".format(library_target)]
else:
link_rule, link, default = [], [], []

# 'Blocks' should be separated by newlines, for visual benefit.
blocks = [config, flags, compile_rule]
if with_cuda:
blocks.append(cuda_compile_rule)
blocks += [link_rule, build, link, default]
with open(path, "w") as build_file:
for block in blocks:
lines = "\n".join(block)
build_file.write("{}\n\n".format(lines))


class FileBaton:
def __init__(self, lock_file_path, wait_seconds=0.1):
self.lock_file_path = lock_file_path
self.wait_seconds = wait_seconds
self.fd = None

def try_acquire(self):
try:
self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
return True
except FileExistsError:
return False

def wait(self):
while os.path.exists(self.lock_file_path):
time.sleep(self.wait_seconds)

def release(self):
if self.fd is not None:
os.close(self.fd)

os.remove(self.lock_file_path)


#####################################################################
# Phase 6
#####################################################################


def _build_with_ninja(build_dir: str, verbose: bool, error_prefix: str):
command = ["ninja", "-v"]
env = os.environ.copy()
try:
sys.stdout.flush()
sys.stderr.flush()
stdout_fileno = 1
subprocess.run(
command,
stdout=stdout_fileno if verbose else subprocess.PIPE,
stderr=subprocess.STDOUT,
cwd=build_dir,
check=True,
env=env,
)
except subprocess.CalledProcessError as e:
with open(os.path.join(build_dir, "build.ninja")) as f:
lines = f.readlines()
print(lines)
_, error, _ = sys.exc_info()
message = error_prefix
if hasattr(error, "output") and error.output:
message += ": {}".format(error.output.decode())
raise RuntimeError(message) from e


def build(
name: str,
sources: Union[str, List[str]],
extra_cflags: Union[str, List[str]] = [],
extra_cuda_cflags: Union[str, List[str]] = [],
extra_ldflags: Union[str, List[str]] = [],
extra_include_paths: Union[str, List[str]] = [],
with_cuda: Optional[bool] = None,
build_dir: Optional[bool] = None,
verbose: bool = False,
abi_tag: Optional[int] = None,
) -> str:
r"""Build a Custom Op with ninja in the way of just-in-time (JIT).

To build the custom op, a Ninja build file is emitted, which is used to
compile the given sources into a dynamic library.

By default, the directory to which the build file is emitted and the
resulting library compiled to is ``<tmp>/mge_custom_op/<name>``, where
``<tmp>`` is the temporary folder on the current platform and ``<name>``
the name of the custom op. This location can be overridden in two ways.
First, if the ``MGE_CUSTOM_OP_DIR`` environment variable is set, it
replaces ``<tmp>/mge_custom_op`` and all custom op will be compiled
into subfolders of this directory. Second, if the ``build_dir``
argument to this function is supplied, it overrides the entire path, i.e.
the library will be compiled into that folder directly.

To compile the sources, the default system compiler (``c++``) is used,
which can be overridden by setting the ``CXX`` environment variable. To pass
additional arguments to the compilation process, ``extra_cflags`` or
``extra_ldflags`` can be provided. For example, to compile your custom op
with optimizations, pass ``extra_cflags=['-O3']``. You can also use
``extra_cflags`` to pass further include directories.

CUDA support with mixed compilation is provided. Simply pass CUDA source
files (``.cu`` or ``.cuh``) along with other sources. Such files will be
detected and compiled with nvcc rather than the C++ compiler. This includes
passing the CUDA lib64 directory as a library directory, and linking
``cudart``. You can pass additional flags to nvcc via
``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various
heuristics for finding the CUDA install directory are used, which usually
work fine. If not, setting the ``CUDA_ROOT_DIR`` environment variable is the
safest option. If you use CUDNN, please also setting the ``CUDNN_ROOT_DIR``
environment variable.

Args:
name: The name of the custom op to build.
sources: A list of relative or absolute paths to C++ source files.
extra_cflags: optional list of compiler flags to forward to the build.
extra_cuda_cflags: optional list of compiler flags to forward to nvcc
when building CUDA sources.
extra_ldflags: optional list of linker flags to forward to the build.
extra_include_paths: optional list of include directories to forward
to the build.
with_cuda: Determines whether CUDA headers and libraries are added to
the build. If set to ``None`` (default), this value is
automatically determined based on the existence of ``.cu`` or
``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
and libraries to be included.
build_dir: optional path to use as build workspace.
verbose: If ``True``, turns on verbose logging of load steps.
abi_tag: Determines the value of MACRO ``_GLIBCXX_USE_CXX11_ABI``
in gcc compiler, should be ``0`` or ``1``.

Returns:
the compiled dynamic library path

"""

# phase 1: prepare config
if abi_tag != None:
global MGE_ABI_VER
MGE_ABI_VER = abi_tag

def strlist(args, name):
assert isinstance(args, str) or isinstance(
args, list
), "{} must be str or list[str]".format(name)
if isinstance(args, str):
return [args]
for arg in args:
assert isinstance(arg, str)
args = [arg.strip() for arg in args]
return args

sources = strlist(sources, "sources")
extra_cflags = strlist(extra_cflags, "extra_cflags")
extra_cuda_cflags = strlist(extra_cuda_cflags, "extra_cuda_cflags")
extra_ldflags = strlist(extra_ldflags, "extra_ldflags")
extra_include_paths = strlist(extra_include_paths, "extra_include_paths")

with_cuda = any(map(_is_cuda_file, sources)) if with_cuda is None else with_cuda
with_cudnn = any(["cudnn" in f for f in extra_ldflags])

if CUDA_ROOT_DIR == None and with_cuda:
print(
"No CUDA runtime is found, using {}=/path/to/your/cuda_root_dir".format(
ev_cuda_root_dir
)
)
if CUDNN_ROOT_DIR == None and with_cudnn:
print(
"Cannot find the root directory of cudnn, using {}=/path/to/your/cudnn_root_dir".format(
ev_cudnn_root_dir
)
)

build_dir = os.path.abspath(
_get_build_dir(name) if build_dir is None else build_dir
)
if not os.path.exists(build_dir):
os.makedirs(build_dir, exist_ok=True)

if verbose:
print("Using {} to build megengine custom op".format(build_dir))

# phase 2: version check
version, old_version = version_check(
name,
sources,
[extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths],
build_dir,
with_cuda,
with_cudnn,
abi_tag,
)
if verbose:
if version != old_version and old_version != None:
print(
"Input conditions of custom op {} have changed, bumping to version {}".format(
name, version
)
)
print("Building custom op {} with version {}".format(name, version))
if version == old_version:
if verbose:
print(
"No modifications detected for {}, skipping build step...".format(name)
)
return
name = "{}_v{}".format(name, version)

# phase 3: compiler and ninja check
_check_ninja_availability()
_check_compiler_comatibility()

# phase 4: setup the compile flags
user_includes, system_includes = _setup_includes(
extra_include_paths, with_cuda, with_cudnn
)
common_cflags = _setup_common_cflags(user_includes, system_includes)
cuda_cflags = (
_setup_cuda_cflags(common_cflags, extra_cuda_cflags) if with_cuda else None
)
ldflags = _setup_ldflags(extra_ldflags, with_cuda, with_cudnn)

if IS_WINDOWS:
cflags = common_cflags + COMMON_MSVC_FLAGS + extra_cflags
cflags = _nt_quote_args(cflags)
else:
cflags = common_cflags + ["-fPIC", "-std=c++14"] + extra_cflags

ldflags = _add_shared_flag(ldflags)
if sys.platform.startswith("darwin"):
ldflags.append("-undefined dynamic_lookup")
elif IS_WINDOWS:
ldflags += ["/link"]
ldflags = _nt_quote_args(ldflags)

baton = FileBaton(os.path.join(build_dir, "lock"))
if baton.try_acquire():
try:
# phase 5: generate ninja build file
objs = [_obj_file_path(src) for src in sources]
name += ".dll" if IS_WINDOWS else ".so"

build_file_path = os.path.join(build_dir, "build.ninja")
if verbose:
print("Emitting ninja build file {}".format(build_file_path))
_dump_ninja_file(
path=build_file_path,
cflags=cflags,
post_cflags=None,
cuda_cflags=cuda_cflags,
cuda_post_cflags=None,
sources=sources,
objects=objs,
ldflags=ldflags,
library_target=name,
with_cuda=with_cuda,
)

# phase 6: build with ninja
if verbose:
print(
"Compiling and linking your custom op {}".format(
os.path.join(build_dir, name)
)
)
_build_with_ninja(build_dir, verbose, "compiling error")
finally:
baton.release()
else:
baton.wait()

return os.path.join(build_dir, name)


def build_and_load(
name: str,
sources: Union[str, List[str]],
extra_cflags: Union[str, List[str]] = [],
extra_cuda_cflags: Union[str, List[str]] = [],
extra_ldflags: Union[str, List[str]] = [],
extra_include_paths: Union[str, List[str]] = [],
with_cuda: Optional[bool] = None,
build_dir: Optional[bool] = None,
verbose: bool = False,
abi_tag: Optional[int] = None,
) -> str:
r"""Build and Load a Custom Op with ninja in the way of just-in-time (JIT).
Same as the function ``build()`` but load the built dynamic library.

Args:
same as ``build()``

Returns:
the compiled dynamic library path

"""

lib_path = build(
name,
sources,
extra_cflags,
extra_cuda_cflags,
extra_ldflags,
extra_include_paths,
with_cuda,
build_dir,
verbose,
abi_tag,
)
if verbose:
print("Load the compiled custom op {}".format(lib_path))
load(lib_path)
return lib_path

+ 7
- 0
imperative/python/src/ops.cpp View File

@@ -766,6 +766,13 @@ void init_custom(pybind11::module m) {
m.def("_install", &install_custom);
m.def("_uninstall", &uninstall_custom);
m.def("_get_custom_op_list", &get_custom_op_list);
m.def("get_custom_op_abi_tag", [](void) -> int {
int ret = 0;
#ifdef _GLIBCXX_USE_CXX11_ABI
ret = _GLIBCXX_USE_CXX11_ABI;
#endif
return ret;
});

static PyMethodDef method_def = {
#ifdef METH_FASTCALL


+ 140
- 0
imperative/python/test/unit/core/custom_opsrc/elem_add.cpp View File

@@ -0,0 +1,140 @@
/**
* \file imperative/python/test/unit/core/custom_opsrc/elem_add.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "megbrain/custom/custom.h"

CUSTOM_OP_REG_BEGIN(ElemAddSmooth)

void forward_device_infer(
const std::vector<Device>& inputs, const Param& params,
std::vector<Device>& outputs) {
outputs[0] = inputs[0];
}

void forward_shape_infer(
const std::vector<Shape>& inputs, const Param& params,
std::vector<Shape>& outputs) {
outputs[0] = inputs[0];
}

void forward_dtype_infer(
const std::vector<DType>& inputs, const Param& params,
std::vector<DType>& outputs) {
outputs[0] = inputs[0];
}

void forward_format_infer(
const std::vector<Format>& inputs, const Param& params,
std::vector<Format>& outputs) {
outputs[0] = inputs[0];
}

template <typename scalar_t>
void forward_kernel(
const scalar_t* input0, const scalar_t* input1, scalar_t* output, size_t len,
float smooth) {
for (size_t i = 0; i < len; ++i) {
output[i] = input0[i] + input1[i];
if (output[i] < 0)
output[i] += smooth;
else
output[i] -= smooth;
}
}

void forward_compute(
const std::vector<Tensor>& inputs, const Param& params,
std::vector<Tensor>& outputs) {
DISPATCH_SIGN_INT_AND_FLOAT_TYPES(
outputs[0].dtype(), "forward_compute", ([&]() {
forward_kernel<scalar_t>(
inputs[0].data<scalar_t>(), inputs[1].data<scalar_t>(),
outputs[0].data<scalar_t>(), outputs[0].size(),
params["smooth"].as<float>());
}));
}

CUSTOM_OP_REG(ElemAddSmoothForward)
.set_description(
"Custom ElemAdd Operator With a Smooth Parameter, "
"which is used to verify the CPU kernel")
.add_input("lhs")
.add_input("rhs")
.add_output("output")
.add_param("smooth", 0.f)
.set_device_infer(forward_device_infer)
.set_shape_infer(forward_shape_infer)
.set_dtype_infer(forward_dtype_infer)
.set_format_infer(forward_format_infer)
.set_compute(forward_compute);

void backward_device_infer(
const std::vector<Device>& ograds, const Param& params,
std::vector<Device>& igrads) {
igrads[0] = ograds[0];
igrads[1] = ograds[0];
}

void backward_shape_infer(
const std::vector<Shape>& ograds, const Param& params,
std::vector<Shape>& igrads) {
igrads[0] = ograds[0];
igrads[1] = ograds[0];
}

void backward_dtype_infer(
const std::vector<DType>& ograds, const Param& params,
std::vector<DType>& igrads) {
igrads[0] = ograds[0];
igrads[1] = ograds[0];
}

void backward_format_infer(
const std::vector<Format>& ograds, const Param& params,
std::vector<Format>& igrads) {
igrads[0] = ograds[0];
igrads[1] = ograds[0];
}

template <typename scalar_t>
void backward_kernel(
const scalar_t* ograd, scalar_t* igrad0, scalar_t* igrad1, size_t len) {
for (size_t i = 0; i < len; ++i) {
igrad0[i] = ograd[i];
igrad1[i] = ograd[i];
}
}

void backward_compute(
const std::vector<Tensor>& ograds, const Param& params,
std::vector<Tensor>& igrads) {
DISPATCH_SIGN_INT_AND_FLOAT_TYPES(
igrads[0].dtype(), "backward_compute", ([&]() {
backward_kernel<scalar_t>(
ograds[0].data<scalar_t>(), igrads[0].data<scalar_t>(),
igrads[1].data<scalar_t>(), igrads[0].size());
}));
}

CUSTOM_OP_REG(ElemAddSmoothBackward)
.set_description(
"Custom ElemAdd Operator With a Smooth Parameter, "
"which is used to verify the CPU kernel")
.add_input("ograd")
.add_output("igrad_lhs")
.add_output("igrad_rhs")
.set_device_infer(backward_device_infer)
.set_shape_infer(backward_shape_infer)
.set_dtype_infer(backward_dtype_infer)
.set_format_infer(backward_format_infer)
.set_compute(backward_compute);

CUSTOM_OP_REG_END(ElemAddSmooth)

+ 65
- 0
imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp View File

@@ -0,0 +1,65 @@
/**
* \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "./matmul_scale.h"
#include "megbrain/custom/custom.h"

CUSTOM_OP_REG_BEGIN(MatMulScale)

void forward_shape_infer(
const std::vector<Shape>& inputs, const Param& params,
std::vector<Shape>& outputs) {
outputs[0] = {inputs[0][0], inputs[1][1]};
}

void forward_compute(
const std::vector<Tensor>& inputs, const Param& params,
std::vector<Tensor>& outputs) {
matmul_forward_helper(
inputs[0], inputs[1], outputs[0], inputs[0].shape()[0],
inputs[0].shape()[1], inputs[1].shape()[1], params["scale"].as<float>());
}

CUSTOM_OP_REG(MatMulScaleForward)
.add_inputs(2)
.add_outputs(1)
.add_param("scale", 1.0f)
.set_shape_infer(forward_shape_infer)
.set_compute("cuda", forward_compute);

void backward_shape_infer(
const std::vector<Shape>& ograd_and_inputs, const Param& params,
std::vector<Shape>& outputs) {
outputs[0] = ograd_and_inputs[1];
outputs[1] = ograd_and_inputs[2];
}

void backward_compute(
const std::vector<Tensor>& ograd_and_inputs, const Param& params,
std::vector<Tensor>& igrads) {
matmul_backward_lhs_helper(
ograd_and_inputs[2], ograd_and_inputs[0], igrads[0],
ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1],
ograd_and_inputs[2].shape()[1], params["scale"].as<float>());
matmul_backward_rhs_helper(
ograd_and_inputs[1], ograd_and_inputs[0], igrads[1],
ograd_and_inputs[1].shape()[0], ograd_and_inputs[1].shape()[1],
ograd_and_inputs[2].shape()[1], params["scale"].as<float>());
}

CUSTOM_OP_REG(MatMulScaleBackward)
.add_inputs(3)
.add_outputs(2)
.add_param("scale", 1.0f)
.set_shape_infer(backward_shape_infer)
.set_compute("cuda", backward_compute);

CUSTOM_OP_REG_END(MatMulScale)

+ 97
- 0
imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu View File

@@ -0,0 +1,97 @@
/**
* \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.cu
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include "./matmul_scale.h"

using namespace custom;

// matmul_forward for Mat_mxk * Mat_k*n
template <typename T>
__global__ void matmul_forward_naive(
const T* lhs, const T* rhs, T* res, size_t M, size_t K, size_t N, float scale) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

T acc = 0;
for (int i = 0; i < K; ++i)
acc += lhs[row * K + i] * rhs[i * N + col];
res[row * N + col] = acc * scale;
}

// matmul_backward_lhs for Mat_mxk * Mat_k*n = Mat_mxn
// that is Mat_mxn * Mat_nxk
template <typename T>
__global__ void matmul_backward_lhs_naive(
const T* rhs, const T* ograd, T* lhs_grad, size_t M, size_t K, size_t N,
float scale) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
T acc = 0;
for (int i = 0; i < N; ++i)
acc += ograd[row * N + i] * rhs[col * N + i];
lhs_grad[row * K + col] = acc / scale;
}

// matmul_backward_rhs for Mat_mxk * Mat_k*n = Mat_mxn
// that is Mat_kxm * Mat_mxn
template <typename T>
__global__ void matmul_backward_rhs_naive(
const T* lhs, const T* ograd, T* rhs_grad, size_t M, size_t K, size_t N,
float scale) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
T acc = 0;
for (int i = 0; i < M; ++i)
acc += lhs[i * K + row] * ograd[i * N + col];
rhs_grad[row * N + col] = acc / scale;
}

void matmul_forward_helper(
const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N,
float scale) {
dim3 block(1, 1);
dim3 grid(N / block.x, M / block.y);

DISPATCH_INT_AND_FLOAT_TYPES(res.dtype(), "matmul_forward", ([&]() {
matmul_forward_naive<scalar_t><<<grid, block>>>(
lhs.data<scalar_t>(), rhs.data<scalar_t>(),
res.data<scalar_t>(), M, K, N, scale);
}));
}

void matmul_backward_lhs_helper(
const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K,
size_t N, float scale) {
dim3 block(1, 1);
dim3 grid(K / block.x, M / block.y);
DISPATCH_INT_AND_FLOAT_TYPES(
lhs_grad.dtype(), "matmul_backward_lhs", ([&]() {
matmul_backward_lhs_naive<scalar_t><<<grid, block>>>(
rhs.data<scalar_t>(), ograd.data<scalar_t>(),
lhs_grad.data<scalar_t>(), M, K, N, scale);
}));
}

void matmul_backward_rhs_helper(
const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K,
size_t N, float scale) {
dim3 block(1, 1);
dim3 grid(N / block.x, K / block.y);
DISPATCH_INT_AND_FLOAT_TYPES(
rhs_grad.dtype(), "matmul_backward_rhs", ([&]() {
matmul_backward_rhs_naive<scalar_t><<<grid, block>>>(
lhs.data<scalar_t>(), ograd.data<scalar_t>(),
rhs_grad.data<scalar_t>(), M, K, N, scale);
}));
}

+ 24
- 0
imperative/python/test/unit/core/custom_opsrc/matmul_scale.h View File

@@ -0,0 +1,24 @@
/**
* \file imperative/python/test/unit/core/custom_opsrc/matmul_scale.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "megbrain/custom/custom.h"

using Tensor = custom::Tensor;

void matmul_forward_helper(
const Tensor& lhs, const Tensor& rhs, Tensor& res, size_t M, size_t K, size_t N,
float scale);
void matmul_backward_lhs_helper(
const Tensor& rhs, const Tensor& ograd, Tensor& lhs_grad, size_t M, size_t K,
size_t N, float scale);
void matmul_backward_rhs_helper(
const Tensor& lhs, const Tensor& ograd, Tensor& rhs_grad, size_t M, size_t K,
size_t N, float scale);

+ 111
- 0
imperative/python/test/unit/core/test_custom_op.py View File

@@ -0,0 +1,111 @@
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

import os
import platform
import shutil
import sys

import numpy as np
import pytest

import megengine
import megengine.functional as F
import megengine.optimizer as optim
from megengine import jit
from megengine.autodiff import Function, GradManager
from megengine.core._imperative_rt.core2 import apply
from megengine.core.ops import custom
from megengine.device import get_device_count
from megengine.module import Conv2d, Linear, Module
from megengine.random import normal
from megengine.tensor import Parameter, Tensor
from megengine.utils import custom_op_tools


def compare(ref, real):
if ref.shape != real.shape:
real = real.T
np.testing.assert_allclose(ref, real, rtol=1e-3, atol=1e-5)


def build_and_clean(test_func):
def wrapper():
cur_dir_path = os.path.dirname(os.path.abspath(__file__))
build_path = os.path.join(cur_dir_path, "custom_opsrc", "build")
mgb_root_path = os.path.dirname(
os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(cur_dir_path)))
)
)
extra_include_paths = [os.path.join(mgb_root_path, "src", "custom", "include")]
extra_ld_flags = []

if sys.platform != "win32":
ld_path = os.environ.get("LD_LIBRARY_PATH")
if ld_path != None:
ld_dirs = ld_path.split(":")
for ld_dir in ld_dirs:
if os.path.exists(ld_dir) and os.path.isdir(ld_dir):
for lib in os.listdir(ld_dir):
if "megengine_shared" in lib:
extra_ld_flags += [
"-L{} -Wl,-rpath,{}".format(ld_dir, ld_dir)
]
break

if get_device_count("gpu") > 0:
custom_opsrc = [
os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cpp"),
os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cu"),
]
else:
custom_opsrc = [os.path.join(cur_dir_path, "custom_opsrc", "elem_add.cpp")]

lib_path = custom_op_tools.build_and_load(
"test_op",
custom_opsrc,
extra_include_paths=extra_include_paths,
extra_ldflags=extra_ld_flags,
build_dir=build_path,
verbose=False,
abi_tag=custom.get_custom_op_abi_tag(),
)
test_func()

custom.unload(lib_path)
if os.path.exists(build_path):
shutil.rmtree(build_path)

return wrapper


@pytest.mark.skipif(
get_device_count("gpu") > 0, reason="elem_add operator is only supported on CPU"
)
@build_and_clean
def test_custom_op_cpu_build():
assert "ElemAddSmoothForward" in custom._get_custom_op_list()
assert "ElemAddSmoothBackward" in custom._get_custom_op_list()
assert hasattr(custom, "ElemAddSmoothForward")
assert hasattr(custom, "ElemAddSmoothBackward")


@pytest.mark.skipif(
platform.system() == "Darwin",
reason="GPU kernel is only support on Linux and Windows",
)
@pytest.mark.skipif(
get_device_count("gpu") < 1, reason="matmul scale operator is only supported on GPU"
)
@build_and_clean
def test_custom_op_gpu_build():
assert "MatMulScaleForward" in custom._get_custom_op_list()
assert "MatMulScaleBackward" in custom._get_custom_op_list()
assert hasattr(custom, "MatMulScaleForward")
assert hasattr(custom, "MatMulScaleBackward")

+ 1
- 0
scripts/whl/macos/macos_build_whl.sh View File

@@ -171,6 +171,7 @@ function do_build() {
mkdir -p staging

cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
cp -a ${SRC_DIR}/src/custom/include staging/megengine/core/include/
cd ${BUILD_DIR}/staging/megengine/core
rt_file=`ls _imperative_rt.*.so`
echo "rt file is: ${rt_file}"


+ 1
- 0
scripts/whl/manylinux2014/do_build_common.sh View File

@@ -151,6 +151,7 @@ do
rm -rf staging
mkdir -p staging
cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include

cd ${BUILD_DIR}/staging/megengine/core
mkdir -p lib/ucx


+ 4
- 1
scripts/whl/windows/windows_build_whl.sh View File

@@ -77,11 +77,13 @@ CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas6
CURAND_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/curand64_10.dll"
CUBLASLT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublasLt64_10.dll"
CUDART_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cudart64_101.dll"
MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll"
MGE_EXPORT_DLL="${SRC_DIR}/build_dir/host/build/src/megengine_shared.dll"
MGE_EXPORT_LIB="${SRC_DIR}/build_dir/host/build/src/megengine_shared.lib"

function depend_real_copy() {
REAL_DST=$1
echo "real copy lib to $1"
cp "${MGE_EXPORT_DLL}" ${REAL_DST}
cp "${MGE_EXPORT_LIB}" ${REAL_DST}

if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
@@ -190,6 +192,7 @@ function do_build() {
rm -rf staging
mkdir -p staging
cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
cp -a ${SRC_DIR}/src/custom/include/megbrain staging/megengine/core/include/
cd ${BUILD_DIR}/staging/megengine/core
rt_file=`ls _imperative_rt.*.pyd`
echo "rt file is: ${rt_file}"


+ 13
- 10
src/CMakeLists.txt View File

@@ -1,3 +1,8 @@
# force define a SHARED target for whl, caused by when build for APPLE
# we will force set BUILD_SHARED_LIBS=OFF for xcode needed
set(MGE_SHARED_LIB megengine_shared)
set(MGE_SHARED_LIB ${MGE_SHARED_LIB} PARENT_SCOPE)

if(MGE_WITH_JIT_MLIR)
add_subdirectory(jit/include/megbrain/jit/mlir/ir)
endif()
@@ -206,32 +211,30 @@ set (_VER_FILE ${PROJECT_SOURCE_DIR}/src/version.ld)

# Build as SHARED or STATIC depending on BUILD_SHARED_LIBS=ON/OFF
add_library(megengine)
# force define a SHARED target for whl, caused by when build for APPLE
# we will force set BUILD_SHARED_LIBS=OFF for xcode needed
add_library(megengine_shared SHARED)
add_library(${MGE_SHARED_LIB} SHARED)
target_link_libraries(megengine PRIVATE ${MGE_CUDA_LIBS})
target_link_libraries(megengine PUBLIC megbrain megdnn)
target_link_libraries(megengine_shared PUBLIC megbrain megdnn)
target_link_libraries(megengine_shared PRIVATE ${MGE_CUDA_LIBS})
target_link_libraries(${MGE_SHARED_LIB} PUBLIC megbrain megdnn)
target_link_libraries(${MGE_SHARED_LIB} PRIVATE ${MGE_CUDA_LIBS})
if (UNIX AND NOT APPLE)
target_link_options(megengine PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
set_target_properties(megengine PROPERTIES LINK_DEPENDS ${_VER_FILE})
target_link_options(megengine_shared PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
set_target_properties(megengine_shared PROPERTIES LINK_DEPENDS ${_VER_FILE})
target_link_options(${MGE_SHARED_LIB} PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
set_target_properties(${MGE_SHARED_LIB} PROPERTIES LINK_DEPENDS ${_VER_FILE})
endif()
if(WIN32 OR MSVC)
target_compile_definitions(megbrain PRIVATE MGE_DLL_EXPORT)
target_compile_definitions(megdnn PRIVATE MGE_DLL_EXPORT)
target_compile_definitions(megengine PRIVATE MGE_DLL_EXPORT)
target_compile_definitions(megengine_shared PRIVATE MGE_DLL_EXPORT)
target_compile_definitions(${MGE_SHARED_LIB} PRIVATE MGE_DLL_EXPORT)
# please do not use WINDOWS_EXPORT_ALL_SYMBOLS, as symbols max than 65535 when build with CUDA
#set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
#set_target_properties(megengine_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
#set_target_properties(${MGE_SHARED_LIB} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
endif()
if (MGE_WITH_DISTRIBUTED)
message(VERBOSE "megengine configured to link megray")
target_link_libraries(megengine PUBLIC megray)
target_link_libraries(megengine_shared PUBLIC megray)
target_link_libraries(${MGE_SHARED_LIB} PUBLIC megray)
endif()
# Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready
# for this.


+ 19
- 7
src/custom/impl/manager.cpp View File

@@ -18,12 +18,31 @@

#ifndef _WIN32
#include <dlfcn.h>
#else
#include <windows.h>
#endif

using namespace mgb;

namespace custom {

#ifdef _WIN32
#define RTLD_LAZY 0

void* dlopen(const char* file, int) {
return static_cast<void*>(LoadLibrary(file));
}

int dlclose(void* handle) {
return static_cast<int>(FreeLibrary(static_cast<HMODULE>(handle)));
}

const char* dlerror(void) {
static char win_err_info[] = "no dlerror info in windows";
return win_err_info;
}
#endif

CustomOpManager* CustomOpManager::inst(void) {
static CustomOpManager op_manager;
return &op_manager;
@@ -127,7 +146,6 @@ std::vector<RunTimeId> CustomOpManager::op_id_list(void) {
return ret;
}

#ifndef _WIN32
CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY)
: m_handle(nullptr, [](void* handle) { dlclose(handle); }) {
auto op_list_before_load = CustomOpManager::inst()->op_name_list();
@@ -146,12 +164,6 @@ CustomLib::CustomLib(const std::string& path, int mode = RTLD_LAZY)
}
}
}
#else
CustomLib::CustomLib(const std::string& path, int mode = 0)
: m_handle(nullptr, [](void* handle) {}) {
mgb_assert(false, "custom op is only supported on Linux now");
}
#endif

const std::vector<std::string>& CustomLib::ops_in_lib(void) const {
return m_ops;


+ 2
- 1
src/custom/include/megbrain/custom/custom.h View File

@@ -16,7 +16,8 @@
#include "tensor.h"

namespace custom {
std::shared_ptr<CustomOp> op_insert(std::string opname, uint32_t version);
MGE_WIN_DECLSPEC_FUC std::shared_ptr<CustomOp> op_insert(
std::string opname, uint32_t version);
}

#define CUSTOM_OP_REG(OpName) \


+ 47
- 53
src/custom/include/megbrain/custom/op.h View File

@@ -32,27 +32,26 @@ namespace custom {

using RunTimeId = uint64_t;

class ArgInfo {
class MGE_WIN_DECLSPEC_FUC ArgInfo {
CUSTOM_PIMPL_CLS_DECL(ArgInfo);
MGE_WIN_DECLSPEC_FUC ArgInfo(
const std::string& name, const std::string& desc,
ArgInfo(const std::string& name, const std::string& desc,
const std::unordered_set<std::string>& dtypes, const int& ndim,
const std::string& mem_stgy);

MGE_WIN_DECLSPEC_FUC const std::string& name(void) const;
MGE_WIN_DECLSPEC_FUC const std::string& desc(void) const;
MGE_WIN_DECLSPEC_FUC const std::unordered_set<std::string>& dtypes(void) const;
MGE_WIN_DECLSPEC_FUC int ndim(void) const;
MGE_WIN_DECLSPEC_FUC const std::string& mem_strategy(void) const;
const std::string& name(void) const;
const std::string& desc(void) const;
const std::unordered_set<std::string>& dtypes(void) const;
int ndim(void) const;
const std::string& mem_strategy(void) const;

MGE_WIN_DECLSPEC_FUC std::string str() const;
std::string str() const;
};

class CustomOp {
class MGE_WIN_DECLSPEC_FUC CustomOp {
std::unique_ptr<void, void_deleter> m_impl;

public:
MGE_WIN_DECLSPEC_FUC CustomOp(const std::string& op_type, uint32_t version);
CustomOp(const std::string& op_type, uint32_t version);
PREVENT_COPY_AND_ASSIGN(CustomOp);

using DeviceInferFuncPtr =
@@ -71,70 +70,65 @@ public:
void (*)(const std::vector<Tensor>&, const Param&, std::vector<Tensor>&);

// write for forward
MGE_WIN_DECLSPEC_FUC CustomOp& set_device_infer(DeviceInferFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_shape_infer(ShapeInferFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_dtype_infer(DTypeInferFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_format_infer(FormatInferFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(PreprocessFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_preprocess(
const std::string& device, PreprocessFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(PostprocessFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_postprocess(
const std::string& device, PostprocessFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(ComputeFuncPtr func);
MGE_WIN_DECLSPEC_FUC CustomOp& set_compute(
const std::string& device, ComputeFuncPtr func);

MGE_WIN_DECLSPEC_FUC CustomOp& set_description(const std::string& op_desc);
MGE_WIN_DECLSPEC_FUC CustomOp& add_input(
CustomOp& set_device_infer(DeviceInferFuncPtr func);
CustomOp& set_shape_infer(ShapeInferFuncPtr func);
CustomOp& set_dtype_infer(DTypeInferFuncPtr func);
CustomOp& set_format_infer(FormatInferFuncPtr func);
CustomOp& set_preprocess(PreprocessFuncPtr func);
CustomOp& set_preprocess(const std::string& device, PreprocessFuncPtr func);
CustomOp& set_postprocess(PostprocessFuncPtr func);
CustomOp& set_postprocess(const std::string& device, PostprocessFuncPtr func);
CustomOp& set_compute(ComputeFuncPtr func);
CustomOp& set_compute(const std::string& device, ComputeFuncPtr func);

CustomOp& set_description(const std::string& op_desc);
CustomOp& add_input(
const std::string& name, const std::string& desc,
const std::initializer_list<std::string>& legal_dtypes = {"float32"},
int dims = -1, const std::string& mem_stgy = "default");
MGE_WIN_DECLSPEC_FUC CustomOp& add_output(
CustomOp& add_output(
const std::string& name, const std::string& desc,
const std::initializer_list<std::string>& legal_dtypes = {"float32"},
int dims = -1, const std::string& mem_stgy = "default");
MGE_WIN_DECLSPEC_FUC CustomOp& add_input(
CustomOp& add_input(
const std::string& name,
const std::initializer_list<std::string>& legal_dtypes = {"float32"},
int dims = -1, const std::string& mem_stgy = "default");
MGE_WIN_DECLSPEC_FUC CustomOp& add_output(
CustomOp& add_output(
const std::string& name,
const std::initializer_list<std::string>& legal_dtypes = {"float32"},
int dims = -1, const std::string& mem_stgy = "default");
MGE_WIN_DECLSPEC_FUC CustomOp& add_inputs(const size_t& input_num);
MGE_WIN_DECLSPEC_FUC CustomOp& add_outputs(const size_t& output_num);
MGE_WIN_DECLSPEC_FUC CustomOp& add_param(
const std::string& name, const ParamVal& default_val);
MGE_WIN_DECLSPEC_FUC CustomOp& add_param(
CustomOp& add_inputs(const size_t& input_num);
CustomOp& add_outputs(const size_t& output_num);
CustomOp& add_param(const std::string& name, const ParamVal& default_val);
CustomOp& add_param(
const std::string& name, const std::string& desc,
const ParamVal& default_val);

// read
MGE_WIN_DECLSPEC_FUC std::string op_type(void) const;
MGE_WIN_DECLSPEC_FUC std::string op_desc(void) const;
MGE_WIN_DECLSPEC_FUC RunTimeId runtime_id(void) const;
MGE_WIN_DECLSPEC_FUC size_t input_num(void) const;
MGE_WIN_DECLSPEC_FUC size_t output_num(void) const;
MGE_WIN_DECLSPEC_FUC std::string str(void) const;
MGE_WIN_DECLSPEC_FUC const ParamInfo& param_info(void) const;
MGE_WIN_DECLSPEC_FUC ArgInfo input_info(size_t idx) const;
MGE_WIN_DECLSPEC_FUC ArgInfo output_info(size_t idx) const;
MGE_WIN_DECLSPEC_FUC const std::vector<ArgInfo>& inputs_info(void) const;
MGE_WIN_DECLSPEC_FUC const std::vector<ArgInfo>& outputs_info(void) const;
std::string op_type(void) const;
std::string op_desc(void) const;
RunTimeId runtime_id(void) const;
size_t input_num(void) const;
size_t output_num(void) const;
std::string str(void) const;
const ParamInfo& param_info(void) const;
ArgInfo input_info(size_t idx) const;
ArgInfo output_info(size_t idx) const;
const std::vector<ArgInfo>& inputs_info(void) const;
const std::vector<ArgInfo>& outputs_info(void) const;

// use
MGE_WIN_DECLSPEC_FUC std::vector<Device> infer_output_device(
std::vector<Device> infer_output_device(
const std::vector<Device>&, const Param&) const;
MGE_WIN_DECLSPEC_FUC std::vector<Shape> infer_output_shape(
std::vector<Shape> infer_output_shape(
const std::vector<Shape>&, const Param&) const;
MGE_WIN_DECLSPEC_FUC std::vector<DType> infer_output_dtype(
std::vector<DType> infer_output_dtype(
const std::vector<DType>&, const Param&) const;
MGE_WIN_DECLSPEC_FUC std::vector<Format> infer_output_format(
std::vector<Format> infer_output_format(
const std::vector<Format>&, const Param&) const;
MGE_WIN_DECLSPEC_FUC void compute(
const std::vector<Tensor>&, const Param&, std::vector<Tensor>&) const;
void compute(const std::vector<Tensor>&, const Param&, std::vector<Tensor>&) const;
};

} // namespace custom

+ 10
- 10
src/custom/include/megbrain/custom/param.h View File

@@ -23,7 +23,7 @@ class ParamInfoImpl;
class ParamImpl;

// Schema of a param element
class ParamSchema {
class MGE_WIN_DECLSPEC_FUC ParamSchema {
CUSTOM_PIMPL_CLS_DECL(ParamSchema);
ParamSchema(
const std::string& name, const ParamVal& value,
@@ -36,7 +36,7 @@ class ParamSchema {
std::string str(void) const;
};

class ParamInfo {
class MGE_WIN_DECLSPEC_FUC ParamInfo {
CUSTOM_PIMPL_CLS_DECL(ParamInfo);

void set_tag(const std::string&);
@@ -46,16 +46,16 @@ class ParamInfo {
const std::vector<ParamSchema>& meta(void) const;
};

class Param {
class MGE_WIN_DECLSPEC_FUC Param {
CUSTOM_PIMPL_CLS_DECL(Param);

MGE_WIN_DECLSPEC_FUC Param(const ParamInfo&);
MGE_WIN_DECLSPEC_FUC ParamVal& operator[](const std::string&);
MGE_WIN_DECLSPEC_FUC const ParamVal& operator[](const std::string&) const;
MGE_WIN_DECLSPEC_FUC const std::unordered_map<std::string, ParamVal>& raw() const;
MGE_WIN_DECLSPEC_FUC bool exist(const std::string& name) const;
MGE_WIN_DECLSPEC_FUC std::string to_bytes(void) const;
MGE_WIN_DECLSPEC_FUC void from_bytes(const std::string&);
Param(const ParamInfo&);
ParamVal& operator[](const std::string&);
const ParamVal& operator[](const std::string&) const;
const std::unordered_map<std::string, ParamVal>& raw() const;
bool exist(const std::string& name) const;
std::string to_bytes(void) const;
void from_bytes(const std::string&);
};

MGE_WIN_DECLSPEC_FUC bool operator==(const Param&, const Param&);


+ 40
- 31
src/custom/include/megbrain/custom/param_val.h View File

@@ -169,21 +169,21 @@ std::string vec2str(const std::vector<T>& vec) {
* Con1: user need to set the type explicitly when class template instantiation
* Con2: ParamVal<int> can not be assigned to ParamVal<double>
*/
class ParamVal {
class MGE_WIN_DECLSPEC_FUC ParamVal {
std::unique_ptr<void, void_deleter> m_ptr;
ParamDynType m_type;

public:
template <typename T>
MGE_WIN_DECLSPEC_FUC ParamVal(const T& val);
ParamVal(const T& val);
template <typename T>
MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list<T>& val);
ParamVal(const std::initializer_list<T>& val);

MGE_WIN_DECLSPEC_FUC ParamVal();
MGE_WIN_DECLSPEC_FUC ParamVal(const char* str);
MGE_WIN_DECLSPEC_FUC ParamVal(const std::initializer_list<const char*>& strs);
MGE_WIN_DECLSPEC_FUC ParamVal(const std::vector<const char*>& strs);
MGE_WIN_DECLSPEC_FUC ParamVal(const ParamVal& rhs);
ParamVal();
ParamVal(const char* str);
ParamVal(const std::initializer_list<const char*>& strs);
ParamVal(const std::vector<const char*>& strs);
ParamVal(const ParamVal& rhs);

template <typename T>
ParamVal& operator=(const T& rhs);
@@ -196,30 +196,39 @@ public:
ParamVal& operator=(const ParamVal& rhs);

template <typename T>
MGE_WIN_DECLSPEC_FUC const T& as(void) const;
const T& as(void) const;
template <typename T>
MGE_WIN_DECLSPEC_FUC T& as(void);

MGE_WIN_DECLSPEC_FUC const void* raw_ptr(void) const;
MGE_WIN_DECLSPEC_FUC void* raw_ptr(void);
MGE_WIN_DECLSPEC_FUC ParamDynType type(void) const;
MGE_WIN_DECLSPEC_FUC std::string str(void) const;
MGE_WIN_DECLSPEC_FUC size_t size(void) const;

MGE_WIN_DECLSPEC_FUC static std::string to_bytes(const ParamVal& value);
MGE_WIN_DECLSPEC_FUC static ParamVal from_bytes(
const std::string& bytes, size_t& offset);

friend ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs);
friend ParamVal operator-(const ParamVal& lhs, const ParamVal& rhs);
friend ParamVal operator*(const ParamVal& lhs, const ParamVal& rhs);
friend ParamVal operator/(const ParamVal& lhs, const ParamVal& rhs);
friend bool operator==(const ParamVal& lhs, const ParamVal& rhs);
friend bool operator!=(const ParamVal& lhs, const ParamVal& rhs);
friend bool operator>(const ParamVal& lhs, const ParamVal& rhs);
friend bool operator<(const ParamVal& lhs, const ParamVal& rhs);
friend bool operator>=(const ParamVal& lhs, const ParamVal& rhs);
friend bool operator<=(const ParamVal& lhs, const ParamVal& rhs);
T& as(void);

const void* raw_ptr(void) const;
void* raw_ptr(void);
ParamDynType type(void) const;
std::string str(void) const;
size_t size(void) const;

static std::string to_bytes(const ParamVal& value);
static ParamVal from_bytes(const std::string& bytes, size_t& offset);

MGE_WIN_DECLSPEC_FUC friend ParamVal operator+(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend ParamVal operator-(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend ParamVal operator*(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend ParamVal operator/(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator!=(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator>(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator<(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator>=(
const ParamVal& lhs, const ParamVal& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator<=(
const ParamVal& lhs, const ParamVal& rhs);
};

ParamVal operator+(const ParamVal& lhs, const ParamVal& rhs);


+ 70
- 64
src/custom/include/megbrain/custom/tensor.h View File

@@ -30,9 +30,9 @@ namespace custom {
#define CUSTOM_DEVICE_TYPE_ENUM_DECL(custom_type, builtin_type, builtin_str) \
custom_type,

class Device {
MGE_WIN_DECLSPEC_FUC const void* impl() const;
MGE_WIN_DECLSPEC_FUC Device(const void* impl);
class MGE_WIN_DECLSPEC_FUC Device {
const void* impl() const;
Device(const void* impl);
CUSTOM_PIMPL_CLS_DECL(Device);

public:
@@ -40,19 +40,19 @@ public:
CUSTOM_FOR_EACH_DEVICE_TYPE(CUSTOM_DEVICE_TYPE_ENUM_DECL)
};

MGE_WIN_DECLSPEC_FUC Device(const std::string& device);
MGE_WIN_DECLSPEC_FUC Device(const char* device);
MGE_WIN_DECLSPEC_FUC Device(DeviceEnum device);
Device(const std::string& device);
Device(const char* device);
Device(DeviceEnum device);

MGE_WIN_DECLSPEC_FUC std::string str(void) const;
MGE_WIN_DECLSPEC_FUC DeviceEnum enumv(void) const;
std::string str(void) const;
DeviceEnum enumv(void) const;

MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& device);
MGE_WIN_DECLSPEC_FUC static bool is_legal(DeviceEnum device);
MGE_WIN_DECLSPEC_FUC static std::vector<std::string> legal_devices(void);
static bool is_legal(const std::string& device);
static bool is_legal(DeviceEnum device);
static std::vector<std::string> legal_devices(void);

friend class Tensor;
friend bool operator==(const Device& lhs, const Device& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(const Device& lhs, const Device& rhs);
CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
};

@@ -60,23 +60,23 @@ using DeviceEnum = Device::DeviceEnum;

bool operator==(const Device& lhs, const Device& rhs);

class Shape {
MGE_WIN_DECLSPEC_FUC const void* impl() const;
MGE_WIN_DECLSPEC_FUC Shape(const void* impl);
class MGE_WIN_DECLSPEC_FUC Shape {
const void* impl() const;
Shape(const void* impl);
CUSTOM_PIMPL_CLS_DECL(Shape);

public:
MGE_WIN_DECLSPEC_FUC Shape(const std::vector<size_t>& rhs);
MGE_WIN_DECLSPEC_FUC Shape(const std::initializer_list<size_t>& rhs);
Shape(const std::vector<size_t>& rhs);
Shape(const std::initializer_list<size_t>& rhs);

size_t& operator[](size_t idx);
size_t operator[](size_t idx) const;

MGE_WIN_DECLSPEC_FUC void ndim(size_t dim);
MGE_WIN_DECLSPEC_FUC size_t ndim(void) const;
void ndim(size_t dim);
size_t ndim(void) const;

friend class Tensor;
friend bool operator==(const Shape& lhs, const Shape& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(const Shape& lhs, const Shape& rhs);
CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
};

@@ -104,9 +104,9 @@ using bfloat16_t = uint16_t;

#define CUSTOM_DTYPE_ENUM_DECL(custom_type, builtin_type, ctype) custom_type,

class DType {
MGE_WIN_DECLSPEC_FUC const void* impl() const;
MGE_WIN_DECLSPEC_FUC DType(const void* impl);
class MGE_WIN_DECLSPEC_FUC DType {
const void* impl() const;
DType(const void* impl);
CUSTOM_PIMPL_CLS_DECL(DType);

public:
@@ -114,27 +114,33 @@ public:
CUSTOM_FOR_EACH_TENSOR_DATA_TYPE(CUSTOM_DTYPE_ENUM_DECL)
};

MGE_WIN_DECLSPEC_FUC DType(const std::string& dtype);
MGE_WIN_DECLSPEC_FUC DType(const char* dtype);
MGE_WIN_DECLSPEC_FUC DType(
const std::string& dtype, float scale, uint8_t zero_point = 0);
MGE_WIN_DECLSPEC_FUC DType(const char* dtype, float scale, uint8_t zero_point = 0);
MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype);
MGE_WIN_DECLSPEC_FUC DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0);

MGE_WIN_DECLSPEC_FUC std::string str(void) const;
MGE_WIN_DECLSPEC_FUC DTypeEnum enumv() const;
MGE_WIN_DECLSPEC_FUC float scale(void) const;
MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const;
DType(const std::string& dtype);
DType(const char* dtype);
DType(const std::string& dtype, float scale, uint8_t zero_point = 0);
DType(const char* dtype, float scale, uint8_t zero_point = 0);
DType(DTypeEnum dtype);
DType(DTypeEnum dtype, float scale, uint8_t zero_point = 0);

std::string str(void) const;
DTypeEnum enumv() const;
float scale(void) const;
uint8_t zero_point(void) const;
template <typename T>
MGE_WIN_DECLSPEC_FUC bool is_compatible(void) const;
bool is_compatible(void) const;

MGE_WIN_DECLSPEC_FUC static bool is_legal(const std::string& dtype);
MGE_WIN_DECLSPEC_FUC static bool is_legal(const DTypeEnum& dtype);
MGE_WIN_DECLSPEC_FUC static std::vector<std::string> legal_dtypes(void);
static bool is_legal(const std::string& dtype);
static bool is_legal(const DTypeEnum& dtype);
static std::vector<std::string> legal_dtypes(void);

friend class Tensor;
friend bool operator==(const DType& lhs, const DType& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const DType& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(
const DType& lhs, const std::string& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(const DType& lhs, const char* rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(
const std::string& lhs, const DType& rhs);
MGE_WIN_DECLSPEC_FUC friend bool operator==(const char* lhs, const DType& rhs);

CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
};

@@ -180,45 +186,45 @@ bool operator==(const DType& lhs, const char* rhs);
bool operator==(const std::string& lhs, const DType& rhs);
bool operator==(const char* lhs, const DType& rhs);

class Format {
MGE_WIN_DECLSPEC_FUC const void* impl() const;
MGE_WIN_DECLSPEC_FUC Format(const void* impl);
class MGE_WIN_DECLSPEC_FUC Format {
const void* impl() const;
Format(const void* impl);
CUSTOM_PIMPL_CLS_DECL(Format);

public:
MGE_WIN_DECLSPEC_FUC Format(const std::string& format);
MGE_WIN_DECLSPEC_FUC Format(const char* format);
Format(const std::string& format);
Format(const char* format);

MGE_WIN_DECLSPEC_FUC std::string str(void) const;
MGE_WIN_DECLSPEC_FUC bool is_default(void) const;
std::string str(void) const;
bool is_default(void) const;

friend class Tensor;
CUSTOM_DATA_ADAPTOR_FRIEND_DECL;
};

class Tensor {
class MGE_WIN_DECLSPEC_FUC Tensor {
void* m_tensor;

MGE_WIN_DECLSPEC_FUC const void* impl(void) const;
MGE_WIN_DECLSPEC_FUC Tensor(const void* impl);
const void* impl(void) const;
Tensor(const void* impl);

MGE_WIN_DECLSPEC_FUC const size_t* shapes_raw(void) const;
MGE_WIN_DECLSPEC_FUC const ptrdiff_t* strides_raw(void) const;
const size_t* shapes_raw(void) const;
const ptrdiff_t* strides_raw(void) const;

public:
Tensor() = delete;
MGE_WIN_DECLSPEC_FUC Tensor(const Tensor& rhs);
MGE_WIN_DECLSPEC_FUC Tensor& operator=(const Tensor& rhs);
MGE_WIN_DECLSPEC_FUC Shape shape(void) const;
MGE_WIN_DECLSPEC_FUC DType dtype(void) const;
MGE_WIN_DECLSPEC_FUC Format format(void) const;
MGE_WIN_DECLSPEC_FUC Device device(void) const;
MGE_WIN_DECLSPEC_FUC size_t size(void) const;
MGE_WIN_DECLSPEC_FUC std::vector<ptrdiff_t> stride(void) const;
MGE_WIN_DECLSPEC_FUC float scale(void) const;
MGE_WIN_DECLSPEC_FUC uint8_t zero_point(void) const;
Tensor(const Tensor& rhs);
Tensor& operator=(const Tensor& rhs);
Shape shape(void) const;
DType dtype(void) const;
Format format(void) const;
Device device(void) const;
size_t size(void) const;
std::vector<ptrdiff_t> stride(void) const;
float scale(void) const;
uint8_t zero_point(void) const;

void* data(void);
const void* data(void) const;


+ 28
- 5
src/custom/include/megbrain/custom/utils.h View File

@@ -19,10 +19,19 @@

namespace custom {

void assert_failed_log(
#ifndef MGE_WIN_DECLSPEC_FUC
#ifdef _WIN32
#define MGE_WIN_DECLSPEC_FUC __declspec(dllexport)
#else
#define MGE_WIN_DECLSPEC_FUC
#endif
#endif

MGE_WIN_DECLSPEC_FUC void assert_failed_log(
const char* file, int line, const char* func, const char* expr,
const char* msg_fmt, ...);

#ifndef _WIN32
#define custom_expect(expr, msg...) \
if (!(expr)) { \
assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \
@@ -33,8 +42,22 @@ void assert_failed_log(
assert_failed_log(__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, ##msg); \
} \
assert((expr))
#else
#define custom_expect(expr, ...) \
if (!(expr)) { \
assert_failed_log( \
__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \
}

#define custom_assert(expr, ...) \
if (!(expr)) { \
assert_failed_log( \
__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr, __VA_ARGS__); \
} \
assert((expr))
#endif

class UnImpleWarnLog {
class MGE_WIN_DECLSPEC_FUC UnImpleWarnLog {
public:
UnImpleWarnLog(
const std::string& func, const std::string& attr, const std::string& val);
@@ -54,9 +77,9 @@ void impl_deleter(void* ptr) {
std::unique_ptr<void, void_deleter> m_impl; \
\
public: \
MGE_WIN_DECLSPEC_FUC Cls(); \
MGE_WIN_DECLSPEC_FUC Cls(const Cls& rhs); \
MGE_WIN_DECLSPEC_FUC Cls& operator=(const Cls& rhs)
Cls(); \
Cls(const Cls& rhs); \
Cls& operator=(const Cls& rhs)

#define CUSTOM_PIMPL_CLS_DEFINE(Cls) \
Cls::Cls() : m_impl(new Cls##Impl(), impl_deleter<Cls##Impl>) {} \


Loading…
Cancel
Save