|
|
@@ -10,6 +10,7 @@ |
|
|
|
* implied. |
|
|
|
*/ |
|
|
|
|
|
|
|
#include "llvm/Pass.h" |
|
|
|
#include "megbrain_build_config.h" |
|
|
|
#if MGB_JIT && MGB_JIT_MLIR |
|
|
|
|
|
|
@@ -21,6 +22,7 @@ |
|
|
|
#include "megbrain/comp_node_env.h" |
|
|
|
#include "megbrain/jit/mlir/ir/dialect.h" |
|
|
|
#include "megbrain/jit/mlir/ir/passes.h" |
|
|
|
#include "megbrain/utils/timer.h" |
|
|
|
|
|
|
|
#include <mlir/Conversion/GPUCommon/GPUCommonPass.h> |
|
|
|
#include <mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h> |
|
|
@@ -36,6 +38,11 @@ |
|
|
|
#include <mlir/Transforms/Passes.h> |
|
|
|
|
|
|
|
#include <llvm/Support/TargetSelect.h> |
|
|
|
#include <llvm/IRReader/IRReader.h> |
|
|
|
#include <llvm/Linker/Linker.h> |
|
|
|
|
|
|
|
#include <dlfcn.h> |
|
|
|
#include <dirent.h> |
|
|
|
|
|
|
|
using namespace mgb; |
|
|
|
using namespace jit; |
|
|
@@ -59,6 +66,61 @@ mlir::OwnedBlob compile_ptx_to_cubin(const std::string ptx, mlir::Location, |
|
|
|
return result; |
|
|
|
} |
|
|
|
|
|
|
|
std::unique_ptr<llvm::Module> translate_module_to_nvvm_ir_and_link_device( |
|
|
|
Operation* m) { |
|
|
|
std::unique_ptr<llvm::Module> module = mlir::translateModuleToNVVMIR(m); |
|
|
|
auto get_device_path = []() -> std::string { |
|
|
|
auto cuda_path = getenv("CUDA_BIN_PATH"); |
|
|
|
std::string device_dir; |
|
|
|
if (!cuda_path) { |
|
|
|
char cuda_lib_path[PATH_MAX]; |
|
|
|
auto handle = dlopen("libcudart.so", RTLD_GLOBAL | RTLD_LAZY); |
|
|
|
mgb_assert(handle != nullptr, "%s", dlerror()); |
|
|
|
mgb_assert(dlinfo(handle, RTLD_DI_ORIGIN, &cuda_lib_path) != -1, |
|
|
|
"%s", dlerror()); |
|
|
|
device_dir = |
|
|
|
std::string(cuda_lib_path) + "/../../../nvvm/libdevice/"; |
|
|
|
mgb_assert(!dlclose(handle), "fail to dlclose handle"); |
|
|
|
} else { |
|
|
|
device_dir = std::string(cuda_path) + "/nvvm/libdevice/"; |
|
|
|
} |
|
|
|
|
|
|
|
DIR* dirp; |
|
|
|
struct dirent* directory; |
|
|
|
dirp = opendir(device_dir.c_str()); |
|
|
|
if (dirp) { |
|
|
|
while ((directory = readdir(dirp)) != nullptr) { |
|
|
|
if (!strncmp(directory->d_name, "libdevice", 9)) { |
|
|
|
closedir(dirp); |
|
|
|
return device_dir + std::string(directory->d_name); |
|
|
|
} |
|
|
|
} |
|
|
|
closedir(dirp); |
|
|
|
} |
|
|
|
return {}; |
|
|
|
}; |
|
|
|
|
|
|
|
//! load libdevice.bc |
|
|
|
llvm::SMDiagnostic err; |
|
|
|
auto libdevice_path = get_device_path(); |
|
|
|
std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile( |
|
|
|
libdevice_path.c_str(), err, module->getContext()); |
|
|
|
if (mlib.get()) { |
|
|
|
mlib->setTargetTriple(module->getTargetTriple()); |
|
|
|
mlib->setDataLayout(module->getDataLayout()); |
|
|
|
|
|
|
|
RealTimer timer; |
|
|
|
mgb_assert( |
|
|
|
!llvm::Linker::linkModules(*module, std::move(mlib), |
|
|
|
llvm::Linker::Flags::LinkOnlyNeeded), |
|
|
|
"failed to parse ir file libdevice.bc"); |
|
|
|
mgb_log("MLIR JIT: link libdevice.bc, used: %.3fms", timer.get_msecs()); |
|
|
|
} else { |
|
|
|
mgb_log_warn("Fail to load bitcode file %s", libdevice_path.c_str()); |
|
|
|
} |
|
|
|
return module; |
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
void add_cpu_lowering_pass(mlir::PassManager& manager) { |
|
|
@@ -80,7 +142,8 @@ void add_cpu_lowering_pass(mlir::PassManager& manager) { |
|
|
|
} |
|
|
|
|
|
|
|
#if MGB_CUDA |
|
|
|
void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) { |
|
|
|
void add_cuda_lowering_pass(mlir::PassManager& manager, |
|
|
|
const std::string& target_chip) { |
|
|
|
{ |
|
|
|
mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>(); |
|
|
|
opt_pm.addPass(mlir::createCanonicalizerPass()); |
|
|
@@ -99,12 +162,10 @@ void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) { |
|
|
|
auto& kernel_pm = manager.nest<gpu::GPUModuleOp>(); |
|
|
|
kernel_pm.addPass(mlir::createLowerGpuOpsToNVVMOpsPass()); |
|
|
|
|
|
|
|
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; |
|
|
|
kernel_pm.addPass(mlir::createConvertGPUKernelToBlobPass( |
|
|
|
mlir::translateModuleToNVVMIR, compile_ptx_to_cubin, |
|
|
|
"nvptx64-nvidia-cuda", |
|
|
|
ssprintf("sm_%d%d", prop.major, prop.minor), "+ptx60", |
|
|
|
MLIRCUDAExecutable::sm_blob_annotation)); |
|
|
|
translate_module_to_nvvm_ir_and_link_device, |
|
|
|
compile_ptx_to_cubin, "nvptx64-nvidia-cuda", target_chip, |
|
|
|
"+ptx60", MLIRCUDAExecutable::sm_blob_annotation)); |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
@@ -134,21 +195,29 @@ void MLIRCompiler::run_lowering_pass(mlir::OwningModuleRef& module, |
|
|
|
CompNode cn) { |
|
|
|
mgb_assert(cn.device_type() == m_device_type); |
|
|
|
mlir::PassManager manager(module->getContext()); |
|
|
|
std::string target_chip; |
|
|
|
switch (m_device_type) { |
|
|
|
case CompNode::DeviceType::CPU: |
|
|
|
add_cpu_lowering_pass(manager); |
|
|
|
break; |
|
|
|
#if MGB_CUDA |
|
|
|
case CompNode::DeviceType::CUDA: |
|
|
|
add_cuda_lowering_pass(manager, cn); |
|
|
|
case CompNode::DeviceType::CUDA: { |
|
|
|
auto&& prop = |
|
|
|
CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; |
|
|
|
std::string target_chip = |
|
|
|
ssprintf("sm_%d%d", prop.major, prop.minor); |
|
|
|
add_cuda_lowering_pass(manager, target_chip); |
|
|
|
break; |
|
|
|
} |
|
|
|
#endif |
|
|
|
default: |
|
|
|
mgb_throw(InternalError, "Unsupport device type: %d", |
|
|
|
static_cast<int>(m_device_type)); |
|
|
|
break; |
|
|
|
} |
|
|
|
RealTimer timer; |
|
|
|
mgb_assert(mlir::succeeded(manager.run(*module))); |
|
|
|
mgb_log("MLIR JIT: run lowering pass used: %.3f ms", timer.get_msecs()); |
|
|
|
} |
|
|
|
|
|
|
|
std::unique_ptr<Executable> MLIRCompiler::do_compile( |
|
|
|