diff --git a/src/jit/impl/mlir/compiler.cpp b/src/jit/impl/mlir/compiler.cpp index d22d46ba..c148099a 100644 --- a/src/jit/impl/mlir/compiler.cpp +++ b/src/jit/impl/mlir/compiler.cpp @@ -10,6 +10,7 @@ * implied. */ +#include "llvm/Pass.h" #include "megbrain_build_config.h" #if MGB_JIT && MGB_JIT_MLIR @@ -21,6 +22,7 @@ #include "megbrain/comp_node_env.h" #include "megbrain/jit/mlir/ir/dialect.h" #include "megbrain/jit/mlir/ir/passes.h" +#include "megbrain/utils/timer.h" #include #include @@ -36,6 +38,11 @@ #include #include +#include +#include + +#include +#include using namespace mgb; using namespace jit; @@ -59,6 +66,61 @@ mlir::OwnedBlob compile_ptx_to_cubin(const std::string ptx, mlir::Location, return result; } +std::unique_ptr translate_module_to_nvvm_ir_and_link_device( + Operation* m) { + std::unique_ptr module = mlir::translateModuleToNVVMIR(m); + auto get_device_path = []() -> std::string { + auto cuda_path = getenv("CUDA_BIN_PATH"); + std::string device_dir; + if (!cuda_path) { + char cuda_lib_path[PATH_MAX]; + auto handle = dlopen("libcudart.so", RTLD_GLOBAL | RTLD_LAZY); + mgb_assert(handle != nullptr, "%s", dlerror()); + mgb_assert(dlinfo(handle, RTLD_DI_ORIGIN, &cuda_lib_path) != -1, + "%s", dlerror()); + device_dir = + std::string(cuda_lib_path) + "/../../../nvvm/libdevice/"; + mgb_assert(!dlclose(handle), "fail to dlclose handle"); + } else { + device_dir = std::string(cuda_path) + "/nvvm/libdevice/"; + } + + DIR* dirp; + struct dirent* directory; + dirp = opendir(device_dir.c_str()); + if (dirp) { + while ((directory = readdir(dirp)) != nullptr) { + if (!strncmp(directory->d_name, "libdevice", 9)) { + closedir(dirp); + return device_dir + std::string(directory->d_name); + } + } + closedir(dirp); + } + return {}; + }; + + //! load libdevice.bc + llvm::SMDiagnostic err; + auto libdevice_path = get_device_path(); + std::unique_ptr mlib = llvm::parseIRFile( + libdevice_path.c_str(), err, module->getContext()); + if (mlib.get()) { + mlib->setTargetTriple(module->getTargetTriple()); + mlib->setDataLayout(module->getDataLayout()); + + RealTimer timer; + mgb_assert( + !llvm::Linker::linkModules(*module, std::move(mlib), + llvm::Linker::Flags::LinkOnlyNeeded), + "failed to parse ir file libdevice.bc"); + mgb_log("MLIR JIT: link libdevice.bc, used: %.3fms", timer.get_msecs()); + } else { + mgb_log_warn("Fail to load bitcode file %s", libdevice_path.c_str()); + } + return module; +} + #endif void add_cpu_lowering_pass(mlir::PassManager& manager) { @@ -80,7 +142,8 @@ void add_cpu_lowering_pass(mlir::PassManager& manager) { } #if MGB_CUDA -void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) { +void add_cuda_lowering_pass(mlir::PassManager& manager, + const std::string& target_chip) { { mlir::OpPassManager& opt_pm = manager.nest(); opt_pm.addPass(mlir::createCanonicalizerPass()); @@ -99,12 +162,10 @@ void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) { auto& kernel_pm = manager.nest(); kernel_pm.addPass(mlir::createLowerGpuOpsToNVVMOpsPass()); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; kernel_pm.addPass(mlir::createConvertGPUKernelToBlobPass( - mlir::translateModuleToNVVMIR, compile_ptx_to_cubin, - "nvptx64-nvidia-cuda", - ssprintf("sm_%d%d", prop.major, prop.minor), "+ptx60", - MLIRCUDAExecutable::sm_blob_annotation)); + translate_module_to_nvvm_ir_and_link_device, + compile_ptx_to_cubin, "nvptx64-nvidia-cuda", target_chip, + "+ptx60", MLIRCUDAExecutable::sm_blob_annotation)); } } #endif @@ -134,21 +195,29 @@ void MLIRCompiler::run_lowering_pass(mlir::OwningModuleRef& module, CompNode cn) { mgb_assert(cn.device_type() == m_device_type); mlir::PassManager manager(module->getContext()); + std::string target_chip; switch (m_device_type) { case CompNode::DeviceType::CPU: add_cpu_lowering_pass(manager); break; #if MGB_CUDA - case CompNode::DeviceType::CUDA: - add_cuda_lowering_pass(manager, cn); + case CompNode::DeviceType::CUDA: { + auto&& prop = + CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + std::string target_chip = + ssprintf("sm_%d%d", prop.major, prop.minor); + add_cuda_lowering_pass(manager, target_chip); break; + } #endif default: mgb_throw(InternalError, "Unsupport device type: %d", static_cast(m_device_type)); break; } + RealTimer timer; mgb_assert(mlir::succeeded(manager.run(*module))); + mgb_log("MLIR JIT: run lowering pass used: %.3f ms", timer.get_msecs()); } std::unique_ptr MLIRCompiler::do_compile( diff --git a/src/jit/impl/mlir/ir/common.cpp b/src/jit/impl/mlir/ir/common.cpp index e46ba6c2..6979a335 100644 --- a/src/jit/impl/mlir/ir/common.cpp +++ b/src/jit/impl/mlir/ir/common.cpp @@ -66,7 +66,6 @@ mlir::Value ValueBuilderHelper::const_val(float val) { } cb(neg, NegFOp); -cb(abs, AbsFOp); cb(ceil, CeilFOp); cb(cos, CosOp); cb(exp, ExpOp); @@ -79,6 +78,10 @@ cb(sqrt, SqrtOp); cb(tanh, TanhOp); #undef cb +mlir::Value ValueBuilderHelper::abs(mlir::Value lhs) { + return max(lhs, const_val(0.f)); +} + mlir::Value ValueBuilderHelper::floor(mlir::Value lhs) { //! FIXME use standard floor when upgrade llvm return neg(ceil(neg(lhs))); diff --git a/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp b/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp index dddabef5..4ecfc86e 100644 --- a/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp +++ b/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp @@ -266,9 +266,6 @@ public: target.addLegalDialect(); target.addIllegalDialect(); - patterns.insert( - &getContext(), &launch_op); - #define cb(_op, _) _op##Lowering, patterns.insert(y.node()->owner_opr()); @@ -273,6 +273,20 @@ TYPED_TEST(TestJITMlirUnaryElemwise, run) { run_mlir_mode(cn); } +#define SKIP_MODE(_mode) \ + if (TypeParam::mode == opr::Elemwise::Mode::_mode) { \ + printf("skip\n"); \ + return; \ + } +TYPED_TEST(TestJITMlirUnaryElemwise, runGpu) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + + SKIP_MODE(SIN); + + run_mlir_mode(cn); +} + ///////////////////////// binary /////////////////////////////// // clang-format off #define FOREACH_BINARY_MODE(cb) \ @@ -319,6 +333,12 @@ TYPED_TEST(TestJITMlirBinaryElemwise, run) { run_mlir_mode(cn); } +TYPED_TEST(TestJITMlirBinaryElemwise, runGpu) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + run_mlir_mode(cn); +} + ///////////////////////// ternary /////////////////////////////// // clang-format off #define FOREACH_TERNARY_MODE(cb) \ @@ -345,6 +365,14 @@ TYPED_TEST(TestJITMlirTernaryElemwise, run) { run_mlir_mode(cn); } +TYPED_TEST(TestJITMlirTernaryElemwise, runGpu) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + run_mlir_mode(cn); +} + +#undef SKIP_MODE + #endif #endif // MGB_JIT