diff --git a/src/jit/impl/mlir/compiler.cpp b/src/jit/impl/mlir/compiler.cpp
index d22d46ba..c148099a 100644
--- a/src/jit/impl/mlir/compiler.cpp
+++ b/src/jit/impl/mlir/compiler.cpp
@@ -10,6 +10,7 @@
  * implied.
  */
 
+#include "llvm/Pass.h"
 #include "megbrain_build_config.h"
 #if MGB_JIT && MGB_JIT_MLIR
 
@@ -21,6 +22,7 @@
 #include "megbrain/comp_node_env.h"
 #include "megbrain/jit/mlir/ir/dialect.h"
 #include "megbrain/jit/mlir/ir/passes.h"
+#include "megbrain/utils/timer.h"
 
 #include <mlir/Conversion/GPUCommon/GPUCommonPass.h>
 #include <mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h>
@@ -36,6 +38,11 @@
 #include <mlir/Transforms/Passes.h>
 
 #include <llvm/Support/TargetSelect.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Linker/Linker.h>
+
+#include <dlfcn.h>
+#include <dirent.h>
 
 using namespace mgb;
 using namespace jit;
@@ -59,6 +66,61 @@ mlir::OwnedBlob compile_ptx_to_cubin(const std::string ptx, mlir::Location,
     return result;
 }
 
+std::unique_ptr<llvm::Module> translate_module_to_nvvm_ir_and_link_device(
+        Operation* m) {
+    std::unique_ptr<llvm::Module> module = mlir::translateModuleToNVVMIR(m);
+    auto get_device_path = []() -> std::string {
+        auto cuda_path = getenv("CUDA_BIN_PATH");
+        std::string device_dir;
+        if (!cuda_path) {
+            char cuda_lib_path[PATH_MAX];
+            auto handle = dlopen("libcudart.so", RTLD_GLOBAL | RTLD_LAZY);
+            mgb_assert(handle != nullptr, "%s", dlerror());
+            mgb_assert(dlinfo(handle, RTLD_DI_ORIGIN, &cuda_lib_path) != -1,
+                       "%s", dlerror());
+            device_dir =
+                    std::string(cuda_lib_path) + "/../../../nvvm/libdevice/";
+            mgb_assert(!dlclose(handle), "fail to dlclose handle");
+        } else {
+            device_dir = std::string(cuda_path) + "/nvvm/libdevice/";
+        }
+
+        DIR* dirp;
+        struct dirent* directory;
+        dirp = opendir(device_dir.c_str());
+        if (dirp) {
+            while ((directory = readdir(dirp)) != nullptr) {
+                if (!strncmp(directory->d_name, "libdevice", 9)) {
+                    closedir(dirp);
+                    return device_dir + std::string(directory->d_name);
+                }
+            }
+            closedir(dirp);
+        }
+        return {};
+    };
+
+    //! load libdevice.bc
+    llvm::SMDiagnostic err;
+    auto libdevice_path = get_device_path();
+    std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile(
+            libdevice_path.c_str(), err, module->getContext());
+    if (mlib.get()) {
+        mlib->setTargetTriple(module->getTargetTriple());
+        mlib->setDataLayout(module->getDataLayout());
+
+        RealTimer timer;
+        mgb_assert(
+                !llvm::Linker::linkModules(*module, std::move(mlib),
+                                           llvm::Linker::Flags::LinkOnlyNeeded),
+                "failed to parse ir file libdevice.bc");
+        mgb_log("MLIR JIT: link libdevice.bc, used: %.3fms", timer.get_msecs());
+    } else {
+        mgb_log_warn("Fail to load bitcode file %s", libdevice_path.c_str());
+    }
+    return module;
+}
+
 #endif
 
 void add_cpu_lowering_pass(mlir::PassManager& manager) {
@@ -80,7 +142,8 @@ void add_cpu_lowering_pass(mlir::PassManager& manager) {
 }
 
 #if MGB_CUDA
-void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) {
+void add_cuda_lowering_pass(mlir::PassManager& manager,
+                            const std::string& target_chip) {
     {
         mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>();
         opt_pm.addPass(mlir::createCanonicalizerPass());
@@ -99,12 +162,10 @@ void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) {
         auto& kernel_pm = manager.nest<gpu::GPUModuleOp>();
         kernel_pm.addPass(mlir::createLowerGpuOpsToNVVMOpsPass());
 
-        auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
         kernel_pm.addPass(mlir::createConvertGPUKernelToBlobPass(
-                mlir::translateModuleToNVVMIR, compile_ptx_to_cubin,
-                "nvptx64-nvidia-cuda",
-                ssprintf("sm_%d%d", prop.major, prop.minor), "+ptx60",
-                MLIRCUDAExecutable::sm_blob_annotation));
+                translate_module_to_nvvm_ir_and_link_device,
+                compile_ptx_to_cubin, "nvptx64-nvidia-cuda", target_chip,
+                "+ptx60", MLIRCUDAExecutable::sm_blob_annotation));
     }
 }
 #endif
@@ -134,21 +195,29 @@ void MLIRCompiler::run_lowering_pass(mlir::OwningModuleRef& module,
                                      CompNode cn) {
     mgb_assert(cn.device_type() == m_device_type);
     mlir::PassManager manager(module->getContext());
+    std::string target_chip;
     switch (m_device_type) {
         case CompNode::DeviceType::CPU:
             add_cpu_lowering_pass(manager);
             break;
 #if MGB_CUDA
-        case CompNode::DeviceType::CUDA:
-            add_cuda_lowering_pass(manager, cn);
+        case CompNode::DeviceType::CUDA: {
+            auto&& prop =
+                    CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
+            std::string target_chip =
+                    ssprintf("sm_%d%d", prop.major, prop.minor);
+            add_cuda_lowering_pass(manager, target_chip);
             break;
+        }
 #endif
         default:
             mgb_throw(InternalError, "Unsupport device type: %d",
                       static_cast<int>(m_device_type));
             break;
     }
+    RealTimer timer;
     mgb_assert(mlir::succeeded(manager.run(*module)));
+    mgb_log("MLIR JIT: run lowering pass used: %.3f ms", timer.get_msecs());
 }
 
 std::unique_ptr<Executable> MLIRCompiler::do_compile(
diff --git a/src/jit/impl/mlir/ir/common.cpp b/src/jit/impl/mlir/ir/common.cpp
index e46ba6c2..6979a335 100644
--- a/src/jit/impl/mlir/ir/common.cpp
+++ b/src/jit/impl/mlir/ir/common.cpp
@@ -66,7 +66,6 @@ mlir::Value ValueBuilderHelper::const_val(float val) {
     }
 
 cb(neg, NegFOp);
-cb(abs, AbsFOp);
 cb(ceil, CeilFOp);
 cb(cos, CosOp);
 cb(exp, ExpOp);
@@ -79,6 +78,10 @@ cb(sqrt, SqrtOp);
 cb(tanh, TanhOp);
 #undef cb
 
+mlir::Value ValueBuilderHelper::abs(mlir::Value lhs) {
+    return max(lhs, const_val(0.f));
+}
+
 mlir::Value ValueBuilderHelper::floor(mlir::Value lhs) {
     //! FIXME use standard floor when upgrade llvm
     return neg(ceil(neg(lhs)));
diff --git a/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp b/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
index dddabef5..4ecfc86e 100644
--- a/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
+++ b/src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
@@ -266,9 +266,6 @@ public:
         target.addLegalDialect<gpu::GPUDialect>();
         target.addIllegalDialect<MgbDialect>();
 
-        patterns.insert<AddOpLowering, AssignOpLowering, ReturnOpLowering>(
-                &getContext(), &launch_op);
-
 #define cb(_op, _) _op##Lowering,
         patterns.insert<MLIR_MGB_FOREACH_ELEMWISE_MODE_UNARY(
                                 cb) MLIR_MGB_FOREACH_ELEMWISE_MODE_BINARY(cb)
diff --git a/src/jit/test/codegen.cpp b/src/jit/test/codegen.cpp
index 6d97a076..d649b941 100644
--- a/src/jit/test/codegen.cpp
+++ b/src/jit/test/codegen.cpp
@@ -137,7 +137,7 @@ void run_mlir(CompNode cn) {
          b = opr::Host2DeviceCopy::make(*graph, host_x1),
          c = opr::Host2DeviceCopy::make(*graph, host_x2);
 
-    auto y = a + b + c;
+    auto y = a + b * c;
 
     auto ig_gen =
             std::make_unique<InternalGraphGenerator>(y.node()->owner_opr());
@@ -273,6 +273,20 @@ TYPED_TEST(TestJITMlirUnaryElemwise, run) {
     run_mlir_mode<TypeParam, 1>(cn);
 }
 
+#define SKIP_MODE(_mode)                                 \
+    if (TypeParam::mode == opr::Elemwise::Mode::_mode) { \
+        printf("skip\n");                                \
+        return;                                          \
+    }
+TYPED_TEST(TestJITMlirUnaryElemwise, runGpu) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+
+    SKIP_MODE(SIN);
+
+    run_mlir_mode<TypeParam, 1>(cn);
+}
+
 ///////////////////////// binary ///////////////////////////////
 // clang-format off
 #define FOREACH_BINARY_MODE(cb) \
@@ -319,6 +333,12 @@ TYPED_TEST(TestJITMlirBinaryElemwise, run) {
     run_mlir_mode<TypeParam, 2>(cn);
 }
 
+TYPED_TEST(TestJITMlirBinaryElemwise, runGpu) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    run_mlir_mode<TypeParam, 2>(cn);
+}
+
 ///////////////////////// ternary ///////////////////////////////
 // clang-format off
 #define FOREACH_TERNARY_MODE(cb) \
@@ -345,6 +365,14 @@ TYPED_TEST(TestJITMlirTernaryElemwise, run) {
     run_mlir_mode<TypeParam, 3>(cn);
 }
 
+TYPED_TEST(TestJITMlirTernaryElemwise, runGpu) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    run_mlir_mode<TypeParam, 3>(cn);
+}
+
+#undef SKIP_MODE
+
 #endif
 
 #endif  // MGB_JIT