diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bb5d3b6..9ae9366f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -537,6 +537,11 @@ set(MGB_CUDA ${MGE_WITH_CUDA})
 set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA})
 
 
+#ROCM
+set(MGB_ROCM ${MGE_WITH_ROCM})
+set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM})
+
+
 # CAMBRICON
 set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON})
 set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON})
diff --git a/dnn/include/hcc_detail/hcc_defs_epilogue.h b/dnn/include/hcc_detail/hcc_defs_epilogue.h
new file mode 100644
index 00000000..3fbfe868
--- /dev/null
+++ b/dnn/include/hcc_detail/hcc_defs_epilogue.h
@@ -0,0 +1,18 @@
+/**
+ * \file dnn/include/hcc_detail/hcc_defs_epilogue.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#ifdef __HIP_PLATFORM_HCC__
+#undef __HIP_PLATFORM_HCC__
+#else
+#error "hcc_defs_epilogue.h must be included after hcc_defs_prologue.h"
+#endif
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/include/hcc_detail/hcc_defs_prologue.h b/dnn/include/hcc_detail/hcc_defs_prologue.h
new file mode 100644
index 00000000..a5938115
--- /dev/null
+++ b/dnn/include/hcc_detail/hcc_defs_prologue.h
@@ -0,0 +1,14 @@
+/**
+ * \file dnn/include/hcc_detail/hcc_defs_prologue.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#define __HIP_PLATFORM_HCC__
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/include/hip_header.h b/dnn/include/hip_header.h
new file mode 100644
index 00000000..a9ef6f15
--- /dev/null
+++ b/dnn/include/hip_header.h
@@ -0,0 +1,35 @@
+/**
+ * \file dnn/include/hip_header.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+/**
+ * \remarks The files in the subdirectory include/hip are copied from HIP
+ * headers provided by ROCm-Developer-Tools/HIP, which can be found from
+ * https://github.com/ROCm-Developer-Tools/HIP. These files are included to make
+ * the MegDNN can be compiled with both CUDA and ROCm backends, and the both
+ * backends share the same code.
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#pragma GCC diagnostic pop
+
+#if !defined(__HIP_PLATFORM_HCC__)
+#error "platform macro __HIP_PLATFORM_HCC__ must be defined"
+#endif
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/include/megcore_cdefs.h b/dnn/include/megcore_cdefs.h
index 7d38b649..ac44ef30 100644
--- a/dnn/include/megcore_cdefs.h
+++ b/dnn/include/megcore_cdefs.h
@@ -19,6 +19,7 @@
 typedef enum {
     megcorePlatformCPU = 1,
     megcorePlatformCUDA = 4,
+    megcorePlatformROCM = 6,
     megcorePlatformCambricon = 7,
     megcorePlatformAtlas = 8,
 } megcorePlatform_t;
diff --git a/dnn/include/megcore_rocm.h b/dnn/include/megcore_rocm.h
new file mode 100644
index 00000000..2a99cb46
--- /dev/null
+++ b/dnn/include/megcore_rocm.h
@@ -0,0 +1,70 @@
+/**
+ * \file dnn/include/megcore_rocm.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "./megcore.h"
+
+#include "hip_header.h"
+#include "megdnn/internal/visibility_prologue.h"
+
+namespace megcore {
+struct ROCMContext {
+    hipStream_t stream = nullptr;
+
+    static std::atomic_bool sm_miopen_algo_search;
+    static inline bool enable_miopen_algo_search() { return sm_miopen_algo_search.load(); }
+    static inline void enable_miopen_algo_search(bool enable_algo_search) {
+        sm_miopen_algo_search.store(enable_algo_search);
+    }
+
+    //! device pointer to buffer for error reporting from kernels
+    AsyncErrorInfo* error_info = nullptr;
+
+    ROCMContext() = default;
+
+    ROCMContext(hipStream_t s, AsyncErrorInfo* e) : stream{s}, error_info{e} {}
+};
+
+megcoreStatus_t createComputingHandleWithROCMContext(
+        megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle,
+        unsigned int flags, const ROCMContext& ctx);
+
+megcoreStatus_t getROCMContext(megcoreComputingHandle_t handle,
+                               ROCMContext* ctx);
+
+// Set MIOpen algo search enabled or disabled
+megcoreStatus_t enableMIOpenAlgoSearch(bool enable_algo_search = true);
+
+// Find out whether MIOpen algo search is enabled or disabled
+megcoreStatus_t getMIOpenAlgoSearchStatus(bool* algo_search_enabled);
+}  // namespace megcore
+
+static inline megcoreStatus_t megcoreCreateComputingHandleWithROCMStream(
+        megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle,
+        unsigned int flags, hipStream_t stream) {
+    megcore::ROCMContext ctx;
+    ctx.stream = stream;
+    return megcore::createComputingHandleWithROCMContext(compHandle, devHandle,
+                                                         flags, ctx);
+}
+
+static inline megcoreStatus_t megcoreGetROCMStream(
+        megcoreComputingHandle_t handle, hipStream_t* stream) {
+    megcore::ROCMContext ctx;
+    auto ret = megcore::getROCMContext(handle, &ctx);
+    *stream = ctx.stream;
+    return ret;
+}
+
+#include "megdnn/internal/visibility_epilogue.h"
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/include/megdnn/handle.h b/dnn/include/megdnn/handle.h
index cc64b4d7..f594f604 100644
--- a/dnn/include/megdnn/handle.h
+++ b/dnn/include/megdnn/handle.h
@@ -33,6 +33,7 @@ class Handle {
             ARMV7 = 4,
             AARCH64 = 5,
             CUDA = 6,
+            ROCM = 11,
             ATLAS = 13,
             CAMBRICON = 12,
         };
@@ -71,6 +72,13 @@ class Handle {
         template <typename opr>
         std::unique_ptr<opr> create_cuda_operator();
 #endif
+#if MEGDNN_WITH_ROCM
+        static std::unique_ptr<Handle> make_rocm_handle(
+                megcoreComputingHandle_t computing_handle);
+        template <typename opr>
+        std::unique_ptr<opr> create_rocm_operator();
+#endif
+
 
         virtual ~Handle();
 
diff --git a/dnn/scripts/gen_elemwise_kern_impls.py b/dnn/scripts/gen_elemwise_kern_impls.py
index 05f4e579..8a230fbc 100755
--- a/dnn/scripts/gen_elemwise_kern_impls.py
+++ b/dnn/scripts/gen_elemwise_kern_impls.py
@@ -11,6 +11,7 @@ def main():
         description='generate elemwise impl files',
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--type', type=str, choices=['cuda',
+                                                     'hip',
                                                      'cpp'],
                         default='cpp', help='generate cuda/hip kernel file')
     parser.add_argument('output', help='output directory')
@@ -21,6 +22,8 @@ def main():
 
     if args.type == 'cuda':
         cpp_ext = 'cu'
+    elif args.type == 'hip':
+        cpp_ext = 'cpp.hip'
     else:
         assert args.type == 'cpp'
         cpp_ext = 'cpp'
diff --git a/dnn/scripts/gen_elemwise_special_kern_impls.py b/dnn/scripts/gen_elemwise_special_kern_impls.py
index 2e75e720..dc92f4c6 100755
--- a/dnn/scripts/gen_elemwise_special_kern_impls.py
+++ b/dnn/scripts/gen_elemwise_special_kern_impls.py
@@ -11,6 +11,7 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--type', type=str, choices=[
                                                     'cuda',
+                                                    'hip'
                                                     ],
                         default='cuda',
                         help='generate cuda/hip elemwise special kernel file')
@@ -22,6 +23,9 @@ def main():
 
     if args.type == 'cuda':
         cpp_ext = 'cu'
+    else:
+        assert  args.type =='hip'
+        cpp_ext = 'cpp.hip'
 
     for dtype in DTYPES.keys():
         fname = 'special_{}.{}'.format(dtype, cpp_ext)
diff --git a/dnn/src/common/handle.cpp b/dnn/src/common/handle.cpp
index 506c4314..4aafabb0 100644
--- a/dnn/src/common/handle.cpp
+++ b/dnn/src/common/handle.cpp
@@ -93,6 +93,13 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
         MIDOUT_END();
 #endif
         }
+        else if (platform == megcorePlatformROCM) {
+#if MEGDNN_WITH_ROCM
+            return make_rocm_handle(computing_handle);
+#else
+            return nullptr;
+#endif
+        }
         else if (platform == megcorePlatformCambricon) {
 #if MEGDNN_WITH_CAMBRICON
             return make_unique<cambricon::HandleImpl>(computing_handle);
@@ -193,6 +200,14 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
 #if MEGDNN_WITH_ATLAS
             CASE(ATLAS, atlas);
 #endif
+#if MEGDNN_WITH_ROCM
+            case HandleType::ROCM: {
+                MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::ROCM)) {
+                    return create_rocm_operator<Opr>();
+                }
+                MIDOUT_END();
+            }
+#endif
 #if MEGDNN_WITH_CAMBRICON
             CASE(CAMBRICON, cambricon);
 #endif
diff --git a/dnn/src/common/megcore/common/computing_context.cpp b/dnn/src/common/megcore/common/computing_context.cpp
index d5d119e2..b178291b 100644
--- a/dnn/src/common/megcore/common/computing_context.cpp
+++ b/dnn/src/common/megcore/common/computing_context.cpp
@@ -18,6 +18,10 @@
 #endif
 
 
+#if MEGDNN_WITH_ROCM
+#include "src/rocm/megcore/computing_context.hpp"
+#endif
+
 #if MEGDNN_WITH_CAMBRICON
 #include "src/cambricon/megcore/cambricon_computing_context.hpp"
 #endif
@@ -41,6 +45,10 @@ std::unique_ptr<ComputingContext> ComputingContext::make(
         case megcorePlatformCUDA:
             return make_unique<cuda::CUDAComputingContext>(dev_handle, flags);
 #endif
+#if MEGDNN_WITH_ROCM
+        case megcorePlatformROCM:
+            return make_rocm_computing_context(dev_handle, flags);
+#endif
 #if MEGDNN_WITH_CAMBRICON
         case megcorePlatformCambricon:
             return make_unique<cambricon::CambriconComputingContext>(dev_handle,
diff --git a/dnn/src/common/megcore/common/device_context.cpp b/dnn/src/common/megcore/common/device_context.cpp
index 47a75d07..a77b0be7 100644
--- a/dnn/src/common/megcore/common/device_context.cpp
+++ b/dnn/src/common/megcore/common/device_context.cpp
@@ -15,6 +15,9 @@
 #if MEGDNN_WITH_CUDA
 #include "src/cuda/megcore/cuda_device_context.hpp"
 #endif
+#if MEGDNN_WITH_ROCM
+#include "src/rocm/megcore/device_context.hpp"
+#endif
 #if MEGDNN_WITH_CAMBRICON
 #include "src/cambricon/megcore/cambricon_device_context.hpp"
 #endif
@@ -36,6 +39,10 @@ std::unique_ptr<DeviceContext> DeviceContext::make(megcorePlatform_t platform,
         case megcorePlatformCUDA:
             return make_unique<cuda::CUDADeviceContext>(deviceID, flags);
 #endif
+#if MEGDNN_WITH_ROCM
+        case megcorePlatformROCM:
+            return make_rocm_device_context(deviceID, flags);
+#endif
 #if MEGDNN_WITH_CAMBRICON
         case megcorePlatformCambricon:
             return make_unique<cambricon::CambriconDeviceContext>(deviceID,
diff --git a/dnn/src/rocm/add_update/add_update.cpp.hip b/dnn/src/rocm/add_update/add_update.cpp.hip
new file mode 100644
index 00000000..a000be66
--- /dev/null
+++ b/dnn/src/rocm/add_update/add_update.cpp.hip
@@ -0,0 +1,28 @@
+/**
+ * \file src/rocm/add_update/add_update.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./add_update.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+#define cb(_dtype)                                                         \
+    INST_RUN_ELEMWISE(AddUpdateKernOp<DTypeTrait<_dtype>::ctype>,          \
+                      DTypeTrait<_dtype>::ctype, 1);                       \
+    INST_RUN_ELEMWISE(AddUpdateKernOpNonContig<DTypeTrait<_dtype>::ctype>, \
+                      DTypeTrait<_dtype>::ctype, 2);
+
+MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+
+}  // namespace rocm
+}  // namespace megdnn
+
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/add_update/add_update.h.hip b/dnn/src/rocm/add_update/add_update.h.hip
new file mode 100644
index 00000000..33e158ba
--- /dev/null
+++ b/dnn/src/rocm/add_update/add_update.h.hip
@@ -0,0 +1,61 @@
+/**
+ *
+ * \file src/rocm/add_update/add_update.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "hip_header.h"
+#include "src/rocm/elemwise_helper.h.hip"
+
+#if MEGDNN_CC_HOST
+#include "megdnn/oprs.h"
+#endif
+
+namespace megdnn {
+namespace rocm {
+
+    template<typename ctype>
+    struct AddUpdateKernOp {
+        ctype *dst;
+        ctype alpha, beta, bias;
+
+        __device__ void operator() (uint32_t idx, ctype delta) {
+            dst[idx] = dst[idx] * alpha + delta * beta + bias;
+        }
+
+#if MEGDNN_CC_HOST
+        AddUpdateKernOp(const TensorND &dest, const AddUpdate::Param &param):
+            dst{dest.ptr<ctype>()},
+            alpha(param.alpha), beta(param.beta), bias(param.bias)
+        {
+        }
+#endif
+    };
+
+    template<typename ctype>
+    struct AddUpdateKernOpNonContig {
+        ctype alpha, beta, bias;
+
+        __device__ void operator() (uint32_t /*idx*/, ctype &dst, ctype delta) {
+            dst = dst * alpha + delta * beta + bias;
+        }
+
+#if MEGDNN_CC_HOST
+        AddUpdateKernOpNonContig(const AddUpdate::Param &param):
+            alpha(param.alpha), beta(param.beta), bias(param.bias)
+        {
+        }
+#endif
+    };
+
+}
+}
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/add_update/opr_impl.cpp b/dnn/src/rocm/add_update/opr_impl.cpp
new file mode 100644
index 00000000..36230ead
--- /dev/null
+++ b/dnn/src/rocm/add_update/opr_impl.cpp
@@ -0,0 +1,67 @@
+/**
+ * \file dnn/src/rocm/add_update/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "src/rocm/add_update/add_update.h.hip"
+
+#include "src/common/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+void AddUpdateForwardImpl::exec(_megdnn_tensor_inout dest,
+                                _megdnn_tensor_in delta) {
+    check_exec(dest.layout, delta.layout);
+    if (!dest.layout.is_contiguous()) {
+        return exec_noncontig(dest, delta);
+    }
+    ElemwiseOpParamN<1> param;
+    param[0] = delta;
+    param[0].layout = param[0].layout.broadcast(dest.layout);
+    param.init_from_given_tensor();
+    auto stream = hip_stream(handle());
+    switch (dest.layout.dtype.enumv()) {
+#define cb(_dt)                                                \
+    case DTypeTrait<_dt>::enumv: {                             \
+        using ctype = DTypeTrait<_dt>::ctype;                  \
+        return run_elemwise<AddUpdateKernOp<ctype>, ctype, 1>( \
+                param, stream, {dest, m_param});               \
+    }
+        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+
+        default:
+            megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate"));
+    }
+}
+
+void AddUpdateForwardImpl::exec_noncontig(_megdnn_tensor_inout dest,
+                                          _megdnn_tensor_in delta) {
+    ElemwiseOpParamN<2> param = make_param(dest, delta);
+    auto stream = hip_stream(handle());
+    switch (dest.layout.dtype.enumv()) {
+#define cb(_dt)                                                         \
+    case DTypeTrait<_dt>::enumv: {                                      \
+        using ctype = DTypeTrait<_dt>::ctype;                           \
+        return run_elemwise<AddUpdateKernOpNonContig<ctype>, ctype, 2>( \
+                param, stream, {m_param});                              \
+    }
+        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+
+        default:
+            megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate"));
+    }
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/add_update/opr_impl.h b/dnn/src/rocm/add_update/opr_impl.h
new file mode 100644
index 00000000..28babaa6
--- /dev/null
+++ b/dnn/src/rocm/add_update/opr_impl.h
@@ -0,0 +1,35 @@
+/**
+ * \file dnn/src/rocm/add_update/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/oprs.h"
+#include "src/common/add_update_helper.h"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+class AddUpdateForwardImpl final : public AddUpdateForwardHelper {
+    void exec_noncontig(_megdnn_tensor_inout dest, _megdnn_tensor_in delta);
+
+public:
+    using AddUpdateForwardHelper::AddUpdateForwardHelper;
+
+    void exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) override;
+
+    bool is_thread_safe() const override { return true; }
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/argmxx/argmxx.cpp.hip b/dnn/src/rocm/argmxx/argmxx.cpp.hip
new file mode 100644
index 00000000..13fd678d
--- /dev/null
+++ b/dnn/src/rocm/argmxx/argmxx.cpp.hip
@@ -0,0 +1,26 @@
+/**
+ * \file src/rocm/argmxx/argmxx.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "hip_header.h"
+#include "src/common/argmxx_helper.h"
+
+#include "src/rocm/reduce_helper.h.hip"
+#include "megdnn/dtype.h"
+
+namespace megdnn {
+namespace rocm {
+
+#define INST(_dt) \
+    INST_REDUCE(argmxx::ArgmxxOp<DTypeTrait<_dt>::ctype MEGDNN_COMMA false>, false); \
+    INST_REDUCE(argmxx::ArgmxxOp<DTypeTrait<_dt>::ctype MEGDNN_COMMA true>, false); \
+
+    MEGDNN_FOREACH_COMPUTING_DTYPE(INST)
+
+} // namespace rocm
+} // namespace megdnn
diff --git a/dnn/src/rocm/argmxx/opr_impl.cpp b/dnn/src/rocm/argmxx/opr_impl.cpp
new file mode 100644
index 00000000..b61ac2b3
--- /dev/null
+++ b/dnn/src/rocm/argmxx/opr_impl.cpp
@@ -0,0 +1,129 @@
+/**
+ * \file dnn/src/rocm/argmxx/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/argmxx/opr_impl.h"
+
+#include "src/rocm/utils.h"
+#include "src/common/reduce_helper.h"
+#include "src/common/argmxx_helper.h"
+#include "src/rocm/reduce_helper.h.hip"
+
+namespace {
+
+using namespace megdnn;
+using namespace rocm;
+using namespace argmxx;
+
+template <typename T, bool is_max>
+size_t get_workspace_in_bytes_impl(const TensorLayout &src,
+        const TensorLayout & /* dst */,
+        size_t axis)
+{
+    size_t A, B, C;
+    reduce::get_ABC(src, A, B, C, axis);
+    return get_reduce_workspace_in_bytes<argmxx::ArgmxxOp<T, is_max>>(
+            A, B, C);
+}
+
+template <typename T, bool is_max>
+void exec_impl(const T *src, int *dst, void *workspace,
+        size_t A, size_t B, size_t C,
+        hipStream_t stream)
+{
+    argmxx::ArgmxxOp<T, is_max> opr(const_cast<T *>(src), dst, A, B, C);
+    run_reduce<argmxx::ArgmxxOp<T, is_max>, false>(
+            (typename argmxx::ArgmxxOp<T, is_max>::wtype *)workspace,
+            A, B, C,
+            stream, opr);
+    after_kernel_launch();
+}
+
+} // anonymous namespace
+
+namespace megdnn {
+namespace rocm {
+
+size_t ArgmaxForwardImpl::get_workspace_in_bytes(const TensorLayout &src,
+        const TensorLayout &dst)
+{
+#define cb(dt) \
+    if (src.dtype == dt()) { \
+        using ctype = typename DTypeTrait<dt>::ctype; \
+        return get_workspace_in_bytes_impl<ctype, true>(src, dst, param().axis); \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_assert_internal(false);
+}
+
+void ArgmaxForwardImpl::exec(_megdnn_tensor_in src,
+        _megdnn_tensor_out dst,
+        _megdnn_workspace workspace)
+{
+    check_exec(src.layout, dst.layout, workspace.size);
+    size_t A, B, C;
+    reduce::get_ABC(src.layout, A, B, C, param().axis);
+    auto stream = hip_stream(handle());
+#define cb(dt) \
+    if (src.layout.dtype.enumv() == DTypeTrait<dt>::enumv) { \
+        using ctype = typename DTypeTrait<dt>::ctype; \
+        exec_impl<ctype, true>(src.ptr<ctype>(), \
+                dst.ptr<dt_int32>(), \
+                workspace.raw_ptr, \
+                A, B, C, stream); \
+        return; \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
+                                        src.layout.dtype.name())));
+}
+
+size_t ArgminForwardImpl::get_workspace_in_bytes(const TensorLayout &src,
+        const TensorLayout &dst)
+{
+#define cb(dt) \
+    if (src.dtype == dt()) { \
+        using ctype = typename DTypeTrait<dt>::ctype; \
+        return get_workspace_in_bytes_impl<ctype, false>(src, dst, param().axis); \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_assert_internal(false);
+}
+
+void ArgminForwardImpl::exec(_megdnn_tensor_in src,
+        _megdnn_tensor_out dst,
+        _megdnn_workspace workspace)
+{
+    check_exec(src.layout, dst.layout, workspace.size);
+    size_t A, B, C;
+    reduce::get_ABC(src.layout, A, B, C, param().axis);
+    auto stream = hip_stream(handle());
+#define cb(dt) \
+    if (src.layout.dtype.enumv() == DTypeTrait<dt>::enumv) { \
+        using ctype = typename DTypeTrait<dt>::ctype; \
+        exec_impl<ctype, false>(src.ptr<ctype>(), \
+                dst.ptr<dt_int32>(), \
+                workspace.raw_ptr, \
+                A, B, C, stream); \
+        return; \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+    megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
+                                        src.layout.dtype.name())));
+}
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/argmxx/opr_impl.h b/dnn/src/rocm/argmxx/opr_impl.h
new file mode 100644
index 00000000..54cf198f
--- /dev/null
+++ b/dnn/src/rocm/argmxx/opr_impl.h
@@ -0,0 +1,41 @@
+/**
+ * \file dnn/src/rocm/argmxx/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class ArgmaxForwardImpl final: public ArgmaxForward {
+    public:
+        using ArgmaxForward::ArgmaxForward;
+        void exec(_megdnn_tensor_in src,
+                _megdnn_tensor_out dst,
+                _megdnn_workspace workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout &src,
+                const TensorLayout &dst) override;
+};
+
+class ArgminForwardImpl: public ArgminForward {
+    public:
+        using ArgminForward::ArgminForward;
+        void exec(_megdnn_tensor_in src,
+                _megdnn_tensor_out dst,
+                _megdnn_workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout &src,
+                const TensorLayout &dst) override;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp b/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp
new file mode 100644
index 00000000..3237e8ea
--- /dev/null
+++ b/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp
@@ -0,0 +1,119 @@
+/**
+ * \file dnn/src/rocm/batched_matrix_mul/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./opr_impl.h"
+
+#include "src/common/utils.cuh"
+#include "src/rocm/handle.h"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B,
+                                       _megdnn_tensor_out C,
+                                       _megdnn_workspace workspace) {
+    check_exec(A.layout, B.layout, C.layout, workspace.size);
+    auto dtype = A.layout.dtype;
+    megdnn_assert(dtype.category() == DTypeCategory::FLOAT &&
+                  param().format == param::MatrixMul::Format::DEFAULT);
+
+    if (dtype == dtype::Float32() ||
+        MEGDNN_FLOAT16_SELECT(dtype == dtype::Float16(), false)) {
+        auto batch = A.layout.shape[0];
+        auto m = C.layout.shape[1], n = C.layout.shape[2];
+        auto k = A.layout.shape[param().transposeA ? 1 : 2];
+        auto handle = concrete_handle(this->handle());
+        auto rocblas_handle_ = handle->get_rocblas_handle();
+
+        auto io32_c32 = [&]() {
+            auto zero = handle->zero_device();
+            auto one = handle->one_device();
+            rocblas_check(rocblas_sgemm_strided_batched(
+                    rocblas_handle_,
+                    param().transposeB ? rocblas_operation_transpose
+                                       : rocblas_operation_none,
+                    param().transposeA ? rocblas_operation_transpose
+                                       : rocblas_operation_none,
+                    n, m, k, one, B.ptr<dt_float32>(),
+                    (rocblas_int)(B.layout.stride[1]),
+                    (rocblas_int)(B.layout.stride[0]), A.ptr<dt_float32>(),
+                    (rocblas_int)(A.layout.stride[1]),
+                    (rocblas_int)(A.layout.stride[0]), zero,
+                    C.ptr<dt_float32>(), (rocblas_int)(C.layout.stride[1]),
+                    (rocblas_int)(C.layout.stride[0]), (rocblas_int)(batch)));
+        };
+
+#if !MEGDNN_DISABLE_FLOAT16
+        auto io16_c32 = [&]() {
+            auto zero = handle->zero_device();
+            auto one = handle->one_device();
+            int32_t solution_index = 0;
+            uint32_t flags = 1;
+            size_t ws_size = 0;
+
+            rocblas_check(rocblas_gemm_strided_batched_ex(
+                    rocblas_handle_,
+                    param().transposeB ? rocblas_operation_transpose
+                                       : rocblas_operation_none,
+                    param().transposeA ? rocblas_operation_transpose
+                                       : rocblas_operation_none,
+                    n, m, k, one, B.raw_ptr, rocblas_datatype_i8_r,
+                    B.layout.stride[1], B.layout.stride[0], A.raw_ptr,
+                    rocblas_datatype_i8_r, A.layout.stride[1],
+                    A.layout.stride[0], zero, C.raw_ptr, rocblas_datatype_i32_r,
+                    C.layout.stride[1], C.layout.stride[0], C.raw_ptr,
+                    rocblas_datatype_i32_r, C.layout.stride[1],
+                    C.layout.stride[0], batch, rocblas_datatype_i32_r,
+                    rocblas_gemm_algo_standard, solution_index, flags, &ws_size,
+                    nullptr));
+        };
+
+        auto io16_c16 = [&]() {
+            auto zero_half = handle->zero_device_h();
+            auto one_half = handle->one_device_h();
+            rocblas_check(rocblas_hgemm_strided_batched(
+                    rocblas_handle_,
+                    param().transposeB ? rocblas_operation_transpose
+                                       : rocblas_operation_none,
+                    param().transposeA ? rocblas_operation_transpose
+                                       : rocblas_operation_none,
+                    n, m, k, reinterpret_cast<const rocblas_half*>(one_half),
+                    static_cast<const rocblas_half*>(B.raw_ptr),
+                    B.layout.stride[1], B.layout.stride[0],
+                    static_cast<const rocblas_half*>(A.raw_ptr),
+                    A.layout.stride[1], A.layout.stride[0],
+                    reinterpret_cast<const rocblas_half*>(zero_half),
+                    static_cast<rocblas_half*>(C.raw_ptr),
+                    C.layout.stride[1], C.layout.stride[0], batch));
+
+        };
+#endif
+
+        if (dtype == dtype::Float32()) {
+            io32_c32();
+        }
+#if !MEGDNN_DISABLE_FLOAT16
+        else {
+            if (param().compute_mode == Param::ComputeMode::FLOAT32) {
+                io16_c32();
+            } else {
+                io16_c16();
+            }
+        }
+#endif
+    }
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/batched_matrix_mul/opr_impl.h b/dnn/src/rocm/batched_matrix_mul/opr_impl.h
new file mode 100644
index 00000000..a4dfbc14
--- /dev/null
+++ b/dnn/src/rocm/batched_matrix_mul/opr_impl.h
@@ -0,0 +1,39 @@
+/**
+ * \file dnn/src/rocm/batched_matrix_mul/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class BatchedMatrixMulForwardImpl : public BatchedMatrixMulForward {
+public:
+    using BatchedMatrixMulForward::BatchedMatrixMulForward;
+    BatchedMatrixMulForwardImpl(Handle* handle)
+            : BatchedMatrixMul(handle),
+              m_opr(handle->create_operator<MatrixMul>()) {}
+    void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
+                                  const TensorLayout&) override {
+        return 0;
+    }
+
+    bool is_thread_safe() const override { return true; }
+
+private:
+    std::unique_ptr<MatrixMul> m_opr;
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/checksum/kern.cpp.hip b/dnn/src/rocm/checksum/kern.cpp.hip
new file mode 100644
index 00000000..88740edd
--- /dev/null
+++ b/dnn/src/rocm/checksum/kern.cpp.hip
@@ -0,0 +1,81 @@
+/**
+ * \file src/rocm/checksum/kern.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "hip_header.h"
+#include "./kern.h.hip"
+
+#include "src/rocm/reduce_helper.h.hip"
+
+namespace megdnn {
+namespace rocm {
+namespace checksum {
+
+namespace {
+struct ChecksumOp {
+    typedef uint32_t wtype;
+    const uint32_t* src;
+    uint32_t* dst;
+
+    static const uint32_t INIT = 0;
+
+    __host__ __device__ void write(uint32_t idx, uint32_t val) {
+        dst[idx] = val;
+    }
+
+    __host__ __device__ static uint32_t apply(uint32_t a, uint32_t b) {
+        return a + b;
+    }
+};
+
+struct NonFourAlignedChecksumOp : ChecksumOp {
+    __host__ __device__ uint32_t read(uint32_t idx) {
+        uint8_t* data = (uint8_t*)(src + idx);
+        return (data[0] | ((uint32_t)data[1] << 8) | ((uint32_t)data[2] << 16) |
+                ((uint32_t)data[3] << 24)) *
+               (idx + 1);
+    }
+};
+
+struct FourAlignedChecksumOp : ChecksumOp {
+    __host__ __device__ uint32_t read(uint32_t idx) {
+        return src[idx] * (idx + 1);
+    }
+};
+
+}  // anonymous namespace
+
+void calc(uint32_t* dest, const uint32_t* buf, uint32_t* workspace,
+          size_t nr_elem, hipStream_t stream) {
+    if (!nr_elem)
+        return;
+    if (reinterpret_cast<uint64_t>(buf) & 0b11) {
+        NonFourAlignedChecksumOp op;
+        op.src = buf;
+        op.dst = dest;
+        run_reduce<NonFourAlignedChecksumOp, false>(workspace, 1, nr_elem, 1,
+                                                    stream, op);
+    } else {
+        FourAlignedChecksumOp op;
+        op.src = buf;
+        op.dst = dest;
+        run_reduce<FourAlignedChecksumOp, false>(workspace, 1, nr_elem, 1,
+                                                 stream, op);
+    }
+}
+
+size_t get_workspace_in_bytes(size_t nr_elem) {
+    return get_reduce_workspace_in_bytes<ChecksumOp>(1, nr_elem, 1);
+}
+
+}  // namespace checksum
+}  // namespace rocm`
+}  // namespace megdnn
+
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/checksum/kern.h.hip b/dnn/src/rocm/checksum/kern.h.hip
new file mode 100644
index 00000000..c21b976e
--- /dev/null
+++ b/dnn/src/rocm/checksum/kern.h.hip
@@ -0,0 +1,28 @@
+/**
+ * \file src/rocm/checksum/kern.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "hip_header.h"
+
+namespace megdnn {
+namespace rocm {
+namespace checksum {
+
+void calc(uint32_t* dest, const uint32_t* buf, uint32_t* workspace,
+          size_t nr_elem, hipStream_t stream);
+
+size_t get_workspace_in_bytes(size_t nr_elem);
+
+}  // namespace checksum
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/checksum/opr_impl.cpp b/dnn/src/rocm/checksum/opr_impl.cpp
new file mode 100644
index 00000000..d3be35c4
--- /dev/null
+++ b/dnn/src/rocm/checksum/opr_impl.cpp
@@ -0,0 +1,68 @@
+/**
+ * \file dnn/src/rocm/checksum/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "src/rocm/checksum/kern.h.hip"
+
+#include "src/common/utils.h"
+#include "src/rocm/reduce_helper.h.hip"
+
+#include <algorithm>
+
+using namespace megdnn;
+using namespace rocm;
+
+namespace {
+
+WorkspaceBundle get_wbundle(const TensorLayout& data) {
+    size_t size_all = data.shape[0], size_ints = size_all / sizeof(uint32_t);
+    size_t part1 = checksum::get_workspace_in_bytes(size_ints);
+    size_t part2 = sizeof(ChecksumForward::Result::checksum);
+    return {nullptr, {part1, part2}};
+}
+
+}  // anonymous namespace
+
+size_t ChecksumForwardImpl::get_workspace_in_bytes(const TensorLayout& data) {
+    auto wbundle = get_wbundle(data);
+    return wbundle.total_size_in_bytes();
+}
+
+ChecksumForward::Result ChecksumForwardImpl::exec(_megdnn_tensor_in data,
+                                                  _megdnn_workspace workspace) {
+    auto wbundle = get_wbundle(data.layout);
+    wbundle.set(workspace.raw_ptr);
+    Result result;
+    memset(&result, 0, sizeof(result));
+    check_exec(data.layout, workspace.size);
+    auto stream = hip_stream(handle());
+
+    auto ptr = static_cast<uint8_t*>(data.raw_ptr);
+    size_t size_all = data.layout.shape[0],
+           size_ints = size_all / sizeof(uint32_t);
+    auto last_val_size = std::min<size_t>(size_all, 4);
+    hip_check(hipMemcpyAsync(&result.last_val, ptr + size_all - last_val_size,
+                             last_val_size, hipMemcpyDeviceToHost, stream));
+    if (size_ints) {
+        checksum::calc(static_cast<uint32_t*>(wbundle.get(1)),
+                       static_cast<uint32_t*>(data.raw_ptr),
+                       static_cast<uint32_t*>(wbundle.get(0)), size_ints,
+                       stream);
+        hip_check(hipMemcpyAsync(&result.checksum, wbundle.get(1),
+                                 sizeof(result.checksum), hipMemcpyDeviceToHost,
+                                 stream));
+    }
+    hip_check(hipStreamSynchronize(stream));
+    return result;
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/checksum/opr_impl.h b/dnn/src/rocm/checksum/opr_impl.h
new file mode 100644
index 00000000..a76915fe
--- /dev/null
+++ b/dnn/src/rocm/checksum/opr_impl.h
@@ -0,0 +1,35 @@
+/**
+ * \file dnn/src/rocm/checksum/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/oprs.h"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+class ChecksumForwardImpl final : public ChecksumForward {
+public:
+    using ChecksumForward::ChecksumForward;
+
+    size_t get_workspace_in_bytes(const TensorLayout&) override;
+
+    bool is_thread_safe() const override { return true; }
+
+    Result exec(_megdnn_tensor_in data, _megdnn_workspace workspace) override;
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/convolution/backward_data/algo.cpp b/dnn/src/rocm/convolution/backward_data/algo.cpp
new file mode 100644
index 00000000..8a14527e
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_data/algo.cpp
@@ -0,0 +1,95 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_data/algo.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
+    all_algos.push_back(&miopen);
+    all_algos.push_back(&matmul);
+    all_algos.push_back(&chanwise);
+    non_miopen_algos.push_back(&matmul);
+    non_miopen_algos.push_back(&chanwise);
+    miopen_algos.push_back(&miopen);
+}
+
+ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack;
+
+ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs(
+        ConvolutionBackwardDataImpl* o, const TensorLayout& filter,
+        const TensorLayout& diff, const TensorLayout& grad)
+        : SizeArgs(o, o->check_layout_fwd(grad, filter, diff), diff, grad) {}
+
+ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs(
+        ConvolutionBackwardDataImpl* o, const CanonizedFilterMeta& filter,
+        const TensorLayout& diff, const TensorLayout& grad)
+        : handle{concrete_handle(o->handle())},
+          filter_meta{filter},
+          diff_layout{&diff},
+          grad_layout{&grad},
+          opr{o} {}
+
+ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs(
+        ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
+        _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+        _megdnn_workspace workspace)
+        : SizeArgs(opr, filter.layout, diff.layout, grad.layout),
+          filter_tensor{&filter},
+          diff_tensor{&diff},
+          grad_tensor{&grad},
+          workspace{workspace} {}
+
+std::string ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_string() const {
+    auto&& fm = filter_meta;
+    MEGDNN_MARK_USED_VAR(fm);
+    return megdnn_mangle(ssprintf(
+            "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, "
+            "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s",
+            fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1],
+            diff_layout->to_string().c_str(), grad_layout->to_string().c_str(),
+            fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
+            fm.dilation[0], fm.dilation[1], !fm.should_flip,
+            diff_layout->dtype.name(), grad_layout->dtype.name()));
+}
+
+convolution::MIOpenCacheKey
+ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key()
+        const {
+    convolution::MIOpenCacheKey res;
+    res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle());
+    res.batch = grad_layout->operator[](0);
+    res.IC = grad_layout->operator[](1);
+    res.IH = grad_layout->operator[](2);
+    res.IW = grad_layout->operator[](3);
+    res.OH = diff_layout->operator[](2);
+    res.OW = diff_layout->operator[](3);
+    res.FH = filter_meta.spatial[0];
+    res.FW = filter_meta.spatial[1];
+    res.SH = filter_meta.stride[0];
+    res.SW = filter_meta.stride[1];
+    res.PH = filter_meta.padding[0];
+    res.PW = filter_meta.padding[1];
+    res.DH = filter_meta.dilation[0];
+    res.DW = filter_meta.dilation[1];
+    res.group = filter_meta.group;
+    res.ocpg = filter_meta.ocpg;
+    res.icpg = filter_meta.icpg;
+    res.dtype_enum = static_cast<uint32_t>(diff_layout->dtype.enumv());
+    res.exhaustive_search =
+            static_cast<int32_t>(handle->enable_miopen_algo_search());
+    res.OC = res.group * res.ocpg;
+    return res;
+}
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_data/algo.h b/dnn/src/rocm/convolution/backward_data/algo.h
new file mode 100644
index 00000000..7efd76d0
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_data/algo.h
@@ -0,0 +1,155 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_data/algo.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "src/rocm/convolution/helper.h"
+
+namespace megdnn {
+namespace rocm {
+
+/*!
+ * \brief base class for convolution algos
+ *
+ */
+class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
+protected:
+    ~AlgoBase() = default;
+
+public:
+    struct SizeArgs {
+        HandleImpl* handle;
+        CanonizedFilterMeta filter_meta;
+        const TensorLayout *diff_layout, *grad_layout;
+        ConvolutionBackwardDataImpl* opr;
+
+        std::string to_string() const;
+        convolution::MIOpenCacheKey to_miopen_algo_cache_key() const;
+        void init_desc(convolution::MIOpenBwdDataDescs& desc) const {
+            desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
+        }
+        SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
+                 const TensorLayout& diff, const TensorLayout& grad);
+        SizeArgs(ConvolutionBackwardDataImpl* opr,
+                 const CanonizedFilterMeta& filter, const TensorLayout& diff,
+                 const TensorLayout& grad);
+
+        convolution::ForwardSizeArgs as_fwd_args() const {
+            return {handle, grad_layout, filter_meta, diff_layout};
+        }
+    };
+    struct ExecArgs : public SizeArgs {
+        const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
+        Workspace workspace;
+
+        ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
+                 _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+                 _megdnn_workspace workspace);
+    };
+    virtual bool is_available(const SizeArgs& args) const = 0;
+    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
+    virtual void exec(const ExecArgs& args) const = 0;
+
+    bool is_available_wk(const SizeArgs& args, size_t limit) {
+        return is_available(args) && get_workspace_in_bytes(args) <= limit;
+    }
+    bool is_available_reproducible(
+            const SizeArgs& args, bool reproducible = true,
+            size_t limit = std::numeric_limits<size_t>::max()) {
+        return (!reproducible || is_reproducible()) &&
+               is_available_wk(args, limit);
+    }
+
+    AlgoBase& check_workspace(const SizeArgs& args,
+                              const Workspace& workspace) {
+        auto req = get_workspace_in_bytes(args);
+        megdnn_assert(req <= workspace.size,
+                      "conv bwd data algo %s: "
+                      "required workspace %zu bytes, got %zu",
+                      name(), req, workspace.size);
+        return *this;
+    }
+
+    virtual bool is_miopen() const { return false; }
+};
+
+class ConvolutionBackwardDataImpl::AlgoMIOpen final : public AlgoBase {
+    bool m_is_reproducible;
+    const char* m_name;
+
+    miopenConvBwdDataAlgorithm_t find_best_algo(const ExecArgs& args);
+
+public:
+    AlgoMIOpen() = delete;
+    AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {}
+
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    bool is_reproducible() const override { return m_is_reproducible; }
+
+    const char* name() const override {
+        return "MIOpenConvolutionBackwardData";
+    }
+
+    bool is_miopen() const override { return true; }
+    static convolution::MIOpenCache<SizeArgs, miopenConvBwdDataAlgorithm_t>
+            sm_miopen_algo_cache;
+    static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache;
+};
+
+class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
+    template <typename T>
+    static void exec_internal(const ExecArgs& args);
+
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "MATMUL"; }
+    bool is_reproducible() const override { return true; }
+};
+
+class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "CHANNEL_WISE"; }
+    bool is_reproducible() const override { return true; }
+};
+
+class ConvolutionBackwardDataImpl::AlgoPack {
+    // defined in miopen.cpp
+    void fill_miopen_algos();
+
+    AlgoPack(const AlgoPack&) = delete;
+    AlgoPack& operator=(const AlgoPack&) = delete;
+
+public:
+    AlgoPack();
+
+    AlgoMIOpen miopen{true};
+    AlgoMatmul matmul;
+    AlgoChanwise chanwise;
+
+    std::vector<AlgoBase*>
+            //! all algorithms
+            all_algos, miopen_algos, non_miopen_algos;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_data/chanwise.cpp b/dnn/src/rocm/convolution/backward_data/chanwise.cpp
new file mode 100644
index 00000000..1897504a
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_data/chanwise.cpp
@@ -0,0 +1,56 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_data/chanwise.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/convolution/chanwise/kern.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+bool ConvolutionBackwardDataImpl::AlgoChanwise::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    return args.filter_meta.format == Param::Format::NCHW &&
+           args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 &&
+           fm.dilation[1] == 1 && !fm.should_flip;
+}
+
+size_t ConvolutionBackwardDataImpl::AlgoChanwise::get_workspace_in_bytes(
+        const SizeArgs&) const {
+    return 0;
+}
+
+void ConvolutionBackwardDataImpl::AlgoChanwise::exec(
+        const ExecArgs& args) const {
+    auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args());
+    auto stream = hip_stream(args.handle);
+    switch (args.diff_layout->dtype.enumv()) {
+#define cb(_dt)                                                         \
+    case DTypeTrait<_dt>::enumv: {                                      \
+        using ctype = DTypeTrait<_dt>::ctype;                           \
+        return chanwise::run_bwd_data(args.grad_tensor->ptr<ctype>(),   \
+                                      args.diff_tensor->ptr<ctype>(),   \
+                                      args.filter_tensor->ptr<ctype>(), \
+                                      kparam, stream);                  \
+    }
+        MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+        default:
+            break;
+    }
+    megdnn_assert_internal(0);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_data/matmul.cpp b/dnn/src/rocm/convolution/backward_data/matmul.cpp
new file mode 100644
index 00000000..e7c085ab
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_data/matmul.cpp
@@ -0,0 +1,94 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_data/matmul.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/convolution/helper.h"
+#include "src/rocm/convolution/im2col.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+
+bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    return args.filter_meta.format == Param::Format::NCHW &&
+           args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           fm.group == 1 && fm.spatial_ndim == 2;
+}
+
+size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    return matmul_get_workspace_bundle(args.as_fwd_args())
+            .total_size_in_bytes();
+}
+
+void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs& args) const {
+#define cb(DType)                                        \
+    if (args.diff_layout->dtype == DType()) {            \
+        using ctype = typename DTypeTrait<DType>::ctype; \
+        exec_internal<ctype>(args);                      \
+        return;                                          \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+
+    megdnn_assert_internal(0);
+}
+
+template <typename T>
+void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(
+        const ExecArgs& args) {
+    auto&& fm = args.filter_meta;
+    size_t N = args.grad_layout->shape[0], IC = fm.icpg,
+           IH = args.grad_layout->shape[2], IW = args.grad_layout->shape[3],
+           OC = fm.ocpg, OH = args.diff_layout->shape[2],
+           OW = args.diff_layout->shape[3], FH = fm.spatial[0],
+           FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
+           SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
+           DW = fm.dilation[1];
+    auto stream = hip_stream(args.handle);
+    auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args());
+    wbundle.set(args.workspace.raw_ptr);
+    T* diff_t = static_cast<T*>(wbundle.get(0));
+    T* col = static_cast<T*>(wbundle.get(1));
+    {
+        // transpose diff
+        TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()),
+                tol(froml);
+        froml.stride[0] = args.diff_layout->stride[0];
+        tol.stride[0] = 1;
+        tol.stride[1] = N;
+        TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
+        args.handle->relayout_opr()->exec(from, to);
+    }
+    {
+        // take gemm grad
+        TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
+                Bl({IC * FH * FW, OH * OW * N},
+                   typename DTypeTrait<T>::dtype()),
+                Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
+        TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
+        if (fm.should_flip) {
+            convolution::flip_filter(args.as_fwd_args(),
+                                     wbundle.get_workspace(2), A.raw_ptr);
+        }
+        args.handle->matmul_aT_opr()->exec(A, C, B, Workspace());
+    }
+    {
+        convolution::col2im<T>(col, args.grad_tensor->ptr<T>(), N,
+                               args.grad_layout->stride[0], IC, IH, IW, FH, FW,
+                               OH, OW, PH, PW, SH, SW, DH, DW, stream);
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_data/miopen.cpp b/dnn/src/rocm/convolution/backward_data/miopen.cpp
new file mode 100644
index 00000000..87873242
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_data/miopen.cpp
@@ -0,0 +1,108 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_data/miopen.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./algo.h"
+
+#include "src/rocm/utils.h"
+#include "src/rocm/miopen_wrapper.h"
+#include "src/rocm/convolution/helper.h"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+MIOpenCache<ConvolutionBackwardDataImpl::AlgoBase::SizeArgs,
+            miopenConvBwdDataAlgorithm_t>
+        ConvolutionBackwardDataImpl::AlgoMIOpen::sm_miopen_algo_cache;
+MIOpenCache<ConvolutionBackwardDataImpl::AlgoBase::SizeArgs, size_t>
+        ConvolutionBackwardDataImpl::AlgoMIOpen::sm_miopen_ws_cache;
+
+bool ConvolutionBackwardDataImpl::AlgoMIOpen::is_available(
+        const SizeArgs& args) const {
+    MIOpenBwdDataDescs D;
+    if (!is_miopen_supported(args.as_fwd_args()))
+        return false;
+    auto got = sm_miopen_ws_cache.get(args);
+    if (got.first)
+        return true;
+    args.init_desc(D);
+    size_t workspace_size;
+    auto status = miopenConvolutionBackwardDataGetWorkSpaceSize(
+            args.handle->miopen_handle(), D.diff_desc.desc, D.filter_desc.desc,
+            D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
+    if (status == miopenStatusSuccess) {
+        sm_miopen_ws_cache.set(args, workspace_size);
+        return true;
+    }
+    return false;
+}
+
+size_t ConvolutionBackwardDataImpl::AlgoMIOpen::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    auto got = sm_miopen_ws_cache.get(args);
+    if (got.first)
+        return got.second;
+    MIOpenBwdDataDescs D;
+    args.init_desc(D);
+    size_t workspace_size;
+    auto status = miopenConvolutionBackwardDataGetWorkSpaceSize(
+            args.handle->miopen_handle(), D.diff_desc.desc, D.filter_desc.desc,
+            D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
+    megdnn_assert(status == miopenStatusSuccess,
+                  "conv bwd_data get workspace failed: %s; info: %s",
+                  miopenGetErrorString(status), args.to_string().c_str());
+    sm_miopen_ws_cache.set(args, workspace_size);
+    return workspace_size;
+}
+
+miopenConvBwdDataAlgorithm_t
+ConvolutionBackwardDataImpl::AlgoMIOpen::find_best_algo(const ExecArgs& args) {
+    auto find_algo = sm_miopen_algo_cache.get(args);
+    if (find_algo.first)
+        return find_algo.second;
+    bool exhaustive_search = args.handle->enable_miopen_algo_search();
+    MIOpenBwdDataDescs D;
+    args.init_desc(D);
+    const int req_algo_count = 1;
+    int ret_algo_count;
+    miopenConvAlgoPerf_t algo_perf;
+    miopen_check(miopenFindConvolutionBackwardDataAlgorithm(
+            args.handle->miopen_handle(), D.diff_desc.desc,
+            args.diff_tensor->raw_ptr, D.filter_desc.desc,
+            args.filter_tensor->raw_ptr, D.conv_desc.desc, D.grad_desc.desc,
+            args.grad_tensor->raw_ptr, req_algo_count, &ret_algo_count,
+            &algo_perf, args.workspace.raw_ptr, args.workspace.size,
+            exhaustive_search));
+    sm_miopen_algo_cache.set(args, algo_perf.bwd_data_algo);
+    return algo_perf.bwd_data_algo;
+}
+
+void ConvolutionBackwardDataImpl::AlgoMIOpen::exec(const ExecArgs& args) const {
+    MIOpenBwdDataDescs D;
+    args.init_desc(D);
+    auto algo = const_cast<ConvolutionBackwardDataImpl::AlgoMIOpen*>(this)
+                        ->find_best_algo(args);
+    float alpha = 1.0f, beta = 0.0f;
+    auto status = miopenConvolutionBackwardData(
+            args.handle->miopen_handle(), &alpha, D.diff_desc.desc,
+            args.diff_tensor->raw_ptr, D.filter_desc.desc,
+            args.filter_tensor->raw_ptr, D.conv_desc.desc, algo, &beta,
+            D.grad_desc.desc, args.grad_tensor->raw_ptr, args.workspace.raw_ptr,
+            args.workspace.size);
+    megdnn_assert(status == miopenStatusSuccess,
+                  "conv bwd_data failed: %s; info: %s",
+                  miopenGetErrorString(status), args.to_string().c_str());
+}
+
+void ConvolutionBackwardDataImpl::AlgoPack::fill_miopen_algos() {}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_filter/algo.cpp b/dnn/src/rocm/convolution/backward_filter/algo.cpp
new file mode 100644
index 00000000..8b01d13d
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_filter/algo.cpp
@@ -0,0 +1,98 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_filter/algo.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() {
+    all_algos.push_back(&miopen);
+    all_algos.push_back(&matmul);
+    all_algos.push_back(&chanwise);
+    non_miopen_algos.push_back(&matmul);
+    non_miopen_algos.push_back(&chanwise);
+    non_miopen_algos.push_back(all_algos.back());
+    miopen_algos.push_back(&miopen);
+}
+
+ConvolutionBackwardFilterImpl::AlgoPack
+        ConvolutionBackwardFilterImpl::sm_algo_pack;
+
+ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs(
+        ConvolutionBackwardFilterImpl* o, const TensorLayout& src,
+        const TensorLayout& diff, const TensorLayout& grad)
+        : SizeArgs(o, src, diff, o->check_layout_fwd(src, grad, diff)) {}
+
+ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs(
+        ConvolutionBackwardFilterImpl* o, const TensorLayout& src,
+        const TensorLayout& diff, const CanonizedFilterMeta& grad)
+        : handle{concrete_handle(o->handle())},
+          src_layout{&src},
+          diff_layout{&diff},
+          grad_filter_meta{grad},
+          opr{o} {}
+
+ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs(
+        ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src,
+        _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+        _megdnn_workspace workspace)
+        : SizeArgs(opr, src.layout, diff.layout, grad.layout),
+          src_tensor{&src},
+          diff_tensor{&diff},
+          grad_tensor{&grad},
+          workspace{workspace} {}
+
+std::string ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_string()
+        const {
+    auto&& fm = grad_filter_meta;
+    MEGDNN_MARK_USED_VAR(fm);
+    return megdnn_mangle(ssprintf(
+            "src=%s diff=%s grad_filter=%u{%u,%u,%u,%u}, "
+            "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s",
+            src_layout->to_string().c_str(), diff_layout->to_string().c_str(),
+            fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1],
+            fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
+            fm.dilation[0], fm.dilation[1], !fm.should_flip,
+            src_layout->dtype.name(), diff_layout->dtype.name()));
+}
+
+convolution::MIOpenCacheKey
+ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key()
+        const {
+    convolution::MIOpenCacheKey res;
+    res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle());
+    res.batch = src_layout->operator[](0);
+    res.IC = src_layout->operator[](1);
+    res.IH = src_layout->operator[](2);
+    res.IW = src_layout->operator[](3);
+    res.OH = diff_layout->operator[](2);
+    res.OW = diff_layout->operator[](3);
+    res.FH = grad_filter_meta.spatial[0];
+    res.FW = grad_filter_meta.spatial[1];
+    res.SH = grad_filter_meta.stride[0];
+    res.SW = grad_filter_meta.stride[1];
+    res.PH = grad_filter_meta.padding[0];
+    res.PW = grad_filter_meta.padding[1];
+    res.DH = grad_filter_meta.dilation[0];
+    res.DW = grad_filter_meta.dilation[1];
+    res.group = grad_filter_meta.group;
+    res.ocpg = grad_filter_meta.ocpg;
+    res.icpg = grad_filter_meta.icpg;
+    res.dtype_enum = static_cast<uint32_t>(src_layout->dtype.enumv());
+    res.exhaustive_search =
+            static_cast<int32_t>(handle->enable_miopen_algo_search());
+    res.OC = res.group * res.ocpg;
+    return res;
+}
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_filter/algo.h b/dnn/src/rocm/convolution/backward_filter/algo.h
new file mode 100644
index 00000000..dfd4a788
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_filter/algo.h
@@ -0,0 +1,154 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_filter/algo.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include "src/rocm/convolution/helper.h"
+
+namespace megdnn {
+namespace rocm {
+
+/*!
+ * \brief base class for convolution algos
+ *
+ */
+class ConvolutionBackwardFilterImpl::AlgoBase : public Algorithm {
+protected:
+    ~AlgoBase() = default;
+
+public:
+    struct SizeArgs {
+        HandleImpl* handle;
+        const TensorLayout *src_layout, *diff_layout;
+        CanonizedFilterMeta grad_filter_meta;
+        ConvolutionBackwardFilterImpl* opr;
+
+        std::string to_string() const;
+        convolution::MIOpenCacheKey to_miopen_algo_cache_key() const;
+        void init_desc(convolution::MIOpenBwdFilterDescs& desc) const {
+            desc.set(*src_layout, *diff_layout, grad_filter_meta, opr->param());
+        }
+        SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src,
+                 const TensorLayout& diff, const TensorLayout& grad);
+        SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src,
+                 const TensorLayout& diff, const CanonizedFilterMeta& grad);
+
+        convolution::ForwardSizeArgs as_fwd_args() const {
+            return {handle, src_layout, grad_filter_meta, diff_layout};
+        }
+    };
+    struct ExecArgs : public SizeArgs {
+        const TensorND *src_tensor, *diff_tensor, *grad_tensor;
+        Workspace workspace;
+
+        ExecArgs(ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src,
+                 _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+                 _megdnn_workspace workspace);
+    };
+    virtual bool is_available(const SizeArgs& args) const = 0;
+    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
+    virtual void exec(const ExecArgs& args) const = 0;
+
+    bool is_available_wk(const SizeArgs& args, size_t limit) {
+        return is_available(args) && get_workspace_in_bytes(args) <= limit;
+    }
+    bool is_available_reproducible(
+            const SizeArgs& args, bool reproducible = true,
+            size_t limit = std::numeric_limits<size_t>::max()) {
+        return (!reproducible || is_reproducible()) &&
+               is_available_wk(args, limit);
+    }
+
+    AlgoBase& check_workspace(const SizeArgs& args,
+                              const Workspace& workspace) {
+        auto req = get_workspace_in_bytes(args);
+        megdnn_assert(req <= workspace.size,
+                      "conv bwd filter algo %s: "
+                      "required workspace %zu bytes, got %zu",
+                      name(), req, workspace.size);
+        return *this;
+    }
+
+    virtual bool is_miopen() const { return false; }
+};
+
+class ConvolutionBackwardFilterImpl::AlgoMIOpen final : public AlgoBase {
+    bool m_is_reproducible;
+    const char* m_name;
+
+    miopenConvBwdWeightsAlgorithm_t find_best_algo(const ExecArgs& args);
+
+public:
+    AlgoMIOpen() = delete;
+    AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {}
+
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    bool is_reproducible() const override { return m_is_reproducible; }
+
+    const char* name() const override {
+        return "MIOpenConvolutionBackwardFilter";
+    }
+
+    bool is_miopen() const override { return true; }
+    static convolution::MIOpenCache<SizeArgs, miopenConvBwdWeightsAlgorithm_t>
+            sm_miopen_algo_cache;
+    static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache;
+};
+
+class ConvolutionBackwardFilterImpl::AlgoMatmul final : public AlgoBase {
+    template <typename T>
+    static void exec_internal(const ExecArgs& args);
+
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "MATMUL"; }
+    bool is_reproducible() const override { return true; }
+};
+
+class ConvolutionBackwardFilterImpl::AlgoChanwise final : public AlgoBase {
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "CHANNEL_WISE"; }
+    bool is_reproducible() const override { return true; }
+};
+
+class ConvolutionBackwardFilterImpl::AlgoPack {
+    void fill_miopen_algos();
+
+    AlgoPack(const AlgoPack&) = delete;
+    AlgoPack& operator=(const AlgoPack&) = delete;
+
+public:
+    AlgoPack();
+
+    AlgoMIOpen miopen{true};
+    AlgoMatmul matmul;
+    AlgoChanwise chanwise;
+
+    std::vector<AlgoBase*>
+            //! all algorithms
+            all_algos, miopen_algos, non_miopen_algos;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_filter/chanwise.cpp b/dnn/src/rocm/convolution/backward_filter/chanwise.cpp
new file mode 100644
index 00000000..c16ede27
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_filter/chanwise.cpp
@@ -0,0 +1,55 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_filter/chanwise.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/convolution/chanwise/kern.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+bool ConvolutionBackwardFilterImpl::AlgoChanwise::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.grad_filter_meta;
+    return fm.format == Param::Format::NCHW &&
+           args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 &&
+           fm.dilation[1] == 1 && !fm.should_flip;
+}
+
+size_t ConvolutionBackwardFilterImpl::AlgoChanwise::get_workspace_in_bytes(
+        const SizeArgs&) const {
+    return 0;
+}
+
+void ConvolutionBackwardFilterImpl::AlgoChanwise::exec(
+        const ExecArgs& args) const {
+    auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args());
+    auto stream = hip_stream(args.handle);
+    switch (args.diff_layout->dtype.enumv()) {
+#define cb(_dt)                                                                \
+    case DTypeTrait<_dt>::enumv: {                                             \
+        using ctype = DTypeTrait<_dt>::ctype;                                  \
+        return chanwise::run_bwd_filter(                                       \
+                args.grad_tensor->ptr<ctype>(), args.src_tensor->ptr<ctype>(), \
+                args.diff_tensor->ptr<ctype>(), kparam, stream);               \
+    }
+        MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+        default:
+            break;
+    }
+    megdnn_assert_internal(0);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_filter/matmul.cpp b/dnn/src/rocm/convolution/backward_filter/matmul.cpp
new file mode 100644
index 00000000..462a0ea8
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_filter/matmul.cpp
@@ -0,0 +1,102 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_filter/matmul.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/convolution/helper.h"
+#include "src/rocm/convolution/im2col.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+
+bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.grad_filter_meta;
+    return fm.format == Param::Format::NCHW &&
+           args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           fm.group == 1 && fm.spatial_ndim == 2;
+}
+
+size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    return matmul_get_workspace_bundle(args.as_fwd_args())
+            .total_size_in_bytes();
+}
+
+void ConvolutionBackwardFilterImpl::AlgoMatmul::exec(
+        const ExecArgs& args) const {
+#define cb(DType)                                        \
+    if (args.diff_layout->dtype == DType()) {            \
+        using ctype = typename DTypeTrait<DType>::ctype; \
+        exec_internal<ctype>(args);                      \
+        return;                                          \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+
+    megdnn_assert_internal(0);
+}
+
+template <typename T>
+void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(
+        const ExecArgs& args) {
+    auto&& fm = args.grad_filter_meta;
+    size_t N = args.src_layout->shape[0], IC = fm.icpg,
+           IH = args.src_layout->shape[2], IW = args.src_layout->shape[3],
+           OC = fm.ocpg, OH = args.diff_layout->shape[2],
+           OW = args.diff_layout->shape[3], FH = fm.spatial[0],
+           FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
+           SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
+           DW = fm.dilation[1];
+    auto stream = hip_stream(args.handle);
+    auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args());
+    wbundle.set(args.workspace.raw_ptr);
+    T* diff_t = static_cast<T*>(wbundle.get(0));
+    T* col = static_cast<T*>(wbundle.get(1));
+    {
+        // transpose diff
+        TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()),
+                tol(froml);
+        froml.stride[0] = args.diff_layout->stride[0];
+        tol.stride[0] = 1;
+        tol.stride[1] = N;
+        TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
+        args.handle->relayout_opr()->exec(from, to);
+    }
+    {
+        convolution::im2col<T>(args.src_tensor->ptr<T>(), col, N,
+                               args.src_tensor->layout.stride[0], IC, IH, IW,
+                               FH, FW, OH, OW, PH, PW, SH, SW, DH, DW, stream);
+    }
+    {
+        // take gemm grad
+        TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
+                Bl({IC * FH * FW, OH * OW * N},
+                   typename DTypeTrait<T>::dtype()),
+                Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
+        TensorND A(args.grad_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
+        if (fm.should_flip) {
+            A.raw_ptr = wbundle.get(2);
+        }
+        args.handle->matmul_bT_opr()->exec(C, B, A, Workspace());
+
+        if (fm.should_flip) {
+            convolution::flip_filter(
+                    args.as_fwd_args(),
+                    {static_cast<dt_byte*>(args.grad_tensor->raw_ptr),
+                     wbundle.get_size(2)},
+                    A.raw_ptr);
+        }
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/backward_filter/miopen.cpp b/dnn/src/rocm/convolution/backward_filter/miopen.cpp
new file mode 100644
index 00000000..d4e6e5a3
--- /dev/null
+++ b/dnn/src/rocm/convolution/backward_filter/miopen.cpp
@@ -0,0 +1,110 @@
+/**
+ * \file dnn/src/rocm/convolution/backward_filter/miopen.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./algo.h"
+
+#include "src/rocm/utils.h"
+#include "src/rocm/miopen_wrapper.h"
+#include "src/rocm/convolution/helper.h"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+MIOpenCache<ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs,
+            miopenConvBwdWeightsAlgorithm_t>
+        ConvolutionBackwardFilterImpl::AlgoMIOpen::sm_miopen_algo_cache;
+MIOpenCache<ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs, size_t>
+        ConvolutionBackwardFilterImpl::AlgoMIOpen::sm_miopen_ws_cache;
+
+bool ConvolutionBackwardFilterImpl::AlgoMIOpen::is_available(
+        const SizeArgs& args) const {
+    MIOpenBwdFilterDescs D;
+    if (!is_miopen_supported(args.as_fwd_args()))
+        return false;
+    auto got = sm_miopen_ws_cache.get(args);
+    if (got.first)
+        return true;
+    args.init_desc(D);
+    size_t workspace_size;
+    auto status = miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+            args.handle->miopen_handle(), D.diff_desc.desc, D.src_desc.desc,
+            D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
+    if (status == miopenStatusSuccess) {
+        sm_miopen_ws_cache.set(args, workspace_size);
+        return true;
+    }
+    return false;
+}
+
+size_t ConvolutionBackwardFilterImpl::AlgoMIOpen::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    auto got = sm_miopen_ws_cache.get(args);
+    if (got.first)
+        return got.second;
+    MIOpenBwdFilterDescs D;
+    args.init_desc(D);
+    size_t workspace_size;
+    auto status = miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+            args.handle->miopen_handle(), D.diff_desc.desc, D.src_desc.desc,
+            D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
+    megdnn_assert(status == miopenStatusSuccess,
+                  "conv bwd_filter get workspace failed: %s; info: %s",
+                  miopenGetErrorString(status), args.to_string().c_str());
+    sm_miopen_ws_cache.set(args, workspace_size);
+    return workspace_size;
+}
+
+miopenConvBwdWeightsAlgorithm_t
+ConvolutionBackwardFilterImpl::AlgoMIOpen::find_best_algo(const ExecArgs& args) {
+    auto find_algo = sm_miopen_algo_cache.get(args);
+    if (find_algo.first)
+        return find_algo.second;
+    bool exhaustive_search = args.handle->enable_miopen_algo_search();
+    MIOpenBwdFilterDescs D;
+    args.init_desc(D);
+    const int req_algo_count = 1;
+    int ret_algo_count;
+    miopenConvAlgoPerf_t algo_perf;
+    miopen_check(miopenFindConvolutionBackwardWeightsAlgorithm(
+            args.handle->miopen_handle(), D.diff_desc.desc,
+            args.diff_tensor->raw_ptr, D.src_desc.desc,
+            args.src_tensor->raw_ptr, D.conv_desc.desc, D.grad_desc.desc,
+            args.grad_tensor->raw_ptr, req_algo_count, &ret_algo_count,
+            &algo_perf, args.workspace.raw_ptr, args.workspace.size,
+            exhaustive_search));
+//    algo_perf.bwd_weights_algo = miopenConvolutionBwdWeightsAlgoGEMM;
+    sm_miopen_algo_cache.set(args, algo_perf.bwd_weights_algo);
+    return algo_perf.bwd_weights_algo;
+}
+
+void ConvolutionBackwardFilterImpl::AlgoMIOpen::exec(
+        const ExecArgs& args) const {
+    MIOpenBwdFilterDescs D;
+    args.init_desc(D);
+    auto algo = const_cast<ConvolutionBackwardFilterImpl::AlgoMIOpen*>(this)
+                        ->find_best_algo(args);
+    float alpha = 1.0f, beta = 0.0f;
+    auto status = miopenConvolutionBackwardWeights(
+            args.handle->miopen_handle(), &alpha, D.diff_desc.desc,
+            args.diff_tensor->raw_ptr, D.src_desc.desc,
+            args.src_tensor->raw_ptr, D.conv_desc.desc, algo, &beta,
+            D.grad_desc.desc, args.grad_tensor->raw_ptr, args.workspace.raw_ptr,
+            args.workspace.size);
+    megdnn_assert(status == miopenStatusSuccess,
+                  "conv bwd_filter failed: %s; info: %s",
+                  miopenGetErrorString(status), args.to_string().c_str());
+}
+
+void ConvolutionBackwardFilterImpl::AlgoPack::fill_miopen_algos() {}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/chanwise/bwd_data.cpp.hip b/dnn/src/rocm/convolution/chanwise/bwd_data.cpp.hip
new file mode 100644
index 00000000..4d67dfc0
--- /dev/null
+++ b/dnn/src/rocm/convolution/chanwise/bwd_data.cpp.hip
@@ -0,0 +1,173 @@
+/**
+ * \file src/rocm/convolution/chanwise/bwd_data.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#include "hip_header.h"
+#include "./kern.h.hip"
+#include "./kern_helper.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+using namespace chanwise;
+
+namespace {
+
+// grid idx is (inp_chl, worker_index)
+// each y-slice of a block works on an (N, IH, IW) spatial image at given
+// inp_chl
+template <typename T, int CHL_MUL_SET, int FH_SET, int FW_SET, int SH_SET,
+          int SW_SET>
+__global__ void kern_bwd_data(T* src_grad, const T* dst_grad, const T* flt_tot,
+                              Param param) {
+    extern __shared__ uint8_t flt_storage[];
+
+    T* const flt = reinterpret_cast<T*>(flt_storage);
+
+    const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x,
+                   IH = param.src_h, IW = param.src_w,
+                   CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul,
+                   FH = FH_SET ? FH_SET : param.flt_h,
+                   FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW,
+                   PH = param.pad_h, PW = param.pad_w,
+                   SH = SH_SET ? SH_SET : param.stride_h,
+                   SW = SW_SET ? SW_SET : param.stride_w, OH = param.out_h,
+                   OW = param.out_w, TOT_OUT = N * IH * IW;
+
+    block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL);
+    dst_grad += ic * CHL_MUL * OH * OW;
+    src_grad += ic * IH * IW;
+
+    uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x,
+             nr_out_per_launch = blockDim.x * gridDim.y;
+    for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) {
+        uint32_t out_idx = out_idx_, n, ih, iw;
+        out_idx = div_mod(out_idx, IW, iw);
+        out_idx = div_mod(out_idx, IH, ih);
+        n = out_idx;
+
+        const T* dst_grad_base = dst_grad + n * (IC * CHL_MUL * OH * OW);
+
+        T sum(0);
+
+        // o >= max(0, floor_div((i+P-F+1), S))
+        uint32_t ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH,
+                 owmin = max(int32_t(iw + PW - FW + SW), 0) / SW,
+                 ohmax = min((ih + PH) / SH, OH - 1),
+                 owmax = min((iw + PW) / SW, OW - 1);
+        if (SH_SET == 1 && SW_SET == 1 && FH_SET && FW_SET) {
+#pragma unroll
+            for (uint32_t doh = 0; doh < FH; ++doh) {
+                uint32_t oh = ohmin + doh;
+                if (oh <= ohmax) {
+                    uint32_t fh = ih - oh * SH + PH;
+#pragma unroll
+                    for (uint32_t dow = 0; dow < FW; ++dow) {
+                        uint32_t ow = owmin + dow;
+                        if (ow <= owmax) {
+                            uint32_t fw = iw - ow * SW + PW;
+                            const T* pd = dst_grad_base + oh * OW + ow;
+                            const T* pf = flt + fh * FW + fw;
+#pragma unroll
+                            for (uint32_t chl_mul = 0; chl_mul < CHL_MUL;
+                                 ++chl_mul) {
+                                sum += *pd * *pf;
+                                pd += OH * OW;
+                                pf += FSIZE;
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            for (uint32_t oh = ohmin; oh <= ohmax; ++oh) {
+                uint32_t fh = ih - oh * SH + PH;
+                for (uint32_t ow = owmin; ow <= owmax; ++ow) {
+                    uint32_t fw = iw - ow * SW + PW;
+                    const T* pd = dst_grad_base + oh * OW + ow;
+                    const T* pf = flt + fh * FW + fw;
+#pragma unroll
+                    for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; ++chl_mul) {
+                        sum += *pd * *pf;
+                        pd += OH * OW;
+                        pf += FSIZE;
+                    }
+                }
+            }
+        }
+
+        src_grad[(n * (IC * IH) + ih) * IW + iw] = sum;
+    }
+}
+
+template <typename T>
+class KernDispatch {
+public:
+    typedef void (*kern_ptr_t)(T*, const T*, const T*, Param);
+
+    static kern_ptr_t dispatch(int chl_mul, int fh, int fw, int sh, int sw) {
+        if (chl_mul == 1) {
+            if (fh == 3 && fw == 3)
+                return d1<1, 3, 3>(sh, sw);
+            if (fh == 4 && fw == 4)
+                return d1<1, 4, 4>(sh, sw);
+        }
+        return d1<0, 0, 0>(sh, sw);
+    }
+
+private:
+    template <int chl_mul, int fh, int fw>
+    static kern_ptr_t d1(int sh, int sw) {
+        if (sh == 1 && sw == 1)
+            return kern_bwd_data<T, chl_mul, fh, fw, 1, 1>;
+        if (sh == 1 && sw == 2)
+            return kern_bwd_data<T, chl_mul, fh, fw, 1, 2>;
+        if (sh == 2 && sw == 1)
+            return kern_bwd_data<T, chl_mul, fh, fw, 2, 1>;
+        if (sh == 2 && sw == 2)
+            return kern_bwd_data<T, chl_mul, fh, fw, 2, 2>;
+        return kern_bwd_data<T, chl_mul, fh, fw, 0, 0>;
+    }
+};
+
+}  // anonymous namespace
+
+template <typename T>
+void chanwise::run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
+                            const Param& param, hipStream_t stream) {
+    typename KernDispatch<T>::kern_ptr_t kern =
+            KernDispatch<T>::dispatch(param.chl_mul, param.flt_h, param.flt_w,
+                                      param.stride_h, param.stride_w);
+    int nr_thread = 256, nr_out_dimx = param.src_h * param.src_w * param.batch;
+    dim3 nr_block(param.src_chl,
+                  std::min(512, max(nr_out_dimx / (nr_thread * 4), 1)));
+    uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T);
+    kern<<<nr_block, nr_thread, shared, stream>>>(src_grad, dst_grad, flt,
+                                                  param);
+    after_kernel_launch();
+}
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+namespace chanwise {
+
+#define INST(_dt)                                                   \
+    template void run_bwd_data(                                     \
+            DTypeTrait<_dt>::ctype*, const DTypeTrait<_dt>::ctype*, \
+            const DTypeTrait<_dt>::ctype*, const Param&, hipStream_t);
+MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST)
+#undef INST
+#undef DO_INST
+
+} // namespace chanwise
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cuda.doxygen
diff --git a/dnn/src/rocm/convolution/chanwise/bwd_filter.cpp.hip b/dnn/src/rocm/convolution/chanwise/bwd_filter.cpp.hip
new file mode 100644
index 00000000..fa2496e2
--- /dev/null
+++ b/dnn/src/rocm/convolution/chanwise/bwd_filter.cpp.hip
@@ -0,0 +1,193 @@
+/**
+ * \file src/rocm/convolution/chanwise/bwd_filter.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#include "hip_header.h"
+#include "./kern.h.hip"
+#include "./kern_helper.h.hip"
+
+const uint32_t WARP_SIZE = 32, BATCH_UNROLL = 4;
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+using namespace chanwise;
+
+namespace {
+
+/*!
+ * \brief compute grad w.r.t. filter
+ *
+ * block dim: out_id * kern_id
+ * threads with the same out_id computes grad for corresponding kernel element
+ * \tparam nr_thpf number of threads for one element in the filter; must be
+ *      power of 2;
+ */
+template <typename T, uint32_t nr_thpf>
+__global__ void kern_bwd_filter(T* flt_grad, const T* src, const T* dst_grad,
+                                Param param) {
+    const uint32_t N = param.batch, IC = param.src_chl, IH = param.src_h,
+                   IW = param.src_w, CHL_MUL = param.chl_mul, FH = param.flt_h,
+                   FW = param.flt_w, PH = param.pad_h, PW = param.pad_w,
+                   SH = param.stride_h, SW = param.stride_w, OH = param.out_h,
+                   OW = param.out_w, SRC_BATCH_STRIDE = IC * IH * IW,
+                   DST_BATCH_STRIDE = IC * CHL_MUL * OH * OW,
+                   BLKDIM_X = blockDim.x / nr_thpf,
+                   THREADID_X = threadIdx.x / nr_thpf,
+                   OUT_IDX = blockIdx.x * BLKDIM_X + THREADID_X;
+
+    uint32_t ic, chl_mul, fh, fw;
+    {
+        uint32_t i = OUT_IDX;
+        i = div_mod(i, FW, fw);
+        i = div_mod(i, FH, fh);
+        i = div_mod(i, CHL_MUL, chl_mul);
+        ic = i;
+    }
+    if (ic >= IC) {
+        return;
+    }
+    src += ic * IH * IW;
+    dst_grad += (ic * CHL_MUL + chl_mul) * OH * OW;
+
+    const uint32_t oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH,
+                   oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH),
+                   ow_lo = max(int32_t(PW - fw + SW - 1), 0) / SW,
+                   ow_hi = min((IW - 1 + PW - fw) / SW + 1, OW),
+                   oblk_h = oh_hi - oh_lo, oblk_w = ow_hi - ow_lo,
+                   oblk_tot = oblk_h * oblk_w *
+                              ((N + BATCH_UNROLL - 1) / BATCH_UNROLL),
+                   tid = threadIdx.x % nr_thpf;
+
+    if (IH + PH < fh + 1 || oh_lo >= oh_hi || IW + PW < fw + 1 ||
+        ow_lo >= ow_hi) {
+        if (!tid)
+            flt_grad[OUT_IDX] = 0;
+        return;
+    }
+
+    T sum(0);
+    for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) {
+        uint32_t n, oh, ow;
+        n = div_mod(div_mod(oblk_idx, oblk_w, ow), oblk_h, oh) * BATCH_UNROLL;
+        oh += oh_lo;
+        ow += ow_lo;
+        uint32_t ih = oh * SH - PH + fh, iw = ow * SW - PW + fw,
+                 soff = ih * IW + iw + n * SRC_BATCH_STRIDE,
+                 doff = oh * OW + ow + n * DST_BATCH_STRIDE;
+#pragma unroll
+        for (uint32_t i = 0; i < BATCH_UNROLL; ++i) {
+            if (!i || n + i < N) {
+                sum += src[soff] * dst_grad[doff];
+            }
+            soff += SRC_BATCH_STRIDE;
+            doff += DST_BATCH_STRIDE;
+        }
+    }
+
+    if (nr_thpf == 1) {
+        flt_grad[OUT_IDX] = sum;
+    } else {
+        // reduce all sums in a block
+        extern __shared__ uint8_t shared_storage[];
+        volatile T* thread_sum = reinterpret_cast<T*>(shared_storage);
+        thread_sum += THREADID_X * nr_thpf;
+        thread_sum[tid] = sum;
+#pragma unroll
+        for (uint32_t i = nr_thpf / 2; i; i >>= 1) {
+            bool cond = nr_thpf >= i * 2 && tid < i;
+            if (i >= WARP_SIZE) {
+                __syncthreads();
+            }
+            if (cond) {
+                T v0 = thread_sum[tid], v1 = v0 + thread_sum[tid + i];
+                thread_sum[tid] = v1;
+            }
+        }
+
+        if (!tid)
+            flt_grad[OUT_IDX] = thread_sum[0];
+    }
+}
+
+}  // anonymous namespace
+
+template <typename T>
+void convolution::chanwise::run_bwd_filter(T* filter_grad, const T* src,
+                                           const T* dst_grad,
+                                           const Param& param,
+                                           hipStream_t stream) {
+    void (*kern)(T*, const T*, const T*, Param) = NULL;
+    uint32_t nr_thread = 256,
+             nr_thpf = std::min(
+                     nr_thread,
+                     std::max<uint32_t>(1, param.out_h * param.out_w *
+                                                   param.batch /
+                                                   (BATCH_UNROLL * 16)));
+
+    // find nearest power-of-2 of nr_thpf
+    do {
+#define CK(_n)                         \
+    if (nr_thpf >= _n) {               \
+        kern = kern_bwd_filter<T, _n>; \
+        nr_thpf = _n;                  \
+        break;                         \
+    }
+        CK(1 << 10);
+        CK(1 << 9);
+        CK(1 << 8);
+        CK(1 << 7);
+        CK(1 << 6);
+        CK(1 << 5);
+        CK(1 << 4);
+        CK(1 << 3);
+        CK(1 << 2);
+        CK(1 << 1);
+        CK(1 << 0);
+#undef CK
+    } while (0);
+
+    megdnn_assert(kern);
+    nr_thread = 256;
+
+    uint32_t nr_flt_per_blk = nr_thread / nr_thpf;
+    while (nr_flt_per_blk * nr_thpf % WARP_SIZE)
+        --nr_flt_per_blk;
+    megdnn_assert(nr_flt_per_blk);
+
+    int nr_block =
+            DIVUP(param.flt_h * param.flt_w * param.src_chl * param.chl_mul,
+                  nr_flt_per_blk);
+    nr_thread = nr_flt_per_blk * nr_thpf;
+    uint32_t shared = nr_thread * 2 * sizeof(T);
+    hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, filter_grad,
+                       src, dst_grad, param);
+    after_kernel_launch();
+}
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+namespace chanwise {
+
+#define DO_INST(_ct)                                                         \
+    template void run_bwd_filter(_ct*, const _ct*, const _ct*, const Param&, \
+                                 hipStream_t);
+#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype)
+
+MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST)
+
+#undef INST
+#undef DO_INST
+
+} // namespace chanwise
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cuda.doxygen
diff --git a/dnn/src/rocm/convolution/chanwise/fwd.cpp.hip b/dnn/src/rocm/convolution/chanwise/fwd.cpp.hip
new file mode 100644
index 00000000..f6f4ae90
--- /dev/null
+++ b/dnn/src/rocm/convolution/chanwise/fwd.cpp.hip
@@ -0,0 +1,132 @@
+/**
+ * \file src/rocm/convolution/chanwise/fwd.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#include "hip_header.h"
+#include "./kern.h.hip"
+#include "./kern_helper.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+using namespace chanwise;
+
+namespace {
+
+// grid idx is (inp_chl, worker_index)
+// each y-slice of a block works on an (N, CHL_MUL, OH, OW) spatial image at
+// given inp_chl
+template <typename T, int CHL_MUL_SET, int FH_SET, int FW_SET>
+__global__ void kern_fwd(T* dst, const T* src, const T* flt_tot, Param param) {
+    extern __shared__ uint8_t flt_storage[];
+
+    T* const flt = reinterpret_cast<T*>(flt_storage);
+
+    const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x,
+                   IH = param.src_h, IW = param.src_w,
+                   CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul,
+                   FH = FH_SET ? FH_SET : param.flt_h,
+                   FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW,
+                   PH = param.pad_h, PW = param.pad_w, SH = param.stride_h,
+                   SW = param.stride_w, OH = param.out_h, OW = param.out_w,
+                   TOT_OUT = N * CHL_MUL * OH * OW;
+
+    block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL);
+
+    uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x,
+             nr_out_per_launch = blockDim.x * gridDim.y;
+    for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) {
+        uint32_t out_idx = out_idx_, n, chl_mul, oh, ow;
+        out_idx = div_mod(out_idx, OW, ow);
+        out_idx = div_mod(out_idx, OH, oh);
+        if (CHL_MUL_SET == 1) {
+            chl_mul = 0;
+            n = out_idx;
+        } else {
+            n = div_mod(out_idx, CHL_MUL, chl_mul);
+        }
+
+        int ih = int(oh * SH) - int(PH), iw = int(ow * SW) - int(PW);
+        const T* flt_base = flt + chl_mul * FSIZE;
+        const T* src_base = src + int(((n * IC + ic) * IH + ih) * IW + iw);
+
+        T sum(0);
+
+        if (FH_SET && FW_SET) {
+#pragma unroll
+            for (uint32_t fh = 0; fh < FH; ++fh) {
+                if (static_cast<uint32_t>(fh + ih) < IH) {
+#pragma unroll
+                    for (uint32_t fw = 0; fw < FW; ++fw) {
+                        if (static_cast<uint32_t>(fw + iw) < IW) {
+                            sum += flt_base[fh * FW + fw] *
+                                   src_base[fh * IW + fw];
+                        }
+                    }
+                }
+            }
+        } else {
+            int fhmax = min(int(FH), int(IH - ih)),
+                fwmax = min(int(FW), int(IW - iw));
+            for (int fh = max(0, -ih); fh < fhmax; ++fh) {
+                for (int fw = max(0, -iw); fw < fwmax; ++fw) {
+                    sum += flt_base[fh * FW + fw] * src_base[fh * IW + fw];
+                }
+            }
+        }
+        dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow] = sum;
+    }
+}
+
+}  // anonymous namespace
+
+template <typename T>
+void chanwise::run_fwd(T* dst, const T* src, const T* flt, const Param& param,
+                       hipStream_t stream) {
+    void (*kern)(T*, const T*, const T*, Param);
+    if (param.chl_mul == 1) {
+        if (param.flt_h == 3 && param.flt_w == 3) {
+            kern = kern_fwd<T, 1, 3, 3>;
+        } else if (param.flt_h == 4 && param.flt_w == 4) {
+            kern = kern_fwd<T, 1, 4, 4>;
+        } else {
+            kern = kern_fwd<T, 1, 0, 0>;
+        }
+    } else {
+        kern = kern_fwd<T, 0, 0, 0>;
+    }
+    int nr_thread = 256,
+        nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul;
+    dim3 nr_block(param.src_chl,
+                  std::min(512, max(nr_out_dimx / (nr_thread * 4), 1)));
+    uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T);
+    kern<<<nr_block, nr_thread, shared, stream>>>(dst, src, flt, param);
+    after_kernel_launch();
+}
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+namespace chanwise {
+
+#define DO_INST(_ct)                                                  \
+    template void run_fwd(_ct*, const _ct*, const _ct*, const Param&, \
+                          hipStream_t);
+#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype)
+
+MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST)
+
+#undef INST
+#undef DO_INST
+
+} // namespace chanwise
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cuda.doxygen
diff --git a/dnn/src/rocm/convolution/chanwise/kern.h.hip b/dnn/src/rocm/convolution/chanwise/kern.h.hip
new file mode 100644
index 00000000..2c06cd6f
--- /dev/null
+++ b/dnn/src/rocm/convolution/chanwise/kern.h.hip
@@ -0,0 +1,71 @@
+/**
+ * \file src/rocm/convolution/chanwise/kern.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "src/rocm/utils.h.hip"
+
+#include <stdint.h>
+#include "hip_header.h"
+
+#if MEGDNN_CC_HOST
+#include "src/rocm/convolution/helper.h"
+#endif
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+namespace chanwise {
+
+struct Param {
+    uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w;
+#if MEGDNN_CC_HOST
+    static Param from_fwd_args(const ForwardSizeArgs& args) {
+#define U(v) static_cast<uint32_t>(v)
+        auto&& src = args.src_layout->shape;
+        auto&& dst = args.dst_layout->shape;
+        auto&& fm = args.filter_meta;
+        size_t c_pos, hw_pos;
+        if (fm.format == param::Convolution::Format::NCHW) {
+            c_pos = 1;
+            hw_pos = 2;
+        } else {
+            c_pos = 3;
+            hw_pos = 1;
+        }
+        return {
+                U(src[0]),          U(src[c_pos]),     U(src[hw_pos]),
+                U(src[hw_pos + 1]), U(fm.ocpg),        U(fm.spatial[0]),
+                U(fm.spatial[1]),   U(dst[hw_pos]),    U(dst[hw_pos + 1]),
+                U(fm.padding[0]),   U(fm.padding[1]),  U(fm.stride[0]),
+                U(fm.stride[1]),    U(fm.dilation[0]), U(fm.dilation[1]),
+        };
+#undef U
+    }
+#endif
+};
+
+template <typename T>
+void run_fwd(T* dst, const T* src, const T* flt, const Param& param,
+             hipStream_t stream);
+
+template <typename T>
+void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
+                  const Param& param, hipStream_t stream);
+
+template <typename T>
+void run_bwd_filter(T* filter_grad, const T* src, const T* dst_grad,
+                    const Param& param, hipStream_t stream);
+
+} // namespace chanwise
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/chanwise/kern_helper.h.hip b/dnn/src/rocm/convolution/chanwise/kern_helper.h.hip
new file mode 100644
index 00000000..7876c612
--- /dev/null
+++ b/dnn/src/rocm/convolution/chanwise/kern_helper.h.hip
@@ -0,0 +1,51 @@
+/**
+ * \file src/rocm/convolution/chanwise/kern_helper.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "megdnn/dtype.h"
+#include "src/rocm/utils.h.hip"
+
+#include <stdint.h>
+#include <algorithm>
+#include "hip_header.h"
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+namespace chanwise {
+
+/*!
+ * \brief return a / b and set mod to a % b
+ */
+__device__ __forceinline__ uint32_t div_mod(uint32_t a, uint32_t b,
+                                            uint32_t& mod) {
+    uint32_t ret = a / b;
+    mod = a - ret * b;
+    return ret;
+}
+
+/*!
+ * \brief copy a 2D matrix by all threads in a block
+ * \param rs row stride
+ */
+template <typename T>
+__device__ __forceinline__ void block_memcpy(T* dst, const T* src,
+                                             uint32_t size) {
+    for (uint32_t i = threadIdx.x; i < size; i += blockDim.x) {
+        dst[i] = src[i];
+    }
+    __syncthreads();
+}
+
+} // namespace chanwise
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cuda.doxygen
diff --git a/dnn/src/rocm/convolution/forward/1x1.cpp b/dnn/src/rocm/convolution/forward/1x1.cpp
new file mode 100644
index 00000000..580472fc
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/1x1.cpp
@@ -0,0 +1,130 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/1x1.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/handle.h"
+#include "src/rocm/utils.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+bool ConvolutionForwardImpl::Algo1x1::is_available(const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    const size_t MAX_WORKSPACE_SIZE = 2147483648;  // 2 * 1024^3
+
+    if (!(fm.format == Param::Format::NCHW &&
+          args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+          (fm.dtype.enumv() == DTypeEnum::Float32 ||
+           fm.dtype.enumv() == DTypeEnum::Float16) &&
+          fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
+          fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 &&
+          fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 &&
+          fm.stride[1] == 1))
+        return false;
+    if (get_workspace_in_bytes(args) > MAX_WORKSPACE_SIZE) {
+        return false;
+    }
+    return true;
+}
+
+void ConvolutionForwardImpl::Algo1x1::extract_matmul_layouts(
+        const SizeArgs& args, TensorLayout& A, TensorLayout& B,
+        TensorLayout& C) {
+    auto&& fm = args.filter_meta;
+    A = {{fm.ocpg, fm.icpg}, fm.dtype};
+    B.ndim = 2;
+    B.shape[0] = args.src_layout->shape[1];
+    B.shape[1] = args.src_layout->shape[2] * args.src_layout->shape[3];
+    B.stride[0] = args.src_layout->stride[1];
+    B.stride[1] = 1;
+    B.dtype = args.src_layout->dtype;
+    C = {{args.dst_layout->shape[1], B.shape[1]}, args.dst_layout->dtype};
+}
+size_t ConvolutionForwardImpl::Algo1x1::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    TensorLayout A, B, C;
+    extract_matmul_layouts(args, A, B, C);
+    return args.handle->matmul_opr()->get_workspace_in_bytes(A, B, C);
+}
+void ConvolutionForwardImpl::Algo1x1::exec(const ExecArgs& args) const {
+    TensorND A, B, C;
+    extract_matmul_layouts(args, A.layout, B.layout, C.layout);
+    A.raw_ptr = args.filter_tensor->raw_ptr;
+    B.raw_ptr = args.src_tensor->raw_ptr;
+    C.raw_ptr = args.dst_tensor->raw_ptr;
+    size_t batch = args.src_layout->shape[0];
+    auto mm = args.handle->matmul_opr();
+    auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(),
+         strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size();
+    for (size_t i = 0; i < batch; ++i) {
+        mm->exec(A, B, C, args.workspace);
+        incr_voidp(B.raw_ptr, strd_B);
+        incr_voidp(C.raw_ptr, strd_C);
+    }
+}
+
+/*
+ *  Funcitons to handle large batch
+ */
+bool ConvolutionForwardImpl::Algo1x1LargeBatch::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    return fm.format == Param::Format::NCHW &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           (fm.dtype.enumv() == DTypeEnum::Float32 ||
+            fm.dtype.enumv() == DTypeEnum::Float16) &&
+           fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
+           fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 &&
+           fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 &&
+           fm.stride[1] == 1;
+}
+
+void ConvolutionForwardImpl::Algo1x1LargeBatch::extract_matmul_layouts(
+        const SizeArgs& args, TensorLayout& A, TensorLayout& B,
+        TensorLayout& C) {
+    auto&& fm = args.filter_meta;
+    // A {N, OC, IC}
+    // B {N, IC, H * W}
+    // C {N, OC, H * W}
+    size_t batched = args.src_layout->shape[0];
+    A = {{batched, fm.ocpg, fm.icpg}, fm.dtype};
+    A.stride[0] = 0;
+    B.ndim = 3;
+    B.shape[1] = args.src_layout->shape[1];
+    B.shape[2] = args.src_layout->shape[2] * args.src_layout->shape[3];
+    B.shape[0] = batched;
+    B.stride[2] = 1;
+    B.stride[1] = args.src_layout->stride[1];
+    B.stride[0] = args.src_layout->stride[0];
+    B.dtype = args.src_layout->dtype;
+    C = {{args.dst_layout->shape[0], args.dst_layout->shape[1], B.shape[2]},
+         args.dst_layout->dtype};
+}
+
+size_t ConvolutionForwardImpl::Algo1x1LargeBatch::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    TensorLayout A, B, C;
+    extract_matmul_layouts(args, A, B, C);
+    return args.handle->batched_matrix_mul()->get_workspace_in_bytes(A, B, C);
+}
+
+void ConvolutionForwardImpl::Algo1x1LargeBatch::exec(
+        const ExecArgs& args) const {
+    TensorND A, B, C;
+    extract_matmul_layouts(args, A.layout, B.layout, C.layout);
+    A.raw_ptr = args.filter_tensor->raw_ptr;
+    B.raw_ptr = args.src_tensor->raw_ptr;
+    C.raw_ptr = args.dst_tensor->raw_ptr;
+    auto mm = args.handle->batched_matrix_mul();
+    mm->exec(A, B, C, args.workspace);
+}
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/algo.cpp b/dnn/src/rocm/convolution/forward/algo.cpp
new file mode 100644
index 00000000..df4db044
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/algo.cpp
@@ -0,0 +1,100 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/algo.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+ConvolutionForwardImpl::AlgoPack::AlgoPack() {
+    miopen_algos.push_back(&miopen);
+    non_miopen_algos.push_back(&matmul);
+    non_miopen_algos.push_back(&inplace_matmul);
+    non_miopen_algos.push_back(&a1x1);
+    non_miopen_algos.push_back(&batched_matrix_mul);
+    non_miopen_algos.push_back(&chanwise);
+
+    all_algos.push_back(&matmul);
+    all_algos.push_back(&inplace_matmul);
+    all_algos.push_back(&a1x1);
+    all_algos.push_back(&batched_matrix_mul);
+    all_algos.push_back(&chanwise);
+    all_algos.push_back(&miopen);
+}
+
+ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack;
+
+ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o,
+                                                     const TensorLayout& src,
+                                                     const TensorLayout& filter,
+                                                     const TensorLayout& dst)
+        : SizeArgs(o, src, o->check_layout_fwd(src, filter, dst), dst) {}
+
+ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(
+        ConvolutionForwardImpl* o, const TensorLayout& src,
+        const CanonizedFilterMeta& filter, const TensorLayout& dst)
+        : ForwardSizeArgs{concrete_handle(o->handle()), &src, filter, &dst},
+          opr{o} {}
+
+ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs(
+        ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
+        _megdnn_tensor_in filter, _megdnn_tensor_out dst,
+        _megdnn_workspace workspace)
+        : SizeArgs(opr, src.layout, filter.layout, dst.layout),
+          src_tensor{&src},
+          filter_tensor{&filter},
+          dst_tensor{&dst},
+          workspace{workspace} {}
+
+std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const {
+    auto&& fm = filter_meta;
+    MEGDNN_MARK_USED_VAR(fm);
+    return megdnn_mangle(ssprintf(
+            "src=%s, filter=%u{%u,%u,%u,%u}, dst=%s, "
+            "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s",
+            src_layout->to_string().c_str(), fm.group, fm.ocpg, fm.icpg,
+            fm.spatial[0], fm.spatial[1], dst_layout->to_string().c_str(),
+            fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
+            fm.dilation[0], fm.dilation[1], !fm.should_flip,
+            src_layout->dtype.name(), dst_layout->dtype.name()));
+}
+
+convolution::MIOpenCacheKey
+ConvolutionForwardImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key() const {
+    convolution::MIOpenCacheKey res;
+    res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle());
+    res.batch = src_layout->operator[](0);
+    res.IC = src_layout->operator[](1);
+    res.IH = src_layout->operator[](2);
+    res.IW = src_layout->operator[](3);
+    res.OH = dst_layout->operator[](2);
+    res.OW = dst_layout->operator[](3);
+    res.FH = filter_meta.spatial[0];
+    res.FW = filter_meta.spatial[1];
+    res.SH = filter_meta.stride[0];
+    res.SW = filter_meta.stride[1];
+    res.PH = filter_meta.padding[0];
+    res.PW = filter_meta.padding[1];
+    res.DH = filter_meta.dilation[0];
+    res.DW = filter_meta.dilation[1];
+    res.group = filter_meta.group;
+    res.ocpg = filter_meta.ocpg;
+    res.icpg = filter_meta.icpg;
+    res.dtype_enum = static_cast<uint32_t>(src_layout->dtype.enumv());
+    res.exhaustive_search =
+            static_cast<int32_t>(handle->enable_miopen_algo_search());
+    res.OC = res.group * res.ocpg;
+    return res;
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/algo.h b/dnn/src/rocm/convolution/forward/algo.h
new file mode 100644
index 00000000..f38cf8d3
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/algo.h
@@ -0,0 +1,194 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/algo.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/oprs.h"
+
+#include "src/common/utils.h"
+#include "src/rocm/convolution/helper.h"
+#include "src/rocm/convolution/opr_impl.h"
+#include "src/rocm/handle.h"
+
+#include <unordered_map>
+
+namespace megdnn {
+namespace rocm {
+
+/*!
+ * \brief base class for convolution algos
+ *
+ */
+class ConvolutionForwardImpl::AlgoBase : public Algorithm {
+protected:
+    ~AlgoBase() = default;
+
+public:
+    struct SizeArgs : public convolution::ForwardSizeArgs {
+        ConvolutionForwardImpl* opr;
+
+        std::string to_string() const;
+        convolution::MIOpenCacheKey to_miopen_algo_cache_key() const;
+        void init_desc(convolution::MIOpenForwardDescs& desc) const {
+            desc.set(*src_layout, filter_meta, *dst_layout, opr->param());
+        }
+        SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src,
+                 const TensorLayout& filter, const TensorLayout& dst);
+        SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src,
+                 const CanonizedFilterMeta& filter, const TensorLayout& dst);
+    };
+    struct ExecArgs : public SizeArgs {
+        const TensorND *src_tensor, *filter_tensor, *dst_tensor;
+        Workspace workspace;
+
+        ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
+                 _megdnn_tensor_in filter, _megdnn_tensor_out dst,
+                 _megdnn_workspace workspace);
+    };
+    virtual bool is_available(const SizeArgs& args) const = 0;
+    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
+    virtual void exec(const ExecArgs& args) const = 0;
+
+    bool is_available_wk(const SizeArgs& args, size_t limit) {
+        return is_available(args) && get_workspace_in_bytes(args) <= limit;
+    }
+    bool is_available_reproducible(
+            const SizeArgs& args, bool reproducible = true,
+            size_t limit = std::numeric_limits<size_t>::max()) {
+        return (!reproducible || is_reproducible()) &&
+               is_available_wk(args, limit);
+    }
+
+    AlgoBase& check_workspace(const SizeArgs& args,
+                              const Workspace& workspace) {
+        auto req = get_workspace_in_bytes(args);
+        megdnn_assert(req <= workspace.size,
+                      "conv fwd algo %s: required workspace %zu bytes, got %zu",
+                      name(), req, workspace.size);
+        return *this;
+    }
+
+    virtual bool is_miopen() const { return false; }
+};
+
+class ConvolutionForwardImpl::AlgoMIOpen final : public AlgoBase {
+    bool m_is_reproducible;
+    const char* m_name;
+
+    miopenConvFwdAlgorithm_t find_best_algo(const ExecArgs& args);
+
+public:
+    AlgoMIOpen() = delete;
+    AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {}
+
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    bool is_reproducible() const override { return m_is_reproducible; }
+
+    const char* name() const override { return "MIOpenConvolutionForward"; }
+
+    bool is_miopen() const override { return true; }
+
+    static convolution::MIOpenCache<SizeArgs, miopenConvFwdAlgorithm_t>
+            sm_miopen_algo_cache;
+    static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache;
+};
+
+class ConvolutionForwardImpl::AlgoMatmul final : public AlgoBase {
+    template <typename T>
+    static void exec_internal(const ExecArgs& args);
+
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "MATMUL"; }
+    bool is_reproducible() const override { return true; }
+};
+
+//! compute small matmul in the kernel
+class ConvolutionForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "INPLACE_MATMUL"; }
+    bool is_reproducible() const override { return true; }
+};
+
+//! optimized 1x1 conv
+class ConvolutionForwardImpl::Algo1x1 final : public AlgoBase {
+    static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A,
+                                       TensorLayout& B, TensorLayout& C);
+
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "1x1"; }
+    bool is_reproducible() const override { return true; }
+};
+
+//! optimized 1x1 conv when input data batchsize is larger than 32
+class ConvolutionForwardImpl::Algo1x1LargeBatch final : public AlgoBase {
+    static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A,
+                                       TensorLayout& B, TensorLayout& C);
+
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "LARGE_BATCH_1x1"; }
+    bool is_reproducible() const override { return true; }
+};
+
+class ConvolutionForwardImpl::AlgoChanwise final : public AlgoBase {
+public:
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+
+    const char* name() const override { return "CHANNEL_WISE"; }
+    bool is_reproducible() const override { return true; }
+};
+
+class ConvolutionForwardImpl::AlgoPack {
+    // defined in miopen.cpp
+    void fill_miopen_algos();
+
+    AlgoPack(const AlgoPack&) = delete;
+    AlgoPack& operator=(const AlgoPack&) = delete;
+
+public:
+    AlgoPack();
+
+    AlgoMIOpen miopen{true};
+    AlgoMatmul matmul;
+    AlgoInplaceMatmul inplace_matmul;
+    Algo1x1 a1x1;
+    Algo1x1LargeBatch batched_matrix_mul;
+    AlgoChanwise chanwise;
+
+    std::vector<AlgoBase*>
+            //! all algorithms
+            all_algos, miopen_algos, non_miopen_algos;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/chanwise.cpp b/dnn/src/rocm/convolution/forward/chanwise.cpp
new file mode 100644
index 00000000..2f46bcda
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/chanwise.cpp
@@ -0,0 +1,54 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/chanwise.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/convolution/chanwise/kern.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+bool ConvolutionForwardImpl::AlgoChanwise::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    return args.filter_meta.format == Param::Format::NCHW &&
+           args.src_layout->dtype.category() == DTypeCategory::FLOAT &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 &&
+           fm.dilation[1] == 1 && !fm.should_flip;
+}
+
+size_t ConvolutionForwardImpl::AlgoChanwise::get_workspace_in_bytes(
+        const SizeArgs&) const {
+    return 0;
+}
+
+void ConvolutionForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const {
+    auto kparam = chanwise::Param::from_fwd_args(args);
+    auto stream = hip_stream(args.handle);
+    switch (args.src_layout->dtype.enumv()) {
+#define cb(_dt)                                                               \
+    case DTypeTrait<_dt>::enumv: {                                            \
+        using ctype = DTypeTrait<_dt>::ctype;                                 \
+        return chanwise::run_fwd(                                             \
+                args.dst_tensor->ptr<ctype>(), args.src_tensor->ptr<ctype>(), \
+                args.filter_tensor->ptr<ctype>(), kparam, stream);            \
+    }
+        MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+        default:
+            break;
+    }
+    megdnn_assert_internal(0);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/inplace_matmul.cpp b/dnn/src/rocm/convolution/forward/inplace_matmul.cpp
new file mode 100644
index 00000000..80b51e6c
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/inplace_matmul.cpp
@@ -0,0 +1,49 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/inplace_matmul.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "./inplace_matmul_impl.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+
+bool ConvolutionForwardImpl::AlgoInplaceMatmul::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    return args.filter_meta.format == Param::Format::NCHW &&
+           args.src_layout->dtype == dtype::Float32() && fm.group == 1 &&
+           fm.spatial_ndim == 2 && fm.dilation[0] == 1 && fm.dilation[1] == 1;
+}
+
+size_t ConvolutionForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes(
+        const SizeArgs&) const {
+    return 0;
+}
+
+void ConvolutionForwardImpl::AlgoInplaceMatmul::exec(
+        const ExecArgs& args) const {
+    auto&& fm = args.filter_meta;
+    size_t N = args.src_layout->shape[0], IC = fm.icpg,
+           IH = args.src_layout->shape[2], IW = args.src_layout->shape[3],
+           OC = fm.ocpg, OH = args.dst_layout->shape[2],
+           OW = args.dst_layout->shape[3], FH = fm.spatial[0],
+           FW = fm.spatial[1];
+    auto stream = args.handle->stream();
+    convolution::exec_inplace_matmul_fwd(
+            args.src_tensor->ptr<dt_float32>(),
+            args.filter_tensor->ptr<dt_float32>(),
+            args.dst_tensor->ptr<dt_float32>(), N, args.src_layout->stride[0],
+            args.dst_layout->stride[0], IC, IH, IW, OC, OH, OW, FH, FW,
+            fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
+            !fm.should_flip, stream);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip b/dnn/src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip
new file mode 100644
index 00000000..09fe7f0e
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip
@@ -0,0 +1,377 @@
+/**
+ * \file src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "./inplace_matmul_impl.h.hip"
+#include "src/rocm/utils.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+
+namespace {
+
+struct BufferFetcherTexture {
+    hipTextureObject_t tex;
+
+    __device__ __forceinline__ float get(uint32_t offset) {
+        return tex1Dfetch<float>(tex, offset);
+    }
+};
+
+struct BufferFetcherRaw {
+    const float* ptr;
+
+    __device__ __forceinline__ float get(uint32_t offset) {
+        return ptr[offset];
+    }
+};
+
+struct BufferFetcherTextureHost {
+    bool init_succ;
+    BufferFetcherTexture val;
+
+    BufferFetcherTextureHost(float* p, const size_t n);
+
+    ~BufferFetcherTextureHost() { reset(); }
+
+    void reset() {
+        if (init_succ) {
+            hip_check(hipDestroyTextureObject(val.tex));
+            init_succ = false;
+        }
+    }
+};
+
+BufferFetcherTextureHost::BufferFetcherTextureHost(float* p, const size_t n) {
+    init_succ = false;
+    hipTextureObject_t tex_obj;
+
+    hipResourceDesc res_desc;
+    memset(&res_desc, 0, sizeof(hipResourceDesc));
+    res_desc.resType = hipResourceTypeLinear;
+    res_desc.res.linear.devPtr = static_cast<void*>(p);
+    res_desc.res.linear.sizeInBytes = n * sizeof(float);
+    res_desc.res.linear.desc =
+            hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
+    hipTextureDesc tex_desc;
+    memset(&tex_desc, 0, sizeof(hipTextureDesc));
+    if (hipCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL) ==
+        hipSuccess) {
+        val.tex = tex_obj;
+        init_succ = true;
+    } else {
+        hipGetLastError();  // reset error
+    }
+}
+
+template <class BufferFetcher>
+struct KernelPtr {
+    typedef void (*type)(BufferFetcher, BufferFetcher, float*, uint32_t,
+                         uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
+                         uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
+                         uint32_t, uint32_t, uint32_t);
+};
+
+//! 1 -> 0xffffffff, 0 -> 0x00000000
+__device__ __forceinline__ uint32_t bool_as_mask(uint32_t cond) {
+    return (!cond) - 1u;
+}
+
+union FloatAndU32 {
+    float f;
+    uint32_t u;
+};
+
+//! \p mask must be either all 1 or 0 bits
+template <class BufferFetcher>
+__device__ __forceinline__ float visit_with_mask(BufferFetcher buf,
+                                                 uint32_t offset,
+                                                 uint32_t mask) {
+    FloatAndU32 f;
+    f.f = buf.get(offset & mask);
+    f.u &= mask;
+    return f.f;
+}
+
+template <uint32_t BY, uint32_t BX, bool is_xcorr, class BufferFetcher>
+__global__ void conv_kernel(BufferFetcher src, BufferFetcher filter, float* dst,
+                            const uint32_t INP_BS, const uint32_t OUT_BS,
+                            const uint32_t IC, const uint32_t IH,
+                            const uint32_t IW, const uint32_t OC,
+                            const uint32_t OH, const uint32_t OW,
+                            const uint32_t FH, const uint32_t FW,
+                            const uint32_t SH, const uint32_t SW,
+                            const uint32_t PH, const uint32_t PW) {
+    const uint32_t BM = BY < BX ? BY : BX;
+    const uint32_t n = blockIdx.z;
+    const uint32_t tidx = threadIdx.x;
+    const uint32_t tidy = threadIdx.y;
+    const uint32_t posx = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint32_t posy = blockIdx.y * blockDim.y + threadIdx.y;
+    const uint32_t posx2 = posx << 2;
+    const uint32_t posy2 = posy << 2;
+    const uint32_t heightA = OC;
+    const uint32_t widthA = IC * FH * FW;
+    const uint32_t heightB = widthA;
+    const uint32_t widthB = OH * OW;
+    const uint32_t oh0 = (posx2 + 0) / OW * SH;
+    const uint32_t ow0 = (posx2 + 0) % OW * SW;
+    const uint32_t op0 = oh0 * IW + ow0;
+    const uint32_t oh1 = (posx2 + 1) / OW * SH;
+    const uint32_t ow1 = (posx2 + 1) % OW * SW;
+    const uint32_t op1 = oh1 * IW + ow1;
+    const uint32_t oh2 = (posx2 + 2) / OW * SH;
+    const uint32_t ow2 = (posx2 + 2) % OW * SW;
+    const uint32_t op2 = oh2 * IW + ow2;
+    const uint32_t oh3 = (posx2 + 3) / OW * SH;
+    const uint32_t ow3 = (posx2 + 3) % OW * SW;
+    const uint32_t op3 = oh3 * IW + ow3;
+    const uint32_t FP = FH * FW;
+    __shared__ float4 localA[BY][BM];
+    __shared__ float4 localB[BM][BX];
+    uint32_t i = 0u;
+    uint32_t offsetA = posy2 * widthA + tidx;
+    uint32_t offsetB = n * INP_BS - PH * IW - PW;
+    float4 sum0 = {0.0f, 0.0f, 0.0f, 0.0f}, sum1 = {0.0f, 0.0f, 0.0f, 0.0f},
+           sum2 = {0.0f, 0.0f, 0.0f, 0.0f}, sum3 = {0.0f, 0.0f, 0.0f, 0.0f};
+    uint32_t fh = tidy / FW % FH;
+    uint32_t fw = tidy % FW;
+    uint32_t ic = tidy / (FH * FW);
+    uint32_t icm = tidy % (FH * FW);
+
+    const uint32_t fhs = BM / FW % FH;
+    const uint32_t fws = BM % FW;
+    const uint32_t ics = BM / (FH * FW);
+    const uint32_t icms = BM % (FH * FW);
+
+    for (; i < widthA; i += BM, offsetA += BM) {
+        // load localA
+        if (tidx < BM) {
+            localA[tidy][tidx].x = filter.get(offsetA + 0 * widthA);
+            localA[tidy][tidx].y = filter.get(offsetA + 1 * widthA);
+            localA[tidy][tidx].z = filter.get(offsetA + 2 * widthA);
+            localA[tidy][tidx].w = filter.get(offsetA + 3 * widthA);
+        }
+
+        // load localB
+        uint32_t fh2, fw2;
+        if (is_xcorr) {
+            fh2 = fh;
+            fw2 = fw;
+        } else {
+            fh2 = FH - fh - 1;
+            fw2 = FW - fw - 1;
+        }
+
+        if (tidy < BM) {
+            uint32_t tmp = offsetB + (ic * IH + (fh2)) * IW + (fw2),
+                     ok = bool_as_mask(tidy + i < heightB),
+                     p0 = bool_as_mask(fh2 + oh0 >= PH && fh2 + oh0 < IH + PH &&
+                                       fw2 + ow0 >= PW && fw2 + ow0 < IW + PW),
+                     p1 = bool_as_mask(fh2 + oh1 >= PH && fh2 + oh1 < IH + PH &&
+                                       fw2 + ow1 >= PW && fw2 + ow1 < IW + PW),
+                     p2 = bool_as_mask(fh2 + oh2 >= PH && fh2 + oh2 < IH + PH &&
+                                       fw2 + ow2 >= PW && fw2 + ow2 < IW + PW),
+                     p3 = bool_as_mask(fh2 + oh3 >= PH && fh2 + oh3 < IH + PH &&
+                                       fw2 + ow3 >= PW && fw2 + ow3 < IW + PW);
+            localB[tidy][tidx].x = visit_with_mask(src, tmp + op0, ok & p0);
+            localB[tidy][tidx].y = visit_with_mask(src, tmp + op1, ok & p1);
+            localB[tidy][tidx].z = visit_with_mask(src, tmp + op2, ok & p2);
+            localB[tidy][tidx].w = visit_with_mask(src, tmp + op3, ok & p3);
+        }
+
+        __syncthreads();
+
+        for (uint32_t j = 0u; j < BM; ++j) {
+            float4 tmpA = localA[tidy][j];
+            float4 tmpB = localB[j][tidx];
+            sum0.x += tmpA.x * tmpB.x;
+            sum0.y += tmpA.x * tmpB.y;
+            sum0.z += tmpA.x * tmpB.z;
+            sum0.w += tmpA.x * tmpB.w;
+            sum1.x += tmpA.y * tmpB.x;
+            sum1.y += tmpA.y * tmpB.y;
+            sum1.z += tmpA.y * tmpB.z;
+            sum1.w += tmpA.y * tmpB.w;
+            sum2.x += tmpA.z * tmpB.x;
+            sum2.y += tmpA.z * tmpB.y;
+            sum2.z += tmpA.z * tmpB.z;
+            sum2.w += tmpA.z * tmpB.w;
+            sum3.x += tmpA.w * tmpB.x;
+            sum3.y += tmpA.w * tmpB.y;
+            sum3.z += tmpA.w * tmpB.z;
+            sum3.w += tmpA.w * tmpB.w;
+        }
+
+        fw += fws;
+        fh += fhs;
+        fh += (fw >= FW);
+        fh -= (fh >= FH) * FH;
+        fw -= (fw >= FW) * FW;
+
+        ic += ics;
+        icm += icms;
+        ic += (icm >= FP);
+        icm -= (icm >= FP) * FP;
+        __syncthreads();
+    }
+    const uint32_t dst_idx = n * OUT_BS + posy2 * widthB + posx2;
+    bool y0 = (posy2 + 0 < heightA);
+    bool y1 = (posy2 + 1 < heightA);
+    bool y2 = (posy2 + 2 < heightA);
+    bool y3 = (posy2 + 3 < heightA);
+    bool x0 = (posx2 + 0 < widthB);
+    bool x1 = (posx2 + 1 < widthB);
+    bool x2 = (posx2 + 2 < widthB);
+    bool x3 = (posx2 + 3 < widthB);
+    if (y0) {
+        if (x0)
+            dst[dst_idx + 0 * widthB + 0] = sum0.x;
+        if (x1)
+            dst[dst_idx + 0 * widthB + 1] = sum0.y;
+        if (x2)
+            dst[dst_idx + 0 * widthB + 2] = sum0.z;
+        if (x3)
+            dst[dst_idx + 0 * widthB + 3] = sum0.w;
+    }
+    if (y1) {
+        if (x0)
+            dst[dst_idx + 1 * widthB + 0] = sum1.x;
+        if (x1)
+            dst[dst_idx + 1 * widthB + 1] = sum1.y;
+        if (x2)
+            dst[dst_idx + 1 * widthB + 2] = sum1.z;
+        if (x3)
+            dst[dst_idx + 1 * widthB + 3] = sum1.w;
+    }
+    if (y2) {
+        if (x0)
+            dst[dst_idx + 2 * widthB + 0] = sum2.x;
+        if (x1)
+            dst[dst_idx + 2 * widthB + 1] = sum2.y;
+        if (x2)
+            dst[dst_idx + 2 * widthB + 2] = sum2.z;
+        if (x3)
+            dst[dst_idx + 2 * widthB + 3] = sum2.w;
+    }
+    if (y3) {
+        if (x0)
+            dst[dst_idx + 3 * widthB + 0] = sum3.x;
+        if (x1)
+            dst[dst_idx + 3 * widthB + 1] = sum3.y;
+        if (x2)
+            dst[dst_idx + 3 * widthB + 2] = sum3.z;
+        if (x3)
+            dst[dst_idx + 3 * widthB + 3] = sum3.w;
+    }
+}
+
+}  // anonymous namespace
+
+void convolution::exec_inplace_matmul_fwd(
+        const float* src, const float* filter, float* dst, size_t N,
+        size_t INP_BS, size_t OUT_BS, size_t IC, size_t IH, size_t IW,
+        size_t OC, size_t OH, size_t OW, size_t FH, size_t FW, size_t PH,
+        size_t PW, size_t SH, size_t SW, bool is_xcorr, hipStream_t stream) {
+    BufferFetcherTextureHost src_tex(const_cast<float*>(src), N * INP_BS),
+            filter_tex(const_cast<float*>(filter), OC * IC * FH * FW);
+
+    BufferFetcherRaw src_buf, filter_buf;
+    src_buf.ptr = src;
+    filter_buf.ptr = filter;
+    if (!src_tex.init_succ || !filter_tex.init_succ) {
+        src_tex.reset();
+        filter_tex.reset();
+    }
+    int m = OC;
+    int n = OH * OW;
+    int BY = 1;
+    int BX = 1;
+    if (m <= 64) {
+        while (BY < 16 && (BY << 2) < m)
+            BY <<= 1;
+        BX = 256 / BY;
+    } else if (n <= 64) {
+        while (BX < 16 && (BX << 2) < n)
+            BX <<= 1;
+        BY = 256 / BX;
+    } else {
+        BX = BY = 16;
+    }
+    dim3 blocks((OH * OW + BX * 4 - 1) / (BX * 4), (OC + BY * 4 - 1) / (BY * 4),
+                N);
+    dim3 threads(BX, BY);
+#define DISPATCH_BX_BY(BX, BY)                                                \
+    do {                                                                      \
+        if (src_tex.init_succ) {                                              \
+            KernelPtr<BufferFetcherTexture>::type kptr;                       \
+            if (is_xcorr) {                                                   \
+                kptr = conv_kernel<BY, BX, true, BufferFetcherTexture>;       \
+            } else {                                                          \
+                kptr = conv_kernel<BY, BX, false, BufferFetcherTexture>;      \
+            }                                                                 \
+            kptr<<<blocks, threads, 0, stream>>>(                             \
+                    src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \
+                    IW, OC, OH, OW, FH, FW, SH, SW, PH, PW);                  \
+        } else {                                                              \
+            KernelPtr<BufferFetcherRaw>::type kptr;                           \
+            if (is_xcorr) {                                                   \
+                kptr = conv_kernel<BY, BX, true, BufferFetcherRaw>;           \
+            } else {                                                          \
+                kptr = conv_kernel<BY, BX, false, BufferFetcherRaw>;          \
+            }                                                                 \
+            kptr<<<blocks, threads, 0, stream>>>(                             \
+                    src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \
+                    OH, OW, FH, FW, SH, SW, PH, PW);                          \
+        }                                                                     \
+    } while (0)
+#define DISPATCH_BX(BX)               \
+    do {                              \
+        DISPATCH_BX_BY(BX, 256 / BX); \
+    } while (0)
+#define DISPATCH()                                \
+    do {                                          \
+        switch (BX) {                             \
+            case 1:                               \
+                DISPATCH_BX(1);                   \
+                break;                            \
+            case 2:                               \
+                DISPATCH_BX(2);                   \
+                break;                            \
+            case 4:                               \
+                DISPATCH_BX(4);                   \
+                break;                            \
+            case 8:                               \
+                DISPATCH_BX(8);                   \
+                break;                            \
+            case 16:                              \
+                DISPATCH_BX(16);                  \
+                break;                            \
+            case 32:                              \
+                DISPATCH_BX(32);                  \
+                break;                            \
+            case 64:                              \
+                DISPATCH_BX(64);                  \
+                break;                            \
+            case 128:                             \
+                DISPATCH_BX(128);                 \
+                break;                            \
+            case 256:                             \
+                DISPATCH_BX(256);                 \
+                break;                            \
+            default:                              \
+                report_error("no usable kernel"); \
+        }                                         \
+    } while (0)
+    DISPATCH();
+#undef DISPATCH
+#undef DISPATCH_BX
+#undef DISPATCH_BX_BY
+    after_kernel_launch();
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/inplace_matmul_impl.h.hip b/dnn/src/rocm/convolution/forward/inplace_matmul_impl.h.hip
new file mode 100644
index 00000000..c4109f19
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/inplace_matmul_impl.h.hip
@@ -0,0 +1,30 @@
+/**
+ * \file src/rocm/convolution/forward/inplace_matmul_impl.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "hip_header.h"
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+
+void exec_inplace_matmul_fwd(const float* src, const float* filter, float* dst,
+                             size_t N, size_t INP_BS, size_t OUT_BS, size_t IC,
+                             size_t IH, size_t IW, size_t OC, size_t OH,
+                             size_t OW, size_t FH, size_t FW, size_t PH,
+                             size_t PW, size_t SH, size_t SW, bool is_xcorr,
+                             hipStream_t stream);
+
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/matmul.cpp b/dnn/src/rocm/convolution/forward/matmul.cpp
new file mode 100644
index 00000000..040d64d3
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/matmul.cpp
@@ -0,0 +1,83 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/matmul.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./algo.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/utils.h.hip"
+#include "src/rocm/convolution/helper.h"
+#include "src/rocm/convolution/im2col.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+
+bool ConvolutionForwardImpl::AlgoMatmul::is_available(
+        const SizeArgs& args) const {
+    auto&& fm = args.filter_meta;
+    return args.filter_meta.format == Param::Format::NCHW &&
+           args.src_layout->dtype.category() == DTypeCategory::FLOAT &&
+           args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
+           fm.group == 1 && fm.spatial_ndim == 2;
+}
+
+size_t ConvolutionForwardImpl::AlgoMatmul::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    return matmul_get_workspace_bundle(args).total_size_in_bytes();
+}
+
+void ConvolutionForwardImpl::AlgoMatmul::exec(const ExecArgs& args) const {
+#define cb(DType)                                        \
+    if (args.src_layout->dtype == DType()) {             \
+        using ctype = typename DTypeTrait<DType>::ctype; \
+        exec_internal<ctype>(args);                      \
+        return;                                          \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+
+    megdnn_assert_internal(0);
+}
+
+template <typename T>
+void ConvolutionForwardImpl::AlgoMatmul::exec_internal(const ExecArgs& args) {
+    auto&& fm = args.filter_meta;
+    size_t N = args.src_layout->shape[0], IC = fm.icpg,
+           IH = args.src_layout->shape[2], IW = args.src_layout->shape[3],
+           OC = fm.ocpg, OH = args.dst_layout->shape[2],
+           OW = args.dst_layout->shape[3], FH = fm.spatial[0],
+           FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
+           SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
+           DW = fm.dilation[1];
+    auto stream = hip_stream(args.handle);
+    auto wbundle = matmul_get_workspace_bundle(args);
+    wbundle.set(args.workspace.raw_ptr);
+    T* dst_t = static_cast<T*>(wbundle.get(0));
+    T* col = static_cast<T*>(wbundle.get(1));
+    convolution::im2col<T>(args.src_tensor->ptr<T>(), col, N,
+                           args.src_layout->stride[0], IC, IH, IW, FH, FW, OH,
+                           OW, PH, PW, SH, SW, DH, DW, stream);
+    TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
+            Bl({IC * FH * FW, OH * OW * N}, typename DTypeTrait<T>::dtype()),
+            Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
+    TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(dst_t, Cl);
+    if (fm.should_flip) {
+        convolution::flip_filter(args, wbundle.get_workspace(2), A.raw_ptr);
+    }
+    args.handle->matmul_opr()->exec(A, B, C, Workspace());
+    TensorLayout C2l({OC * OH * OW, N}, typename DTypeTrait<T>::dtype()),
+            C3l = C2l;
+    C3l.stride[0] = 1;
+    C3l.stride[1] = args.dst_tensor->layout.stride[0];
+    TensorND C2(dst_t, C2l);
+    TensorND C3(args.dst_tensor->ptr<T>(), C3l);
+    args.handle->relayout_opr()->exec(C2, C3);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/forward/miopen.cpp b/dnn/src/rocm/convolution/forward/miopen.cpp
new file mode 100644
index 00000000..1ed75d17
--- /dev/null
+++ b/dnn/src/rocm/convolution/forward/miopen.cpp
@@ -0,0 +1,111 @@
+/**
+ * \file dnn/src/rocm/convolution/forward/miopen.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./algo.h"
+
+#include <mutex>
+#include "src/rocm/convolution/helper.h"
+#include "src/rocm/miopen_wrapper.h"
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+MIOpenCache<ConvolutionForwardImpl::AlgoBase::SizeArgs,
+            miopenConvFwdAlgorithm_t>
+        ConvolutionForwardImpl::AlgoMIOpen::sm_miopen_algo_cache;
+MIOpenCache<ConvolutionForwardImpl::AlgoBase::SizeArgs, size_t>
+        ConvolutionForwardImpl::AlgoMIOpen::sm_miopen_ws_cache;
+
+bool ConvolutionForwardImpl::AlgoMIOpen::is_available(
+        const SizeArgs& args) const {
+    if (!is_miopen_supported(args))
+        return false;
+    auto got = sm_miopen_ws_cache.get(args);
+    if (got.first)
+        return true;
+    MIOpenForwardDescs D;
+    args.init_desc(D);
+    size_t workspace_size;
+    auto status = miopenConvolutionForwardGetWorkSpaceSize(
+            args.handle->miopen_handle(), D.filter_desc.desc, D.src_desc.desc,
+            D.conv_desc.desc, D.dst_desc.desc, &workspace_size);
+    if (status == miopenStatusSuccess) {
+        sm_miopen_ws_cache.set(args, workspace_size);
+        return true;
+    }
+    return false;
+}
+
+size_t ConvolutionForwardImpl::AlgoMIOpen::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    auto got = sm_miopen_ws_cache.get(args);
+    if (got.first)
+        return got.second;
+    MIOpenForwardDescs D;
+    args.init_desc(D);
+    size_t workspace_size;
+    auto status = miopenConvolutionForwardGetWorkSpaceSize(
+            args.handle->miopen_handle(), D.filter_desc.desc, D.src_desc.desc,
+            D.conv_desc.desc, D.dst_desc.desc, &workspace_size);
+    megdnn_assert(status == miopenStatusSuccess,
+                  "conv fwd get workspace failed: %s; info: %s",
+                  miopenGetErrorString(status), args.to_string().c_str());
+    sm_miopen_ws_cache.set(args, workspace_size);
+    return workspace_size;
+}
+
+miopenConvFwdAlgorithm_t ConvolutionForwardImpl::AlgoMIOpen::find_best_algo(
+        const ExecArgs& args) {
+    auto find_algo = sm_miopen_algo_cache.get(args);
+    if (find_algo.first)
+        return find_algo.second;
+    bool exhaustive_search = args.handle->enable_miopen_algo_search();
+    MIOpenForwardDescs D;
+    args.init_desc(D);
+    const int req_algo_count = 1;
+    int ret_algo_count;
+    miopenConvAlgoPerf_t algo_perf;
+    miopen_check(miopenFindConvolutionForwardAlgorithm(
+            args.handle->miopen_handle(), D.src_desc.desc,
+            args.src_tensor->raw_ptr, D.filter_desc.desc,
+            args.filter_tensor->raw_ptr, D.conv_desc.desc, D.dst_desc.desc,
+            args.dst_tensor->raw_ptr, req_algo_count, &ret_algo_count,
+            &algo_perf, args.workspace.raw_ptr, args.workspace.size,
+            exhaustive_search));
+    sm_miopen_algo_cache.set(args, algo_perf.fwd_algo);
+    return algo_perf.fwd_algo;
+}
+
+void ConvolutionForwardImpl::AlgoMIOpen::exec(const ExecArgs& args) const {
+    MIOpenForwardDescs D;
+    args.init_desc(D);
+    auto algo = const_cast<ConvolutionForwardImpl::AlgoMIOpen*>(this)
+                        ->find_best_algo(args);
+    float alpha = 1.0f, beta = 0.0f;
+    auto status = miopenConvolutionForward(
+            args.handle->miopen_handle(), &alpha, D.src_desc.desc,
+            args.src_tensor->raw_ptr, D.filter_desc.desc,
+            args.filter_tensor->raw_ptr, D.conv_desc.desc, algo, &beta,
+            D.dst_desc.desc, args.dst_tensor->raw_ptr, args.workspace.raw_ptr,
+            args.workspace.size);
+    megdnn_assert(status == miopenStatusSuccess,
+                  "conv fwd failed: %s; info: %s", miopenGetErrorString(status),
+                  args.to_string().c_str());
+}
+
+void ConvolutionForwardImpl::AlgoPack::fill_miopen_algos() {
+    megdnn_throw("MIOpen has implemented auto-tuning in the framework, so we do not need to choose algorithms manually");
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/helper.cpp b/dnn/src/rocm/convolution/helper.cpp
new file mode 100644
index 00000000..cb52a21c
--- /dev/null
+++ b/dnn/src/rocm/convolution/helper.cpp
@@ -0,0 +1,102 @@
+/**
+ * \file dnn/src/rocm/convolution/helper.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./helper.h"
+#include "./forward/algo.h"
+#include "./backward_data/algo.h"
+#include "./backward_filter/algo.h"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace convolution;
+
+bool convolution::is_miopen_supported(const ForwardSizeArgs& args) {
+    //! TODO: We only support NCHW format now. It seems MIOpen do not support
+    //! NHWC or NCHW4 now
+    if (args.filter_meta.format != param::Convolution::Format::NCHW) {
+        return false;
+    }
+    auto& fm = args.filter_meta;
+    //! TODO: It seems MIOpen do not support non xcorr convolution
+    return !fm.should_flip;
+}
+
+std::string MIOpenCacheKey::to_string_binary() const {
+    std::string ret(sizeof(MIOpenCacheKey), '\0');
+    auto ptr = reinterpret_cast<MIOpenCacheKey*>(&ret[0]);
+    *ptr = *this;
+    return ret;
+}
+
+template <typename Args, typename ValueType>
+void MIOpenCache<Args, ValueType>::set(const Args& args, ValueType val) {
+    std::string key = args.to_miopen_algo_cache_key().to_string_binary();
+    std::lock_guard<std::mutex> guard{m_mtx};
+    m_cache[key] = val;
+}
+
+template <typename Args, typename ValueType>
+std::pair<bool, ValueType> MIOpenCache<Args, ValueType>::get(const Args& args) {
+    std::string key = args.to_miopen_algo_cache_key().to_string_binary();
+    std::lock_guard<std::mutex> guard{m_mtx};
+    auto search = m_cache.find(key);
+    bool find = search != m_cache.end();
+    ValueType val = ValueType();
+    if (find) {
+        val = search->second;
+    }
+    return std::make_pair(find, val);
+}
+
+#define INST(_opr, _miopen_algo)                           \
+    template class megdnn::rocm::convolution::MIOpenCache< \
+            _opr::AlgoBase::SizeArgs, _miopen_algo>;       \
+    template class megdnn::rocm::convolution::MIOpenCache< \
+            _opr::AlgoBase::SizeArgs, size_t>;
+
+INST(ConvolutionForwardImpl, miopenConvFwdAlgorithm_t);
+INST(ConvolutionBackwardDataImpl, miopenConvBwdDataAlgorithm_t);
+INST(ConvolutionBackwardFilterImpl, miopenConvBwdWeightsAlgorithm_t);
+
+WorkspaceBundle convolution::matmul_get_workspace_bundle(
+        const ForwardSizeArgs& args) {
+    auto dtype = args.src_layout->dtype;
+    auto&& fm = args.filter_meta;
+    megdnn_assert(fm.group == 1);
+    auto N = args.src_layout->shape[0];
+    auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1];
+    auto OH = args.dst_layout->shape[2], OW = args.dst_layout->shape[3];
+    SmallVector<size_t> sizes{dtype.size() * args.dst_layout->total_nr_elems(),
+                              dtype.size() * IC * FH * FW * OH * OW * N};
+    if (args.filter_meta.should_flip) {
+        sizes.push_back(dtype.size() * OC * IC * FH * FW);
+    }
+    return {nullptr, std::move(sizes)};
+}
+
+void convolution::flip_filter(const ForwardSizeArgs& args,
+                              const Workspace& workspace, void*& raw_ptr) {
+    auto&& fm = args.filter_meta;
+    megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2);
+    auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1];
+    auto dtype = fm.dtype;
+    megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW);
+
+    TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}},
+            dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout};
+    dst.layout.stride[2] = -dst.layout.stride[2];
+    dst.layout.stride[3] = -dst.layout.stride[3];
+    args.handle->relayout_opr()->exec(src, dst);
+    raw_ptr = workspace.raw_ptr;
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/helper.h b/dnn/src/rocm/convolution/helper.h
new file mode 100644
index 00000000..0029ba55
--- /dev/null
+++ b/dnn/src/rocm/convolution/helper.h
@@ -0,0 +1,139 @@
+/**
+ * \file dnn/src/rocm/convolution/helper.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "./opr_impl.h"
+#include "src/rocm/miopen_wrapper.h"
+#include "src/rocm/handle.h"
+#include "src/common/utils.h"
+#include "src/common/algo_chooser.h"
+
+#include <unordered_map>
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+
+struct MIOpenCacheKey {
+    int64_t miopen_handle;
+    uint32_t batch, IC, IH, IW, OC, OH, OW, FH, FW, SH, SW, PH, PW, DH, DW,
+            group, ocpg, icpg, dtype_enum;
+    int exhaustive_search;
+    std::string to_string_binary() const;
+};
+
+//! FIXME: MIOpenCache to avoid calling find() and GetWorkSpaceSize()
+//! redundantly
+template <typename Args, typename ValueType>
+class MIOpenCache {
+    using HashMap = std::unordered_map<std::string, ValueType>;
+    HashMap m_cache;
+    std::mutex m_mtx;
+
+public:
+    MIOpenCache() = default;
+    ~MIOpenCache() noexcept = default;
+    void set(const Args& args, ValueType val);
+    std::pair<bool, ValueType> get(const Args& args);
+};
+
+using CanonizedFilterMeta = ConvolutionForward::CanonizedFilterMeta;
+
+//! conv size descriptor in the forward view
+struct ForwardSizeArgs {
+    HandleImpl* handle;
+    const TensorLayout* src_layout;
+    CanonizedFilterMeta filter_meta;
+    const TensorLayout* dst_layout;
+};
+
+//! whether miopen is supported for a filter meta
+bool is_miopen_supported(const ForwardSizeArgs& args);
+
+//! get workspace bundle for matmul algo
+WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs& args);
+
+/*!
+ * \brief flip conv filter
+ *
+ * Flip conv filter pointed by \p raw_ptr, store result in workspace, and
+ * change \p raw_ptr to workspace.
+ * */
+void flip_filter(const ForwardSizeArgs& args, const Workspace& workspace,
+                 void*& raw_ptr);
+
+struct MIOpenForwardDescs {
+    TensorDesc src_desc, filter_desc, dst_desc;
+    ConvDesc conv_desc;
+    void set(const TensorLayout& src, const CanonizedFilterMeta& filter,
+             const TensorLayout& dst, const param::Convolution& param) {
+        src_desc.set(src, param.format);
+        auto&& group = filter.group;
+        auto&& ocpg = filter.ocpg;
+        auto&& icpg = filter.icpg;
+        auto&& fh = filter.spatial[0];
+        auto&& fw = filter.spatial[1];
+        TensorLayout filter_layout{{group * ocpg, icpg, fh, fw}, filter.dtype};
+        filter_desc.set(filter_layout, param.format);
+        dst_desc.set(dst, param.format);
+        bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP &&
+                            (icpg == 1) && (ocpg == 1);
+        conv_desc.set(param, filter.group, is_depthwise);
+    }
+};
+
+struct MIOpenBwdDataDescs {
+    TensorDesc diff_desc, filter_desc, grad_desc;
+    ConvDesc conv_desc;
+    void set(const CanonizedFilterMeta& filter, const TensorLayout& diff,
+             const TensorLayout& grad, const param::Convolution& param) {
+        auto&& group = filter.group;
+        auto&& ocpg = filter.ocpg;
+        auto&& icpg = filter.icpg;
+        auto&& fh = filter.spatial[0];
+        auto&& fw = filter.spatial[1];
+        TensorLayout filter_layout{{group * ocpg, icpg, fh, fw}, filter.dtype};
+        filter_desc.set(filter_layout, param.format);
+        diff_desc.set(diff, param.format);
+        grad_desc.set(grad, param.format);
+        bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP &&
+                            (icpg == 1) && (ocpg == 1);
+        conv_desc.set(param, filter.group, is_depthwise);
+    }
+};
+
+struct MIOpenBwdFilterDescs {
+    TensorDesc diff_desc, src_desc, grad_desc;
+    ConvDesc conv_desc;
+    void set(const TensorLayout& src, const TensorLayout& diff,
+             const CanonizedFilterMeta& grad, const param::Convolution& param) {
+        src_desc.set(src, param.format);
+        diff_desc.set(diff, param.format);
+        auto&& group = grad.group;
+        auto&& ocpg = grad.ocpg;
+        auto&& icpg = grad.icpg;
+        auto&& fh = grad.spatial[0];
+        auto&& fw = grad.spatial[1];
+        TensorLayout grad_layout{{group * ocpg, icpg, fh, fw}, grad.dtype};
+        grad_desc.set(grad_layout, param.format);
+        bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP &&
+                            (icpg == 1) && (ocpg == 1);
+        conv_desc.set(param, grad.group, is_depthwise);
+    }
+};
+
+//! TODO:miopen does not support non xcorr convolution for now, expecting
+//! support in future.
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/im2col.cpp.hip b/dnn/src/rocm/convolution/im2col.cpp.hip
new file mode 100644
index 00000000..919c7330
--- /dev/null
+++ b/dnn/src/rocm/convolution/im2col.cpp.hip
@@ -0,0 +1,129 @@
+/**
+ * \file src/rocm/convolution/im2col.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "./im2col.h.hip"
+#include "megdnn/dtype.h"
+#include "src/rocm/utils.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+
+namespace {
+
+template <typename T>
+__global__ void im2col_kernel(const T* im, T* col, uint32_t N, uint32_t INP_BS,
+                              uint32_t IC, uint32_t IH, uint32_t IW,
+                              uint32_t FH, uint32_t FW, uint32_t OH,
+                              uint32_t OW, uint32_t PH, uint32_t PW,
+                              uint32_t SH, uint32_t SW, uint32_t DH,
+                              uint32_t DW) {
+    uint32_t n = threadIdx.x + blockIdx.y * blockDim.x;
+    uint32_t ow = threadIdx.y + blockIdx.z * blockDim.y;
+    uint32_t oh = blockIdx.x % OH;
+    uint32_t fw = blockIdx.x / OH % FW;
+    uint32_t fh = blockIdx.x / OH / FW % FH;
+    uint32_t ic = blockIdx.x / OH / FW / FH;
+    if (n < N && ow < OW) {
+        uint32_t didx = blockIdx.x * OW * N + ow * N + n;
+        uint32_t ih = -PH + oh * SH + fh * DH;
+        uint32_t iw = -PW + ow * SW + fw * DW;
+        col[didx] = (ih < IH && iw < IW
+                             ? im[n * INP_BS + ic * IH * IW + ih * IW + iw]
+                             : T(0.0f));
+    }
+}
+
+template <typename T>
+__global__ void col2im_kernel(const T* col, T* im, uint32_t N, uint32_t INP_BS,
+                              uint32_t IC, uint32_t IH, uint32_t IW,
+                              uint32_t FH, uint32_t FW, uint32_t OH,
+                              uint32_t OW, uint32_t PH, uint32_t PW,
+                              uint32_t SH, uint32_t SW, uint32_t DH,
+                              uint32_t DW) {
+    uint32_t iw = threadIdx.x + blockIdx.y * blockDim.x;
+    uint32_t ih = threadIdx.y + blockIdx.z * blockDim.y;
+    uint32_t ic = blockIdx.x % IC;
+    uint32_t n = blockIdx.x / IC;
+    if (iw < IW && ih < IH) {
+        T res(0);
+        for (uint32_t fh = 0; fh < FH; ++fh) {
+            uint32_t anchorh = ih + PH - fh * DH;
+            if (anchorh < OH * SH && anchorh % SH == 0) {
+                uint32_t oh = anchorh / SH;
+                for (uint32_t fw = 0; fw < FW; ++fw) {
+                    uint32_t anchorw = iw + PW - fw * DW;
+                    if (anchorw < OW * SW && anchorw % SW == 0) {
+                        uint32_t ow = anchorw / SW;
+                        res += col[ic * FH * FW * OH * OW * N +
+                                   fh * FW * OH * OW * N + fw * OH * OW * N +
+                                   oh * OW * N + ow * N + n];
+                    }
+                }
+            }
+        }
+        im[n * INP_BS + ic * IH * IW + ih * IW + iw] = res;
+    }
+}
+
+}  // anonymous namespace
+
+template <typename T>
+void convolution::im2col(const T* im, T* col, size_t N, size_t INP_BS,
+                         size_t IC, size_t IH, size_t IW, size_t FH, size_t FW,
+                         size_t OH, size_t OW, size_t PH, size_t PW, size_t SH,
+                         size_t SW, size_t DH, size_t DW, hipStream_t stream) {
+    dim3 threads(NR_THREADS_X, NR_THREADS_Y);
+    dim3 blocks(IC * FH * FW * OH, DIVUP(N, NR_THREADS_X),
+                DIVUP(OW, NR_THREADS_Y));
+    hipLaunchKernelGGL(im2col_kernel<T>, blocks, threads, 0, stream, im, col, N,
+                       INP_BS, IC, IH, IW, FH, FW, OH, OW, PH, PW, SH, SW, DH,
+                       DW);
+    after_kernel_launch();
+}
+
+template <typename T>
+void convolution::col2im(const T* col, T* im, size_t N, size_t INP_BS,
+                         size_t IC, size_t IH, size_t IW, size_t FH, size_t FW,
+                         size_t OH, size_t OW, size_t PH, size_t PW, size_t SH,
+                         size_t SW, size_t DH, size_t DW, hipStream_t stream) {
+    dim3 threads(NR_THREADS_X, NR_THREADS_Y);
+    dim3 blocks(N * IC, DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y));
+    hipLaunchKernelGGL(col2im_kernel<T>, blocks, threads, 0, stream, col, im, N,
+                       INP_BS, IC, IH, IW, FH, FW, OH, OW, PH, PW, SH, SW, DH,
+                       DW);
+    after_kernel_launch();
+}
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+
+#define DO_INST(T)                                                        \
+    template void im2col<T>(const T* im, T* col, size_t N, size_t INP_BS, \
+                            size_t IC, size_t IH, size_t IW, size_t FH,   \
+                            size_t FW, size_t OH, size_t OW, size_t PH,   \
+                            size_t PW, size_t SH, size_t SW, size_t DH,   \
+                            size_t DW, hipStream_t stream);               \
+    template void col2im<T>(const T* col, T* im, size_t N, size_t INP_BS, \
+                            size_t IC, size_t IH, size_t IW, size_t FH,   \
+                            size_t FW, size_t OH, size_t OW, size_t PH,   \
+                            size_t PW, size_t SH, size_t SW, size_t DH,   \
+                            size_t DW, hipStream_t stream);
+
+#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype)
+
+MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST);
+
+#undef DO_INST
+#undef INST
+
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/im2col.h.hip b/dnn/src/rocm/convolution/im2col.h.hip
new file mode 100644
index 00000000..1d5b46e6
--- /dev/null
+++ b/dnn/src/rocm/convolution/im2col.h.hip
@@ -0,0 +1,34 @@
+/**
+ * \file src/rocm/convolution/im2col.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "hip_header.h"
+
+namespace megdnn {
+namespace rocm {
+namespace convolution {
+
+//! col is of shape (ic*fh*fw, oh*ow*n)
+template <typename T>
+void im2col(const T* im, T* col, size_t N, size_t INP_BS, size_t IC, size_t IH,
+            size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH,
+            size_t PW, size_t SH, size_t SW, size_t DH, size_t DW,  // dilation
+            hipStream_t stream);
+
+template <typename T>
+void col2im(const T* col, T* im, size_t N, size_t INP_BS, size_t IC, size_t IH,
+            size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH,
+            size_t PW, size_t SH, size_t SW, size_t DH, size_t DW,  // dilation
+            hipStream_t stream);
+
+} // namespace convolution
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/opr_impl.cpp b/dnn/src/rocm/convolution/opr_impl.cpp
new file mode 100644
index 00000000..bfa2c079
--- /dev/null
+++ b/dnn/src/rocm/convolution/opr_impl.cpp
@@ -0,0 +1,284 @@
+/**
+ * \file dnn/src/rocm/convolution/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./backward_data/algo.h"
+#include "./backward_filter/algo.h"
+#include "./forward/algo.h"
+#include "./opr_impl.h"
+#include "src/common/algo_chooser.h"
+
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+#define TO_STRING2(v) #v
+#define TO_STRING(v) TO_STRING2(v)
+#define MIOPEN_VERSION_STR          \
+    TO_STRING(MIOPEN_VERSION_MAJOR) \
+    "." TO_STRING(MIOPEN_VERSION_MINOR) "." TO_STRING(MIOPEN_VERSION_PATCH)
+
+/* ============== ConvolutionForwardImpl ============== */
+ConvolutionForwardImpl::Algorithm*
+ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
+                                                const TensorLayout& filter,
+                                                const TensorLayout& dst,
+                                                size_t workspace_limit_in_bytes,
+                                                bool reproducible) {
+    auto fm = check_layout_fwd(src, filter, dst);
+    return get_algorithm_heuristic(src, fm, dst, workspace_limit_in_bytes,
+                                   reproducible);
+}
+
+ConvolutionForwardImpl::Algorithm*
+ConvolutionForwardImpl::get_algorithm_heuristic(
+        const TensorLayout& src, const CanonizedFilterMeta& filter,
+        const TensorLayout& dst, size_t workspace_limit_in_bytes,
+        bool reproducible) {
+    AlgoBase::SizeArgs args(this, src, filter, dst);
+
+    //! MIOpen auto-tuning need to run with actual tensors, so we cannot get
+    //! best algorithm here.
+    if (is_miopen_supported(args)) {
+        auto algo = megdnn::get_reproducible_algo<ConvolutionForwardImpl>(
+                sm_algo_pack.miopen_algos[0], reproducible);
+        if (algo)
+            return algo;
+    }
+
+    if (args.filter_meta.group > 1) {
+        if (sm_algo_pack.chanwise.is_available_reproducible(
+                    args, reproducible, workspace_limit_in_bytes)) {
+            return &sm_algo_pack.chanwise;
+        }
+    }
+
+    auto prefer_1x1 = [&args, reproducible, workspace_limit_in_bytes]() {
+        const size_t MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO = 4;
+        size_t batch_size = args.src_layout->shape[0];
+
+        if (batch_size > MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO) {
+            return false;
+        }
+        return sm_algo_pack.a1x1.is_available_reproducible(
+                args, reproducible, workspace_limit_in_bytes);
+    };
+
+    if (prefer_1x1()) {
+        return &sm_algo_pack.a1x1;
+    }
+
+    auto prefer_1x1_large_batch = [&args, reproducible,
+                                   workspace_limit_in_bytes]() {
+        const size_t MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO = 32;
+        size_t batch_size = args.src_layout->shape[0];
+
+        if (batch_size < MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO) {
+            return false;
+        }
+        return sm_algo_pack.batched_matrix_mul.is_available_reproducible(
+                args, reproducible, workspace_limit_in_bytes);
+    };
+
+    if (prefer_1x1_large_batch()) {
+        return &sm_algo_pack.batched_matrix_mul;
+    }
+
+    if (reproducible) {
+        return megdnn::get_reproducible_algo<ConvolutionForwardImpl>(
+                sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
+                "rocm conv fwd");
+    } else {
+        return megdnn::get_usable_algo<ConvolutionForwardImpl>(
+                sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
+                "rocm conv fwd");
+    }
+}
+
+std::vector<ConvolutionForwardImpl::Algorithm*>
+ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
+                                           const TensorLayout& filter,
+                                           const TensorLayout& dst) {
+    return megdnn::get_all_algorithms<ConvolutionForwardImpl>(
+            {this, src, filter, dst});
+}
+
+size_t ConvolutionForwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& filter,
+        const TensorLayout& dst, const PreprocessedFilter*) {
+    AlgoBase::SizeArgs args(this, src, filter, dst);
+    return get_algorithm(this, src, args.filter_meta, dst)
+            ->get_workspace_in_bytes(args);
+}
+
+void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
+                                  _megdnn_tensor_in filter,
+                                  _megdnn_tensor_out dst,
+                                  const PreprocessedFilter*,
+                                  _megdnn_workspace workspace) {
+    AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
+    auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout);
+    algo->check_workspace(args, workspace).exec(args);
+}
+
+const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
+    return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
+}
+
+/* ============== ConvolutionBackwardDataImpl ============== */
+
+void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
+                                       _megdnn_tensor_in diff,
+                                       _megdnn_tensor_out grad,
+                                       _megdnn_workspace workspace) {
+    AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
+    auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout);
+    algo->check_workspace(args, workspace).exec(args);
+}
+
+std::vector<ConvolutionBackwardDataImpl::Algorithm*>
+ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter,
+                                                const TensorLayout& diff,
+                                                const TensorLayout& grad) {
+    return megdnn::get_all_algorithms<ConvolutionBackwardDataImpl>(
+            {this, filter, diff, grad});
+}
+
+ConvolutionBackwardDataImpl::Algorithm*
+ConvolutionBackwardDataImpl::get_algorithm_heuristic(
+        const TensorLayout& filter, const TensorLayout& diff,
+        const TensorLayout& grad, size_t workspace_limit_in_bytes,
+        bool reproducible) {
+    auto fm = check_layout_fwd(grad, filter, diff);
+    return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes,
+                                   reproducible);
+}
+
+ConvolutionBackwardDataImpl::Algorithm*
+ConvolutionBackwardDataImpl::get_algorithm_heuristic(
+        const CanonizedFilterMeta& filter, const TensorLayout& diff,
+        const TensorLayout& grad, size_t workspace_limit_in_bytes,
+        bool reproducible) {
+    AlgoBase::SizeArgs args(this, filter, diff, grad);
+
+    if (is_miopen_supported(args.as_fwd_args())) {
+        auto algo = megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
+                sm_algo_pack.miopen_algos[0], reproducible);
+        if (algo)
+            return algo;
+    }
+
+    if (args.filter_meta.group > 1 &&
+        sm_algo_pack.chanwise.is_available_reproducible(
+                args, reproducible, workspace_limit_in_bytes)) {
+        return &sm_algo_pack.chanwise;
+    }
+
+    if (reproducible) {
+        return megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
+                sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
+                "rocm conv bwd_data");
+    } else {
+        return megdnn::get_usable_algo<ConvolutionBackwardDataImpl>(
+                sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
+                "rocm conv bwd_data");
+    }
+}
+
+size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(
+        const TensorLayout& filter, const TensorLayout& diff,
+        const TensorLayout& grad) {
+    AlgoBase::SizeArgs args(this, filter, diff, grad);
+    return get_algorithm(this, args.filter_meta, diff, grad)
+            ->get_workspace_in_bytes(args);
+}
+
+const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const {
+    return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
+}
+
+/* ============== ConvolutionBackwardFilterImpl ============== */
+
+void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src,
+                                         _megdnn_tensor_in diff,
+                                         _megdnn_tensor_out grad,
+                                         _megdnn_workspace workspace) {
+    AlgoBase::ExecArgs args(this, src, diff, grad, workspace);
+    auto algo =
+            get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta);
+    algo->check_workspace(args, workspace).exec(args);
+}
+
+std::vector<ConvolutionBackwardFilterImpl::Algorithm*>
+ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src,
+                                                  const TensorLayout& diff,
+                                                  const TensorLayout& grad) {
+    return megdnn::get_all_algorithms<ConvolutionBackwardFilterImpl>(
+            {this, src, diff, grad});
+}
+
+ConvolutionBackwardFilterImpl::Algorithm*
+ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
+        const TensorLayout& src, const TensorLayout& diff,
+        const TensorLayout& grad, size_t workspace_limit_in_bytes,
+        bool reproducible) {
+    auto fm = check_layout_fwd(src, grad, diff);
+    return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes,
+                                   reproducible);
+}
+
+ConvolutionBackwardFilterImpl::Algorithm*
+ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
+        const TensorLayout& src, const TensorLayout& diff,
+        const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes,
+        bool reproducible) {
+    AlgoBase::SizeArgs args(this, src, diff, grad);
+
+    if (is_miopen_supported(args.as_fwd_args())) {
+        auto algo =
+                megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
+                        sm_algo_pack.miopen_algos[0], reproducible);
+        if (algo)
+            return algo;
+    }
+
+    if (args.grad_filter_meta.group > 1 &&
+        sm_algo_pack.chanwise.is_available_reproducible(
+                args, reproducible, workspace_limit_in_bytes)) {
+        // prefer special chanwise impl
+        return &sm_algo_pack.chanwise;
+    }
+
+    if (reproducible) {
+        return megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
+                sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
+                "rocm conv bwd_filter");
+    } else {
+        return megdnn::get_usable_algo<ConvolutionBackwardFilterImpl>(
+                sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
+                "rocm conv bwd_filter");
+    }
+}
+
+size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& diff,
+        const TensorLayout& grad) {
+    AlgoBase::SizeArgs args(this, src, diff, grad);
+    return get_algorithm(this, src, diff, args.grad_filter_meta)
+            ->get_workspace_in_bytes(args);
+}
+
+const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const {
+    return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/convolution/opr_impl.h b/dnn/src/rocm/convolution/opr_impl.h
new file mode 100644
index 00000000..a19fbc89
--- /dev/null
+++ b/dnn/src/rocm/convolution/opr_impl.h
@@ -0,0 +1,154 @@
+/**
+ * \file dnn/src/rocm/convolution/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megdnn/oprs/nn.h"
+#include "src/common/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+class ConvolutionForwardImpl : public ConvolutionForward {
+public:
+    using ConvolutionForward::ConvolutionForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
+              _megdnn_tensor_out dst,
+              const PreprocessedFilter* preprocessed_filter,
+              _megdnn_workspace workspace) override;
+    std::vector<Algorithm*> get_all_algorithms(
+            const TensorLayout& src, const TensorLayout& filter,
+            const TensorLayout& dst) override;
+    Algorithm* get_algorithm_heuristic(const TensorLayout& src,
+                                       const TensorLayout& filter,
+                                       const TensorLayout& dst,
+                                       size_t workspace_limit_in_bytes,
+                                       bool reproducible) override;
+    Algorithm* get_algorithm_heuristic(const TensorLayout& src,
+                                       const CanonizedFilterMeta& filter,
+                                       const TensorLayout& dst,
+                                       size_t workspace_limit_in_bytes,
+                                       bool reproducible);
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& filter,
+                                  const TensorLayout& dst,
+                                  const PreprocessedFilter*) override;
+
+    size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
+                                             const TensorLayout&,
+                                             const TensorLayout&) override {
+        return 0;
+    }
+
+    void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
+                         const TensorLayout&, PreprocessedFilter*,
+                         _megdnn_workspace) override {
+        megdnn_throw("convolution exec_preprocess has not implemented yet");
+    }
+
+    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
+            const TensorLayout&, const TensorLayout&,
+            const TensorLayout&) override {
+        return {};
+    }
+    const char* get_algorithm_set_name() const override;
+
+    class AlgoBase;
+    class AlgoMIOpen;
+    class AlgoMatmul;
+    class AlgoInplaceMatmul;
+    class Algo1x1;
+    class Algo1x1LargeBatch;
+    class AlgoChanwise;
+
+    class AlgoPack;
+
+    static const AlgoPack& algo_pack() { return sm_algo_pack; }
+
+private:
+    static AlgoPack sm_algo_pack;
+};
+
+class ConvolutionBackwardDataImpl : public ConvolutionBackwardData {
+public:
+    using ConvolutionBackwardData::ConvolutionBackwardData;
+    void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff,
+              _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
+    std::vector<Algorithm*> get_all_algorithms(
+            const TensorLayout& filter, const TensorLayout& diff,
+            const TensorLayout& grad) override;
+    Algorithm* get_algorithm_heuristic(const TensorLayout& filter,
+                                       const TensorLayout& diff,
+                                       const TensorLayout& grad,
+                                       size_t workspace_limit_in_bytes,
+                                       bool reproducible) override;
+    Algorithm* get_algorithm_heuristic(const CanonizedFilterMeta& filter,
+                                       const TensorLayout& diff,
+                                       const TensorLayout& grad,
+                                       size_t workspace_limit_in_bytes,
+                                       bool reproducible);
+    size_t get_workspace_in_bytes(const TensorLayout& filter,
+                                  const TensorLayout& diff,
+                                  const TensorLayout& grad) override;
+    const char* get_algorithm_set_name() const override;
+
+    class AlgoBase;
+    class AlgoMIOpen;
+    class AlgoMatmul;
+    class AlgoChanwise;
+
+    class AlgoPack;
+
+    static const AlgoPack& algo_pack() { return sm_algo_pack; }
+
+private:
+    static AlgoPack sm_algo_pack;
+};
+
+class ConvolutionBackwardFilterImpl : public ConvolutionBackwardFilter {
+public:
+    using ConvolutionBackwardFilter::ConvolutionBackwardFilter;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff,
+              _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
+    std::vector<Algorithm*> get_all_algorithms(
+            const TensorLayout& src, const TensorLayout& diff,
+            const TensorLayout& grad) override;
+    Algorithm* get_algorithm_heuristic(const TensorLayout& src,
+                                       const TensorLayout& diff,
+                                       const TensorLayout& grad,
+                                       size_t workspace_limit_in_bytes,
+                                       bool reproducible) override;
+    Algorithm* get_algorithm_heuristic(const TensorLayout& src,
+                                       const TensorLayout& diff,
+                                       const CanonizedFilterMeta& grad,
+                                       size_t workspace_limit_in_bytes,
+                                       bool reproducible);
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& diff,
+                                  const TensorLayout& grad) override;
+    const char* get_algorithm_set_name() const override;
+
+    class AlgoBase;
+    class AlgoMIOpen;
+    class AlgoMatmul;
+    class AlgoChanwise;
+
+    class AlgoPack;
+
+    static const AlgoPack& algo_pack() { return sm_algo_pack; }
+
+private:
+    static AlgoPack sm_algo_pack;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/elemwise/kern_impl.inl b/dnn/src/rocm/elemwise/kern_impl.inl
new file mode 100644
index 00000000..fb6b287b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kern_impl.inl
@@ -0,0 +1,36 @@
+/**
+ * \file dnn/src/rocm/elemwise/kern_impl.inl
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#ifndef KERN_IMPL_MODE
+#error "KERN_IMPL_MODE, KERN_IMPL_ARITY and KERN_IMPL_CTYPE must be defined"
+#endif
+
+#include "src/rocm/elemwise/kern_wrapper.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+#define cb(_mode)                                                             \
+    typedef ElemwiseKern<megcorePlatformROCM,                                 \
+                         param_enumv::Elemwise::Mode::_mode, KERN_IMPL_CTYPE> \
+            KernImpl##_mode;                                                  \
+    typedef ElemArithKernWrapper<KERN_IMPL_ARITY, KernImpl##_mode>            \
+            Wrapper##_mode;                                                   \
+    INST_RUN_ELEMWISE(Wrapper##_mode, KERN_IMPL_CTYPE, KERN_IMPL_ARITY);
+
+KERN_IMPL_MODE(cb)
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/elemwise/kern_wrapper.h.hip b/dnn/src/rocm/elemwise/kern_wrapper.h.hip
new file mode 100644
index 00000000..b97cf944
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kern_wrapper.h.hip
@@ -0,0 +1,62 @@
+/**
+ * \file src/rocm/elemwise/kern_wrapper.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief helper for implementing elemwise oprs
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "src/rocm/elemwise_helper.h.hip"
+#include "src/common/elemwise/kern_defs.cuh"
+
+namespace megdnn {
+namespace rocm {
+
+template <int arity, class KernImpl>
+struct ElemArithKernWrapper;
+
+template <class KernImpl>
+struct ElemArithKernWrapper<1, KernImpl> {
+    typedef typename KernImpl::ctype ctype;
+    ctype* dst;
+
+#if MEGDNN_CC_CUDA
+    __device__ void operator()(uint32_t idx, ctype x) {
+        dst[idx] = KernImpl::apply(x);
+    }
+#endif
+};
+template <class KernImpl>
+struct ElemArithKernWrapper<2, KernImpl> {
+    typedef typename KernImpl::ctype ctype;
+    ctype* dst;
+
+#if MEGDNN_CC_CUDA
+    __device__ void operator()(uint32_t idx, ctype x, ctype y) {
+        dst[idx] = KernImpl::apply(x, y);
+    }
+#endif
+};
+template <class KernImpl>
+struct ElemArithKernWrapper<3, KernImpl> {
+    typedef typename KernImpl::ctype ctype;
+    ctype* dst;
+
+#if MEGDNN_CC_CUDA
+    __device__ void operator()(uint32_t idx, ctype x, ctype y, ctype z) {
+        dst[idx] = KernImpl::apply(x, y, z);
+    }
+#endif
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
+
+
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_float16.cpp.hip
new file mode 100644
index 00000000..d4f8ac33
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_float32.cpp.hip
new file mode 100644
index 00000000..4b8c7696
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int16.cpp.hip
new file mode 100644
index 00000000..fe2bb209
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int32.cpp.hip
new file mode 100644
index 00000000..062685a7
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int8.cpp.hip
new file mode 100644
index 00000000..bd883a99
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_uint8.cpp.hip
new file mode 100644
index 00000000..185c733b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_GRAD_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_dt_float16.cpp.hip
new file mode 100644
index 00000000..665208ea
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_dt_float32.cpp.hip
new file mode 100644
index 00000000..6bd3fd01
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_dt_int16.cpp.hip
new file mode 100644
index 00000000..6d0a1d42
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_dt_int32.cpp.hip
new file mode 100644
index 00000000..b7468e36
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_dt_int8.cpp.hip
new file mode 100644
index 00000000..9af9fc33
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ABS_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ABS_dt_uint8.cpp.hip
new file mode 100644
index 00000000..c197ee12
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ABS_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ACOS_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ACOS_dt_float16.cpp.hip
new file mode 100644
index 00000000..9a072a73
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ACOS_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ACOS_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ACOS_dt_float32.cpp.hip
new file mode 100644
index 00000000..c8382465
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ACOS_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ADD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ADD_dt_float16.cpp.hip
new file mode 100644
index 00000000..d1097cee
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ADD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ADD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ADD_dt_float32.cpp.hip
new file mode 100644
index 00000000..04e414d8
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ADD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ADD_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ADD_dt_int16.cpp.hip
new file mode 100644
index 00000000..2692639b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ADD_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ADD_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ADD_dt_int32.cpp.hip
new file mode 100644
index 00000000..2a8b63ab
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ADD_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ADD_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ADD_dt_int8.cpp.hip
new file mode 100644
index 00000000..a9ff809f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ADD_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ADD_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ADD_dt_uint8.cpp.hip
new file mode 100644
index 00000000..fd4c23d0
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ADD_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ASIN_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ASIN_dt_float16.cpp.hip
new file mode 100644
index 00000000..20b2a7c8
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ASIN_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ASIN_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ASIN_dt_float32.cpp.hip
new file mode 100644
index 00000000..a7852fa9
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ASIN_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ATAN2_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ATAN2_dt_float16.cpp.hip
new file mode 100644
index 00000000..e30a5931
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ATAN2_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ATAN2_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ATAN2_dt_float32.cpp.hip
new file mode 100644
index 00000000..7024dbaa
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ATAN2_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/CEIL_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/CEIL_dt_float16.cpp.hip
new file mode 100644
index 00000000..e5051bb2
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/CEIL_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/CEIL_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/CEIL_dt_float32.cpp.hip
new file mode 100644
index 00000000..c3f91b79
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/CEIL_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cpp.hip
new file mode 100644
index 00000000..6025e7e0
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cpp.hip
new file mode 100644
index 00000000..90d61a5f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cpp.hip
new file mode 100644
index 00000000..81bd6fe1
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cpp.hip
new file mode 100644
index 00000000..63d9211a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cpp.hip
new file mode 100644
index 00000000..cb8b92d3
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cpp.hip
new file mode 100644
index 00000000..fd1b9437
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COND_LEQ_MOV_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/COS_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COS_dt_float16.cpp.hip
new file mode 100644
index 00000000..c3b061ed
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COS_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/COS_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/COS_dt_float32.cpp.hip
new file mode 100644
index 00000000..89b9f12c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/COS_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EQ_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EQ_dt_float16.cpp.hip
new file mode 100644
index 00000000..2492fcb8
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EQ_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/EQ_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EQ_dt_float32.cpp.hip
new file mode 100644
index 00000000..3dbdaf9d
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EQ_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EQ_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EQ_dt_int16.cpp.hip
new file mode 100644
index 00000000..1887146f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EQ_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EQ_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EQ_dt_int32.cpp.hip
new file mode 100644
index 00000000..2518d6ff
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EQ_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EQ_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EQ_dt_int8.cpp.hip
new file mode 100644
index 00000000..d0ca968f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EQ_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EQ_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EQ_dt_uint8.cpp.hip
new file mode 100644
index 00000000..6c62949c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EQ_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ERFCINV_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERFCINV_dt_float16.cpp.hip
new file mode 100644
index 00000000..98315e0e
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERFCINV_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ERFCINV_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERFCINV_dt_float32.cpp.hip
new file mode 100644
index 00000000..e337f0c6
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERFCINV_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ERFC_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERFC_dt_float16.cpp.hip
new file mode 100644
index 00000000..2f0894cc
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERFC_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ERFC_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERFC_dt_float32.cpp.hip
new file mode 100644
index 00000000..9dd164d5
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERFC_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ERFINV_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERFINV_dt_float16.cpp.hip
new file mode 100644
index 00000000..37b4a3f4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERFINV_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFINV, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ERFINV_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERFINV_dt_float32.cpp.hip
new file mode 100644
index 00000000..a022e82c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERFINV_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFINV, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ERF_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERF_dt_float16.cpp.hip
new file mode 100644
index 00000000..2156e847
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERF_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERF, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ERF_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ERF_dt_float32.cpp.hip
new file mode 100644
index 00000000..3b86ad21
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ERF_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERF, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EXPM1_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EXPM1_dt_float16.cpp.hip
new file mode 100644
index 00000000..daaed095
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EXPM1_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXPM1, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/EXPM1_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EXPM1_dt_float32.cpp.hip
new file mode 100644
index 00000000..8acc8cd2
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EXPM1_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXPM1, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/EXP_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EXP_dt_float16.cpp.hip
new file mode 100644
index 00000000..57e07652
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EXP_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXP, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/EXP_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/EXP_dt_float32.cpp.hip
new file mode 100644
index 00000000..cbf23a51
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/EXP_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EXP, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cpp.hip
new file mode 100644
index 00000000..68034e3f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_GRAD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cpp.hip
new file mode 100644
index 00000000..16614d4d
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_GRAD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FAST_TANH_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_dt_float16.cpp.hip
new file mode 100644
index 00000000..128142cf
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FAST_TANH_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_dt_float32.cpp.hip
new file mode 100644
index 00000000..7c67ca34
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FAST_TANH_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_float16.cpp.hip
new file mode 100644
index 00000000..102a4455
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_float32.cpp.hip
new file mode 100644
index 00000000..c22574b6
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int16.cpp.hip
new file mode 100644
index 00000000..0c5eadea
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int32.cpp.hip
new file mode 100644
index 00000000..23408ae3
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int8.cpp.hip
new file mode 100644
index 00000000..aa6005ea
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_uint8.cpp.hip
new file mode 100644
index 00000000..5aa2fa74
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_DIV_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_dt_float16.cpp.hip
new file mode 100644
index 00000000..aa434531
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FLOOR_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FLOOR_dt_float32.cpp.hip
new file mode 100644
index 00000000..b64b99c7
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FLOOR_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cpp.hip
new file mode 100644
index 00000000..255dca30
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cpp.hip
new file mode 100644
index 00000000..c183462b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_H_SWISH_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cpp.hip
new file mode 100644
index 00000000..f1541b7a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cpp.hip
new file mode 100644
index 00000000..a9aa59ae
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cpp.hip
new file mode 100644
index 00000000..86038f27
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cpp.hip
new file mode 100644
index 00000000..6f1a21b7
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cpp.hip
new file mode 100644
index 00000000..dd2771dd
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cpp.hip
new file mode 100644
index 00000000..229d7b69
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_RELU_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cpp.hip
new file mode 100644
index 00000000..7bd8b0f5
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cpp.hip
new file mode 100644
index 00000000..48656fc4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_SIGMOID_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cpp.hip
new file mode 100644
index 00000000..86ea8f2a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_TANH_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cpp.hip
new file mode 100644
index 00000000..349b33ea
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_ADD_TANH_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cpp.hip
new file mode 100644
index 00000000..5716afe2
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_MUL_ADD3_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cpp.hip
new file mode 100644
index 00000000..7e4134cb
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/FUSE_MUL_ADD3_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_MUL_ADD3, cb)
+#define KERN_IMPL_ARITY 3
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cpp.hip
new file mode 100644
index 00000000..4e03c1e1
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/H_SWISH_GRAD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cpp.hip
new file mode 100644
index 00000000..8fbfc156
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/H_SWISH_GRAD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/H_SWISH_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/H_SWISH_dt_float16.cpp.hip
new file mode 100644
index 00000000..a97d4aaf
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/H_SWISH_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/H_SWISH_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/H_SWISH_dt_float32.cpp.hip
new file mode 100644
index 00000000..6f42839c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/H_SWISH_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LEQ_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_float16.cpp.hip
new file mode 100644
index 00000000..786c2feb
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/LEQ_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_float32.cpp.hip
new file mode 100644
index 00000000..3d1f4970
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int16.cpp.hip
new file mode 100644
index 00000000..33f503a9
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int32.cpp.hip
new file mode 100644
index 00000000..c7e04327
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int8.cpp.hip
new file mode 100644
index 00000000..7c7bebcd
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LEQ_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_uint8.cpp.hip
new file mode 100644
index 00000000..ef977f91
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LEQ_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LOG1P_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LOG1P_dt_float16.cpp.hip
new file mode 100644
index 00000000..2f95257b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LOG1P_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG1P, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/LOG1P_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LOG1P_dt_float32.cpp.hip
new file mode 100644
index 00000000..7fe27d28
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LOG1P_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG1P, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cpp.hip
new file mode 100644
index 00000000..b9eb2b37
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LOG_SUM_EXP_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG_SUM_EXP, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cpp.hip
new file mode 100644
index 00000000..c5ea7054
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LOG_SUM_EXP_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG_SUM_EXP, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LOG_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LOG_dt_float16.cpp.hip
new file mode 100644
index 00000000..cda065e6
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LOG_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/LOG_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LOG_dt_float32.cpp.hip
new file mode 100644
index 00000000..56b1cfd6
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LOG_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LOG, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LT_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LT_dt_float16.cpp.hip
new file mode 100644
index 00000000..2bd4bb7f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LT_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/LT_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LT_dt_float32.cpp.hip
new file mode 100644
index 00000000..bfd1c942
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LT_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LT_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LT_dt_int16.cpp.hip
new file mode 100644
index 00000000..484f8cfe
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LT_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LT_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LT_dt_int32.cpp.hip
new file mode 100644
index 00000000..d44e5041
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LT_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LT_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LT_dt_int8.cpp.hip
new file mode 100644
index 00000000..1ae62018
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LT_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/LT_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/LT_dt_uint8.cpp.hip
new file mode 100644
index 00000000..a18d0913
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/LT_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MAX_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MAX_dt_float16.cpp.hip
new file mode 100644
index 00000000..580efc07
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MAX_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/MAX_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MAX_dt_float32.cpp.hip
new file mode 100644
index 00000000..fc13cb74
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MAX_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MAX_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MAX_dt_int16.cpp.hip
new file mode 100644
index 00000000..b49743e1
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MAX_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MAX_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MAX_dt_int32.cpp.hip
new file mode 100644
index 00000000..c9649f9c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MAX_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MAX_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MAX_dt_int8.cpp.hip
new file mode 100644
index 00000000..e0e24df0
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MAX_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MAX_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MAX_dt_uint8.cpp.hip
new file mode 100644
index 00000000..bf1a78a3
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MAX_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MIN_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MIN_dt_float16.cpp.hip
new file mode 100644
index 00000000..26c8df53
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MIN_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/MIN_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MIN_dt_float32.cpp.hip
new file mode 100644
index 00000000..d3a40eff
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MIN_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MIN_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MIN_dt_int16.cpp.hip
new file mode 100644
index 00000000..787b8d21
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MIN_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MIN_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MIN_dt_int32.cpp.hip
new file mode 100644
index 00000000..a7621fdb
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MIN_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MIN_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MIN_dt_int8.cpp.hip
new file mode 100644
index 00000000..598a3f06
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MIN_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MIN_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MIN_dt_uint8.cpp.hip
new file mode 100644
index 00000000..393347fb
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MIN_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MOD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MOD_dt_float16.cpp.hip
new file mode 100644
index 00000000..0f5d6e14
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MOD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/MOD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MOD_dt_float32.cpp.hip
new file mode 100644
index 00000000..38a18d02
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MOD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MOD_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MOD_dt_int16.cpp.hip
new file mode 100644
index 00000000..736a4c1a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MOD_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MOD_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MOD_dt_int32.cpp.hip
new file mode 100644
index 00000000..f4999db2
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MOD_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MOD_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MOD_dt_int8.cpp.hip
new file mode 100644
index 00000000..af16999c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MOD_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MOD_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MOD_dt_uint8.cpp.hip
new file mode 100644
index 00000000..65841790
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MOD_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MOD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MUL_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MUL_dt_float16.cpp.hip
new file mode 100644
index 00000000..8100f209
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MUL_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/MUL_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MUL_dt_float32.cpp.hip
new file mode 100644
index 00000000..73293900
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MUL_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MUL_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MUL_dt_int16.cpp.hip
new file mode 100644
index 00000000..8df90a7e
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MUL_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MUL_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MUL_dt_int32.cpp.hip
new file mode 100644
index 00000000..96f7da3d
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MUL_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MUL_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MUL_dt_int8.cpp.hip
new file mode 100644
index 00000000..5a90184e
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MUL_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/MUL_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/MUL_dt_uint8.cpp.hip
new file mode 100644
index 00000000..334814b5
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/MUL_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_float16.cpp.hip
new file mode 100644
index 00000000..1ef8ed1d
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_float32.cpp.hip
new file mode 100644
index 00000000..290a1a03
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int16.cpp.hip
new file mode 100644
index 00000000..ea506d31
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int32.cpp.hip
new file mode 100644
index 00000000..6d21f1e5
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int8.cpp.hip
new file mode 100644
index 00000000..74dba711
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_uint8.cpp.hip
new file mode 100644
index 00000000..927f0fa1
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/NEGATE_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/POW_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/POW_dt_float16.cpp.hip
new file mode 100644
index 00000000..d4ba6730
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/POW_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(POW, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/POW_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/POW_dt_float32.cpp.hip
new file mode 100644
index 00000000..e9fb788e
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/POW_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(POW, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RELU_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RELU_dt_float16.cpp.hip
new file mode 100644
index 00000000..e5393775
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RELU_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/RELU_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RELU_dt_float32.cpp.hip
new file mode 100644
index 00000000..d18e37c8
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RELU_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RELU_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RELU_dt_int16.cpp.hip
new file mode 100644
index 00000000..3eb24ed4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RELU_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RELU_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RELU_dt_int32.cpp.hip
new file mode 100644
index 00000000..8c11a2e3
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RELU_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RELU_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RELU_dt_int8.cpp.hip
new file mode 100644
index 00000000..9330078e
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RELU_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RELU_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RELU_dt_uint8.cpp.hip
new file mode 100644
index 00000000..470bd051
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RELU_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int16.cpp.hip
new file mode 100644
index 00000000..0f21d7cb
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int32.cpp.hip
new file mode 100644
index 00000000..2f125239
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int8.cpp.hip
new file mode 100644
index 00000000..e2229ac1
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/RMULH_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_uint8.cpp.hip
new file mode 100644
index 00000000..89e247eb
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/RMULH_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(RMULH, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/ROUND_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ROUND_dt_float16.cpp.hip
new file mode 100644
index 00000000..0e24f548
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ROUND_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/ROUND_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/ROUND_dt_float32.cpp.hip
new file mode 100644
index 00000000..9660812d
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/ROUND_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHL_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHL_dt_int16.cpp.hip
new file mode 100644
index 00000000..1ec354f7
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHL_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHL_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHL_dt_int32.cpp.hip
new file mode 100644
index 00000000..c62bcc4f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHL_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHL_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHL_dt_int8.cpp.hip
new file mode 100644
index 00000000..906d29f3
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHL_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHL_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHL_dt_uint8.cpp.hip
new file mode 100644
index 00000000..50dae36e
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHL_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHL, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHR_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHR_dt_int16.cpp.hip
new file mode 100644
index 00000000..d9ecc70c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHR_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHR_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHR_dt_int32.cpp.hip
new file mode 100644
index 00000000..583a1554
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHR_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHR_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHR_dt_int8.cpp.hip
new file mode 100644
index 00000000..6a9bfba6
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHR_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SHR_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SHR_dt_uint8.cpp.hip
new file mode 100644
index 00000000..cff0b17b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SHR_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SHR, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cpp.hip
new file mode 100644
index 00000000..4b89026a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cpp.hip
new file mode 100644
index 00000000..cd70a27d
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cpp.hip
new file mode 100644
index 00000000..65b55d7b
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cpp.hip
new file mode 100644
index 00000000..21bde467
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cpp.hip
new file mode 100644
index 00000000..3584305f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cpp.hip
new file mode 100644
index 00000000..d339eea4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_GRAD_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_dt_float16.cpp.hip
new file mode 100644
index 00000000..baae5803
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/SIGMOID_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIGMOID_dt_float32.cpp.hip
new file mode 100644
index 00000000..4b4b1d8f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIGMOID_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SIN_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIN_dt_float16.cpp.hip
new file mode 100644
index 00000000..fdabffd0
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIN_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIN, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/SIN_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SIN_dt_float32.cpp.hip
new file mode 100644
index 00000000..2f1ea67c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SIN_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SIN, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SUB_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SUB_dt_float16.cpp.hip
new file mode 100644
index 00000000..129bd04f
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SUB_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/SUB_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SUB_dt_float32.cpp.hip
new file mode 100644
index 00000000..1b0aec6a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SUB_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SUB_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SUB_dt_int16.cpp.hip
new file mode 100644
index 00000000..957627f1
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SUB_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SUB_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SUB_dt_int32.cpp.hip
new file mode 100644
index 00000000..e41c6bcf
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SUB_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SUB_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SUB_dt_int8.cpp.hip
new file mode 100644
index 00000000..4a0890e4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SUB_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SUB_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SUB_dt_uint8.cpp.hip
new file mode 100644
index 00000000..33a54a6a
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SUB_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_float16.cpp.hip
new file mode 100644
index 00000000..7fe80c4c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_float32.cpp.hip
new file mode 100644
index 00000000..9a759078
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int16.cpp.hip
new file mode 100644
index 00000000..0d2892f4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int32.cpp.hip
new file mode 100644
index 00000000..c7f4b26c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int8.cpp.hip
new file mode 100644
index 00000000..1d4df389
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_uint8.cpp.hip
new file mode 100644
index 00000000..7c83a5c2
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/SWITCH_GT0_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_float16.cpp.hip
new file mode 100644
index 00000000..5be50c8c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_float32.cpp.hip
new file mode 100644
index 00000000..0e259719
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int16.cpp.hip
new file mode 100644
index 00000000..4efd5978
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int16.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int16
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int32.cpp.hip
new file mode 100644
index 00000000..69202693
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int8.cpp.hip
new file mode 100644
index 00000000..448aaf29
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_int8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_int8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_uint8.cpp.hip
new file mode 100644
index 00000000..e1fc7756
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_GRAD_dt_uint8.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH_GRAD, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_uint8
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_dt_float16.cpp.hip
new file mode 100644
index 00000000..3c807b09
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/TANH_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TANH_dt_float32.cpp.hip
new file mode 100644
index 00000000..89184efd
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TANH_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb)
+#define KERN_IMPL_ARITY 1
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/kimpl/TRUE_DIV_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TRUE_DIV_dt_float16.cpp.hip
new file mode 100644
index 00000000..7e4779c4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TRUE_DIV_dt_float16.cpp.hip
@@ -0,0 +1,7 @@
+// generated by gen_elemwise_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TRUE_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float16
+#include "../kern_impl.inl"
+#endif
diff --git a/dnn/src/rocm/elemwise/kimpl/TRUE_DIV_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/kimpl/TRUE_DIV_dt_float32.cpp.hip
new file mode 100644
index 00000000..6792bbe3
--- /dev/null
+++ b/dnn/src/rocm/elemwise/kimpl/TRUE_DIV_dt_float32.cpp.hip
@@ -0,0 +1,5 @@
+// generated by gen_elemwise_kern_impls.py
+#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(TRUE_DIV, cb)
+#define KERN_IMPL_ARITY 2
+#define KERN_IMPL_CTYPE dt_float32
+#include "../kern_impl.inl"
diff --git a/dnn/src/rocm/elemwise/opr_impl.cpp b/dnn/src/rocm/elemwise/opr_impl.cpp
new file mode 100644
index 00000000..90a84977
--- /dev/null
+++ b/dnn/src/rocm/elemwise/opr_impl.cpp
@@ -0,0 +1,73 @@
+/**
+ * \file dnn/src/rocm/elemwise/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "midout.h"
+#include "src/rocm/elemwise/kern_wrapper.h.hip"
+#include "src/rocm/elemwise/special_kerns.h.hip"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+#define on_arity_dispatched_cb_dtype(_dt)                           \
+    if (m_dst->layout.dtype == _dt()) {                             \
+        using dtrait = DTypeTrait<_dt>;                             \
+        using ctype = dtrait::ctype;                                \
+        auto stream = hip_stream(handle());                         \
+        return ModeDispatcher<arity, dtrait::category, ctype>::run( \
+                src, stream, m_param.mode, m_dst->ptr<ctype>());    \
+    }
+
+#define _cb_dispatch_mode(_m)                                                 \
+    case Mode::_m:                                                            \
+        do {                                                                  \
+            using KernImpl =                                                  \
+                    ElemwiseKern<megcorePlatformROCM,                         \
+                                 param_enumv::Elemwise::Mode::_m, ctype>;     \
+            using Wrapper = ElemArithKernWrapper<arity, KernImpl>;            \
+            Wrapper wrapper;                                                  \
+            wrapper.dst = static_cast<ctype*>(dst);                           \
+            return run_elemwise<Wrapper, ctype, arity>(src, stream, wrapper); \
+        } while (0);
+
+#define IMPL_MODE_DISPATCHER(_arity, _dtype_cat)                            \
+    template <typename ctype>                                               \
+    struct ElemwiseForwardImpl::ModeDispatcher<_arity, _dtype_cat, ctype> { \
+        static constexpr int arity = _arity;                                \
+        static void run(const ElemwiseOpParamN<arity>& src,                 \
+                        hipStream_t stream, Mode mode, void* dst) {         \
+            switch (mode) {                                                 \
+                FOREACH(_cb_dispatch_mode)                                  \
+                default:                                                    \
+                    megdnn_throw("bad mode");                               \
+            }                                                               \
+        }                                                                   \
+    }
+
+#include "src/common/elemwise/opr_impl_body.inl"
+
+template <typename ctype, bool c_is_scalar>
+void ElemwiseForwardImpl::impl_fuse_mul_add3(const ElemwiseOpParamN<3>& param) {
+    kern_fuse_mul_add3<c_is_scalar, ctype>(m_dst->ptr<ctype>(), param,
+                                           hip_stream(handle()));
+}
+
+template <typename ctype>
+void ElemwiseForwardImpl::impl_fuse_mul_add4(const ElemwiseOpParamN<4>& param) {
+    kern_fuse_mul_add4(m_dst->ptr<ctype>(), param, hip_stream(handle()));
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/elemwise/opr_impl.h b/dnn/src/rocm/elemwise/opr_impl.h
new file mode 100644
index 00000000..ec38961c
--- /dev/null
+++ b/dnn/src/rocm/elemwise/opr_impl.h
@@ -0,0 +1,27 @@
+/**
+ * \file dnn/src/rocm/elemwise/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "src/common/elemwise/opr_impl_helper.h"
+
+namespace megdnn {
+namespace rocm {
+
+class ElemwiseForwardImpl final : public ElemwiseForwardImplHelper {
+#include "src/common/elemwise/opr_impl_class_def.inl"
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/elemwise/special_kerns.h.hip b/dnn/src/rocm/elemwise/special_kerns.h.hip
new file mode 100644
index 00000000..b21ee120
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kerns.h.hip
@@ -0,0 +1,31 @@
+/**
+ * \file src/rocm/elemwise/special_kerns.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief special elemwise opr rocm kernels
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "src/rocm/elemwise_helper.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+template <bool c_is_scalar, typename ctype>
+void kern_fuse_mul_add3(ctype* dest, const ElemwiseOpParamN<3>& param,
+                        hipStream_t stream);
+
+template <typename ctype>
+void kern_fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param,
+                        hipStream_t stream);
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/elemwise/special_kerns.inl b/dnn/src/rocm/elemwise/special_kerns.inl
new file mode 100644
index 00000000..d6e47dc9
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kerns.inl
@@ -0,0 +1,139 @@
+/**
+ * \file dnn/src/rocm/elemwise/special_kerns.inl
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./special_kerns.h.hip"
+
+namespace megdnn {
+namespace rocm {
+namespace elemwise_intl {
+
+template <typename ctype, bool c_is_scalar>
+struct FuseMulAdd3Op {
+    typedef ctype* __restrict__ bufptr_t;
+    bufptr_t m_dst, m_src2;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, int off0, int off1,
+                                               bufptr_t src0, bufptr_t src1) {
+        m_dst[idx] = src0[off0] * src1[off1] + m_src2[c_is_scalar ? 0 : off0];
+    }
+};
+
+template <typename ctype>
+struct FuseMulAdd4Op {
+    typedef ctype* __restrict__ bufptr_t;
+    bufptr_t m_dst, m_src2, m_src3;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, int off0, int off1,
+                                               bufptr_t src0, bufptr_t src1) {
+        m_dst[idx] = static_cast<ctype>(src0[off0]) *
+                             static_cast<ctype>(src1[off1]) +
+                     static_cast<ctype>(m_src2[off0]) *
+                             static_cast<ctype>(m_src3[off1]);
+    }
+};
+
+//! wrap an op so the special OpCaller can be selected by template matching
+template <class Op>
+class FuseOpWrapper {
+    const Op& m_op;
+
+public:
+    FuseOpWrapper(const Op& op) : m_op(op) {}
+
+    operator const Op&() const { return m_op; }
+};
+
+template <class Op, class PVis0, class PVis1>
+struct OpCallerBinary<FuseOpWrapper<Op>, PVis0, PVis1> {
+    Op op;
+    PVis0 par0;
+    PVis1 par1;
+
+    __device__ __forceinline__ void thread_init(uint32_t idx) {
+        par0.thread_init(idx);
+        par1.thread_init(idx);
+    }
+
+    __device__ __forceinline__ void on(uint32_t idx) {
+        op(idx, par0.offset(idx), par1.offset(idx), par0.ptr(), par1.ptr());
+    }
+
+    __device__ __forceinline__ void next() {
+        par0.next();
+        par1.next();
+    }
+};
+
+template <class Op, class PVis>
+struct OpCallerUniform<FuseOpWrapper<Op>, 2, PVis> {
+    Op op;
+    PVis par[2];
+
+    __device__ __forceinline__ void thread_init(uint32_t idx) {
+        par[0].thread_init(idx);
+        par[1].thread_init(idx);
+    }
+
+    __device__ __forceinline__ void on(uint32_t idx) {
+        op(idx, par[0].offset(idx), par[1].offset(idx), par[0].ptr(),
+           par[1].ptr());
+    }
+
+    __device__ __forceinline__ void next() {
+        par[0].next();
+        par[1].next();
+    }
+};
+
+}  // namespace elemwise_intl
+
+namespace {
+template <typename ctype, class Op, int arity>
+void run_fuse_elemwise(Op& op, const ElemwiseOpParamN<arity>& param,
+                       hipStream_t stream) {
+    param.assert_initialized();
+    ElemwiseOpParamN<2> p2 = *static_cast<const ElemwiseOpParamN<2>*>(
+            static_cast<const void*>(&param));
+    elemwise_intl::UserOpInvoker<elemwise_intl::FuseOpWrapper<Op>, ctype, 2>(
+            p2, stream, op);
+}
+}  // anonymous namespace
+
+template <bool c_is_scalar, typename ctype>
+void kern_fuse_mul_add3(ctype* dest, const ElemwiseOpParamN<3>& param,
+                        hipStream_t stream) {
+    elemwise_intl::FuseMulAdd3Op<ctype, c_is_scalar> op;
+    op.m_dst = dest;
+    op.m_src2 = param[2].ptr<ctype>();
+    run_fuse_elemwise<ctype>(op, param, stream);
+}
+
+template <typename ctype>
+void kern_fuse_mul_add4(ctype* dest, const ElemwiseOpParamN<4>& param,
+                        hipStream_t stream) {
+    elemwise_intl::FuseMulAdd4Op<ctype> op;
+    op.m_dst = dest;
+    op.m_src2 = param[2].ptr<ctype>();
+    op.m_src3 = param[3].ptr<ctype>();
+    run_fuse_elemwise<ctype>(op, param, stream);
+}
+
+#define INST(_dt)                                                              \
+    template void kern_fuse_mul_add3<true>(                                    \
+            DTypeTrait<_dt>::ctype*, const ElemwiseOpParamN<3>&, hipStream_t); \
+    template void kern_fuse_mul_add3<false>(                                   \
+            DTypeTrait<_dt>::ctype*, const ElemwiseOpParamN<3>&, hipStream_t); \
+    template void kern_fuse_mul_add4(DTypeTrait<_dt>::ctype*,                  \
+                                     const ElemwiseOpParamN<4>&, hipStream_t);
+
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/elemwise/special_kimpl/special_dt_float16.cpp.hip b/dnn/src/rocm/elemwise/special_kimpl/special_dt_float16.cpp.hip
new file mode 100644
index 00000000..2e2f77b0
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kimpl/special_dt_float16.cpp.hip
@@ -0,0 +1,8 @@
+// generated by gen_elemwise_special_kern_impls.py
+#if !MEGDNN_DISABLE_FLOAT16
+#include "../special_kerns.inl"
+INST(::megdnn::dtype::Float16)
+#undef INST
+}
+}
+#endif
diff --git a/dnn/src/rocm/elemwise/special_kimpl/special_dt_float32.cpp.hip b/dnn/src/rocm/elemwise/special_kimpl/special_dt_float32.cpp.hip
new file mode 100644
index 00000000..8d267018
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kimpl/special_dt_float32.cpp.hip
@@ -0,0 +1,6 @@
+// generated by gen_elemwise_special_kern_impls.py
+#include "../special_kerns.inl"
+INST(::megdnn::dtype::Float32)
+#undef INST
+}
+}
diff --git a/dnn/src/rocm/elemwise/special_kimpl/special_dt_int16.cpp.hip b/dnn/src/rocm/elemwise/special_kimpl/special_dt_int16.cpp.hip
new file mode 100644
index 00000000..25063dcd
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kimpl/special_dt_int16.cpp.hip
@@ -0,0 +1,6 @@
+// generated by gen_elemwise_special_kern_impls.py
+#include "../special_kerns.inl"
+INST(::megdnn::dtype::Int16)
+#undef INST
+}
+}
diff --git a/dnn/src/rocm/elemwise/special_kimpl/special_dt_int32.cpp.hip b/dnn/src/rocm/elemwise/special_kimpl/special_dt_int32.cpp.hip
new file mode 100644
index 00000000..2a62bb21
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kimpl/special_dt_int32.cpp.hip
@@ -0,0 +1,6 @@
+// generated by gen_elemwise_special_kern_impls.py
+#include "../special_kerns.inl"
+INST(::megdnn::dtype::Int32)
+#undef INST
+}
+}
diff --git a/dnn/src/rocm/elemwise/special_kimpl/special_dt_int8.cpp.hip b/dnn/src/rocm/elemwise/special_kimpl/special_dt_int8.cpp.hip
new file mode 100644
index 00000000..69fbafe4
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kimpl/special_dt_int8.cpp.hip
@@ -0,0 +1,6 @@
+// generated by gen_elemwise_special_kern_impls.py
+#include "../special_kerns.inl"
+INST(::megdnn::dtype::Int8)
+#undef INST
+}
+}
diff --git a/dnn/src/rocm/elemwise/special_kimpl/special_dt_uint8.cpp.hip b/dnn/src/rocm/elemwise/special_kimpl/special_dt_uint8.cpp.hip
new file mode 100644
index 00000000..2ff1ec79
--- /dev/null
+++ b/dnn/src/rocm/elemwise/special_kimpl/special_dt_uint8.cpp.hip
@@ -0,0 +1,6 @@
+// generated by gen_elemwise_special_kern_impls.py
+#include "../special_kerns.inl"
+INST(::megdnn::dtype::Uint8)
+#undef INST
+}
+}
diff --git a/dnn/src/rocm/elemwise_helper.cpp b/dnn/src/rocm/elemwise_helper.cpp
new file mode 100644
index 00000000..85dc7f34
--- /dev/null
+++ b/dnn/src/rocm/elemwise_helper.cpp
@@ -0,0 +1,177 @@
+/**
+ * \file dnn/src/rocm/elemwise_helper.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/rocm/utils.h"
+#include "src/rocm/elemwise_helper.h.hip"
+#include "megcore_cdefs.h"
+
+#include "src/common/utils.h"
+
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+
+#define _cb_check_ndim(n) megdnn::TensorShape::MAX_NDIM == n ||
+static_assert(MEGDNN_FOREACH_TENSOR_NDIM(_cb_check_ndim) false,
+              "bad foreach ndim");
+#undef _cb_check_ndim
+
+namespace megdnn {
+namespace rocm {
+
+// ParamElemVisitor::init impls
+namespace elemwise_intl {
+
+template <int ndim, typename ctype>
+void ParamElemVisitor<ndim, ctype, BCAST_OTHER>::host_init(const TensorND& rv,
+                                                           int /*grid_size*/,
+                                                           int /*block_size*/) {
+    megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim);
+    m_ptr = rv.ptr<ctype>();
+    for (size_t i = 0; i < rv.layout.ndim; ++i) {
+        m_stride[i] = rv.layout.stride[i];
+        if (i + 1 < rv.layout.ndim)
+            m_shape_highdim[i] = rv.layout.shape[i + 1];
+    }
+    for (int i = rv.layout.ndim - 1; i < ndim - 1; ++i) {
+        m_shape_highdim[i] = 1;
+    }
+    for (int i = rv.layout.ndim; i < ndim; ++i) {
+        m_stride[i] = 0;
+    }
+}
+
+template <typename ctype>
+void ParamElemVisitor<3, ctype, BCAST_101>::host_init(const TensorND& rv,
+                                                      int grid_size,
+                                                      int block_size) {
+    uint32_t shape2, shape1;
+    int stride1;
+    if (rv.layout.ndim == 3) {
+        megdnn_assert(!rv.layout.stride[0] && !rv.layout.stride[2]);
+        shape1 = rv.layout[1];
+        shape2 = rv.layout[2];
+        stride1 = rv.layout.stride[1];
+    } else {
+        megdnn_assert(rv.layout.ndim == 2 && !rv.layout.stride[1]);
+        shape1 = rv.layout[0];
+        shape2 = rv.layout[1];
+        stride1 = rv.layout.stride[0];
+    }
+    m_ptr = rv.ptr<ctype>();
+    m_stride1 = stride1;
+    m_shape12.host_init(grid_size * block_size, shape2, shape1);
+}
+
+template <typename ctype>
+void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv,
+                                                     int grid_size,
+                                                     int block_size) {
+    megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]);
+    m_ptr = rv.ptr<ctype>();
+    m_stride1 = rv.layout.stride[1];
+    m_shape1.host_init(grid_size * block_size, rv.layout.shape[1]);
+}
+
+template <typename ctype>
+void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv,
+                                                     int grid_size,
+                                                     int block_size) {
+    megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[1]);
+    m_ptr = rv.ptr<ctype>();
+    m_stride0 = rv.layout.stride[0];
+    m_shape1.host_init(grid_size * block_size, rv.layout.shape[1]);
+}
+
+template <typename ctype>
+void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init(const TensorND& rv,
+                                                       int /*grid_size*/,
+                                                       int /*block_size*/) {
+    megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]);
+    m_ptr = rv.ptr<ctype>();
+}
+
+#define INST(ndim, ctype, brd) template class ParamElemVisitor<ndim, ctype, brd>
+#define INST_FOR_CTYPE                  \
+    MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) \
+    INST(3, ct, BCAST_101);             \
+    INST(2, ct, BCAST_10);              \
+    INST(2, ct, BCAST_01);              \
+    INST(1, ct, BCAST_FULL);
+
+#define ndim_cb(_ndim) INST(_ndim, ct, BCAST_OTHER);
+
+#define ct dt_byte
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_int32
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_float32
+INST_FOR_CTYPE
+#undef ct
+#if !MEGDNN_DISABLE_FLOAT16
+#define ct dt_float16
+INST_FOR_CTYPE
+#undef ct
+#endif
+#define ct dt_int8
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_uint8
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_int16
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_quint8
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_qint8
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_qint32
+INST_FOR_CTYPE
+#undef ct
+
+#undef ndim_cb
+
+#undef INST_FOR_CTYPE
+#undef INST
+
+}  // namespace elemwise_intl
+
+void elemwise_intl::get_launch_spec(const void* /*kern*/, size_t size,
+                                    int* grid_size, int* block_size) {
+    safe_size_in_kern(size);
+    const uint32_t blocks = 256;
+    *block_size = blocks;
+    int a = size / (blocks * 2), b = (size - 1) / (blocks * 3) + 1;
+    *grid_size = std::max(a, b);
+    if (!*grid_size) {
+        *block_size = std::min<int>(std::max<int>(size / 64, 1) * 32, 1024);
+        *grid_size = std::max<int>(size / *block_size, 1);
+    }
+    // because we unroll 3 times in the kernel
+    megdnn_assert(static_cast<size_t>(*block_size) * *grid_size * 3 >= size);
+}
+
+void elemwise_intl::on_bad_ndim(int ndim) {
+    megdnn_throw(ssprintf("invalid ndim: %d", ndim));
+    MEGDNN_MARK_USED_VAR(ndim);
+}
+}  // namespace rocm
+}  // namespace megdnn
+
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
+
diff --git a/dnn/src/rocm/elemwise_helper.h.hip b/dnn/src/rocm/elemwise_helper.h.hip
new file mode 100644
index 00000000..bd2c2e23
--- /dev/null
+++ b/dnn/src/rocm/elemwise_helper.h.hip
@@ -0,0 +1,598 @@
+/**
+ * \file src/rocm/elemwise_helper.h.hip
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \brief helper utilities for implementing element-wise kernels
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "hip_header.h"
+#include "src/rocm/utils.h.hip"
+#include "src/common/elemwise_helper.cuh"
+#include "src/rocm/int_fastdiv.h.hip"
+
+/*
+ * please note that all arithmetics on GPU are 32-bit for best performance; this
+ * limits max possible size
+ */
+
+namespace megdnn {
+namespace rocm {
+
+//! internals for element-wise
+namespace elemwise_intl {
+#define devfunc __device__ __forceinline__
+
+/*!
+ * \brief get hip launch specs for element-wise kernel
+ * \param kern kernel function address
+ * \param size total size of elements
+ */
+void get_launch_spec(const void* kern, size_t size, int* grid_size,
+                     int* block_size);
+
+MEGDNN_NORETURN void on_bad_ndim(int ndim);
+
+/*!
+ * \brief broadcast type
+ * BCAST_x[0]x[1]...: x[i] == !stride[i]
+ */
+enum BcastType { BCAST_OTHER, BCAST_101, BCAST_10, BCAST_01, BCAST_FULL };
+
+/*!
+ * \brief visitor to access an elemeent in a tensor at given logic index
+ * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait)
+ * \tparam brdcast_mask bit mask for broadcast of params; (i.e. stride[i] is
+ *      0 iff (brdcast_mask & (1<<(ndim-1-i))) is 1.
+ *
+ * host interface:
+ *      void host_init(
+ *              const TensorND &tensor, int grid_size, int block_size)
+ *
+ * device interface:
+ *      void thread_init(uint32_t idx)
+ *          called on thread entrance, with logical indexing; the index may
+ *          go beyond buffer range
+ *
+ *      ctype* ptr()
+ *          return buffer pointer; can be used by specialized OpCaller
+ *
+ *      void next()
+ *          called before moving to next chunk on each thread
+ *
+ *      int offset(uint32_t idx)
+ *          get physical offset from logical index
+ *
+ *      ctype& at(uint32_t idx)
+ *          ptr()[offset(idx)]
+ *
+ */
+template <int ndim, typename ctype, BcastType brd_type>
+class ParamElemVisitor;
+
+#define PARAM_ELEM_VISITOR_COMMON_DEV      \
+    devfunc ctype* ptr() { return m_ptr; } \
+    devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; }
+
+//! specialization for BCAST_OTHER
+template <int ndim, typename ctype>
+class ParamElemVisitor<ndim, ctype, BCAST_OTHER> {
+    ctype* __restrict m_ptr;
+    int m_stride[ndim];
+
+    //! m_shape_highdim[i] = original_shape[i + 1]
+#ifdef _MSC_VER
+    Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1];
+#else
+    Uint32Fastdiv m_shape_highdim[ndim - 1];
+#endif
+
+public:
+    static const int NDIM = ndim;
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
+
+#if MEGDNN_CC_CUDA
+    devfunc void thread_init(uint32_t) {}
+
+    devfunc void next() {}
+
+    devfunc int offset(uint32_t idx) {
+        int offset = 0;
+#pragma unroll
+        for (int i = ndim - 1; i >= 1; --i) {
+            Uint32Fastdiv& shp = m_shape_highdim[i - 1];
+            uint32_t idx_div = idx / shp;
+            offset += (idx - idx_div * shp.divisor()) * m_stride[i];
+            idx = idx_div;
+        }
+        offset += idx * m_stride[0];
+        return offset;
+    }
+
+    PARAM_ELEM_VISITOR_COMMON_DEV
+#endif
+};
+
+/*!
+ * \brief specialization for ndim == 3 and BCAST_101
+ * (for dimshuffle 'x', 0, 'x')
+ *
+ * visit: idx / m_shape2 % m_shape1
+ */
+template <typename ctype>
+class ParamElemVisitor<3, ctype, BCAST_101> {
+    ctype* __restrict m_ptr;
+    StridedDivSeq2 m_shape12;
+    int m_stride1;
+
+public:
+    static const int NDIM = 3;
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
+
+#if MEGDNN_CC_CUDA
+    devfunc void thread_init(uint32_t idx) { m_shape12.device_init(idx); }
+
+    devfunc void next() { m_shape12.next(); }
+
+    devfunc int offset(uint32_t /* idx */) {
+        return m_shape12.get() * m_stride1;
+    }
+
+    PARAM_ELEM_VISITOR_COMMON_DEV
+#endif
+};
+
+/*!
+ * \brief specialization for ndim == 2 and BCAST_10
+ *
+ * visit: idx % m_shape1
+ */
+template <typename ctype>
+class ParamElemVisitor<2, ctype, BCAST_10> {
+    ctype* __restrict m_ptr;
+    StridedDivSeq<false> m_shape1;
+    int m_stride1;
+
+public:
+    static const int NDIM = 2;
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
+
+#if MEGDNN_CC_CUDA
+    devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); }
+
+    devfunc void next() { m_shape1.next(); }
+
+    devfunc int offset(uint32_t /* idx */) { return m_shape1.r() * m_stride1; }
+
+    PARAM_ELEM_VISITOR_COMMON_DEV
+#endif
+};
+
+/*!
+ * \brief specialization for ndim == 2 and BCAST_01
+ *
+ * visit: idx / shape1
+ */
+template <typename ctype>
+class ParamElemVisitor<2, ctype, BCAST_01> {
+    ctype* __restrict m_ptr;
+    StridedDivSeq<true> m_shape1;
+    int m_stride0;
+
+public:
+    static const int NDIM = 2;
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
+
+    devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); }
+
+    devfunc void next() { m_shape1.next(); }
+
+    devfunc int offset(uint32_t /* idx */) { return m_shape1.q() * m_stride0; }
+
+    PARAM_ELEM_VISITOR_COMMON_DEV
+};
+
+//! specialization for ndim == 1 and BCAST_FULL
+template <typename ctype>
+class ParamElemVisitor<1, ctype, BCAST_FULL> {
+    ctype* __restrict m_ptr;
+
+public:
+    static const int NDIM = 1;
+
+    void host_init(const TensorND& rv, int grid_size, int block_size);
+
+#if MEGDNN_CC_CUDA
+    devfunc void thread_init(uint32_t) {}
+
+    devfunc void next() {}
+
+    devfunc int offset(uint32_t idx) {
+        MEGDNN_MARK_USED_VAR(idx);
+        return 0;
+    }
+
+    PARAM_ELEM_VISITOR_COMMON_DEV
+#endif
+};
+
+#undef PARAM_ELEM_VISITOR_COMMON_DEV
+
+#if MEGDNN_CC_CUDA
+/*
+ * OpCaller is used to invoke user operator with loaded element arguments.
+ *
+ * device interface:
+ *      void thread_init(uint32_t idx);
+ *
+ *      void on(uint32_t idx);
+ *
+ *      void next();
+ */
+
+/*!
+ * \brief call user op directly without visiting any params (i.e. arity ==
+ *      0)
+ */
+template <class Op>
+struct OpCallerNull {
+    Op op;
+
+    devfunc void thread_init(uint32_t) {}
+
+    devfunc void on(uint32_t idx) { op(idx); }
+
+    devfunc void next() {}
+};
+
+/*!
+ * \brief call an operator whose each param are promted to the same ndim and
+ *      brdcast_mask
+ * \tparam PVis ParamElemVisitor class
+ */
+template <class Op, int arity, class PVis>
+struct OpCallerUniform;
+
+//! specialization for arity == 1
+template <class Op, class PVis>
+struct OpCallerUniform<Op, 1, PVis> {
+    Op op;
+    PVis par[1];
+
+    devfunc void thread_init(uint32_t idx) { par[0].thread_init(idx); }
+
+    devfunc void on(uint32_t idx) { op(idx, par[0].at(idx)); }
+
+    devfunc void next() { par[0].next(); }
+};
+//! specialization for arity == 2
+template <class Op, class PVis>
+struct OpCallerUniform<Op, 2, PVis> {
+    Op op;
+    PVis par[2];
+
+    devfunc void thread_init(uint32_t idx) {
+        par[0].thread_init(idx);
+        par[1].thread_init(idx);
+    }
+
+    devfunc void on(uint32_t idx) { op(idx, par[0].at(idx), par[1].at(idx)); }
+
+    devfunc void next() {
+        par[0].next();
+        par[1].next();
+    }
+};
+//! specialization for arity == 3
+template <class Op, class PVis>
+struct OpCallerUniform<Op, 3, PVis> {
+    Op op;
+    PVis par[3];
+
+    devfunc void thread_init(uint32_t idx) {
+        par[0].thread_init(idx);
+        par[1].thread_init(idx);
+        par[2].thread_init(idx);
+    }
+
+    devfunc void on(uint32_t idx) {
+        op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx));
+    }
+
+    devfunc void next() {
+        par[0].next();
+        par[1].next();
+        par[2].next();
+    }
+};
+
+/*!
+ * \brief call binary (i.e. arity == 2) operator with different param
+ *      visitors
+ */
+template <class Op, class PVis0, class PVis1>
+struct OpCallerBinary {
+    Op op;
+    PVis0 par0;
+    PVis1 par1;
+
+    devfunc void thread_init(uint32_t idx) {
+        par0.thread_init(idx);
+        par1.thread_init(idx);
+    }
+
+    devfunc void on(uint32_t idx) { op(idx, par0.at(idx), par1.at(idx)); }
+
+    devfunc void next() {
+        par0.next();
+        par1.next();
+    }
+};
+
+template <class OpCaller>
+__global__ void cuda_kern(OpCaller op_caller, uint32_t size) {
+    uint32_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x,
+             delta = hipBlockDim_x * hipGridDim_x;
+    // each thread works on at most 3 elements; see get_launch_spec
+    op_caller.thread_init(idx);
+    if (idx < size) {
+        op_caller.on(idx);
+        idx += delta;
+        if (idx < size) {
+            op_caller.next();
+            op_caller.on(idx);
+            idx += delta;
+            if (idx < size) {
+                op_caller.next();
+                op_caller.on(idx);
+            }
+        }
+    }
+}
+
+//! invoke a user Op passed to run_elemwise
+template <class Op, typename ctype, int arity>
+class UserOpInvoker;
+
+//! run op by promoting all params to same ndim
+template <class Op, typename ctype, int arity>
+class UserOpInvokerToSameNdim {
+    const ElemwiseOpParamN<arity>& m_param;
+    hipStream_t m_stream;
+    const Op& m_op;
+
+    void dispatch0() {
+        switch (m_param.max_ndim) {
+#define cb(ndim) \
+    case ndim:   \
+        return dispatch1<ndim>();
+            MEGDNN_FOREACH_TENSOR_NDIM(cb)
+#undef cb
+        }
+        on_bad_ndim(m_param.max_ndim);
+    }
+
+    template <int ndim>
+    void dispatch1() {
+        typedef OpCallerUniform<Op, arity,
+                                ParamElemVisitor<ndim, ctype, BCAST_OTHER>>
+                Caller;
+        size_t size = m_param.size;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+
+        Caller caller;
+        caller.op = m_op;
+        for (int i = 0; i < arity; ++i)
+            caller.par[i].host_init(m_param[i], grid_size, block_size);
+
+        hipLaunchKernelGGL(fptr,
+                           dim3(grid_size), dim3(block_size), 0, m_stream,
+                           caller, size);
+        after_kernel_launch();
+    }
+
+public:
+    UserOpInvokerToSameNdim(const ElemwiseOpParamN<arity>& param,
+                            hipStream_t stream, const Op& op)
+            : m_param(param), m_stream(stream), m_op(op) {
+        dispatch0();
+    }
+};
+
+//! implement general case by UserOpInvokerToSameNdim
+template <class Op, typename ctype, int arity>
+class UserOpInvoker : public UserOpInvokerToSameNdim<Op, ctype, arity> {
+public:
+    UserOpInvoker(const ElemwiseOpParamN<arity>& param, hipStream_t stream,
+                  const Op& op)
+            : UserOpInvokerToSameNdim<Op, ctype, arity>(param, stream, op) {}
+};
+
+//! specialization for arity == 0
+template <class Op, typename ctype>
+class UserOpInvoker<Op, ctype, 0> {
+public:
+    UserOpInvoker(const ElemwiseOpParamN<0>& param, hipStream_t stream,
+                  const Op& op) {
+        size_t size = param.size;
+        typedef OpCallerNull<Op> Caller;
+        Caller caller;
+        caller.op = op;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        hipLaunchKernelGGL(fptr,
+                           dim3(grid_size), dim3(block_size), 0, stream, caller,
+                           size);
+        after_kernel_launch();
+    }
+};
+
+#define DEFINE_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, _stride) \
+    _cb_header(1) {                                                          \
+        const ptrdiff_t* stride = _stride;                                   \
+        if (!stride[0]) {                                                    \
+            return _cb_dispatch(1, BCAST_FULL);                              \
+        }                                                                    \
+        _cb_dispatch(1, BCAST_OTHER);                                        \
+    }                                                                        \
+    _cb_header(2) {                                                          \
+        const ptrdiff_t* stride = _stride;                                   \
+        if (!stride[0] && stride[1]) {                                       \
+            return _cb_dispatch(2, BCAST_10);                                \
+        }                                                                    \
+        if (stride[0] && !stride[1]) {                                       \
+            return _cb_dispatch(2, BCAST_01);                                \
+        }                                                                    \
+        _cb_dispatch(2, BCAST_OTHER);                                        \
+    }                                                                        \
+    _cb_header(3) {                                                          \
+        const ptrdiff_t* stride = _stride;                                   \
+        if (!stride[0] && stride[1] && !stride[2]) {                         \
+            return _cb_dispatch(3, BCAST_101);                               \
+        }                                                                    \
+        _cb_dispatch(3, BCAST_OTHER);                                        \
+    }
+
+//! specialization for binary opr
+template <class Op, typename ctype>
+class UserOpInvoker<Op, ctype, 2> {
+    bool m_invoked;
+    const ElemwiseOpParamN<2>& m_param;
+    hipStream_t m_stream;
+    const Op& m_op;
+
+    void fallback() {
+        megdnn_assert(!m_invoked);
+        UserOpInvokerToSameNdim<Op, ctype, 2>(m_param, m_stream, m_op);
+        m_invoked = true;
+    }
+
+    void dispatch0() {
+        switch (m_param[0].layout.ndim) {
+#define cb(ndim) \
+    case ndim:   \
+        return dispatch1_##ndim();
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+#undef cb
+        }
+        fallback();
+    }
+
+#define cb_header(ndim) void dispatch1_##ndim()
+#define cb_dispatch(ndim, brdcast_mask) \
+    dispatch2<ParamElemVisitor<ndim, ctype, brdcast_mask>>()
+    DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                      m_param[0].layout.stride)
+#undef cb_header
+#undef cb_dispatch
+
+    template <class PVis0>
+    void dispatch2() {
+        switch (m_param[1].layout.ndim) {
+#define cb(ndim) \
+    case ndim:   \
+        return dispatch3_##ndim<PVis0>();
+            MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb)
+#undef cb
+        }
+        fallback();
+    }
+
+#define cb_header(ndim)    \
+    template <class PVis0> \
+    void dispatch3_##ndim()
+#define cb_dispatch(ndim, brdcast_mask) \
+    do_run<PVis0, ParamElemVisitor<ndim, ctype, brdcast_mask>>()
+    DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch,
+                                      m_param[1].layout.stride)
+#undef cb_header
+#undef cb_dispatch
+
+    template <class PVis0, class PVis1>
+    void do_run() {
+        megdnn_assert(!m_invoked);
+        m_invoked = true;
+        typedef OpCallerBinary<Op, PVis0, PVis1> Caller;
+        int grid_size, block_size;
+        void (*fptr)(Caller, uint32_t) = cuda_kern<Caller>;
+        size_t size = m_param.size;
+        get_launch_spec(reinterpret_cast<const void*>(fptr), size, &grid_size,
+                        &block_size);
+        Caller caller;
+        caller.op = m_op;
+        caller.par0.host_init(m_param[0], grid_size, block_size);
+        caller.par1.host_init(m_param[1], grid_size, block_size);
+        hipLaunchKernelGGL(fptr,
+                           dim3(grid_size), dim3(block_size), 0, m_stream,
+                           caller, size);
+        after_kernel_launch();
+    }
+
+public:
+    UserOpInvoker(const ElemwiseOpParamN<2>& param, hipStream_t stream,
+                  const Op& op)
+            : m_param(param), m_stream(stream), m_op(op) {
+        m_invoked = false;
+        dispatch0();
+        megdnn_assert(m_invoked);
+    }
+};
+
+#undef DEFINE_BRDCAST_DISPATCH_RECEIVERS
+
+#endif  // MEGDNN_CC_CUDA
+
+#undef devfunc
+}  // namespace elemwise_intl
+
+/*!
+ * \brief general element-wise kernel launcher
+ *
+ * \tparam arity number of params for the operator
+ * \param param param values for the operator; must have been initialized (i.e.
+ *      by calling ElemwiseOpParamN::init_from_given_tensor). The params
+ *      can have arbitrary layouts, as long as they share the same total number
+ *      of elements.
+ * \param op callable with a signature compatible with
+ *      `void op(uint32_t idx, ctype& param0, ..., ctype& param[arity - 1])`
+ *      if arity == 0, there is only an `idx` input
+ */
+template <class Op, typename ctype, int arity>
+void run_elemwise(const ElemwiseOpParamN<arity>& param, hipStream_t stream,
+                  const Op& op = Op());
+
+#if MEGDNN_CC_CUDA
+template <class Op, typename ctype, int arity>
+void run_elemwise(const ElemwiseOpParamN<arity>& param, hipStream_t stream,
+                  const Op& op) {
+    param.assert_initialized();
+    elemwise_intl::UserOpInvoker<Op, ctype, arity>(param, stream, op);
+}
+
+/*!
+ * \brief explicit instantialization of run_elemwise for given template params;
+ *      used in .cu files, so corresponding run_elemwise can be called from .cpp
+ */
+#define INST_RUN_ELEMWISE(Op, ctype, arity)       \
+    template void run_elemwise<Op, ctype, arity>( \
+            const ElemwiseOpParamN<arity>&, hipStream_t, const Op&)
+#endif  // MEGDNN_CC_CUDA
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
+
diff --git a/dnn/src/rocm/error_info.h.hip b/dnn/src/rocm/error_info.h.hip
new file mode 100644
index 00000000..1f09ca4e
--- /dev/null
+++ b/dnn/src/rocm/error_info.h.hip
@@ -0,0 +1,52 @@
+/**
+ * \file src/rocm/error_info.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "hip_header.h"
+#include "megcore_cdefs.h"
+#include "megdnn/arch.h"
+
+typedef megcore::AsyncErrorInfo AsyncErrorInfo;
+#if MEGDNN_CC_CUDA
+// we can not put this function into anonymous namespace, since it would cause
+// unused static func or undefined static func warning depending on whether you
+// define it
+namespace {
+#endif
+
+__device__ void set_async_error_info(AsyncErrorInfo* info, void* tracker,
+                                     const char* msg, int arg0 = 0,
+                                     int arg1 = 0, int arg2 = 0, int arg3 = 0)
+#if MEGDNN_CC_CUDA
+{
+    if (info && !atomicAdd(&info->nr_error, 1)) {
+        // use atomic expression to ensure that only the first error is reported
+        info->tracker_ptr = tracker;
+        char* ptr = info->msg;
+        char* ptr_end = ptr + sizeof(AsyncErrorInfo::msg) - 1;
+        while (ptr < ptr_end && *msg) {
+            *(ptr++) = *(msg++);
+        }
+        *ptr = 0;
+        info->msg_args[0] = arg0;
+        info->msg_args[1] = arg1;
+        info->msg_args[2] = arg2;
+        info->msg_args[3] = arg3;
+    }
+}
+#else
+        ;
+#endif
+
+#if MEGDNN_CC_CUDA
+}  // anonymous namespace
+#endif
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/eye/eye.cpp.hip b/dnn/src/rocm/eye/eye.cpp.hip
new file mode 100644
index 00000000..774233ed
--- /dev/null
+++ b/dnn/src/rocm/eye/eye.cpp.hip
@@ -0,0 +1,49 @@
+/**
+ * \file src/rocm/eye/eye.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "hip_header.h"
+#include "megdnn/dtype.h"
+#include "src/rocm/eye/eye.h.hip"
+#include "src/rocm/utils.h.hip"
+
+namespace {
+
+template <typename T>
+__global__ void kernel(T* dst, uint32_t m, uint32_t n, int k) {
+    int32_t i = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t x = i % n;
+    int32_t y = i / n;
+    if (i < m * n) {
+        dst[i] = (y + k == x);
+    }
+}
+
+}  // anonymous namespace
+
+namespace megdnn {
+namespace rocm {
+namespace eye {
+
+template <typename T>
+void exec_internal(T* dst, size_t m, size_t n, int k, hipStream_t stream) {
+    hipLaunchKernelGGL((kernel<T>), dim3(DIVUP(m * n, NR_THREADS)),
+                       dim3(NR_THREADS), 0, stream, dst, m, n, k);
+    after_kernel_launch();
+}
+
+#define INST(T) \
+    template void exec_internal<T>(T*, size_t, size_t, int, hipStream_t);
+#define cb(DType) INST(typename DTypeTrait<DType>::ctype)
+MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+
+}  // namespace eye
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/eye/eye.h.hip b/dnn/src/rocm/eye/eye.h.hip
new file mode 100644
index 00000000..6a1f6f86
--- /dev/null
+++ b/dnn/src/rocm/eye/eye.h.hip
@@ -0,0 +1,23 @@
+/**
+ * \file src/rocm/eye/eye.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <stdint.h>
+#include "hip_header.h"
+
+namespace megdnn {
+namespace rocm {
+namespace eye {
+
+template <typename T>
+void exec_internal(T* dst, size_t m, size_t n, int k, hipStream_t stream);
+
+}  // namespace eye
+}  // namespace rocm
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/eye/opr_impl.cpp b/dnn/src/rocm/eye/opr_impl.cpp
new file mode 100644
index 00000000..d11c56a6
--- /dev/null
+++ b/dnn/src/rocm/eye/opr_impl.cpp
@@ -0,0 +1,35 @@
+/**
+ * \file dnn/src/rocm/eye/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/eye/opr_impl.h"
+
+#include "src/rocm/eye/eye.h.hip"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+void EyeImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) {
+    check_exec(dst.layout, workspace.size);
+#define cb(DType)                                                        \
+    if (dst.layout.dtype.enumv() == DTypeTrait<DType>::enumv) {          \
+        using ctype = typename DTypeTrait<DType>::ctype;                 \
+        eye::exec_internal<ctype>(dst.ptr<ctype>(), dst.layout.shape[0], \
+                                  dst.layout.shape[1], param().k,        \
+                                  hip_stream(handle()));                 \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/eye/opr_impl.h b/dnn/src/rocm/eye/opr_impl.h
new file mode 100644
index 00000000..9566285c
--- /dev/null
+++ b/dnn/src/rocm/eye/opr_impl.h
@@ -0,0 +1,27 @@
+/**
+ * \file dnn/src/rocm/eye/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class EyeImpl final : public Eye {
+public:
+    using Eye::Eye;
+    void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout&) override { return 0; }
+};
+
+} // namespace rocm
+} // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
+
diff --git a/dnn/src/rocm/handle.cpp b/dnn/src/rocm/handle.cpp
new file mode 100644
index 00000000..cefbe8cf
--- /dev/null
+++ b/dnn/src/rocm/handle.cpp
@@ -0,0 +1,184 @@
+/**
+ * \file dnn/src/rocm/handle.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/common/handle_impl.h"
+#include "src/common/version_symbol.h"
+
+#include "src/rocm/handle.h"
+#include "src/rocm/miopen_with_check.h"
+#include "src/rocm/utils.h"
+
+#include "src/rocm/checksum/opr_impl.h"
+#include "src/rocm/convolution/opr_impl.h"
+#include "src/rocm/elemwise/opr_impl.h"
+#include "src/rocm/eye/opr_impl.h"
+#include "src/rocm/pooling/opr_impl.h"
+#include "src/rocm/reduce/opr_impl.h"
+#include "src/rocm/type_cvt/opr_impl.h"
+#include "src/rocm/add_update/opr_impl.h"
+#include "src/rocm/matrix_mul/opr_impl.h"
+#include "src/rocm/batched_matrix_mul/opr_impl.h"
+#include "src/rocm/indexing_one_hot/opr_impl.h"
+#include "src/rocm/rng/opr_impl.h"
+#include "src/rocm/relayout/opr_impl.h"
+#include "src/rocm/powc/opr_impl.h"
+#include "src/rocm/indexing_multi_axis_vec/opr_impl.h"
+#include "src/rocm/linspace/opr_impl.h"
+#include "src/rocm/argmxx/opr_impl.h"
+#include "src/rocm/sleep/opr_impl.h"
+
+#include <cstring>
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#define MIOPEN_VERSION_STR    \
+    STR(MIOPEN_VERSION_MAJOR) \
+    "." STR(MIOPEN_VERSION_MINOR) "." STR(MIOPEN_VERSION_PATCH)
+
+#pragma message "compile with MIOpen " MIOPEN_VERSION_STR " "
+
+#undef STR
+#undef STR_HELPER
+
+namespace megdnn {
+std::unique_ptr<Handle> Handle::make_rocm_handle(megcoreComputingHandle_t computing_handle) {
+    return std::make_unique<rocm::HandleImpl>(computing_handle);
+}
+template <typename Opr>
+std::unique_ptr<Opr> Handle::create_rocm_operator() {
+    return static_cast<rocm::HandleImpl*>(this)->create_operator<Opr>(); 
+}
+#define INST(opr) \
+    template std::unique_ptr<opr> Handle::create_rocm_operator();
+MEGDNN_FOREACH_OPR_CLASS(INST)
+#undef INST
+}
+
+namespace megdnn {
+namespace rocm {
+
+HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle)
+        : HandleImplHelper(comp_handle, HandleType::ROCM) {
+    // Get megcore device handle
+    megcoreDeviceHandle_t dev_handle;
+    megcoreGetDeviceHandle(comp_handle, &dev_handle);
+    int dev_id;
+    megcoreGetDeviceID(dev_handle, &dev_id);
+    if (dev_id < 0) {
+        hip_check(hipGetDevice(&dev_id));
+    }
+    m_device_id = dev_id;
+    hip_check(hipGetDeviceProperties(&m_device_prop, dev_id));
+    // Get stream from MegCore computing handle.
+    //! no version check
+    megcore::getROCMContext(comp_handle, &m_megcore_context);
+    rocblas_check(rocblas_create_handle(&m_rocblas_handle));
+    //! must call miopenCreateWithStream() to create miopen handle, then the
+    //! rocblas_handle of miopen will set to be the same stream , otherwise
+    //! miopen create rocblas_handle with default stream
+    miopen_check(miopenCreateWithStream(&m_miopen_handle, stream()));
+
+    // Set stream for miopen and rocblas handles.
+    rocblas_check(rocblas_set_stream(m_rocblas_handle, stream()));
+
+    // Note that all rocblas scalars (alpha, beta) and scalar results such as
+    // dot output resides at device side.
+    rocblas_check(rocblas_set_pointer_mode(m_rocblas_handle,
+                                           rocblas_pointer_mode_device));
+
+    // init const scalars
+    hip_check(hipMalloc(&m_const_scalars, sizeof(ConstScalars)));
+    ConstScalars const_scalars_val;
+    const_scalars_val.init();
+    hip_check(hipMemcpyAsync(m_const_scalars, &const_scalars_val,
+                             sizeof(ConstScalars), hipMemcpyHostToDevice,
+                             stream()));
+    hip_check(hipStreamSynchronize(stream()));
+}
+
+HandleImpl::~HandleImpl() noexcept {
+    miopen_check(miopenDestroy(m_miopen_handle));
+    rocblas_check(rocblas_destroy_handle(m_rocblas_handle));
+    hip_check(hipFree(m_const_scalars));
+}
+
+void HandleImpl::ConstScalars::init() {
+#if !MEGDNN_DISABLE_FLOAT16
+    f16[0].megdnn_x = 0;
+    f16[1].megdnn_x = 1;
+#endif
+    f32[0] = 0;
+    f32[1] = 1;
+    i32[0] = 0;
+    i32[1] = 1;
+}
+
+template <typename Opr>
+std::unique_ptr<Opr> HandleImpl::create_operator() {
+    megdnn_throw("unsupported rocm opr");
+    return nullptr;
+}
+
+size_t HandleImpl::alignment_requirement() const {
+    auto&& prop = m_device_prop;
+    MEGDNN_MARK_USED_VAR(prop);
+    //! for now, texture functions are not supported.
+    return 1u;
+}
+
+bool HandleImpl::check_cross_dev_copy_constraint(const TensorLayout& src) {
+    // is contiguous or can be hold by
+    // relayout::param::try_copy_2d/try_copy_last_contig
+    return src.is_contiguous() || src.stride[src.ndim - 1] == 1;
+}
+
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardFilter);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ElemwiseForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(Eye);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ChecksumForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(PoolingForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(PoolingBackward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ReduceForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(TypeCvt);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(AddUpdateForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(MatrixMulForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(BatchedMatrixMulForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(IndexingOneHotForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(IndexingSetOneHotForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(UniformRNG);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(GaussianRNG);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(RelayoutForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(PowC);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(IndexingMultiAxisVec);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(IndexingSetMultiAxisVec);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(IndexingIncrMultiAxisVec);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(Linspace);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ArgmaxForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(ArgminForward);
+MEGDNN_SPECIALIZE_CREATE_OPERATOR(SleepForward);
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Winstantiation-after-specialization"
+MEGDNN_FOREACH_OPR_CLASS(MEGDNN_INST_CREATE_OPERATOR)
+#pragma GCC diagnostic pop
+
+}  // namespace rocm
+}  // namespace megdnn
+
+MEGDNN_VERSION_SYMBOL(HIP, HIP_VERSION);
+MEGDNN_VERSION_SYMBOL3(MIOPEN, MIOPEN_VERSION_MAJOR, MIOPEN_VERSION_MINOR,
+                       MIOPEN_VERSION_PATCH);
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/handle.h b/dnn/src/rocm/handle.h
new file mode 100644
index 00000000..dbd0a2cd
--- /dev/null
+++ b/dnn/src/rocm/handle.h
@@ -0,0 +1,125 @@
+/**
+ * \file dnn/src/rocm/handle.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megcore_rocm.h"
+#include "megdnn/basic_types.h"
+#include "megdnn/handle.h"
+#include "megdnn/oprs/general.h"
+
+#include "src/common/handle_impl.h"
+#include "src/common/utils.h"
+#include "src/rocm/miopen_with_check.h"
+
+#include <rocblas-types.h>
+#include <rocblas.h>
+#include <atomic>
+#include <mutex>
+
+namespace megdnn {
+namespace rocm {
+
+class HandleImpl : public HandleImplHelper {
+public:
+    HandleImpl(megcoreComputingHandle_t computing_handle);
+    ~HandleImpl() noexcept;
+
+    size_t alignment_requirement() const override;
+
+    bool check_cross_dev_copy_constraint(const TensorLayout& src) override;
+
+    const hipDeviceProp_t& device_prop() const { return m_device_prop; }
+
+    template <typename Opr>
+    std::unique_ptr<Opr> create_operator();
+
+    const megcore::ROCMContext& megcore_context() const {
+        return m_megcore_context;
+    }
+
+    bool enable_miopen_algo_search() const {
+        return megcore::ROCMContext::enable_miopen_algo_search();
+    }
+
+    void enable_miopen_algo_search(bool enable_algo_search) {
+        megcore::ROCMContext::enable_miopen_algo_search(enable_algo_search);
+    }
+
+    int device_id() const { return m_device_id; }
+
+    hipStream_t stream() const { return megcore_context().stream; }
+    miopenHandle_t miopen_handle() { return m_miopen_handle; }
+    rocblas_handle get_rocblas_handle() { return m_rocblas_handle; }
+    dt_float32* zero_device() { return &m_const_scalars->f32[0]; }
+    dt_float32* one_device() { return &m_const_scalars->f32[1]; }
+#if !MEGDNN_DISABLE_FLOAT16
+    __half* zero_device_h() { return &m_const_scalars->f16[0].hip_x; }
+    __half* one_device_h() { return &m_const_scalars->f16[1].hip_x; }
+#endif
+    dt_int32* zero_device_i32() { return &m_const_scalars->i32[0]; }
+    dt_int32* one_device_i32() { return &m_const_scalars->i32[1]; }
+
+    //! global matmul opr
+    MatrixMul* matmul_opr() override final {
+        return get_helper_opr<MatrixMul, 0>(this);
+    }
+
+    //! global matmul opr with first operand transposed
+    MatrixMul* matmul_aT_opr() override final {
+        return get_helper_opr<MatrixMul, 1>(this, {true, false});
+    }
+
+    //! global matmul opr with second operand transposed
+    MatrixMul* matmul_bT_opr() override final {
+        return get_helper_opr<MatrixMul, 2>(this, {false, true});
+    }
+
+    //! global relayout opr
+    Relayout* relayout_opr() override final {
+        return get_helper_opr<Relayout, 3>(this);
+    }   
+    
+    BatchedMatrixMulForward* batched_matrix_mul() {
+        return get_helper_opr<BatchedMatrixMulForward, 4>(this);
+    }
+
+private:
+    int m_device_id;
+    //! MegDNN handle does not manage the lifetime of HIP stream.
+    megcore::ROCMContext m_megcore_context;
+
+    miopenHandle_t m_miopen_handle;
+    rocblas_handle m_rocblas_handle;
+
+    hipDeviceProp_t m_device_prop;
+
+    struct ConstScalars {
+#if !MEGDNN_DISABLE_FLOAT16
+        union FP16 {
+            __half hip_x;
+            dt_float16 megdnn_x;
+            FP16() {}
+        };
+        static_assert(sizeof(FP16) == 2, "bad FP16 size");
+        FP16 f16[2];
+#endif
+        dt_float32 f32[2];
+        dt_int32 i32[2];
+        void init();
+    };
+
+    //! device ptr to const scalars
+    ConstScalars* m_const_scalars;
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/kern.h.hip b/dnn/src/rocm/indexing_multi_axis_vec/kern.h.hip
new file mode 100644
index 00000000..bee63e22
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/kern.h.hip
@@ -0,0 +1,95 @@
+/**
+ * \file src/rocm/indexing_multi_axis_vec/kern.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "megdnn/arch.h"
+#include "src/rocm/int_fastdiv.h.hip"
+#include "src/rocm/error_info.h.hip"
+
+namespace megdnn {
+namespace rocm {
+namespace indexing_multi_axis_vec {
+
+    //! AxisIndexer equiv in kernel
+    struct KAxisIndexer {
+        int stride;
+        const int *ptr;
+    };
+
+    //! param for gen_offset_base
+    template<int nidx>
+    struct GenOffsetBaseParam {
+        uint32_t size;  //!< number of outputs; also size of each index
+        int *output;    //!< output ptr
+        KAxisIndexer indexer[nidx];
+        uint32_t data_shape[nidx];
+        int data_stride[nidx];
+
+        void* error_tracker;
+        megcore::AsyncErrorInfo* error_info;
+    };
+
+    //! tensor layout for fast offset computing
+    template<int ndim>
+    struct FastLayout {
+        int stride[ndim];
+#ifdef WIN32
+        Uint32Fastdiv shape[ndim];
+#else
+        Uint32Fastdiv shape[ndim - 1];
+#endif
+    };
+
+    //! param for apply_opr
+    template<typename ctype, int ndim>
+    struct ApplyOprParam {
+        uint32_t tot_size;    //!< total output size
+
+        //! offset array generated by gen_offset_base for first output axis
+        const int *offset_base;
+        ctype *data, *value;
+
+        int idx_axis;
+
+        int value_stride;
+
+        //! iterate on value, with strides from corresponding axes on data
+        FastLayout<ndim> value_ly_on_data;
+    };
+
+    //! generate offset bases for first axis in the output
+    template<int nidx>
+    void gen_offset_base(const GenOffsetBaseParam<nidx> &param,
+            hipStream_t stream);
+
+    struct OprAtomicIncr {
+#if MEGDNN_CC_CUDA
+        template<typename ctype>
+        __device__ static void apply(ctype &data, ctype value) {
+            atomicAdd(&data, value);
+        }
+#endif
+    };
+
+    /*!
+     * \brief forward kernel: copy data to value
+     * \tparam ndim numer of axes except axis_0 in data,
+     *      range from 0 to max_ndim - 1
+     */
+    template<typename ctype, int ndim, class Opr>
+    void apply_opr(const ApplyOprParam<ctype, ndim> &param,
+            hipStream_t stream);
+
+} // namespace indexing_multi_axis_vec
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_fwd.cpp.hip b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_fwd.cpp.hip
new file mode 100644
index 00000000..7de02bfc
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_fwd.cpp.hip
@@ -0,0 +1,17 @@
+/**
+ * \file src/rocm/indexing_multi_axis_vec/kern_apply_opr_fwd.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "hip_header.h"
+#include "src/common/indexing_multi_axis_vec_kdef.h"
+#define KERN_APPLY_OPR_OPR  ::megdnn::indexing_multi_axis_vec_kdef::OprFwd
+#include "./kern_apply_opr_impl.hipinl"
+
+// vim: ft=cuda syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_impl.hipinl b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_impl.hipinl
new file mode 100644
index 00000000..54112725
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_impl.hipinl
@@ -0,0 +1,83 @@
+/**
+ * \file src/rocm/indexing_multi_axis_vec/kern_apply_opr_impl.hipinl
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#ifndef KERN_APPLY_OPR_OPR
+#error "must define KERN_APPLY_OPR_OPR"
+#endif
+
+#include "src/rocm/utils.h.hip"
+#include "./kern.h.hip"
+#include "megdnn/internal/defs.h"
+#include "megdnn/dtype.h"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace indexing_multi_axis_vec;
+
+namespace {
+    template<typename ctype, int ndim, class Opr>
+    __global__ void kapply_opr(ApplyOprParam<ctype, ndim> param) {
+
+        uint32_t oidx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (oidx < param.tot_size) {
+            int offset = 0, coidx = oidx;
+            int all_ax_idx[ndim];
+#pragma unroll
+            for (int i = ndim - 1; i >= 0; -- i) {
+                int next_coidx, ax_idx;
+                if (i) {
+                    next_coidx = coidx / param.value_ly_on_data.shape[i - 1];
+                    ax_idx =
+                        coidx -
+                        (next_coidx *
+                         param.value_ly_on_data.shape[i - 1].divisor());
+                    coidx = next_coidx;
+                } else {
+                    ax_idx = coidx;
+                }
+                offset += param.value_ly_on_data.stride[i] * ax_idx;
+                all_ax_idx[i] = ax_idx;
+            }
+            offset += param.offset_base[all_ax_idx[param.idx_axis]];
+            Opr::apply(
+                    param.data[offset],
+                    param.value[oidx * param.value_stride]);
+        }
+    }
+}
+
+template<typename ctype, int ndim, class Opr>
+void indexing_multi_axis_vec::apply_opr(
+        const ApplyOprParam<ctype, ndim> &param, hipStream_t stream) {
+    void (*kptr)(ApplyOprParam<ctype, ndim>) = kapply_opr<ctype, ndim, Opr>;
+    int bsize = 256;
+    hipLaunchKernelGGL(kptr,
+                       DIVUP(param.tot_size, bsize), bsize, 0, stream,
+                       param);
+}
+
+namespace megdnn {
+namespace rocm {
+namespace indexing_multi_axis_vec {
+
+#define INST(_ndim, _ctype) \
+    template void apply_opr<_ctype, _ndim, KERN_APPLY_OPR_OPR> \
+    (const ApplyOprParam<_ctype, _ndim>&, hipStream_t);
+#define cb(_dtype) \
+    MEGDNN_FOREACH_TENSOR_NDIM(INST, DTypeTrait<_dtype>::ctype)
+    MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+#undef INST
+
+} // namespace indexing_multi_axis_vec
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cuda syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_incr.cpp.hip b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_incr.cpp.hip
new file mode 100644
index 00000000..88419db7
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_incr.cpp.hip
@@ -0,0 +1,41 @@
+/**
+ * \file src/rocm/indexing_multi_axis_vec/kern_apply_opr_incr.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "hip_header.h"
+#include "megdnn/dtype.h"
+
+#if !MEGDNN_DISABLE_FLOAT16
+__device__ void atomicAdd(megdnn::dt_float16 *, megdnn::dt_float16) {
+    asm("s_trap 2;");
+    ((int*)0)[0] = 1;
+}
+#endif
+
+__device__ void atomicAdd(megdnn::dt_int8 *, megdnn::dt_int8) {
+     asm("s_trap 2;");
+     ((int*)0)[0] = 1;
+}
+
+__device__ void atomicAdd(megdnn::dt_uint8 *, megdnn::dt_uint8) {
+    asm("s_trap 2;");
+    ((int*)0)[0] = 1;
+}
+
+__device__ void atomicAdd(megdnn::dt_int16 *, megdnn::dt_int16) {
+    asm("s_trap 2;");
+    ((int*)0)[0] = 1;
+}
+
+#define KERN_APPLY_OPR_OPR \
+    ::megdnn::rocm::indexing_multi_axis_vec::OprAtomicIncr
+#include "./kern_apply_opr_impl.hipinl"
+
+// vim: ft=cuda syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_set.cpp.hip b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_set.cpp.hip
new file mode 100644
index 00000000..c7e75ec1
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/kern_apply_opr_set.cpp.hip
@@ -0,0 +1,17 @@
+/**
+ * \file src/rocm/indexing_multi_axis_vec/kern_apply_opr_set.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "hip_header.h"
+#include "src/common/indexing_multi_axis_vec_kdef.h"
+#define KERN_APPLY_OPR_OPR  ::megdnn::indexing_multi_axis_vec_kdef::OprSet
+#include "./kern_apply_opr_impl.hipinl"
+
+// vim: ft=cuda syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/kern_gen_offset_base.cpp.hip b/dnn/src/rocm/indexing_multi_axis_vec/kern_gen_offset_base.cpp.hip
new file mode 100644
index 00000000..e15181f4
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/kern_gen_offset_base.cpp.hip
@@ -0,0 +1,71 @@
+/**
+ * \file src/rocm/indexing_multi_axis_vec/kern_gen_offset_base.hip.cpp
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "hip_header.h"
+#include "./kern.h.hip"
+#include "megdnn/internal/defs.h"
+#include "src/rocm/utils.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace indexing_multi_axis_vec;
+
+namespace {
+    template<int nidx>
+    __global__ void kgen_offset_base(GenOffsetBaseParam<nidx> param) {
+        int oidx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (oidx < param.size) {
+            int offset = 0;
+#pragma unroll
+            for (int i = 0; i < nidx; ++ i) {
+                int data_idx = param.indexer[i].ptr[
+                         param.indexer[i].stride * oidx];
+                data_idx += (data_idx < 0 ? param.data_shape[i] : 0);
+                if (static_cast<uint32_t>(data_idx) >= param.data_shape[i]) {
+                    // cast to uint32 to handle both negative and overflow
+                    set_async_error_info(param.error_info, param.error_tracker,
+                            "invalid advanced indexing: "
+                            "indexer=%d idx=%d shape=%d",
+                            i, data_idx, param.data_shape[i]);
+                    data_idx = 0;
+                }
+                offset += data_idx * param.data_stride[i];
+            }
+            param.output[oidx] = offset;
+        }
+    }
+}
+
+namespace megdnn {
+namespace rocm {
+namespace indexing_multi_axis_vec {
+
+#define INST(_n) \
+    template void gen_offset_base( \
+            const GenOffsetBaseParam<_n> &, hipStream_t);
+    MEGDNN_FOREACH_TENSOR_NDIM(INST)
+#undef INST
+
+} // namespace indexing_multi_axis_vec
+} // namespace rocm
+} // namespace megdnn
+
+template<int nidx>
+void indexing_multi_axis_vec::gen_offset_base(
+        const GenOffsetBaseParam<nidx> &param, hipStream_t stream) {
+    void (*kptr)(GenOffsetBaseParam<nidx>) = kgen_offset_base<nidx>;
+    int bsize = 256; 
+    hipLaunchKernelGGL(kptr,
+		       DIVUP(param.size, bsize), bsize, 0, stream,
+		       param);
+}
+
+// vim: ft=cuda syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/opr_impl.cpp b/dnn/src/rocm/indexing_multi_axis_vec/opr_impl.cpp
new file mode 100644
index 00000000..89f3c381
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/opr_impl.cpp
@@ -0,0 +1,212 @@
+/**
+ * \file dnn/src/rocm/indexing_multi_axis_vec/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/rocm/utils.h"
+#include "./opr_impl.h"
+#include "./kern.h.hip"
+
+#include "src/common/indexing_multi_axis_vec_kdef.h"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace indexing_multi_axis_vec;
+
+namespace {
+    class ExecImplHelper {
+        template<int nidx>
+        void dispatch_gen_offset_base_nidx();
+
+        void dispatch_gen_offset_base();
+    protected:
+        using IndexDesc = IndexingMultiAxisVec::IndexDesc;
+        using ExecInfo = IndexingMultiAxisVec::ExecInfo;
+
+        hipStream_t m_stream;
+        const TensorND * const m_data;
+        const TensorND * const m_value;
+        const IndexDesc * const m_index;
+        const ExecInfo* const m_exec_info;
+        int * const m_offset_base;
+        TensorLayout m_value_layout_on_data;
+        size_t m_idx_axis;
+        int m_value_stride;
+
+    public:
+        ExecImplHelper(const TensorND &data, const TensorND &value,
+                const IndexDesc &index, const Workspace &workspace,
+                const ExecInfo &exec_info, hipStream_t stream);
+    };
+
+    template<class Opr>
+    class ExecImpl : public ExecImplHelper {
+
+        void dispatch_exec();
+
+        template<typename ctype>
+        void dispatch_exec_ctype();
+
+        template<typename ctype, int ndim>
+        void dispatch_exec_ctype_ndim();
+
+    public:
+        using ExecImplHelper::ExecImplHelper;
+
+        void operator() () {
+            dispatch_exec();
+            after_kernel_launch();
+        }
+    };
+} // anonymous namespace
+
+ExecImplHelper::ExecImplHelper(const TensorND &data, const TensorND &value,
+        const IndexDesc &index, const Workspace &workspace,
+        const ExecInfo &exec_info, hipStream_t stream):
+    m_stream{stream}, m_data{&data}, m_value{&value}, m_index{&index},
+    m_exec_info{&exec_info}, m_offset_base{workspace.ptr<int>()}
+{
+    safe_size_in_kern(data.layout.total_nr_elems());
+    dispatch_gen_offset_base();
+
+    std::tie(m_value_layout_on_data, m_idx_axis) =
+        IndexingMultiAxisVec::get_value_iter_optimized_layout(
+            data.layout, value.layout, index, exec_info.idx_axis);
+    m_value_stride = exec_info.value_stride;
+}
+
+template<int nidx>
+void ExecImplHelper::dispatch_gen_offset_base_nidx() {
+
+    GenOffsetBaseParam<nidx> param;
+    param.size = m_value->layout.shape[m_exec_info->idx_axis];
+    param.output = m_offset_base;
+    param.error_tracker = m_exec_info->error_tracker;
+    param.error_info = m_exec_info->error_info;
+    for (int i = 0; i < nidx; ++ i) {
+        auto &&dst = param.indexer[i];
+        auto &&src = m_index->operator[](i);
+        megdnn_assert(src.vec.layout.ndim == 1);
+        dst.stride = src.vec.layout.stride[0];
+        if (src.vec.layout.shape[0] == 1) {
+            dst.stride = 0;
+        }
+        dst.ptr = src.vec.ptr<int>();
+        param.data_shape[i] = m_data->layout.shape[src.axis];
+        param.data_stride[i] = m_data->layout.stride[src.axis];
+    }
+    gen_offset_base(param, m_stream);
+}
+
+void ExecImplHelper::dispatch_gen_offset_base() {
+    switch(m_index->size()) {
+#define cb(_n) case _n:  return dispatch_gen_offset_base_nidx<_n>();
+        MEGDNN_FOREACH_TENSOR_NDIM(cb)
+#undef cb
+    }
+    megdnn_throw("bad index size");
+}
+
+template<class Opr>
+void ExecImpl<Opr>::dispatch_exec() {
+    switch (m_data->layout.dtype.enumv()) {
+#define cb(_dtype) \
+        case DTypeTrait<_dtype>::enumv: \
+            return dispatch_exec_ctype<DTypeTrait<_dtype>::ctype>();
+        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+        default:
+            megdnn_throw("bad dtype");
+    }
+}
+
+template<class Opr>
+template<typename ctype>
+void ExecImpl<Opr>::dispatch_exec_ctype() {
+    switch (m_value_layout_on_data.ndim) {
+#define cb(_n) \
+        case _n: return dispatch_exec_ctype_ndim<ctype, _n>();
+        MEGDNN_FOREACH_TENSOR_NDIM(cb)
+#undef cb
+        default:
+            megdnn_throw("bad data ndim");
+    }
+}
+
+template<class Opr>
+template<typename ctype, int ndim>
+void ExecImpl<Opr>::dispatch_exec_ctype_ndim() {
+    ApplyOprParam<ctype, ndim> param;
+    param.tot_size = safe_size_in_kern(m_value->layout.total_nr_elems());
+    param.offset_base = m_offset_base;
+    param.data = m_data->ptr<ctype>();
+    param.value = m_value->ptr<ctype>();
+    param.idx_axis = m_idx_axis;
+    param.value_stride = m_value_stride;
+    for (int i = 0; i < ndim; ++ i) {
+        param.value_ly_on_data.stride[i] = m_value_layout_on_data.stride[i];
+        if (i) {
+            param.value_ly_on_data.shape[i - 1] =
+                m_value_layout_on_data.shape[i];
+        }
+    }
+    apply_opr<ctype, ndim, Opr>(param, m_stream);
+}
+
+
+size_t IndexingMultiAxisVecImpl::get_workspace_in_bytes(size_t dst_idx_size) {
+    return dst_idx_size * sizeof(int);
+}
+
+void IndexingMultiAxisVecImpl::exec(
+        _megdnn_tensor_in src, const IndexDesc &index,
+        _megdnn_tensor_out dst,
+        _megdnn_workspace workspace) {
+    auto info = check_exec(src.layout, index, dst.layout, workspace.size);
+    info.error_tracker = m_error_tracker;
+    info.error_info = async_error_info(handle());
+    ExecImpl<indexing_multi_axis_vec_kdef::OprFwd>{
+            src, dst, index, workspace, info, hip_stream(handle())}();
+}
+
+size_t IndexingSetMultiAxisVecImpl::get_workspace_in_bytes(
+        size_t value_idx_size) {
+    return value_idx_size * sizeof(int);
+}
+
+void IndexingSetMultiAxisVecImpl::exec(
+        _megdnn_tensor_inout data, _megdnn_tensor_in value,
+        const IndexDesc &index, _megdnn_workspace workspace) {
+    auto info = check_exec(data.layout, value.layout, index, workspace.size);
+    info.error_tracker = m_error_tracker;
+    info.error_info = async_error_info(handle());
+    ExecImpl<indexing_multi_axis_vec_kdef::OprSet>{
+            data, value, index, workspace, info, hip_stream(handle())}();
+}
+
+size_t IndexingIncrMultiAxisVecImpl::get_workspace_in_bytes(
+        size_t value_idx_size) {
+    return value_idx_size * sizeof(int);
+}
+
+void IndexingIncrMultiAxisVecImpl::exec(
+        _megdnn_tensor_inout data, _megdnn_tensor_in value,
+        const IndexDesc &index, _megdnn_workspace workspace) {
+    MEGDNN_INC_FLOAT16(
+            megdnn_assert(data.layout.dtype != dtype::Float16(),
+            "float16 incr on hip currently not supported"));
+    auto info = check_exec(data.layout, value.layout, index, workspace.size);
+    info.error_tracker = m_error_tracker;
+    info.error_info = async_error_info(handle());
+    ExecImpl<OprAtomicIncr>{data, value, index, workspace, info,
+            hip_stream(handle())}();
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/indexing_multi_axis_vec/opr_impl.h b/dnn/src/rocm/indexing_multi_axis_vec/opr_impl.h
new file mode 100644
index 00000000..0c67f9a2
--- /dev/null
+++ b/dnn/src/rocm/indexing_multi_axis_vec/opr_impl.h
@@ -0,0 +1,73 @@
+/**
+ * \file dnn/src/rocm/indexing_multi_axis_vec/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+    class IndexingMultiAxisVecImpl final: public IndexingMultiAxisVec {
+        void* m_error_tracker = nullptr;
+
+        public:
+            using IndexingMultiAxisVec::IndexingMultiAxisVec;
+
+            size_t get_workspace_in_bytes(size_t dst_idx_size) override;
+
+            void exec(_megdnn_tensor_in src, const IndexDesc &index,
+                    _megdnn_tensor_out dst,
+                    _megdnn_workspace workspace) override;
+
+            void set_error_tracker(void* tracker) override {
+                m_error_tracker = tracker;
+            }
+    };
+
+    class IndexingSetMultiAxisVecImpl final: public IndexingSetMultiAxisVec {
+        void* m_error_tracker = nullptr;
+
+        public:
+            using IndexingSetMultiAxisVec::IndexingSetMultiAxisVec;
+
+            size_t get_workspace_in_bytes(size_t dst_idx_size) override;
+
+            void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value,
+                    const IndexDesc &index,
+                    _megdnn_workspace workspace) override;
+
+            void set_error_tracker(void* tracker) override {
+                m_error_tracker = tracker;
+            }
+    };
+
+    class IndexingIncrMultiAxisVecImpl final: public IndexingIncrMultiAxisVec {
+        void* m_error_tracker = nullptr;
+
+        public:
+            using IndexingIncrMultiAxisVec::IndexingIncrMultiAxisVec;
+
+            size_t get_workspace_in_bytes(size_t dst_idx_size) override;
+
+            void exec(_megdnn_tensor_inout data, _megdnn_tensor_in value,
+                    const IndexDesc &index,
+                    _megdnn_workspace workspace) override;
+
+            void set_error_tracker(void* tracker) override {
+                m_error_tracker = tracker;
+            }
+    };
+}
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_one_hot/indexing_one_hot.cpp.hip b/dnn/src/rocm/indexing_one_hot/indexing_one_hot.cpp.hip
new file mode 100644
index 00000000..769c38be
--- /dev/null
+++ b/dnn/src/rocm/indexing_one_hot/indexing_one_hot.cpp.hip
@@ -0,0 +1,34 @@
+/**
+ * \file src/rocm/indexing_one_hot/indexing_one_hot.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2016 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./indexing_one_hot.h.hip"
+#include "src/rocm/elemwise_helper.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+#define cb(_dt) \
+    typedef indexing_one_hot::OpGet<DTypeTrait<dtype::_dt>::ctype, dt_int32> \
+            OpGet##_dt; \
+    typedef indexing_one_hot::OpSet<DTypeTrait<dtype::_dt>::ctype, dt_int32> \
+            OpSet##_dt; \
+    INST_RUN_ELEMWISE(OpGet##_dt, void, 0); \
+    INST_RUN_ELEMWISE(OpSet##_dt, void, 0);
+
+    MEGDNN_FOREACH_DTYPE_NAME(cb)
+    MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb)
+
+#undef cb
+
+} // namespace rocm
+} // namespace megdnn
+
+
+// vim: ft=cpp syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/indexing_one_hot/indexing_one_hot.h.hip b/dnn/src/rocm/indexing_one_hot/indexing_one_hot.h.hip
new file mode 100644
index 00000000..1358d7f7
--- /dev/null
+++ b/dnn/src/rocm/indexing_one_hot/indexing_one_hot.h.hip
@@ -0,0 +1,75 @@
+/**
+ * \file src/rocm/indexing_one_hot/indexing_one_hot.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "src/rocm/error_info.h.hip"
+#include "src/rocm/int_fastdiv.h.hip"
+
+namespace megdnn {
+namespace rocm {
+namespace indexing_one_hot {
+
+struct KernParam {
+    //! stride[axis], also prod(shape[axis+1:ndim])
+    Uint32Fastdiv shape_lo;
+    //! stride[axis-1]
+    uint32_t stride_hi;
+
+    //! max value that user provide index array can give
+    uint32_t max_mid_index;
+    void* error_tracker;
+    AsyncErrorInfo* error_info;
+
+    template <typename idx_type>
+    __device__ uint32_t get_idx(uint32_t offset, const idx_type* idx) const {
+        uint32_t idx0, idx1, idx2;
+        idx0 = offset / shape_lo;
+        idx2 = offset - idx0 * shape_lo.divisor();
+        idx1 = idx[offset];
+        if (idx1 >= max_mid_index) {
+            set_async_error_info(error_info, error_tracker,
+                                 "invalid IndexingOneHot: "
+                                 "offset=%d idx0=%d indexer=%d idx2=%d",
+                                 offset, idx0, idx1, idx2);
+            idx1 = 0;
+        }
+        return idx0 * stride_hi + idx1 * shape_lo.divisor() + idx2;
+    }
+};
+
+template <typename data_type, typename idx_type>
+struct OpGet {
+    const data_type* m_src;
+    const idx_type* m_idx;
+    data_type* m_dst;
+    KernParam m_param;
+
+    __device__ void operator()(uint32_t offset) {
+        m_dst[offset] = m_src[m_param.get_idx(offset, m_idx)];
+    }
+};
+
+template <typename data_type, typename idx_type>
+struct OpSet {
+    data_type* m_data;
+    const idx_type* m_idx;
+    const data_type* m_sub;
+    KernParam m_param;
+
+    __device__ void operator()(uint32_t offset) {
+        m_data[m_param.get_idx(offset, m_idx)] = m_sub[offset];
+    }
+};
+
+}  // namespace indexing_one_hot
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/indexing_one_hot/opr_impl.cpp b/dnn/src/rocm/indexing_one_hot/opr_impl.cpp
new file mode 100644
index 00000000..f64efebe
--- /dev/null
+++ b/dnn/src/rocm/indexing_one_hot/opr_impl.cpp
@@ -0,0 +1,91 @@
+/**
+ * \file dnn/src/rocm/indexing_one_hot/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "src/rocm/indexing_one_hot/indexing_one_hot.h.hip"
+
+#include "src/rocm/utils.h"
+#include "src/rocm/elemwise_helper.h.hip"
+
+using namespace megdnn;
+using namespace rocm;
+using namespace indexing_one_hot;
+
+namespace {
+
+    KernParam make_kern_param(const TensorLayout &layout, size_t axis) {
+        KernParam ret;
+        memset(&ret, 0, sizeof(ret));
+        ret.shape_lo = layout.stride[axis];
+        ret.stride_hi = axis > 0 ? layout.stride[axis - 1] : 1;
+        ret.max_mid_index = layout[axis];
+        return ret;
+    }
+
+} // anonymous namespace
+
+void IndexingOneHotForwardImpl::exec(
+        _megdnn_tensor_in src, _megdnn_tensor_in index,
+        _megdnn_tensor_out dst, _megdnn_workspace workspace) {
+    check_exec(src.layout, index.layout, dst.layout, workspace.size);
+    ElemwiseOpParamN<0> ele_param{dst.layout.total_nr_elems()};
+    auto kern_param = make_kern_param(src.layout, m_param.axis);
+    auto stream = hip_stream(handle());
+    kern_param.error_tracker = m_error_tracker;
+    kern_param.error_info = async_error_info(handle());
+
+#define cb(_dt) \
+    case DTypeTrait<_dt>::enumv: { \
+        using ctype = DTypeTrait<_dt>::ctype; \
+        using Op = OpGet<DTypeTrait<_dt>::ctype, dt_int32>; \
+        Op op{src.ptr<ctype>(), index.ptr<dt_int32>(), dst.ptr<ctype>(), \
+            kern_param}; \
+        return run_elemwise<Op, void>(ele_param, stream, op); \
+    }
+    switch (src.layout.dtype.enumv()) {
+        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+        default:
+            megdnn_throw(megdnn_mangle("bad dtype"));
+    }
+#undef cb
+}
+
+void IndexingSetOneHotForwardImpl::exec(
+        _megdnn_tensor_inout data, _megdnn_tensor_in index,
+        _megdnn_tensor_in sub, _megdnn_workspace workspace) {
+    check_exec(data.layout, index.layout, sub.layout, workspace.size);
+
+    ElemwiseOpParamN<0> ele_param{sub.layout.total_nr_elems()};
+    auto kern_param = make_kern_param(data.layout, m_param.axis);
+    auto stream = hip_stream(handle());
+    kern_param.error_tracker = m_error_tracker;
+    kern_param.error_info = async_error_info(handle());
+
+#define cb(_dt) \
+    case DTypeTrait<_dt>::enumv: { \
+        using ctype = DTypeTrait<_dt>::ctype; \
+        using Op = OpSet<DTypeTrait<_dt>::ctype, dt_int32>; \
+        Op op{data.ptr<ctype>(), index.ptr<dt_int32>(), sub.ptr<ctype>(), \
+            kern_param}; \
+        return run_elemwise<Op, void>(ele_param, stream, op); \
+    }
+    switch (data.layout.dtype.enumv()) {
+        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+        default:
+            megdnn_throw(megdnn_mangle("bad dtype"));
+    }
+#undef cb
+}
+
+// vim: syntax=cpp.doxygen
+
+
diff --git a/dnn/src/rocm/indexing_one_hot/opr_impl.h b/dnn/src/rocm/indexing_one_hot/opr_impl.h
new file mode 100644
index 00000000..d01daa3b
--- /dev/null
+++ b/dnn/src/rocm/indexing_one_hot/opr_impl.h
@@ -0,0 +1,57 @@
+/**
+ * \file dnn/src/rocm/indexing_one_hot/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class IndexingOneHotForwardImpl final: public IndexingOneHotForward {
+    void* m_error_tracker = nullptr;
+    public:
+        using IndexingOneHotForward::IndexingOneHotForward;
+        void exec(_megdnn_tensor_in src, _megdnn_tensor_in index,
+                _megdnn_tensor_out dst, _megdnn_workspace workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout &,
+                const TensorLayout &,
+                const TensorLayout &) override {
+            return 0;
+        }
+
+        void set_error_tracker(void* tracker) override {
+            m_error_tracker = tracker;
+        }
+};
+
+class IndexingSetOneHotForwardImpl final: public IndexingSetOneHotForward {
+    void* m_error_tracker = nullptr;
+    public:
+        using IndexingSetOneHotForward::IndexingSetOneHotForward;
+        void exec(_megdnn_tensor_inout data, _megdnn_tensor_in index,
+                _megdnn_tensor_in sub, _megdnn_workspace workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout &,
+                const TensorLayout &,
+                const TensorLayout &) override {
+            return 0;
+        }
+
+        void set_error_tracker(void* tracker) override {
+            m_error_tracker = tracker;
+        }
+};
+
+}
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/int_fastdiv.cpp b/dnn/src/rocm/int_fastdiv.cpp
new file mode 100644
index 00000000..fd26ad30
--- /dev/null
+++ b/dnn/src/rocm/int_fastdiv.cpp
@@ -0,0 +1,63 @@
+/**
+ * \file dnn/src/rocm/int_fastdiv.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include <cstring>
+#include "src/rocm/int_fastdiv.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+Uint32Fastdiv::Uint32Fastdiv() {
+    memset(this, 0, sizeof(Uint32Fastdiv));
+}
+
+Uint32Fastdiv& Uint32Fastdiv::operator=(uint32_t d) {
+    megdnn_assert(d);
+    m_divisor = d;
+    MEGDNN_CONSTEXPR uint32_t MAX_U32 = ~0u;
+    m_inc_dividend = 0;
+    m_divisor_is_not_1 = ~0u;
+    if (!(d & (d - 1))) {
+        // power of 2
+        m_mul = 1u << 31;
+        int p = 0;
+        while ((1u << p) < d)
+            ++p;
+        megdnn_assert((1u << p) == d);
+        m_shift = p ? p - 1 : 0;
+        if (d == 1)
+            m_divisor_is_not_1 = 0;
+        return *this;
+    }
+    auto n_bound = uint64_t(d / 2 + 1) * MAX_U32;
+    uint32_t shift = 32;
+    while ((1ull << shift) < n_bound)
+        ++shift;
+    uint64_t mdst = 1ull << shift;
+    int64_t delta = d - mdst % d;
+    m_mul = mdst / d + 1;
+    if ((uint64_t)delta > d / 2) {
+        delta -= d;
+        --m_mul;
+        m_inc_dividend = 1;
+    }
+    megdnn_assert((uint64_t)m_mul * d == mdst + delta);
+    megdnn_assert((uint64_t)std::abs(delta) * MAX_U32 < mdst);
+    m_shift = shift - 32;
+    return *this;
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/int_fastdiv.h.hip b/dnn/src/rocm/int_fastdiv.h.hip
new file mode 100644
index 00000000..47ac27e8
--- /dev/null
+++ b/dnn/src/rocm/int_fastdiv.h.hip
@@ -0,0 +1,184 @@
+/**
+ * \file src/rocm/int_fastdiv.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief fast integer division for constant divisor
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "src/common/utils.cuh"
+#include "hip_header.h"
+
+#include <stdint.h>
+#include <cstdlib>
+
+namespace megdnn {
+namespace rocm {
+
+/*!
+ * \brief fast division for uint32
+ */
+class Uint32Fastdiv {
+    uint32_t m_mul, m_divisor, m_divisor_is_not_1, m_inc_dividend, m_shift;
+
+public:
+    Uint32Fastdiv();
+
+    Uint32Fastdiv(uint32_t d) { operator=(d); }
+
+    //! set the divisor to be d
+    Uint32Fastdiv& operator=(uint32_t d);
+
+    //! caller must ensure that dividend would not exceed this number
+    static MEGDNN_CONSTEXPR uint32_t MAX_DIVIDEND = ~0u - 1;
+
+    __device__ __forceinline__ uint32_t divisor() const { return m_divisor; }
+
+    __device__ __forceinline__ uint32_t divide(uint32_t dividend) const {
+        uint32_t ans_for_one = dividend & ~m_divisor_is_not_1,
+                 dfix = dividend + m_inc_dividend,
+#if MEGDNN_CC_CUDA
+                 hi32 = __umulhi(dfix, m_mul),
+#else
+                 hi32 = ((uint64_t)dfix * m_mul) >> 32,
+#endif
+                 ans = hi32 >> m_shift;
+
+        return (ans & m_divisor_is_not_1) | ans_for_one;
+    }
+};
+
+static __forceinline__ __device__ uint32_t operator/(uint32_t a,
+                                                     const Uint32Fastdiv& d) {
+    return d.divide(a);
+}
+
+static __forceinline__ __device__ uint32_t operator%(uint32_t a,
+                                                     const Uint32Fastdiv& d) {
+    return a - d.divisor() * d.divide(a);
+}
+
+/*!
+ * \brief maintain (a + k * x) / b and (a + k * x) % b for x >= 0
+ * \tparam need_quotient whether quotient need to be maintained
+ */
+template <bool need_quotient>
+class StridedDivSeq;
+
+template <>
+class StridedDivSeq<false> {
+    Uint32Fastdiv m_b;
+
+    //! k % b
+    uint32_t m_kr;
+
+    //! current (a + k * x) % b
+    uint32_t m_r;
+
+public:
+    void host_init(uint32_t k, uint32_t b) {
+        m_b = b;
+        m_kr = k % b;
+    }
+
+    //! init to k == 0
+    __device__ __forceinline__ void device_init(uint32_t a) { m_r = a % m_b; }
+
+    //! perform x += 1
+    __device__ __forceinline__ void next() {
+        uint32_t b = m_b.divisor(), r1 = m_r + m_kr, carry_mask = (r1 < b) - 1;
+        m_r = r1 - (b & carry_mask);
+    }
+
+    //! current remainder
+    __device__ __forceinline__ uint32_t r() const { return m_r; }
+};
+
+template <>
+class StridedDivSeq<true> {
+    Uint32Fastdiv m_b;
+
+    //! k / b, k % b
+    uint32_t m_kq, m_kr;
+
+    //! current (a + k * x) / b and (a + k * x) % b
+    uint32_t m_q, m_r;
+
+public:
+    void host_init(uint32_t k, uint32_t b) {
+        m_b = b;
+        m_kq = k / b;
+        m_kr = k % b;
+    }
+
+    //! init to k == 0
+    __device__ __forceinline__ void device_init(uint32_t a) {
+        //! fix operator/() defined but not used error
+        m_q = a / m_b;
+        m_r = a - m_b.divisor() * m_q;
+    }
+
+    //! perform x += 1
+    __device__ __forceinline__ void next() {
+        uint32_t b = m_b.divisor(), r1 = m_r + m_kr, carry_mask = (r1 < b) - 1;
+        m_q += m_kq + (r1 >= b);
+        m_r = r1 - (b & carry_mask);
+    }
+
+    //! current quotient
+    __device__ __forceinline__ uint32_t q() const { return m_q; }
+
+    //! current remainder
+    __device__ __forceinline__ uint32_t r() const { return m_r; }
+};
+
+/*!
+ * \brief maintain (a + k * x) / b % c for x >= 0
+ */
+class StridedDivSeq2 {
+    Uint32Fastdiv m_b, m_c;
+
+    //! k / b, k % b, k / b % c
+    uint32_t m_qkb, m_rkb, m_rkbc;
+
+    //! current (a + k * x) % b and (a + k * x) / b % c
+    uint32_t m_cur_rkb, m_cur_ans;
+
+public:
+    void host_init(uint32_t k, uint32_t b, uint32_t c) {
+        m_b = b;
+        m_c = c;
+        m_qkb = k / b;
+        m_rkb = k % b;
+        m_rkbc = m_qkb % c;
+    }
+
+    //! init to k == 0
+    __device__ __forceinline__ void device_init(uint32_t a) {
+        uint32_t q = m_b.divide(a);
+        m_cur_rkb = a - m_b.divisor() * q;
+        m_cur_ans = q % m_c;
+    }
+
+    //! perform x += 1
+    __device__ __forceinline__ void next() {
+        uint32_t b = m_b.divisor(), c = m_c.divisor(), rkb = m_cur_rkb + m_rkb,
+                 carry0 = (rkb < b) - 1,
+                 next_ans = m_cur_ans + m_rkbc + (rkb >= b),
+                 carry1 = (next_ans < c) - 1;
+        m_cur_rkb = rkb - (b & carry0);
+        m_cur_ans = next_ans - (c & carry1);
+    }
+
+    __device__ __forceinline__ uint32_t get() const { return m_cur_ans; }
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/linspace/linspace.cpp.hip b/dnn/src/rocm/linspace/linspace.cpp.hip
new file mode 100644
index 00000000..5094f015
--- /dev/null
+++ b/dnn/src/rocm/linspace/linspace.cpp.hip
@@ -0,0 +1,51 @@
+/**
+ * \file src/rocm/linspace/linspace.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./linspace.h.hip"
+#include "src/rocm/utils.h.hip"
+#include "megdnn/dtype.h"
+
+namespace {
+
+template <typename T>
+__global__ void kernel(T *dst, double start, double step, uint32_t n)
+{
+    uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < n) {
+        dst[i] = T(start + step*i);
+    }
+}
+
+} // anonymous namespace
+
+namespace megdnn {
+namespace rocm {
+namespace linspace {
+
+template <typename T>
+void exec_internal(T *dst, double start, double step, size_t n,
+        hipStream_t stream)
+{
+    uint32_t threads = NR_THREADS;
+    uint32_t blocks = DIVUP(n, threads);
+    hipLaunchKernelGGL(kernel, 
+                       dim3(blocks), dim3(threads), 0, stream,
+                       dst, start, step, n);
+    after_kernel_launch();
+}
+
+#define INST(T) template void exec_internal<T>(T *dst, \
+        double start, double step, size_t n, hipStream_t stream);
+#define cb(DType) INST(typename DTypeTrait<DType>::ctype)
+MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+
+} // namespace linspace
+} // namespace rocm
+} // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/linspace/linspace.h.hip b/dnn/src/rocm/linspace/linspace.h.hip
new file mode 100644
index 00000000..3d16e6cf
--- /dev/null
+++ b/dnn/src/rocm/linspace/linspace.h.hip
@@ -0,0 +1,22 @@
+/**
+ * \file src/rocm/linspace/linspace.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hip_header.h"
+
+namespace megdnn {
+namespace rocm {
+namespace linspace {
+
+template <typename T>
+void exec_internal(T *dst, double start, double step, size_t n,
+        hipStream_t stream);
+
+} // namespace linspace
+} // namespace rocm
+} // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/linspace/opr_impl.cpp b/dnn/src/rocm/linspace/opr_impl.cpp
new file mode 100644
index 00000000..1e7b42ec
--- /dev/null
+++ b/dnn/src/rocm/linspace/opr_impl.cpp
@@ -0,0 +1,39 @@
+/**
+ * \file dnn/src/rocm/linspace/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/rocm/utils.h"
+#include "./opr_impl.h"
+#include "src/rocm/linspace/linspace.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+void LinspaceImpl::exec(_megdnn_tensor_out dst, _megdnn_workspace workspace)
+{
+    check_exec(dst.layout, workspace.size);
+    auto stream = hip_stream(handle());
+    auto n = dst.layout.total_nr_elems();
+    auto step = (param().stop - param().start) /
+        std::max(static_cast<double>(param().endpoint ? n-1 : n), 1.0);
+#define cb(dt) \
+    if (dst.layout.dtype == dt()) { \
+        using ctype = typename DTypeTrait<dt>::ctype; \
+        linspace::exec_internal<ctype>(dst.ptr<ctype>(), \
+                param().start, step, n, \
+                stream); \
+    }
+    MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+}
+
+} // namespace rocm 
+} // namespace megdnn
diff --git a/dnn/src/rocm/linspace/opr_impl.h b/dnn/src/rocm/linspace/opr_impl.h
new file mode 100644
index 00000000..8b5d0786
--- /dev/null
+++ b/dnn/src/rocm/linspace/opr_impl.h
@@ -0,0 +1,28 @@
+/**
+ * \file dnn/src/rocm/linspace/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class LinspaceImpl final: public Linspace {
+    public:
+        using Linspace::Linspace;
+        void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout &) override {
+            return 0;
+        }
+};
+
+} // namespace rocm
+} // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/matrix_mul/opr_impl.cpp b/dnn/src/rocm/matrix_mul/opr_impl.cpp
new file mode 100644
index 00000000..e34ad53a
--- /dev/null
+++ b/dnn/src/rocm/matrix_mul/opr_impl.cpp
@@ -0,0 +1,159 @@
+/**
+ * \file dnn/src/rocm/matrix_mul/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/matrix_mul/opr_impl.h"
+
+#include "src/rocm/utils.h"
+#include "src/rocm/handle.h"
+
+namespace megdnn {
+namespace rocm {
+
+void MatrixMulForwardImpl::exec(_megdnn_tensor_in A,
+        _megdnn_tensor_in B,
+        _megdnn_tensor_out C,
+        _megdnn_workspace workspace)
+{
+    check_exec(A.layout, B.layout, C.layout, workspace.size);
+
+    auto m = C.layout.shape[0], n = C.layout.shape[1];
+    auto k = A.layout.shape[param().transposeA ? 0 : 1];
+    auto handle = concrete_handle(this->handle());
+    auto rocblas_handle_ = handle->get_rocblas_handle();
+
+    auto sgemm = [&]() {
+        auto zero = handle->zero_device();
+        auto one = handle->one_device();
+        rocblas_check(rocblas_sgemm(
+                rocblas_handle_,
+                param().transposeB ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                param().transposeA ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                n, m, k, one, B.ptr<dt_float32>(), B.layout.stride[0],
+                A.ptr<dt_float32>(), A.layout.stride[0], zero,
+                C.ptr<dt_float32>(), C.layout.stride[0]));
+    };
+
+#if !MEGDNN_DISABLE_FLOAT16
+    //! used for FLOAT_IO16xC32, not tested
+    auto gemm_ex = [&]() {
+        auto zero = handle->zero_device();
+        auto one = handle->one_device();
+        //! These two arguments for future use, see
+        //! https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/src/blas_ex/rocblas_gemm_ex.cpp
+        int32_t solution_index = 0;
+        uint32_t flags = 1;
+        size_t ws_size = 0;
+        auto gemm_ex_err = rocblas_gemm_ex(
+                rocblas_handle_,
+                param().transposeB ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                param().transposeA ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                n, m, k, one, B.raw_ptr, rocblas_datatype_f16_r,
+                B.layout.stride[0], A.raw_ptr, rocblas_datatype_f16_r,
+                A.layout.stride[0], zero, C.raw_ptr, rocblas_datatype_f16_r,
+                C.layout.stride[0], C.raw_ptr, rocblas_datatype_f16_r,
+                C.layout.stride[0], rocblas_datatype_f32_r,
+                rocblas_gemm_algo_standard, solution_index, flags, &ws_size,
+                nullptr);
+        rocblas_check(gemm_ex_err);
+    };
+
+    auto hgemm = [&]() {
+        auto one_half = handle->one_device_h();
+        auto zero_half = handle->zero_device_h();
+        auto hgemm_err = rocblas_hgemm(
+                rocblas_handle_,
+                param().transposeB ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                param().transposeA ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                n, m, k, reinterpret_cast<const rocblas_half*>(one_half),
+                static_cast<const rocblas_half*>(B.raw_ptr), B.layout.stride[0],
+                static_cast<const rocblas_half*>(A.raw_ptr), A.layout.stride[0],
+                reinterpret_cast<const rocblas_half*>(zero_half),
+                static_cast<rocblas_half*>(C.raw_ptr), C.layout.stride[0]);
+        rocblas_check(hgemm_err);
+    };
+#endif
+
+    if (param().compute_mode == Param::ComputeMode::DEFAULT) {
+        if (A.layout.dtype == dtype::Float32()) {
+            sgemm();
+        }
+#if !MEGDNN_DISABLE_FLOAT16
+        else {
+            megdnn_assert(A.layout.dtype == dtype::Float16(),
+                          "invalid matmul data type");
+            hgemm();
+        }
+#endif
+    }
+#if !MEGDNN_DISABLE_FLOAT16
+    else if (param().compute_mode == Param::ComputeMode::FLOAT32) {
+        megdnn_assert(B.layout.dtype == dtype::Float16() &&
+                              C.layout.dtype == dtype::Float16() &&
+                              A.layout.dtype == dtype::Float16(),
+                      "DataType::FLOAT_IO16xC32 is supported, when dtype of A, "
+                      "B, C are all Float16");
+        gemm_ex();
+    }
+#endif
+    else if (A.layout.dtype == dtype::Int8() &&
+             B.layout.dtype == dtype::Int8() &&
+             C.layout.dtype == dtype::Int32()) {
+        //! see
+        //! https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/src/blas_ex/rocblas_gemm_ex.cpp:470
+        bool rocblas_int8x8x32_valid = true;
+        rocblas_int8x8x32_valid &= (k % 4 == 0);
+        rocblas_int8x8x32_valid &=
+                (!param().transposeB || B.layout.stride[0] % 4 == 0);
+        rocblas_int8x8x32_valid &=
+                (!param().transposeA || A.layout.stride[0] % 4 == 0);
+        megdnn_assert(rocblas_int8x8x32_valid,
+                      "rocblas int8x8x32 matmul requires K must be a multiple "
+                      "of 4, and/or LDA/LDB based on transpose mode"
+                      "get: %zu, is_trans_b = %d, %zu, is_trans_a = %d, %zu",
+                      k, param().transposeB, B.layout.stride[0],
+                      param().transposeA, A.layout.stride[0]);
+        int32_t solution_index = 0;
+        uint32_t flags = 1;
+        size_t ws_size = 0;
+        auto zero = handle->zero_device_i32();
+        auto one = handle->one_device_i32();
+        rocblas_check(rocblas_gemm_ex(
+                rocblas_handle_,
+                param().transposeB ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                param().transposeA ? rocblas_operation_transpose
+                                   : rocblas_operation_none,
+                n, m, k, one, B.raw_ptr, rocblas_datatype_i8_r,
+                B.layout.stride[0], A.raw_ptr, rocblas_datatype_i8_r,
+                A.layout.stride[0], zero, C.raw_ptr, rocblas_datatype_i32_r,
+                C.layout.stride[0], C.raw_ptr, rocblas_datatype_i32_r,
+                C.layout.stride[0], rocblas_datatype_i32_r,
+                rocblas_gemm_algo_standard, solution_index, flags, &ws_size,
+                nullptr));
+    } else {
+        megdnn_assert((A.layout.dtype == dtype::Int8() &&
+                       B.layout.dtype == dtype::Int8() &&
+                       C.layout.dtype == dtype::Int16()),
+                      "invalid matmul data type");
+        megdnn_throw("cuda matmul does not support INT8x8x16 now");
+    }
+}
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/matrix_mul/opr_impl.h b/dnn/src/rocm/matrix_mul/opr_impl.h
new file mode 100644
index 00000000..5d8abad4
--- /dev/null
+++ b/dnn/src/rocm/matrix_mul/opr_impl.h
@@ -0,0 +1,51 @@
+/**
+ * \file dnn/src/rocm/matrix_mul/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class MatrixMulForwardImpl : public MatrixMulForward {
+public:
+    using MatrixMulForward::MatrixMulForward;
+    void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
+                                  const TensorLayout&) override {
+        return 0;
+    }
+
+    bool is_thread_safe() const override { return true; }
+private:
+    std::vector<Algorithm*> get_all_algorithms(
+            const TensorLayout& /*A*/, const TensorLayout& /*B*/,
+            const TensorLayout& /*C*/) override {
+        return {};
+    }
+
+    Algorithm* get_algorithm_heuristic(const TensorLayout& /*A*/,
+                                       const TensorLayout& /*B*/,
+                                       const TensorLayout& /*C*/,
+                                       size_t /*workspace_limit_in_bytes*/,
+                                       bool /*reproducible*/) override {
+        return nullptr;
+    }
+
+    const char* get_algorithm_set_name() const override {
+        return "ROCM MATMUL";
+    }
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/megcore/computing_context.hpp b/dnn/src/rocm/megcore/computing_context.hpp
new file mode 100644
index 00000000..273fc56e
--- /dev/null
+++ b/dnn/src/rocm/megcore/computing_context.hpp
@@ -0,0 +1,18 @@
+/**
+ * \file dnn/src/rocm/megcore/computing_context.hpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "src/common/megcore/common/computing_context.hpp"
+#include <memory>
+
+namespace megcore {
+std::unique_ptr<ComputingContext> make_rocm_computing_context(megcoreDeviceHandle_t dev_handle, unsigned int flags);
+}
diff --git a/dnn/src/rocm/megcore/device_context.hpp b/dnn/src/rocm/megcore/device_context.hpp
new file mode 100644
index 00000000..068e018a
--- /dev/null
+++ b/dnn/src/rocm/megcore/device_context.hpp
@@ -0,0 +1,18 @@
+/**
+ * \file dnn/src/rocm/megcore/device_context.hpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "src/common/megcore/common/device_context.hpp"
+#include <memory>
+
+namespace megcore {
+std::unique_ptr<DeviceContext> make_rocm_device_context(int deviceID, unsigned int flags);
+}
diff --git a/dnn/src/rocm/megcore/public_api/computing.cpp b/dnn/src/rocm/megcore/public_api/computing.cpp
new file mode 100644
index 00000000..6bc19579
--- /dev/null
+++ b/dnn/src/rocm/megcore/public_api/computing.cpp
@@ -0,0 +1,61 @@
+/**
+ * \file dnn/src/rocm/megcore/public_api/computing.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megcore_rocm.h"
+
+#include "src/common/utils.h"
+#include "src/common/megcore/public_api/computing.hpp"
+#include "../rocm_computing_context.hpp"
+
+using namespace megcore;
+
+megcoreStatus_t megcore::createComputingHandleWithROCMContext(
+        megcoreComputingHandle_t *compHandle,
+        megcoreDeviceHandle_t devHandle,
+        unsigned int flags,
+        const ROCMContext& ctx)
+{
+    auto content = megdnn::make_unique<rocm::ROCMComputingContext>(
+            devHandle, flags, ctx);
+    auto &H = *compHandle;
+    H = new megcoreComputingContext;
+    H->content = std::move(content);
+    return megcoreSuccess;
+}
+
+megcoreStatus_t megcore::getROCMContext(megcoreComputingHandle_t handle,
+        ROCMContext* ctx)
+{
+    auto &&H = handle;
+    megdnn_assert(H);
+    megcoreDeviceHandle_t dev_handle = H->content->dev_handle();
+    megcorePlatform_t platform;
+    megcoreGetPlatform(dev_handle, &platform);
+    megdnn_assert(platform == megcorePlatformROCM);
+    auto context = static_cast<megcore::rocm::ROCMComputingContext *>(
+            H->content.get());
+    *ctx = context->context();
+    return megcoreSuccess;
+}
+
+std::atomic_bool megcore::ROCMContext::sm_miopen_algo_search{false};
+megcoreStatus_t megcore::enableMIOpenAlgoSearch(bool enable_algo_search) {
+    megcore::ROCMContext::enable_miopen_algo_search(enable_algo_search);
+    return megcoreSuccess;
+}
+
+megcoreStatus_t megcore::getMIOpenAlgoSearchStatus(bool* algo_search_enabled) {
+    *algo_search_enabled = megcore::ROCMContext::enable_miopen_algo_search();
+    return megcoreSuccess;
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/megcore/rocm_computing_context.cpp b/dnn/src/rocm/megcore/rocm_computing_context.cpp
new file mode 100644
index 00000000..e65ee35d
--- /dev/null
+++ b/dnn/src/rocm/megcore/rocm_computing_context.cpp
@@ -0,0 +1,81 @@
+/**
+ * \file dnn/src/rocm/megcore/rocm_computing_context.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megcore.h"
+
+#include "src/common/utils.h"
+#include "src/rocm/utils.h"
+#include "./computing_context.hpp"
+
+#include "./rocm_computing_context.hpp"
+
+using namespace megcore;
+using namespace rocm;
+
+std::unique_ptr<ComputingContext> megcore::make_rocm_computing_context(megcoreDeviceHandle_t dev_handle, unsigned int flags) {
+    return std::make_unique<ROCMComputingContext>(dev_handle, flags);
+}
+
+ROCMComputingContext::ROCMComputingContext(megcoreDeviceHandle_t dev_handle,
+        unsigned int flags, const ROCMContext& ctx):
+    ComputingContext(dev_handle, flags),
+    own_stream_{ctx.stream == nullptr},
+    context_{ctx}
+{
+    megcorePlatform_t platform;
+    megcoreGetPlatform(dev_handle, &platform);
+    megdnn_assert(platform == megcorePlatformROCM);
+    if (own_stream_) {
+        hip_check(hipStreamCreateWithFlags(&context_.stream,
+                    hipStreamNonBlocking));
+    }
+}
+
+ROCMComputingContext::~ROCMComputingContext()
+{
+    if (own_stream_) {
+        hip_check(hipStreamDestroy(context_.stream));
+    }
+}
+
+void ROCMComputingContext::memcpy(void *dst, const void *src,
+        size_t size_in_bytes, megcoreMemcpyKind_t kind)
+{
+    hipMemcpyKind hip_kind;
+    switch (kind) {
+        case megcoreMemcpyDeviceToHost:
+            hip_kind = hipMemcpyDeviceToHost;
+            break;
+        case megcoreMemcpyHostToDevice:
+            hip_kind = hipMemcpyHostToDevice;
+            break;
+        case megcoreMemcpyDeviceToDevice:
+            hip_kind = hipMemcpyDeviceToDevice;
+            break;
+        default:
+            megdnn_throw("bad hip memcpy kind");
+    }
+    hip_check(hipMemcpyAsync(dst, src, size_in_bytes, hip_kind,
+                context_.stream));
+}
+
+void ROCMComputingContext::memset(void *dst, int value, size_t size_in_bytes)
+{
+    hip_check(hipMemsetAsync(dst, value, size_in_bytes, context_.stream));
+}
+
+void ROCMComputingContext::synchronize()
+{
+    hip_check(hipStreamSynchronize(context_.stream));
+}
+
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/megcore/rocm_computing_context.hpp b/dnn/src/rocm/megcore/rocm_computing_context.hpp
new file mode 100644
index 00000000..db34ab13
--- /dev/null
+++ b/dnn/src/rocm/megcore/rocm_computing_context.hpp
@@ -0,0 +1,41 @@
+/**
+ * \file dnn/src/rocm/megcore/rocm_computing_context.hpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "src/common/megcore/common/computing_context.hpp"
+#include "megcore_rocm.h"
+
+namespace megcore {
+namespace rocm {
+
+class ROCMComputingContext final : public ComputingContext {
+public:
+    ROCMComputingContext(megcoreDeviceHandle_t dev_handle, unsigned int flags,
+                         const ROCMContext& ctx = {});
+    ~ROCMComputingContext();
+
+    void memcpy(void* dst, const void* src, size_t size_in_bytes,
+                megcoreMemcpyKind_t kind) override;
+    void memset(void* dst, int value, size_t size_in_bytes) override;
+    void synchronize() override;
+
+    const ROCMContext& context() const { return context_; }
+    hipStream_t stream() const { return context().stream; }
+
+private:
+    bool own_stream_;
+    ROCMContext context_;
+};
+
+} // namespace rocm
+} // namespace megcore
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/megcore/rocm_device_context.cpp b/dnn/src/rocm/megcore/rocm_device_context.cpp
new file mode 100644
index 00000000..c12baf04
--- /dev/null
+++ b/dnn/src/rocm/megcore/rocm_device_context.cpp
@@ -0,0 +1,71 @@
+/**
+ * \file dnn/src/rocm/megcore/rocm_device_context.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "megcore.h"
+#include "src/common/utils.h"
+#include "src/rocm/utils.h"
+#include "./device_context.hpp"
+
+#include "./rocm_device_context.hpp"
+
+//! HIP_VERSION_MAJOR HIP_VERSION_MINOR HIP_VERSION_PATCH is defined when
+//! compile with hipcc
+
+using namespace megcore;
+using namespace rocm;
+
+std::unique_ptr<DeviceContext> megcore::make_rocm_device_context(int deviceID, unsigned int flags) {
+    return std::make_unique<ROCMDeviceContext>(deviceID, flags);
+}
+
+ROCMDeviceContext::ROCMDeviceContext(int device_id, unsigned int flags):
+    DeviceContext(megcorePlatformROCM, device_id, flags)
+{
+    int version;
+    hip_check(hipRuntimeGetVersion(&version));
+    int id = device_id;
+    if (id < 0) {
+        hip_check(hipGetDevice(&id));
+    }
+    hip_check(hipGetDeviceProperties(&prop_, id));
+}
+
+ROCMDeviceContext::~ROCMDeviceContext() noexcept = default;
+
+size_t ROCMDeviceContext::mem_alignment_in_bytes() const noexcept {
+    return 1u;
+#if 0
+    return std::max(prop_.textureAlignment, prop_.texturePitchAlignment);
+#endif
+}
+
+void ROCMDeviceContext::activate()
+{
+    int id = device_id();
+    if (id >= 0) {
+        hip_check(hipSetDevice(id));
+    }
+}
+
+void *ROCMDeviceContext::malloc(size_t size_in_bytes)
+{
+    void *ptr;
+    hip_check(hipMalloc(&ptr, size_in_bytes));
+    return ptr;
+}
+
+void ROCMDeviceContext::free(void *ptr)
+{
+    hip_check(hipFree(ptr));
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/megcore/rocm_device_context.hpp b/dnn/src/rocm/megcore/rocm_device_context.hpp
new file mode 100644
index 00000000..3ff4586f
--- /dev/null
+++ b/dnn/src/rocm/megcore/rocm_device_context.hpp
@@ -0,0 +1,35 @@
+/**
+ * \file dnn/src/rocm/megcore/rocm_device_context.hpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "src/common/megcore/common/device_context.hpp"
+
+namespace megcore {
+namespace rocm {
+
+class ROCMDeviceContext: public DeviceContext {
+    public:
+        ROCMDeviceContext(int device_id, unsigned int flags);
+        ~ROCMDeviceContext() noexcept;
+
+        size_t mem_alignment_in_bytes() const noexcept override;
+
+        void activate() override;
+        void *malloc(size_t size_in_bytes) override;
+        void free(void *ptr) override;
+    private:
+        hipDeviceProp_t prop_;
+};
+
+} // namespace rocm
+} // namespace megcore
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/miopen_with_check.h b/dnn/src/rocm/miopen_with_check.h
new file mode 100644
index 00000000..ad2e84fc
--- /dev/null
+++ b/dnn/src/rocm/miopen_with_check.h
@@ -0,0 +1,24 @@
+/**
+ * \file dnn/src/rocm/miopen_with_check.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#ifndef __HIP_PLATFORM_HCC__
+#define __HIP_PLATFORM_HCC__
+#endif
+
+#include <miopen/version.h>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#include <miopen/miopen.h>
+#pragma GCC diagnostic pop
+
diff --git a/dnn/src/rocm/miopen_wrapper.cpp b/dnn/src/rocm/miopen_wrapper.cpp
new file mode 100644
index 00000000..3a48a466
--- /dev/null
+++ b/dnn/src/rocm/miopen_wrapper.cpp
@@ -0,0 +1,155 @@
+/**
+ * \file dnn/src/rocm/miopen_wrapper.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/miopen_wrapper.h"
+
+#include "src/common/utils.h"
+#include "src/rocm/utils.h"
+
+namespace {
+
+using namespace megdnn;
+
+miopenDataType_t to_miopen_dtype(DType type,
+                                 const param::Convolution::Format format = {}) {
+    MEGDNN_MARK_USED_VAR(format);
+    //! TODO check quantized type
+    switch (type.enumv()) {
+        case DTypeEnum::Float32:
+            return miopenFloat;
+#if !MEGDNN_DISABLE_FLOAT16
+        case DTypeEnum::Float16:
+            return miopenHalf;
+#endif
+        case DTypeEnum::Int32:
+        case DTypeEnum::QuantizedS32:
+            return miopenInt32;
+        case DTypeEnum::QuantizedS8:
+        case DTypeEnum::Int8:
+            return miopenInt8;
+        default:
+            megdnn_throw(
+                    megdnn_mangle("dtype must be float16/float32/int8/int32"));
+    }
+}
+}  // namespace
+
+namespace megdnn {
+namespace rocm {
+
+TensorDesc::TensorDesc() {
+    miopen_check(miopenCreateTensorDescriptor(&desc));
+}
+
+TensorDesc::~TensorDesc() {
+    miopen_check(miopenDestroyTensorDescriptor(desc));
+}
+
+void TensorDesc::set(const TensorLayout& layout,
+                     const param::Convolution::Format format) {
+    megdnn_assert(format == param::Convolution::Format::NCHW,
+                  "for now, miopen only support NCHW format");
+    megdnn_assert_eq_size_t(layout.ndim, 4_z);
+    int n = layout[0];
+    int c = layout[1];
+    int h = layout[2];
+    int w = layout[3];
+    miopen_check(miopenSet4dTensorDescriptor(
+            desc, to_miopen_dtype(layout.dtype), n, c, h, w));
+}
+
+ConvDesc::ConvDesc() {
+    miopen_check(miopenCreateConvolutionDescriptor(&desc));
+}
+
+ConvDesc::~ConvDesc() {
+    miopen_check(miopenDestroyConvolutionDescriptor(desc));
+}
+
+void ConvDesc::set(const param::Convolution& param, const size_t nr_group,
+                   const bool is_depthwise) {
+    miopenConvolutionMode_t mode;
+    if (param.mode == param::Convolution::Mode::CROSS_CORRELATION) {
+        mode = miopenConvolution;
+        if (param.sparse == param::Convolution::Sparse::GROUP) {
+            mode = is_depthwise ? miopenDepthwise : miopenGroupConv;
+        }
+    } else {
+        megdnn_throw(megdnn_mangle(
+                "for now, miopen do not support non xcorr convolution"));
+    }
+
+    miopen_check(miopenInitConvolutionDescriptor(
+            desc, mode, param.pad_h, param.pad_w, param.stride_h,
+            param.stride_w, param.dilate_h, param.dilate_w));
+    if (mode == miopenGroupConv || mode == miopenDepthwise) {
+        miopen_check(miopenSetConvolutionGroupCount(desc, nr_group));
+    }
+    //! miopen do not support set compute_type, so mixed precision training is
+    //! not supported
+}
+
+PoolingDesc::PoolingDesc() {
+    miopen_check(miopenCreatePoolingDescriptor(&desc));
+}
+
+PoolingDesc::~PoolingDesc() {
+    miopen_check(miopenDestroyPoolingDescriptor(desc));
+}
+
+void PoolingDesc::set(const param::Pooling& param) {
+    miopenPoolingMode_t mode;
+    switch (param.mode) {
+        case param::Pooling::Mode::MAX:
+            mode = miopenPoolingMax;
+            break;
+        case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
+            mode = miopenPoolingAverage;
+            break;
+        default:
+            megdnn_throw(megdnn_mangle("Unsupported pooling mode for miopen"));
+    }
+    miopen_check(miopenSet2dPoolingDescriptor(
+            desc, mode, param.window_h, param.window_w, param.pad_h,
+            param.pad_w, param.stride_h, param.stride_w));
+}
+
+LRNDesc::LRNDesc() {
+    miopen_check(miopenCreateLRNDescriptor(&desc));
+}
+
+LRNDesc::~LRNDesc() {
+    miopen_check(miopenDestroyLRNDescriptor(desc));
+}
+
+void LRNDesc::set(const param::LRN& param) {
+    MEGDNN_MARK_USED_VAR(param);
+//! TODO MIOpen has two LRN Mode, miopenLRNWithinChannel and
+//! miopenLRNCrossChannel, need to check what do these modes mean.
+}
+
+BNParamDesc::BNParamDesc() {
+    miopen_check(miopenCreateTensorDescriptor(&desc));
+}
+
+void BNParamDesc::set(const miopenTensorDescriptor_t xDesc,
+                      miopenBatchNormMode_t mode) {
+    miopen_check(miopenDeriveBNTensorDescriptor(desc, xDesc, mode));
+}
+
+BNParamDesc::~BNParamDesc() {
+    miopen_check(miopenDestroyTensorDescriptor(desc));
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/miopen_wrapper.h b/dnn/src/rocm/miopen_wrapper.h
new file mode 100644
index 00000000..f571bedd
--- /dev/null
+++ b/dnn/src/rocm/miopen_wrapper.h
@@ -0,0 +1,70 @@
+/**
+ * \file dnn/src/rocm/miopen_wrapper.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megdnn/basic_types.h"
+#include "megdnn/oprs/nn.h"
+#include "src/rocm/miopen_with_check.h"
+
+namespace megdnn {
+namespace rocm {
+
+class TensorDesc {
+public:
+    TensorDesc();
+    //! default layout is nchw
+    void set(const TensorLayout& layout,
+             const param::Convolution::Format =
+                     param::Convolution::Format::NCHW);
+    ~TensorDesc();
+    miopenTensorDescriptor_t desc;
+};
+
+class ConvDesc {
+public:
+    ConvDesc();
+    //! We need more information to determine detphwise convolution
+    void set(const param::Convolution& param, const size_t nr_group,
+             const bool is_depthwise = false);
+    ~ConvDesc();
+    miopenConvolutionDescriptor_t desc;
+};
+
+class PoolingDesc {
+public:
+    PoolingDesc();
+    void set(const param::Pooling& param);
+    ~PoolingDesc();
+    miopenPoolingDescriptor_t desc;
+};
+
+class LRNDesc {
+public:
+    LRNDesc();
+    void set(const param::LRN& param);
+    ~LRNDesc();
+    miopenLRNDescriptor_t desc;
+};
+
+class BNParamDesc {
+public:
+    BNParamDesc();
+    void set(const miopenTensorDescriptor_t xDesc, miopenBatchNormMode_t mode);
+    ~BNParamDesc();
+    miopenTensorDescriptor_t desc;
+};
+
+// for now miopen do not support 3d convolution
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/pooling/opr_impl.cpp b/dnn/src/rocm/pooling/opr_impl.cpp
new file mode 100644
index 00000000..7724ea40
--- /dev/null
+++ b/dnn/src/rocm/pooling/opr_impl.cpp
@@ -0,0 +1,90 @@
+/**
+ * \file dnn/src/rocm/pooling/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/pooling/opr_impl.h"
+
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+void PoolingForwardImpl::setup_descs(const TensorLayout &src,
+        const TensorLayout &dst)
+{
+    src_desc.set(src, param().format);
+    dst_desc.set(dst, param().format);
+    pooling_desc.set(this->param());
+}
+
+void PoolingForwardImpl::exec(_megdnn_tensor_in src,
+        _megdnn_tensor_out dst,
+        _megdnn_workspace workspace)
+{
+    check_exec(src.layout, dst.layout, workspace.size);
+    auto handle = miopen_handle(this->handle());
+    setup_descs(src.layout, dst.layout);
+    dt_float32 alpha = 1.0f, beta = 0.0f;
+    miopen_check(miopenPoolingForward(handle, pooling_desc.desc, &alpha,
+                                      src_desc.desc, src.raw_ptr, &beta,
+                                      dst_desc.desc, dst.raw_ptr, false,
+                                      nullptr, 0_z));
+}
+
+void PoolingBackwardImpl::setup_descs(const TensorLayout& src,
+                                      const TensorLayout& dst,
+                                      const TensorLayout& diff,
+                                      const TensorLayout& grad) {
+    src_desc.set(src);
+    dst_desc.set(dst);
+    diff_desc.set(diff);
+    grad_desc.set(grad);
+    pooling_desc.set(this->param());
+}
+
+void PoolingBackwardImpl::exec(_megdnn_tensor_in src,
+        _megdnn_tensor_in dst,
+        _megdnn_tensor_in diff,
+        _megdnn_tensor_out grad,
+        _megdnn_workspace workspace)
+{
+    check_exec(src.layout, dst.layout, diff.layout, grad.layout, workspace.size);
+    auto handle = miopen_handle(this->handle());
+    setup_descs(src.layout, dst.layout, diff.layout, grad.layout);
+    float alpha = 1.0f, beta = 0.0f;
+    if (param().mode == param::Pooling::Mode::MAX) {
+        //! FIXME: when using max pooling opr, the backward opr need the indices
+        //! of the forward opr which stored in workspace. We have to recompute
+        //! the indices by calling miopenPoolingForward again.
+        miopen_check(miopenPoolingForward(handle, pooling_desc.desc, &alpha,
+                                          src_desc.desc, src.raw_ptr, &beta,
+                                          dst_desc.desc, dst.raw_ptr, true,
+                                          workspace.raw_ptr, workspace.size));
+    }
+    miopen_check(miopenPoolingBackward(
+            handle, pooling_desc.desc, &alpha, dst_desc.desc, dst.raw_ptr,
+            diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, &beta,
+            grad_desc.desc, grad.raw_ptr, workspace.raw_ptr));
+}
+
+size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src,
+                const TensorLayout& dst,
+                const TensorLayout& diff,
+                const TensorLayout& grad) {
+    setup_descs(src, dst, diff, grad);
+    size_t ws_size = 0_z;
+    miopenPoolingGetWorkSpaceSize(dst_desc.desc, &ws_size); 
+    return ws_size;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/pooling/opr_impl.h b/dnn/src/rocm/pooling/opr_impl.h
new file mode 100644
index 00000000..71958575
--- /dev/null
+++ b/dnn/src/rocm/pooling/opr_impl.h
@@ -0,0 +1,59 @@
+/**
+ * \file dnn/src/rocm/pooling/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+#include "src/rocm/miopen_wrapper.h"
+
+namespace megdnn {
+namespace rocm {
+
+class PoolingForwardImpl final: public PoolingForward {
+    public:
+        using PoolingForward::PoolingForward;
+        void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                _megdnn_workspace workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout &,
+                const TensorLayout &) override {
+            return 0;
+        }
+    private:
+        TensorDesc src_desc, dst_desc;
+        PoolingDesc pooling_desc;
+        void setup_descs(const TensorLayout &src, const TensorLayout &dst);
+};
+
+class PoolingBackwardImpl final: public PoolingBackward {
+    public:
+        using PoolingBackward::PoolingBackward;
+        void exec(_megdnn_tensor_in src,
+                _megdnn_tensor_in dst,
+                _megdnn_tensor_in diff,
+                _megdnn_tensor_out grad,
+                _megdnn_workspace workspace) override;
+        size_t get_workspace_in_bytes(const TensorLayout& src,
+                const TensorLayout& dst,
+                const TensorLayout& diff,
+                const TensorLayout& grad) override;
+    private:
+        TensorDesc src_desc, dst_desc, diff_desc, grad_desc;
+        PoolingDesc pooling_desc;
+        void setup_descs(const TensorLayout &src,
+                const TensorLayout &dst,
+                const TensorLayout &diff,
+                const TensorLayout &grad);
+
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/powc/opr_impl.cpp b/dnn/src/rocm/powc/opr_impl.cpp
new file mode 100644
index 00000000..742c11c2
--- /dev/null
+++ b/dnn/src/rocm/powc/opr_impl.cpp
@@ -0,0 +1,26 @@
+/**
+ * \file dnn/src/rocm/powc/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "src/rocm/powc/powc.h.hip"
+
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+void PowCImpl::do_exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                       const float* exp_f, const int* exp_i) {
+    powc_kern(dst, src, exp_f, exp_i, hip_stream(handle()));
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/powc/opr_impl.h b/dnn/src/rocm/powc/opr_impl.h
new file mode 100644
index 00000000..c7671fc5
--- /dev/null
+++ b/dnn/src/rocm/powc/opr_impl.h
@@ -0,0 +1,29 @@
+/**
+ * \file dnn/src/rocm/powc/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megdnn/oprs/general.h"
+
+namespace megdnn {
+namespace rocm {
+
+class PowCImpl final : public PowC {
+public:
+    using PowC::PowC;
+    void do_exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                 const float* exp_f, const int* exp_i) override;
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/powc/powc.cpp.hip b/dnn/src/rocm/powc/powc.cpp.hip
new file mode 100644
index 00000000..1b84d581
--- /dev/null
+++ b/dnn/src/rocm/powc/powc.cpp.hip
@@ -0,0 +1,229 @@
+/**
+ * \file src/rocm/powc/powc.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/powc/powc.h.hip"
+#include "megdnn/dtype.h"
+#include "src/rocm/elemwise_helper.h.hip"
+
+#include <cmath>
+#include <limits>
+
+namespace megdnn {
+namespace rocm {
+// use a namespace (but not anonymous namespace) to avoid name confliction while
+// maintaining readability of cuda kernel names
+namespace hip_kern {
+
+template <int>
+struct PowCIntSmall;
+
+template <>
+struct PowCIntSmall<0> {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T) {
+        return static_cast<T>(1);
+    }
+};
+template <>
+struct PowCIntSmall<1> {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return x;
+    }
+};
+template <>
+struct PowCIntSmall<2> {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return x * x;
+    }
+};
+template <>
+struct PowCIntSmall<3> {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return x * x * x;
+    }
+};
+template <>
+struct PowCIntSmall<4> {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        x = x * x;
+        return x * x;
+    }
+};
+template <int n>
+struct PowCIntSmall {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return PowCIntSmall<-n>::apply(static_cast<T>(1) / x);
+    }
+};
+
+template <typename T>
+struct PowCIntOdd {
+    T exp;
+
+    __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(copysignf(powf(fabsf(x), exp), x));
+    }
+};
+
+template <typename T>
+struct PowCIntEven {
+    T exp;
+
+    __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(powf(fabsf(x), exp));
+    }
+};
+
+struct PowCFloatSqrt {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(sqrtf(x));
+    }
+};
+
+struct PowCFloatCbrt {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(cbrtf(x));
+    }
+};
+
+struct PowCFloatRSqrt {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(rsqrtf(x));
+    }
+};
+
+struct PowCFloatRCbrt {
+    template <typename T>
+    static __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(rcbrtf(x));
+    }
+};
+
+template <typename T>
+struct PowCFloat {
+    T exp;
+
+    __device__ __forceinline__ T apply(T x) {
+        return static_cast<T>(powf(x, exp));
+    }
+};
+
+template <typename T, typename PowOp>
+struct PowCOp {
+    T* dest;
+    PowOp pow_op;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, T src) {
+        dest[idx] = pow_op.apply(src);
+    }
+};
+
+}  // namespace hip_kern
+
+namespace {
+
+template <typename T, typename PowOp>
+void invoke(const TensorND& dest, const TensorND& src, PowOp pow_op,
+            hipStream_t stream) {
+    ElemwiseOpParamN<1> param;
+    param[0] = src;
+    param.init_from_given_tensor();
+    typedef hip_kern::PowCOp<T, PowOp> Op;
+    Op op;
+    op.dest = dest.ptr<T>();
+    op.pow_op = pow_op;
+    run_elemwise<Op, T, 1>(param, stream, op);
+}
+
+bool feq(float a, float b) {
+    return std::abs(a - b) < std::numeric_limits<float>::epsilon();
+}
+
+template <typename T>
+void dispatch_op(const TensorND& dest, const TensorND& src, const float* exp_f,
+                 const int* exp_i, hipStream_t stream) {
+#define CALL(_op) invoke<T>(dest, src, _op, stream)
+    if (exp_f) {
+        float exp = *exp_f;
+#define CALL_IF(_v, _op)    \
+    do {                    \
+        if (feq(exp, _v)) { \
+            CALL(_op);      \
+            return;         \
+        }                   \
+    } while (0)
+        CALL_IF(.5f, hip_kern::PowCFloatSqrt());
+        CALL_IF(1.f / 3.f, hip_kern::PowCFloatCbrt());
+        CALL_IF(-.5f, hip_kern::PowCFloatRSqrt());
+        CALL_IF(-1.f / 3.f, hip_kern::PowCFloatRCbrt());
+
+        hip_kern::PowCFloat<T> op;
+        op.exp = exp;
+        CALL(op);
+        return;
+#undef CALL_IF
+    }
+
+    int exp = *exp_i;
+    switch (exp) {
+#define CASE(v)                  \
+    case v:                      \
+        CALL(hip_kern::PowCIntSmall<v>()); \
+        return
+        CASE(0);
+        CASE(1);
+        CASE(2);
+        CASE(3);
+        CASE(4);
+        CASE(-1);
+        CASE(-2);
+        CASE(-3);
+        CASE(-4);
+#undef CASE
+    }
+    if (exp & 1) {
+        hip_kern::PowCIntOdd<T> op;
+        op.exp = exp;
+        CALL(op);
+    } else {
+        hip_kern::PowCIntEven<T> op;
+        op.exp = exp;
+        CALL(op);
+    }
+#undef CALL
+}
+}  // anonymous namespace
+
+void powc_kern(const TensorND& dest, const TensorND& src,
+                     const float* exp_f, const int* exp_i,
+                     hipStream_t stream) {
+    switch (src.layout.dtype.enumv().ev) {
+#define cb(dt)                                                             \
+    case DTypeTrait<dt>::enumv:                                            \
+        return dispatch_op<DTypeTrait<dt>::ctype>(dest, src, exp_f, exp_i, \
+                                                  stream);
+        MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
+#undef cb
+        default:
+            megdnn_throw("unsupported dtype for PowC");
+    }
+}
+} // namespace rocm
+} // namespace megdnn
+
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/powc/powc.h.hip b/dnn/src/rocm/powc/powc.h.hip
new file mode 100644
index 00000000..654e1a25
--- /dev/null
+++ b/dnn/src/rocm/powc/powc.h.hip
@@ -0,0 +1,23 @@
+/**
+ * \file src/rocm/powc/powc.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#include "hip_header.h"
+#include "megdnn/basic_types.h"
+#include "src/rocm/utils.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+void powc_kern(const TensorND& dest, const TensorND& src, const float* exp_f,
+               const int* exp_i, hipStream_t stream);
+
+}  // namespace rocm`
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce/opr_impl.cpp b/dnn/src/rocm/reduce/opr_impl.cpp
new file mode 100644
index 00000000..e241e6c0
--- /dev/null
+++ b/dnn/src/rocm/reduce/opr_impl.cpp
@@ -0,0 +1,186 @@
+/**
+ * \file dnn/src/rocm/reduce/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/rocm/reduce/opr_impl.h"
+#include "src/rocm/reduce_helper.h.hip"
+
+#include "src/rocm/handle.h"
+#include "src/rocm/utils.h"
+
+#include "src/common/reduce_helper.h"
+
+namespace {
+
+using namespace megdnn;
+using namespace rocm;
+
+template <template <typename, typename, typename> class Op>
+size_t dispatch_dtype_workspace(const TensorLayout& src, const TensorLayout&,
+                                size_t A, size_t B, size_t C,
+                                Reduce::DataType data_type) {
+#if !MEGDNN_DISABLE_FLOAT16
+    using f16 = DTypeTrait<dtype::Float16>::ctype;
+#endif
+    using f32 = DTypeTrait<dtype::Float32>::ctype;
+    using i32 = DTypeTrait<dtype::Int32>::ctype;
+    if (data_type == Reduce::DataType::DEFAULT) {
+#define cb(_dt)                                                             \
+    case DTypeTrait<_dt>::enumv: {                                          \
+        using ctype = DTypeTrait<_dt>::ctype;                               \
+        return get_reduce_workspace_in_bytes<Op<ctype, ctype, ctype>>(A, B, \
+                                                                      C);   \
+    }
+        switch (src.dtype.enumv()) {
+            MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+            default:
+                megdnn_assert_internal(false);
+        }
+#undef cb
+    } else if (data_type == Reduce::DataType::FLOAT_O32xC32) {
+        if (src.dtype == dtype::Float32())
+            return get_reduce_workspace_in_bytes<Op<f32, f32, f32>>(A, B, C);
+#if !MEGDNN_DISABLE_FLOAT16
+        else if (src.dtype == dtype::Float16())
+            return get_reduce_workspace_in_bytes<Op<f16, f32, f32>>(A, B, C);
+#endif
+        else if (src.dtype == dtype::Int32())
+            return get_reduce_workspace_in_bytes<Op<i32, f32, f32>>(A, B, C);
+    }
+#if !MEGDNN_DISABLE_FLOAT16
+    else if (data_type == Reduce::DataType::FLOAT_O16xC32) {
+        if (src.dtype == dtype::Float16())
+            return get_reduce_workspace_in_bytes<Op<f16, f16, f32>>(A, B, C);
+        else if (src.dtype == dtype::Float32())
+            return get_reduce_workspace_in_bytes<Op<f32, f16, f32>>(A, B, C);
+    }
+#endif
+    megdnn_assert_internal(0);
+}
+
+template <template <typename, typename, typename> class Op>
+void dispatch_dtype(hipStream_t stream, const TensorND& src,
+                    const TensorND& dst, _megdnn_workspace workspace, size_t A,
+                    size_t B, size_t C, Reduce::DataType data_type) {
+#if !MEGDNN_DISABLE_FLOAT16
+    using f16 = DTypeTrait<dtype::Float16>::ctype;
+#endif
+    using f32 = DTypeTrait<dtype::Float32>::ctype;
+    using i32 = DTypeTrait<dtype::Int32>::ctype;
+    if (data_type == Reduce::DataType::DEFAULT) {
+        switch (src.layout.dtype.enumv()) {
+#define cb(_dt)                                                             \
+    case DTypeTrait<_dt>::enumv: {                                          \
+        using ctype = DTypeTrait<_dt>::ctype;                               \
+        return run_reduce<Op<ctype, ctype, ctype>, false>(                  \
+                workspace.ptr<ctype>(), A, B, C, stream,                    \
+                Op<ctype, ctype, ctype>(src.ptr<ctype>(), dst.ptr<ctype>(), \
+                                        B));                                \
+    }
+            MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+            default:
+                megdnn_assert_internal(false);
+        }
+    } else if (data_type == Reduce::DataType::FLOAT_O32xC32) {
+        if (src.layout.dtype == dtype::Float32()) {
+            return run_reduce<Op<f32, f32, f32>, false>(
+                    workspace.ptr<f32>(), A, B, C, stream,
+                    Op<f32, f32, f32>(src.ptr<f32>(), dst.ptr<f32>(), B));
+        }
+#if !MEGDNN_DISABLE_FLOAT16
+        else if (src.layout.dtype == dtype::Float16()) {
+            return run_reduce<Op<f16, f32, f32>, false>(
+                    workspace.ptr<f32>(), A, B, C, stream,
+                    Op<f16, f32, f32>(src.ptr<f16>(), dst.ptr<f32>(), B));
+        }
+#endif
+        else if (src.layout.dtype == dtype::Float32()) {
+            return run_reduce<Op<i32, f32, f32>, false>(
+                    workspace.ptr<f32>(), A, B, C, stream,
+                    Op<i32, f32, f32>(src.ptr<i32>(), dst.ptr<f32>(), B));
+        }
+    }
+#if !MEGDNN_DISABLE_FLOAT16
+    else if (data_type == Reduce::DataType::FLOAT_O16xC32) {
+        if (src.layout.dtype == dtype::Float16()) {
+            return run_reduce<Op<f16, f16, f32>, false>(
+                    workspace.ptr<f32>(), A, B, C, stream,
+                    Op<f16, f16, f32>(src.ptr<f16>(), dst.ptr<f16>(), B));
+        } else {
+            return run_reduce<Op<f32, f16, f32>, false>(
+                    workspace.ptr<f32>(), A, B, C, stream,
+                    Op<f32, f16, f32>(src.ptr<f32>(), dst.ptr<f16>(), B));
+        }
+    }
+#endif
+    megdnn_assert_internal(0);
+#undef cb
+}
+
+}  // anonymous namespace
+
+namespace megdnn {
+namespace rocm {
+
+void ReduceForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                             _megdnn_workspace workspace) {
+    using namespace reduce;
+    check_exec(src.layout, dst.layout, workspace.size);
+    size_t A, B, C;
+    get_ABC(src.layout, A, B, C, param().axis);
+    auto stream = hip_stream(this->handle());
+#define CASE(_mode, _op)                                                 \
+    case _mode:                                                          \
+        return dispatch_dtype<_op>(stream, src, dst, workspace, A, B, C, \
+                                   param().data_type);
+    switch (param().mode) {
+        CASE(Mode::SUM, SumOp)
+        CASE(Mode::SUM_SQR, SumSqrOp)
+        CASE(Mode::PRODUCT, ProdOp)
+        CASE(Mode::MIN, MinOp)
+        CASE(Mode::MAX, MaxOp)
+        CASE(Mode::MEAN, MeanOp)
+        default:
+            megdnn_assert_internal(false);
+#undef CASE
+    }
+}
+
+size_t ReduceForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
+                                                 const TensorLayout& dst) {
+    megdnn_assert(param().data_type != Reduce::DataType::FLOAT_IO16xC32,
+                  "FLOAT_IO16xC32 is deprecated");
+    using namespace reduce;
+    size_t A, B, C;
+    get_ABC(src, A, B, C, param().axis);
+#define CASE(_mode, _op)                                         \
+    case _mode: {                                                \
+        return dispatch_dtype_workspace<_op>(src, dst, A, B, C,  \
+                                             param().data_type); \
+        break;                                                   \
+    }
+
+    switch (param().mode) {
+        CASE(Mode::SUM, SumOp)
+        CASE(Mode::SUM_SQR, SumSqrOp)
+        CASE(Mode::PRODUCT, ProdOp)
+        CASE(Mode::MIN, MinOp)
+        CASE(Mode::MAX, MaxOp)
+        CASE(Mode::MEAN, MeanOp)
+        default:
+            megdnn_assert_internal(false);
+    }
+#undef CASE
+}
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce/opr_impl.h b/dnn/src/rocm/reduce/opr_impl.h
new file mode 100644
index 00000000..26eb3db7
--- /dev/null
+++ b/dnn/src/rocm/reduce/opr_impl.h
@@ -0,0 +1,29 @@
+/**
+ * \file dnn/src/rocm/reduce/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class ReduceForwardImpl final : public ReduceForward {
+public:
+    using ReduceForward::ReduceForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst) override;
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce/reduce.cpp.hip b/dnn/src/rocm/reduce/reduce.cpp.hip
new file mode 100644
index 00000000..2228a045
--- /dev/null
+++ b/dnn/src/rocm/reduce/reduce.cpp.hip
@@ -0,0 +1,47 @@
+/**
+ * \file src/rocm/reduce/reduce.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "hip_header.h"
+#include "src/common/reduce_helper.h"
+
+#include "megdnn/dtype.h"
+#include "src/rocm/reduce_helper.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+using namespace reduce;
+
+#define COMMOA ,
+
+#define INST(sctype, dctype, wtype)                                  \
+    INST_REDUCE(SumOp<sctype COMMOA dctype COMMOA wtype>, false);    \
+    INST_REDUCE(SumSqrOp<sctype COMMOA dctype COMMOA wtype>, false); \
+    INST_REDUCE(ProdOp<sctype COMMOA dctype COMMOA wtype>, false);   \
+    INST_REDUCE(MinOp<sctype COMMOA dctype COMMOA wtype>, false);    \
+    INST_REDUCE(MaxOp<sctype COMMOA dctype COMMOA wtype>, false);    \
+    INST_REDUCE(MeanOp<sctype COMMOA dctype COMMOA wtype>, false);
+
+#define cb(_dt) \
+    INST(DTypeTrait<_dt>::ctype, DTypeTrait<_dt>::ctype, DTypeTrait<_dt>::ctype)
+
+MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+
+INST(dt_float16, dt_float16, float)
+INST(dt_float16, float, float)
+INST(float, dt_float16, float)
+
+#undef cb
+#undef INST
+
+}  // namespace rocm
+}  // namespace megdnn
+
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce_helper.h.hip b/dnn/src/rocm/reduce_helper.h.hip
new file mode 100644
index 00000000..96f678f2
--- /dev/null
+++ b/dnn/src/rocm/reduce_helper.h.hip
@@ -0,0 +1,46 @@
+/**
+ * \file src/rocm/reduce_helper.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief helper for implementing reduce operators
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "hip_header.h"
+#include "src/rocm/utils.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+/*!
+ * \brief run reduce for custom op on (A, B, C) tensor and reduce on the B axis
+ * \tparam PublicOperator
+ *      must have typedef for wtype (workspace type)
+ *      must have const member wtype INIT (the initial value for reduction)
+ *      must have method wtype read(uint32_t idx) (load and cast to workspace
+ * type) must have method wtype apply(wtype, wtype) (apply reduction) must have
+ * method void write(uint32_t idx, wtype) (write back) \tparam
+ * sync_within_warp always do a __syncthreads(), even when the reduction falls
+ * in a warp. Turn on this to make argmxx work.
+ */
+template <class PublicOperator, bool sync_within_warp>
+void run_reduce(typename PublicOperator::wtype* workspace, size_t A, size_t B,
+                size_t C, hipStream_t stream, const PublicOperator& opr);
+template <typename wtype>
+size_t get_reduce_workspace_in_bytes(size_t A, size_t B, size_t C);
+
+#define INST_REDUCE(Op, sync_within_warp)                                   \
+    template void run_reduce<Op, sync_within_warp>(typename Op::wtype*,     \
+                                                   size_t, size_t, size_t,  \
+                                                   hipStream_t, const Op&); \
+    template size_t get_reduce_workspace_in_bytes<Op>(size_t, size_t, size_t)
+
+}  // namespace rocm
+}  // namespace megdnn
+
+#include "src/rocm/reduce_helper.hipinl"
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce_helper.hipinl b/dnn/src/rocm/reduce_helper.hipinl
new file mode 100644
index 00000000..c526d882
--- /dev/null
+++ b/dnn/src/rocm/reduce_helper.hipinl
@@ -0,0 +1,54 @@
+/**
+ * \file src/rocm/reduce_helper.hipinl
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief helper for implementing reduce operators
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#if MEGDNN_CC_CUDA
+
+#include "./reduce_helper/column.hipinl"
+#include "./reduce_helper/largeBC.hipinl"
+
+namespace megdnn {
+namespace rocm {
+
+namespace reduce_intl {
+static inline bool use_reduce_column(size_t A, size_t B, size_t C) {
+    return C == 1 && (B <= A * 4 || B <= 32);
+}
+}  // namespace reduce_intl
+
+template <class PublicOperator, bool sync_within_warp>
+void run_reduce(typename PublicOperator::wtype* workspace, size_t A, size_t B,
+                size_t C, hipStream_t stream, const PublicOperator& opr) {
+    using namespace reduce_intl;
+    if (use_reduce_column(A, B, C)) {
+        reduce_intl::run_column<PublicOperator>::run(A, B, stream, opr);
+    } else {
+        reduce_intl::run_largeBC<PublicOperator, sync_within_warp>(workspace, A, B, C,
+                                                      stream, opr);
+    }
+}
+
+template <typename Op>
+size_t get_reduce_workspace_in_bytes(size_t A, size_t B, size_t C) {
+    using namespace reduce_intl;
+    if (use_reduce_column(A, B, C))
+        return 0;
+
+    return get_workspace_largeBC<typename Op::wtype>(A, B, C);
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+#endif
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce_helper/column.hipinl b/dnn/src/rocm/reduce_helper/column.hipinl
new file mode 100644
index 00000000..3d8ef596
--- /dev/null
+++ b/dnn/src/rocm/reduce_helper/column.hipinl
@@ -0,0 +1,161 @@
+/**
+ * \file src/rocm/reduce_helper/column.hipinl
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief reduce kernel for (A, B) -> (A, 1) with small B
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "src/rocm/reduce_helper.h.hip"
+#include <limits>
+
+namespace megdnn {
+namespace rocm {
+namespace reduce_intl {
+
+/*!
+ * each block has (1 << block_size_log2) threads and process fixed number of
+ * rows; each row is processed by (1 << nr_thread_per_row_log2) threads.
+ *
+ * need a padding of max_nr_threads_per_row/2 elements after shared memory
+ */
+template<int block_size_log2, int max_nr_threads_per_row,
+    class Op, int warp_size>
+__global__ void kern_column(Op op,
+        uint32_t A, uint32_t B, uint32_t nr_thread_per_row_log2,
+        uint32_t sm_width_byte) {
+    typedef typename Op::wtype wtype;
+    // shared mem: matrix(nr_row_per_block, nr_thread_per_row)
+    HIP_DYNAMIC_SHARED( uint8_t, sub_block_raw)
+
+    uint32_t nr_row_per_block =
+                1 << (block_size_log2 - nr_thread_per_row_log2),
+             nr_thread_per_row = 1 << nr_thread_per_row_log2,
+             row_num = hipThreadIdx_x >> nr_thread_per_row_log2,
+             // tid in current row
+             tid = hipThreadIdx_x - (row_num << nr_thread_per_row_log2),
+             a = hipBlockIdx_x * nr_row_per_block + row_num;
+
+    volatile wtype* row = (wtype*)(sub_block_raw + row_num * sm_width_byte);
+    // sum columns of src[a0:a1] and store in row
+    {
+        uint32_t base = min(a, A - 1) * B;
+        wtype csum = op.read(base + tid);
+        for (int c = tid + nr_thread_per_row; c < B; c += nr_thread_per_row) {
+            csum = Op::apply(csum, op.read(base + c));
+        }
+        row[tid] = csum;
+    }
+
+#pragma unroll
+    for (uint32_t i = max_nr_threads_per_row / 2; i; i >>= 1) {
+        bool cond = nr_thread_per_row >= i * 2 && tid < i;
+        if (i >= warp_size) {
+            __syncthreads();
+        } else {
+        }
+        if (cond) {
+            wtype v0 = row[tid];
+            wtype v1 = Op::apply(v0, row[tid + i]);
+            row[tid] = cond ? v1 : v0;
+        }
+    }
+
+    if (a < A && !tid) {
+        op.write(a, row[0]);
+    }
+}
+
+template<class Op,
+    uint32_t max_nr_threads_per_row, uint32_t block_size_log2,
+    uint32_t warp_size>
+void _do_run_column(uint32_t A, uint32_t B, hipStream_t stream,
+        const Op &op) {
+    typedef typename Op::wtype wtype;
+    const uint32_t block_size = 1 << block_size_log2;
+    uint32_t nr_thread_per_row = 1, nr_thread_per_row_log2 = 0;
+
+    while (nr_thread_per_row < max_nr_threads_per_row &&
+            nr_thread_per_row * 2 <= B) {
+        ++ nr_thread_per_row_log2;
+        nr_thread_per_row *= 2;
+    }
+    // now: nr_thread_per_row <= B < nr_thread_per_row * 2
+
+    if (B <= max_nr_threads_per_row * 4) {
+        // find nr_thread_per_row with minimal wasted threads
+        uint32_t min_cost = std::numeric_limits<uint32_t>::max(),
+                 min_cost_th = 0;
+        for (uint32_t i = warp_size; i <= nr_thread_per_row; i *= 2) {
+            uint32_t cost = (i - B % i) % i;
+            if (cost < min_cost) {
+                min_cost = cost;
+                min_cost_th = i;
+            }
+        }
+        if (min_cost_th) {
+            nr_thread_per_row = min_cost_th;
+            while ((1u << nr_thread_per_row_log2) != nr_thread_per_row)
+                -- nr_thread_per_row_log2;
+        }
+    }
+
+    uint32_t nr_row_per_block = block_size / nr_thread_per_row,
+             nr_blk = DIVUP(A, nr_row_per_block),
+             sm_width_word32 = DIVUP(nr_thread_per_row * sizeof(wtype), 4ul);
+
+    // gcd(sm_width_word32, BANKS) should be 1 to avoid bank confliction
+    // iff sm_width_word32 is odd
+    sm_width_word32 += !(sm_width_word32 % 2);
+    uint32_t sm_width_byte = sm_width_word32 * 4,
+             sm_size = nr_row_per_block * sm_width_byte +
+                 sizeof(wtype) * max_nr_threads_per_row / 2;
+
+    void (*kptr)(Op op,
+        uint32_t A, uint32_t B, uint32_t nr_thread_per_row_log2,
+        uint32_t sm_width_byte);
+    if (nr_thread_per_row <= max_nr_threads_per_row / 4) {
+        kptr = kern_column<block_size_log2, max_nr_threads_per_row / 4,
+             Op, warp_size>;
+    } else if (nr_thread_per_row <= max_nr_threads_per_row / 2) {
+        kptr = kern_column<block_size_log2, max_nr_threads_per_row / 2,
+             Op, warp_size>;
+    } else {
+        kptr = kern_column<block_size_log2, max_nr_threads_per_row,
+             Op, warp_size>;
+    }
+    hipLaunchKernelGGL((kptr), dim3(nr_blk), dim3(block_size), sm_size, stream, 
+            op, A, B, nr_thread_per_row_log2, sm_width_byte);
+    after_kernel_launch();
+}
+
+
+// use struct to allow default template arguments in C++-03
+/*!
+ * \brief start the hip kernel to reduce in column direction of a matrix
+ * \tparam max_nr_threads_per_row max number of threads to reduce each row
+ * \tparam block_size_log2 log2 of threads in a block
+ * \tparam warp_size size of warp on the device
+ */
+template<class Op,
+    uint32_t max_nr_threads_per_row=64, uint32_t block_size_log2=7,
+    uint32_t warp_size=32>
+struct run_column {
+    static void run(
+            uint32_t A, uint32_t B, hipStream_t stream,
+            const Op &op) {
+        return _do_run_column<Op, max_nr_threads_per_row,
+        block_size_log2, warp_size>(A, B, stream, op);
+    }
+};
+
+} // namespace reduce_intl
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/reduce_helper/largeBC.hipinl b/dnn/src/rocm/reduce_helper/largeBC.hipinl
new file mode 100644
index 00000000..d361b1a6
--- /dev/null
+++ b/dnn/src/rocm/reduce_helper/largeBC.hipinl
@@ -0,0 +1,348 @@
+/**
+ * \file src/rocm/reduce_helper/largeBC.hipinl
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \brief reduce kernel for large (B * C) value
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "src/rocm/reduce_helper.h.hip"
+
+#include <algorithm>
+#include <cstdio>
+
+namespace megdnn {
+namespace rocm {
+namespace reduce_intl {
+
+struct ExecPolicy {
+    // (BY, BX) is the blockDim to launch reduce kernel
+    ExecPolicy(size_t A, size_t B, size_t C):
+        A(A), B(B), C(C)
+    {
+        // use C to determine BX
+        BX = 1;
+        while (BX < 32 && BX < C) BX *= 2;
+        BY = 512 / BX;
+        NA = A;
+        factor = BY*4;
+        NB = DIVUP(B, factor);
+        NC = DIVUP(C, BX);
+        {
+            nr_reduces = 0;
+            size_t tmp = B;
+            while (tmp > 1) {
+                tmp = DIVUP(tmp, factor);
+                ++nr_reduces;
+            }
+            if (nr_reduces == 0) nr_reduces = 1;
+        }
+    }
+    ExecPolicy next() const
+    {
+        return ExecPolicy(A, DIVUP(B, factor), C);
+    }
+    size_t factor;
+    size_t nr_reduces;
+    size_t BY, BX;
+    size_t NA, NB, NC;
+    size_t A, B, C;
+};
+
+// Whenever blockIdx is referenced, bidy_offset and bidz_offset should be added.
+// This mechanism is to solve thread block size limitation issue by calling
+// multiple kernels from host code.
+template <class Operator, class Reader, class Writer, typename wtype,
+         uint32_t BX, uint32_t BY, bool sync_within_warp>
+__global__ void kern_largeBC(
+        Operator opr, Reader rdr, Writer wtr,
+        uint32_t A, uint32_t B, uint32_t B2, uint32_t C,
+        uint32_t bidy_offset, uint32_t bidz_offset)
+{
+    volatile __shared__ wtype shared[BY][BX];
+    wtype s = opr.INIT;
+    uint32_t c = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t a = blockIdx.z+bidz_offset;
+    if (c < C) {
+        uint32_t base = threadIdx.y + (blockIdx.y+bidy_offset)*4*blockDim.y;
+        if (base + 0*blockDim.y < B) {
+            s = opr.apply(s, rdr.read(a*B*C + (base + 0*blockDim.y)*C + c));
+        }
+        if (base + 1*blockDim.y < B) {
+            s = opr.apply(s, rdr.read(a*B*C + (base + 1*blockDim.y)*C + c));
+        }
+        if (base + 2*blockDim.y < B) {
+            s = opr.apply(s, rdr.read(a*B*C + (base + 2*blockDim.y)*C + c));
+        }
+        if (base + 3*blockDim.y < B) {
+            s = opr.apply(s, rdr.read(a*B*C + (base + 3*blockDim.y)*C + c));
+        }
+    }
+    shared[threadIdx.y][threadIdx.x] = s;
+    __syncthreads();
+
+    const uint32_t warp_y = 32 / BX;
+#pragma unroll
+    for (uint32_t k = 256; k > warp_y; k >>= 1) {
+        if (BY >= k<<1) {
+            if (threadIdx.y < k) {
+                shared[threadIdx.y][threadIdx.x] = opr.apply(
+                        shared[threadIdx.y][threadIdx.x],
+                        shared[threadIdx.y+k][threadIdx.x]);
+            }
+            __syncthreads();
+        }
+    }
+    if (threadIdx.y < warp_y) {
+#pragma unroll
+        for (uint32_t k = warp_y; k > 0; k >>= 1) {
+            if (threadIdx.y < k) {
+                shared[threadIdx.y][threadIdx.x] =
+                        opr.apply(shared[threadIdx.y][threadIdx.x],
+                                  shared[threadIdx.y + k][threadIdx.x]);
+            }
+            if (sync_within_warp) {
+                __syncthreads();
+            }
+        }
+    }
+    if (threadIdx.y == 0 && c < C) {
+        uint32_t b2 = blockIdx.y+bidy_offset;
+        wtr.write(a*B2*C + b2*C + c, shared[0][threadIdx.x]);
+    }
+}
+
+/**
+ * \tparam Operator must have method wtype apply(wtype, wtype)
+ * \tparam Operator must have const member INIT
+ * \tparam Reader must have method wtype read(size_t idx)
+ * \tparam Writer must have method void write(size_t idx, wtype)
+ */
+template <class Operator, class Reader, class Writer, typename wtype,
+         bool sync_within_warp>
+void invoke_kernel(const ExecPolicy &p,
+        const Operator &opr,
+        const Reader &rdr,
+        const Writer &wtr,
+        hipStream_t stream)
+{
+    // 32768 thread blocks for each call
+#define CHECK(nBX, nBY)                                                        \
+    if (p.BX == nBX && p.BY == nBY) {                                          \
+        for (size_t bidy_offset = 0; bidy_offset < p.NB; bidy_offset += 32768) \
+            for (size_t bidz_offset = 0; bidz_offset < p.NA;                   \
+                 bidz_offset += 32768) {                                       \
+                dim3 blocks;                                                   \
+                blocks.x = p.NC;                                               \
+                blocks.y = std::min<size_t>(32768, p.NB - bidy_offset);        \
+                blocks.z = std::min<size_t>(32768, p.NA - bidz_offset);        \
+                void (*kptr)(Operator op, Reader rdr, Writer wtr, uint32_t A,  \
+                             uint32_t B, uint32_t B2, uint32_t C,              \
+                             uint32_t bidy_offset, uint32_t bidz_offset);      \
+                kptr = kern_largeBC<Operator, Reader, Writer, wtype, nBX, nBY, \
+                                    sync_within_warp>;                         \
+                hipLaunchKernelGGL((kptr), dim3(blocks), dim3(p.BX, p.BY), 0,  \
+                                   stream, opr, rdr, wtr, p.A, p.B,            \
+                                   DIVUP(p.B, p.factor), p.C, bidy_offset,     \
+                                   bidz_offset);                               \
+            }                                                                  \
+    }
+    void (*kptr)(Operator op, Reader rdr, Writer wtr, uint32_t A, uint32_t B,
+                 uint32_t B2, uint32_t C, uint32_t bidy_offset,
+                 uint32_t bidz_offset);
+
+#define CHECK2(nBX)                                                          \
+    if (p.BX == nBX && p.BY == 512 / nBX) {                                  \
+        kptr = kern_largeBC<Operator, Reader, Writer, wtype, nBX, 512 / nBX, \
+                            sync_within_warp>;                               \
+    }
+    CHECK2(1)
+    CHECK2(2)
+    CHECK2(4)
+    CHECK2(8)
+    CHECK2(16)
+    CHECK2(32)
+
+    for (size_t bidy_offset = 0; bidy_offset < p.NB; bidy_offset += 32768) {
+        for (size_t bidz_offset = 0; bidz_offset < p.NA; bidz_offset += 32768) {
+            dim3 blocks;
+            blocks.x = p.NC;
+            blocks.y = std::min<size_t>(32768, p.NB - bidy_offset);
+            blocks.z = std::min<size_t>(32768, p.NA - bidz_offset);
+
+            hipLaunchKernelGGL((kptr), dim3(blocks), dim3(p.BX, p.BY), 0,
+                               stream, opr, rdr, wtr, p.A, p.B,
+                               DIVUP(p.B, p.factor), p.C, bidy_offset,
+                               bidz_offset);
+        }
+    }
+
+//! using MACRO CHECK to dispatch block size will cause segmentfault when
+//! compiling
+#undef CHECK
+#undef CHECK2
+    after_kernel_launch();
+}
+
+/**
+ * inherit from PublicOperator
+ */
+template <class PublicOperator>
+struct PublicReader {
+    PublicOperator opr;
+    typedef typename PublicOperator::wtype wtype;
+    PublicReader(const PublicOperator &opr): opr(opr)
+    {}
+    __device__ wtype read(uint32_t idx)
+    { return opr.read(idx); }
+};
+
+/**
+ * read from workspace
+ */
+template <typename wtype>
+struct WorkspaceReader {
+    wtype *workspace;
+    WorkspaceReader(wtype *workspace): workspace(workspace)
+    {}
+    __device__ wtype read(uint32_t idx)
+    { return workspace[idx]; }
+};
+
+/**
+ * inherit from PublicOperator
+ */
+template <class PublicOperator>
+struct PublicWriter {
+    PublicOperator opr;
+    typedef typename PublicOperator::wtype wtype;
+    PublicWriter(const PublicOperator &opr): opr(opr)
+    {}
+    __device__ void write(uint32_t idx, wtype value)
+    { opr.write(idx, value); }
+};
+
+/**
+ * write to workspace
+ */
+template <typename wtype>
+struct WorkspaceWriter {
+    wtype *workspace;
+    WorkspaceWriter(wtype *workspace): workspace(workspace)
+    {}
+    __device__ void write(uint32_t idx, wtype value)
+    { workspace[idx] = value; }
+};
+
+/**
+ * \tparam PublicOperator
+ *      must have typedef for wtype
+ *      must have const static member wtype INIT
+ *      must have method wtype read(uint32_t idx)
+ *      must have method wtype apply(const wtype &, const wtype &)
+ *      must have method void write(uint32_t idx, const wtype &)
+ */
+template <class PublicOperator, bool sync_within_warp>
+void run_largeBC(typename PublicOperator::wtype *workspace,
+        size_t A, size_t B, size_t C,
+        hipStream_t stream, const PublicOperator &opr)
+{
+    typedef typename PublicOperator::wtype wtype;
+    using namespace reduce_intl;
+    ExecPolicy p(A, B, C);
+    if (p.nr_reduces == 1) {
+        PublicReader<PublicOperator> rdr(opr);
+        PublicWriter<PublicOperator> wtr(opr);
+        invoke_kernel<PublicOperator,
+            PublicReader<PublicOperator>,
+            PublicWriter<PublicOperator>,
+            wtype,
+            sync_within_warp>(p, opr, rdr, wtr, stream);
+    } else if (p.nr_reduces == 2) {
+        PublicReader<PublicOperator> rdr1(opr);
+        WorkspaceWriter<wtype> wtr1(workspace);
+        WorkspaceReader<wtype> rdr2(workspace);
+        PublicWriter<PublicOperator> wtr2(opr);
+        invoke_kernel<PublicOperator,
+            PublicReader<PublicOperator>,
+            WorkspaceWriter<wtype>,
+            wtype,
+            sync_within_warp>(p, opr, rdr1, wtr1, stream);
+        p = p.next();
+        invoke_kernel<PublicOperator,
+            WorkspaceReader<wtype>,
+            PublicWriter<PublicOperator>,
+            wtype,
+            sync_within_warp>(p, opr, rdr2, wtr2, stream);
+    } else {
+        wtype *workspace1 = workspace;
+        size_t B2 = DIVUP(B, p.factor);
+        wtype *workspace2 = workspace + A * B2 * C;
+        size_t nr_reduces = p.nr_reduces;
+
+        {
+            PublicReader<PublicOperator> rdr(opr);
+            WorkspaceWriter<wtype> wtr(workspace1);
+            invoke_kernel<PublicOperator,
+                PublicReader<PublicOperator>,
+                WorkspaceWriter<wtype>,
+                wtype,
+                sync_within_warp>(p, opr, rdr, wtr, stream);
+        }
+        p = p.next();
+        wtype *current = workspace1;
+        wtype *next = workspace2;
+        for (size_t i = 1; i < nr_reduces; ++i) {
+            WorkspaceReader<wtype> rdr(current);
+            if (i + 1 == nr_reduces) {
+                PublicWriter<PublicOperator> wtr(opr);
+                invoke_kernel<PublicOperator,
+                    WorkspaceReader<wtype>,
+                    PublicWriter<PublicOperator>,
+                    wtype,
+                    sync_within_warp>(p, opr, rdr, wtr, stream);
+            } else {
+                WorkspaceWriter<wtype> wtr(next);
+                invoke_kernel<PublicOperator,
+                    WorkspaceReader<wtype>,
+                    WorkspaceWriter<wtype>,
+                    wtype,
+                    sync_within_warp>(p, opr, rdr, wtr, stream);
+            }
+            std::swap(next, current);
+            p = p.next();
+        }
+    }
+}
+
+template <typename wtype>
+size_t get_workspace_largeBC(size_t A, size_t B, size_t C)
+{
+    using namespace reduce_intl;
+    ExecPolicy p(A, B, C);
+    if (p.nr_reduces == 1) {
+        // direct reduce
+        return 0;
+    } else if (p.nr_reduces == 2) {
+        // src->workspace->dst
+        size_t B2 = DIVUP(B, p.factor);
+        return sizeof(wtype) * A * B2 * C;
+    } else {
+        // src->workspace1->workspace2->dst
+        size_t B2 = DIVUP(B, p.factor);
+        size_t B3 = DIVUP(B2, p.factor);
+        return sizeof(wtype) * A * B2 * C + sizeof(wtype) * A * B3 * C;
+    }
+}
+
+
+} // namespace reduce_intl
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/relayout/opr_impl.cpp b/dnn/src/rocm/relayout/opr_impl.cpp
new file mode 100644
index 00000000..e24d58c4
--- /dev/null
+++ b/dnn/src/rocm/relayout/opr_impl.cpp
@@ -0,0 +1,166 @@
+/**
+ * \file dnn/src/rocm/relayout/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/rocm/relayout/relayout.h.hip"
+#include "src/rocm/relayout/relayout_contiguous.h.hip"
+
+#include "src/common/utils.h"
+#include "src/rocm/utils.h"
+#include "src/rocm/relayout/opr_impl.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+RelayoutForwardImpl::Param::Param(const TensorND &src, const TensorND &dst,
+        RelayoutForwardImpl *opr):
+    m_src{src}, m_dst{dst}, m_opr{opr}
+{
+    opr->check_layout_and_canonize(m_src.layout, m_dst.layout);
+}
+
+bool RelayoutForwardImpl::Param::try_copy_contig() {
+    auto &&lsrc = m_src.layout, &&ldst = m_dst.layout;
+    if (lsrc.ndim != 1 || ldst.ndim != 1)
+        return false;
+    if (lsrc.stride[0] != 1 || ldst.stride[0] != 1)
+        return false;
+    hip_check(hipMemcpyAsync(
+                m_dst.raw_ptr, m_src.raw_ptr,
+                ldst.total_nr_elems() * dtype_size(),
+                hipMemcpyDeviceToDevice, m_opr->stream()));
+    return true;
+}
+
+bool RelayoutForwardImpl::expand_dim2(
+        TensorLayout &dst, const TensorLayout &src) {
+    megdnn_assert(src.ndim == 2 && dst.ndim == 1);
+    megdnn_assert(dst.shape[0] == src.shape[0] * src.shape[1]);
+    if (src.stride[1] != 1 || dst.stride[0] != 1)
+        return false;
+    dst.ndim = 2;
+    dst.stride[0] = src.shape[1];
+    dst.stride[1] = 1;
+    dst.shape[0] = src.shape[0];
+    dst.shape[1] = src.shape[1];
+    return true;
+}
+
+bool RelayoutForwardImpl::Param::try_copy_2d() {
+    TensorLayout lsrc = m_src.layout, ldst = m_dst.layout;
+
+    if (lsrc.ndim > 2 || ldst.ndim > 2)
+        return false;
+
+    if (ldst.ndim == 1 && lsrc.ndim == 1) {
+        megdnn_assert(ldst.stride[0] != 1 || lsrc.stride[0] != 1);
+        if (lsrc.stride[0] < 1 || ldst.stride[0] < 1)
+            return false;
+        // extend to ndim == 2
+        megdnn_assert(ldst.shape[0] == lsrc.shape[0]);
+        ldst.ndim = lsrc.ndim = 2;
+        ldst.shape[1] = lsrc.shape[1] = 1;
+        ldst.stride[1] = lsrc.stride[1] = 1;
+    } else if (ldst.ndim < 2) {
+        if (!expand_dim2(ldst, lsrc))
+            return false;
+    } else if (lsrc.ndim < 2) {
+        if (!expand_dim2(lsrc, ldst))
+            return false;
+    }
+    if (ldst.stride[1] != 1 || lsrc.stride[1] != 1 ||
+            ldst.shape[0] != lsrc.shape[0] ||
+            ldst.shape[1] != lsrc.shape[1] ||
+            ldst.stride[0] < static_cast<ptrdiff_t>(ldst.shape[1]) ||
+            lsrc.stride[0] < static_cast<ptrdiff_t>(ldst.shape[1]))
+        return false;
+
+    //! TODO: need refactor, hipMemcpy2DAsync has bug
+    auto dsize = dtype_size();
+    hip_check(hipMemcpy2DAsync(
+            m_dst.raw_ptr, ldst.stride[0] * dsize,
+            m_src.raw_ptr, lsrc.stride[0] * dsize,
+            ldst.shape[1] * dsize, ldst.shape[0],
+            hipMemcpyDeviceToDevice, m_opr->stream()));
+
+    return true;
+};
+
+bool RelayoutForwardImpl::Param::try_copy_last_contig() {
+    //! check if the last stride is contiguous
+    auto gcd = [](size_t a, size_t b) {
+        if (a > b) std::swap(a, b);
+        size_t c;
+        while (a != 0) {
+            c = a;
+            a = b % a;
+            b = c;
+        }
+        return b;
+    };
+    auto has_negative_stride = [](const TensorLayout& layout) {
+        rep(i, layout.ndim) {
+            if (layout.stride[i] < 0) return true;
+        }
+        return false;
+    };
+
+    TensorLayout lsrc = m_src.layout, ldst = m_dst.layout;
+    if (lsrc.stride[lsrc.ndim - 1] == 1 && ldst.stride[ldst.ndim - 1] == 1 &&
+            !has_negative_stride(lsrc) && !has_negative_stride(ldst)) {
+        size_t contiguous_size =
+            gcd(lsrc.shape[lsrc.ndim - 1], ldst.shape[ldst.ndim - 1]);
+        if (contiguous_size > 1) {
+            copy_last_contiguous(m_dst, m_src, contiguous_size,
+                                 m_opr->stream());
+            return true;
+        }
+    }
+    return false;
+}
+
+void RelayoutForwardImpl::Param::copy_general() {
+
+    copy_noncontig_general(m_dst, m_src, m_opr->stream());
+}
+
+void RelayoutForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                               Handle *src_handle) {
+    bool cross_dev = false;
+
+    // check whether cross device copy
+    if (src_handle && src_handle != handle()) {
+        megcoreDeviceHandle_t dev;
+        megcoreGetDeviceHandle(src_handle->megcore_computing_handle(), &dev);
+        megcorePlatform_t plat;
+        megcoreGetPlatform(dev, &plat);
+        megdnn_assert(plat == megcorePlatformROCM,
+                      "only relayout between rocm devices are supported");
+        int dst_dev_id = -1, src_dev_id = -1;
+        megcoreGetDeviceID(dev, &src_dev_id);
+
+        megcoreGetDeviceHandle(this->handle()->megcore_computing_handle(),
+                               &dev);
+        megcoreGetDeviceID(dev, &dst_dev_id);
+
+        megdnn_assert(src_dev_id >= 0 && dst_dev_id >= 0);
+        cross_dev = src_dev_id != dst_dev_id;
+    }
+    Param param{src, dst, this};
+    if (!param.try_copy_contig() && !param.try_copy_2d() &&
+            !param.try_copy_last_contig()) {
+        megdnn_assert(!cross_dev,
+                      "cross-device general non-contig copy unsupported");
+        param.copy_general();
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/relayout/opr_impl.h b/dnn/src/rocm/relayout/opr_impl.h
new file mode 100644
index 00000000..6131ddf6
--- /dev/null
+++ b/dnn/src/rocm/relayout/opr_impl.h
@@ -0,0 +1,62 @@
+/**
+ * \file dnn/src/rocm/relayout/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/oprs.h"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+class RelayoutForwardImpl final : public RelayoutForward {
+    class Param {
+        TensorND m_src, m_dst;
+        RelayoutForwardImpl* const m_opr;
+
+    public:
+        Param(const TensorND& src, const TensorND& dst,
+              RelayoutForwardImpl* opr);
+
+        size_t dtype_size() const { return m_src.layout.dtype.size(); }
+
+        //! try to copy by cudaMemcpy
+        bool try_copy_contig();
+
+        //! try to copy by cudaMemcpy2DAsync
+        bool try_copy_2d();
+
+        void copy_general();
+
+        //! try to copy if last contiguous
+        bool try_copy_last_contig();
+    };
+
+    //! expand *dst* to 2 dims to match *src*
+    static bool expand_dim2(TensorLayout& dst, const TensorLayout& src);
+
+    hipStream_t stream() const { return hip_stream(handle()); }
+
+public:
+    using RelayoutForward::RelayoutForward;
+
+    bool is_thread_safe() const override { return true; }
+
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+              Handle* src_handle) override;
+};
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
+
diff --git a/dnn/src/rocm/relayout/relayout.cpp.hip b/dnn/src/rocm/relayout/relayout.cpp.hip
new file mode 100644
index 00000000..95c6d620
--- /dev/null
+++ b/dnn/src/rocm/relayout/relayout.cpp.hip
@@ -0,0 +1,61 @@
+/**
+ * \file src/rocm/relayout/relayout.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./relayout.h.hip"
+#include "megdnn/basic_types.h"
+#include "src/rocm/elemwise_helper.h.hip"
+
+namespace {
+    template<typename ctype>
+    struct CopyOp {
+        __device__ __forceinline__ void operator() (
+                uint32_t idx, ctype &dst, ctype src) {
+            MEGDNN_MARK_USED_VAR(idx);
+            dst = src;
+        }
+    };
+
+} // anonymous namespace
+
+namespace megdnn {
+namespace rocm {
+
+void copy_noncontig_general(const TensorND& dst, const TensorND& src,
+                            hipStream_t stream) {
+    ElemwiseOpParamN<2> param;
+    param[0] = dst;
+    param[1] = src;
+
+#define RUN(_dt)                                                      \
+    do {                                                              \
+        typedef DTypeTrait<dtype::_dt>::ctype ctype;                  \
+        param[0].layout.dtype = param[1].layout.dtype = dtype::_dt(); \
+        param.init_from_given_tensor();                               \
+        run_elemwise<CopyOp<ctype>, ctype, 2>(param, stream);         \
+        return;                                                       \
+    } while (0)
+
+    switch (dst.layout.dtype.size()) {
+        case 1:
+            RUN(Byte);
+#if !MEGDNN_DISABLE_FLOAT16
+        case 2:
+            RUN(Float16);
+#endif
+        case 4:
+            RUN(Int32);
+    }
+    megdnn_assert(0, "bad dtype size");
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/relayout/relayout.h.hip b/dnn/src/rocm/relayout/relayout.h.hip
new file mode 100644
index 00000000..1a370f39
--- /dev/null
+++ b/dnn/src/rocm/relayout/relayout.h.hip
@@ -0,0 +1,25 @@
+/**
+ * \file src/rocm/relayout/relayout.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "hip_header.h"
+#include "megdnn/basic_types.h"
+
+namespace megdnn {
+namespace rocm {
+
+void copy_noncontig_general(const TensorND& dst, const TensorND& src,
+                            hipStream_t stream);
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
+
+
diff --git a/dnn/src/rocm/relayout/relayout_contiguous.cpp b/dnn/src/rocm/relayout/relayout_contiguous.cpp
new file mode 100644
index 00000000..89e767f3
--- /dev/null
+++ b/dnn/src/rocm/relayout/relayout_contiguous.cpp
@@ -0,0 +1,96 @@
+/**
+ * \file dnn/src/rocm/relayout/relayout_contiguous.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/rocm/relayout/relayout_contiguous.h.hip"
+#include "src/common/utils.h"
+
+namespace megdnn {
+namespace rocm {
+
+namespace contiguous_intl {
+
+template <int ndim, typename ctype>
+void ParamElemVisitor<ndim, ctype, CONTIG_OTHER>::host_init(
+        const TensorND& rv, int /*grid_size*/, int /*block_size*/) {
+    megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim);
+    m_ptr = rv.ptr<ctype>();
+    for (size_t i = 0; i < rv.layout.ndim; ++i) {
+        m_stride[i] = rv.layout.stride[i];
+        if (i + 1 < rv.layout.ndim)
+            m_shape_highdim[i] = rv.layout.shape[i + 1];
+    }
+    for (int i = rv.layout.ndim - 1; i < ndim - 1; ++i) {
+        m_shape_highdim[i] = 1;
+    }
+    for (int i = rv.layout.ndim; i < ndim; ++i) {
+        m_stride[i] = 0;
+    }
+}
+
+template <int ndim, typename ctype>
+void ParamElemVisitor<ndim, ctype, CONTIG_FULL>::host_init(const TensorND& rv,
+                                                           int /*grid_size*/,
+                                                           int /*block_size*/) {
+    megdnn_assert_contiguous(rv.layout);
+    m_ptr = rv.ptr<ctype>();
+}
+
+#define INST(ndim, ctype, ctg) template class ParamElemVisitor<ndim, ctype, ctg>
+#define INST_FOR_CTYPE MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb)
+
+#define ndim_cb(_ndim)             \
+    INST(_ndim, ct, CONTIG_OTHER); \
+    INST(_ndim, ct, CONTIG_FULL);
+
+#define ct dt_byte
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_int32
+INST_FOR_CTYPE
+#undef ct
+#define ct dt_float16
+INST_FOR_CTYPE
+#undef ct
+
+#undef ndim_cb
+
+#undef INST_FOR_CTYPE
+#undef INST
+
+}  // namespace contiguous_intl
+
+void get_last_contiguous_launch_spec(const void* /*kern*/, size_t size,
+                                     size_t contiguous_size, int* grid_size,
+                                     int* block_size) {
+    safe_size_in_kern(size);
+    const uint32_t blocks = 256;
+    *block_size = blocks;
+
+    int a = size / (blocks * (contiguous_size - 1)),
+        b = (size - 1) / (blocks * contiguous_size) + 1;
+    *grid_size = std::max(a, b);
+
+    if (!*grid_size) {
+        *block_size = std::min<int>(std::max<int>(size / 64, 1) * 32, 1024);
+        *grid_size = std::max<int>(size / *block_size, 1);
+    }
+
+    // because we unroll contiguous_size times in the kernel
+    megdnn_assert(static_cast<size_t>(*block_size) * *grid_size *
+                          contiguous_size >=
+                  size);
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/relayout/relayout_contiguous.cpp.hip b/dnn/src/rocm/relayout/relayout_contiguous.cpp.hip
new file mode 100644
index 00000000..af85b3b3
--- /dev/null
+++ b/dnn/src/rocm/relayout/relayout_contiguous.cpp.hip
@@ -0,0 +1,54 @@
+/**
+ * \file src/rocm/relayout/kern_contiguous.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./relayout_contiguous.h.hip"
+#include "src/rocm/elemwise_helper.h.hip"
+
+#include <stdio.h>
+
+namespace megdnn {
+namespace rocm {
+
+// dst is contiguous
+void copy_last_contiguous(const TensorND& dst, const TensorND& src,
+                          size_t contiguous_size, hipStream_t stream) {
+    ElemwiseOpParamN<2> param;
+    param[0] = dst;
+    param[1] = src;
+
+#define RUN(_dt)                                                      \
+    do {                                                              \
+        typedef DTypeTrait<dtype::_dt>::ctype ctype;                  \
+        param[0].layout.dtype = param[1].layout.dtype = dtype::_dt(); \
+        param.init_from_given_tensor();                               \
+        param.assert_initialized();                                   \
+        contiguous_intl::UserOpInvoker<ctype, 2>(param, stream,       \
+                                                 contiguous_size);    \
+        return;                                                       \
+    } while (0)
+
+    switch (dst.layout.dtype.size()) {
+        case 1:
+            RUN(Byte);
+#if !MEGDNN_DISABLE_FLOAT16
+        case 2:
+            RUN(Float16);
+#endif
+        case 4:
+            RUN(Int32);
+    }
+    megdnn_assert(0, "bad dtype size");
+}
+
+}  // namespace rocm
+}  // namespace megdnn
+
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/relayout/relayout_contiguous.h.hip b/dnn/src/rocm/relayout/relayout_contiguous.h.hip
new file mode 100644
index 00000000..a7f74450
--- /dev/null
+++ b/dnn/src/rocm/relayout/relayout_contiguous.h.hip
@@ -0,0 +1,380 @@
+/**
+ * \file src/rocm/relayout/relayout_contiguous.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#pragma once
+
+#include "hip_header.h"
+#include "megdnn/basic_types.h"
+#include "src/rocm/int_fastdiv.h.hip"
+#include "src/rocm/elemwise_helper.h.hip"
+
+#include <stdio.h>
+
+namespace megdnn {
+namespace rocm {
+
+void copy_last_contiguous(const TensorND &dst, const TensorND &src,
+                          size_t contiguous_size, hipStream_t stream);
+
+
+void get_last_contiguous_launch_spec(const void *kern, size_t size,
+                                     size_t contiguous_size, int *grid_size,
+                                     int *block_size);
+
+//! internals for contiguous
+namespace contiguous_intl {
+
+#define devfunc __device__ __forceinline__
+    /*!
+     * \brief contiguous type
+     * If the layout is contiguous, then the type is CONTIG_FULL, CONTIG_OTHER
+     * otherwise.
+     */
+    enum ContigType {
+        CONTIG_OTHER,
+        CONTIG_FULL
+    };
+
+    /*!
+    * \brief visitor to access an element in a tensor at given logic index
+    * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait)
+    * \tparam contig_mask bit mask for contig of params;
+    *
+    * host interface:
+    *      void host_init(
+    *              const TensorND &tensor, int grid_size, int block_size)
+    *
+    * device interface:
+    *      void thread_init(uint32_t idx)
+    *          called on thread entrance, with logical indexing; the index
+    y
+    *          go beyond buffer range
+    *
+    *      ctype* ptr()
+    *          return buffer pointer; can be used by specialized OpCaller
+    *
+    *      int offset(uint32_t idx)
+    *          get physical offset from logical index
+    *
+    *      ctype& at(uint32_t idx)
+    *          ptr()[offset(idx)]
+    *
+    */
+    template<int ndim, typename ctype, ContigType contig_type>
+    class ParamElemVisitor;
+
+    /* f{{{ ParamElemVisitor specialization */
+
+#define PARAM_ELEM_VISITOR_COMMON_DEV \
+            devfunc ctype* ptr() { \
+                return m_ptr; \
+            } \
+            devfunc ctype& at(uint32_t idx) { \
+                return m_ptr[offset(idx)]; \
+            } \
+
+    //! specialization for CONTIG_OTHER
+    template<int ndim, typename ctype>
+    class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> {
+        ctype * __restrict m_ptr;
+        int m_stride[ndim];
+
+        //! m_shape_highdim[i] = original_shape[i + 1]
+#ifdef _MSC_VER
+        Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1];
+#else
+        Uint32Fastdiv m_shape_highdim[ndim - 1];
+#endif
+
+        public:
+            static const int NDIM = ndim;
+
+            void host_init(const TensorND &rv, int grid_size, int block_size);
+
+#if MEGDNN_CC_CUDA
+            devfunc void thread_init(uint32_t) {
+            }
+
+            devfunc void next() {
+            }
+
+            devfunc int offset(uint32_t idx) {
+                int offset = 0;
+#pragma unroll
+                for (int i = ndim - 1; i >= 1; -- i) {
+                    Uint32Fastdiv &shp = m_shape_highdim[i - 1];
+                    uint32_t idx_div = idx / shp;
+                    offset += (idx - idx_div * shp.divisor()) * m_stride[i];
+                    idx = idx_div;
+                }
+                offset += idx * m_stride[0];
+                return offset;
+            }
+
+            PARAM_ELEM_VISITOR_COMMON_DEV
+#endif
+    };
+
+    //! specialization for CONTIG_FULL
+    template<int ndim, typename ctype>
+    class ParamElemVisitor<ndim, ctype, CONTIG_FULL> {
+        ctype * __restrict m_ptr;
+
+        public:
+            static const int NDIM = ndim;
+
+            void host_init(const TensorND &rv, int grid_size, int block_size);
+
+#if MEGDNN_CC_CUDA
+            devfunc void thread_init(uint32_t) {
+            }
+
+            devfunc void next() {
+            }
+
+            devfunc int offset(uint32_t idx) {
+                return idx;
+            }
+
+            PARAM_ELEM_VISITOR_COMMON_DEV
+#endif
+
+    };
+
+#undef PARAM_ELEM_VISITOR_COMMON_DEV
+
+    /* f}}} */ 
+
+    template <class PVis0, class PVis1>
+    struct OpCallerBinaryContiguous {
+        PVis0 par0;
+        PVis1 par1;
+    };
+
+    /* f{{{ hip  kern */
+
+#if MEGDNN_CC_CUDA
+    /*!
+     * \brief hip kern for the last axis stride is contiguous and
+     *     contiguous_size is small
+     */
+    template <typename OpCaller>
+    __global__ void hip_last_contiguous_kern(OpCaller op_caller,
+                                              uint32_t contiguous_size,
+                                              uint32_t size) {
+        uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        idx *= contiguous_size;
+        if (idx >= size) return;
+
+        size_t remain_size = size - idx;
+        remain_size =
+            remain_size > contiguous_size ? contiguous_size : remain_size;
+
+        int offset0 = op_caller.par0.offset(idx);
+        int offset1 = op_caller.par1.offset(idx);
+
+    #pragma unroll
+        for (size_t i = 0; i < remain_size; ++i) {
+            op_caller.par0.ptr()[offset0++] = op_caller.par1.ptr()[offset1++];
+        }
+    }
+
+    /*!
+     * \brief hip kern for the last axis stride is contiguous and
+     *     contiguous_size is large
+     *
+     * \param op_caller user op caller
+     * \param contiguous_size the size of last contiguous elements
+     * \param size total number of elements
+     * \param contig_size_each_block the contiguous elements visited in each
+     *     block
+     * \param contig_block_size the block size used to visit the contiguous
+     *     element
+     */
+    template <typename OpCaller>
+    __global__ void hip_last_contiguous_large_kern(
+            OpCaller op_caller, uint32_t contiguous_size, uint32_t size,
+            uint32_t contig_size_each_block, uint32_t contig_block_size) {
+        // Every block manipulate a sub contiguous elements
+        //! The \p contig_idx contiguous elements
+        uint32_t contig_idx = blockIdx.x / contig_block_size;
+        //! The idx in the current contiguous elemwise
+        uint32_t contig_block_idx = blockIdx.x - contig_idx * contig_block_size;
+        uint32_t idx = contig_idx * contiguous_size + contig_block_idx *
+            contig_size_each_block;
+        if (idx >= size) return;
+
+        uint32_t remain = contiguous_size - contig_block_idx *
+            contig_size_each_block;
+        if (remain > contig_size_each_block) remain = contig_size_each_block;
+
+        uint32_t physical_idx0 = op_caller.par0.offset(idx);
+        uint32_t physical_idx1 = op_caller.par1.offset(idx);
+
+        int i = threadIdx.x;
+        while (i < remain) {
+            op_caller.par0.ptr()[physical_idx0 + i] =
+                op_caller.par1.ptr()[physical_idx1 + i];
+            i += blockDim.x;
+        }
+    }
+
+    /* f}}} */
+
+
+#define DEFINE_CONTIG_RECEIVER(_ndim, _cb_header, _cb_dispatch, _layout) \
+    _cb_header(_ndim) { \
+        if (_layout.is_contiguous()) { \
+            return _cb_dispatch(_ndim, CONTIG_FULL); \
+        } \
+        return _cb_dispatch(_ndim, CONTIG_OTHER); \
+    } \
+
+    //! invoke a user Op passed to run_elemwise
+    template<typename ctype, int arity>
+    class UserOpInvoker;
+
+    /* f{{{ UserOpInvoker specializations */
+
+    //! specialization for binary opr
+    template<typename ctype>
+    class UserOpInvoker<ctype, 2> {
+        bool m_invoked;
+        const ElemwiseOpParamN<2> &m_param;
+        hipStream_t m_stream;
+        const size_t m_contiguous_size;
+
+        void dispatch0() {
+            switch(m_param[0].layout.ndim) {
+#define cb(ndim) \
+                case ndim: return dispatch1_##ndim();
+                MEGDNN_FOREACH_TENSOR_NDIM(cb)
+#undef cb
+            }
+        }
+
+#define cb_header(ndim) void dispatch1_##ndim()
+#define cb_dispatch(ndim, contig_mask) \
+        dispatch2<ParamElemVisitor<ndim, ctype, contig_mask> >()
+DEFINE_CONTIG_RECEIVER(1, cb_header, cb_dispatch, m_param[0].layout)
+DEFINE_CONTIG_RECEIVER(2, cb_header, cb_dispatch, m_param[0].layout)
+DEFINE_CONTIG_RECEIVER(3, cb_header, cb_dispatch, m_param[0].layout)
+DEFINE_CONTIG_RECEIVER(4, cb_header, cb_dispatch, m_param[0].layout)
+DEFINE_CONTIG_RECEIVER(5, cb_header, cb_dispatch, m_param[0].layout)
+DEFINE_CONTIG_RECEIVER(6, cb_header, cb_dispatch, m_param[0].layout)
+DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch, m_param[0].layout)
+#undef cb_header
+#undef cb_dispatch
+
+
+        template<class PVis0>
+        void dispatch2() {
+            switch(m_param[1].layout.ndim) {
+#define cb(ndim) \
+                case ndim: return dispatch3_##ndim<PVis0>();
+                MEGDNN_FOREACH_TENSOR_NDIM(cb)
+#undef cb
+            }
+        }
+
+#define cb_header(ndim) \
+    template<class PVis0> \
+    void dispatch3_##ndim()
+#define cb_dispatch(ndim, contig_mask) \
+        do_run<PVis0, ParamElemVisitor<ndim, ctype, contig_mask> >()
+DEFINE_CONTIG_RECEIVER(1, cb_header, cb_dispatch, m_param[1].layout)
+DEFINE_CONTIG_RECEIVER(2, cb_header, cb_dispatch, m_param[1].layout)
+DEFINE_CONTIG_RECEIVER(3, cb_header, cb_dispatch, m_param[1].layout)
+DEFINE_CONTIG_RECEIVER(4, cb_header, cb_dispatch, m_param[1].layout)
+DEFINE_CONTIG_RECEIVER(5, cb_header, cb_dispatch, m_param[1].layout)
+DEFINE_CONTIG_RECEIVER(6, cb_header, cb_dispatch, m_param[1].layout)
+DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch, m_param[1].layout)
+#undef cb_header
+#undef cb_dispatch
+
+        template<class PVis0, class PVis1>
+        void do_run() {
+            megdnn_assert(!m_invoked);
+            m_invoked = true;
+            typedef OpCallerBinaryContiguous<PVis0, PVis1> Caller;
+            size_t size = m_param.size;
+            int grid_size, block_size;
+            if (m_contiguous_size > 32) {
+                void (*fptr)(Caller, uint32_t, uint32_t, uint32_t, uint32_t);
+                fptr = hip_last_contiguous_large_kern<Caller>;
+                safe_size_in_kern(size);
+                block_size = m_contiguous_size;
+                if (block_size > 256) {
+                    block_size = 256;
+                } else if (block_size > 128) {
+                    block_size = 128;
+                } else if (block_size > 64) {
+                    block_size = 64;
+                } else {
+                    block_size = 32;
+                }
+
+                const uint32_t MAX_CONTIG_SIZE_EACH_BLOCK = 1024;
+                uint32_t contig_block_size = 1;
+                uint32_t contig_size_each_block = m_contiguous_size;
+                if (m_contiguous_size > MAX_CONTIG_SIZE_EACH_BLOCK) {
+                    contig_size_each_block = MAX_CONTIG_SIZE_EACH_BLOCK;
+                    contig_block_size =
+                            (m_contiguous_size + contig_size_each_block - 1) /
+                            contig_size_each_block;
+                }
+                grid_size = (size + m_contiguous_size - 1) / m_contiguous_size *
+                            contig_block_size;
+                Caller caller;
+                caller.par0.host_init(m_param[0], grid_size, block_size);
+                caller.par1.host_init(m_param[1], grid_size, block_size);
+                hipLaunchKernelGGL(fptr, dim3(grid_size), dim3(block_size), 0,
+                                   m_stream, caller, m_contiguous_size, size,
+                                   contig_size_each_block, contig_block_size);
+            } else {
+                void (*fptr)(Caller, uint32_t, uint32_t);
+                fptr = hip_last_contiguous_kern<Caller>;
+                get_last_contiguous_launch_spec(
+                        reinterpret_cast<const void *>(fptr),
+                        size, m_contiguous_size, &grid_size, &block_size);
+                Caller caller;
+                caller.par0.host_init(m_param[0], grid_size, block_size);
+                caller.par1.host_init(m_param[1], grid_size, block_size);
+                hipLaunchKernelGGL(fptr, dim3(grid_size), dim3(block_size), 0,
+                                   m_stream, caller, m_contiguous_size, size);
+            }
+            after_kernel_launch();
+        }
+
+        public:
+            UserOpInvoker(const ElemwiseOpParamN<2> &param, hipStream_t stream,
+                    const size_t contiguous_size):
+                m_param(param), m_stream(stream),
+                m_contiguous_size(contiguous_size)
+            {
+                m_invoked = false;
+                dispatch0();
+                megdnn_assert(m_invoked);
+            }
+    };
+
+#undef DEFINE_CONTIG_RECEIVER
+
+    /* f}}} */ 
+
+#endif
+
+#undef devfunc
+
+} // namespace contiguous_intl
+
+} // rocm
+} // megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/rocm/rng/opr_impl.cpp b/dnn/src/rocm/rng/opr_impl.cpp
new file mode 100644
index 00000000..0170e744
--- /dev/null
+++ b/dnn/src/rocm/rng/opr_impl.cpp
@@ -0,0 +1,126 @@
+/**
+ * \file dnn/src/rocm/rng/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "src/common/utils.h"
+#include "src/rocm/handle.h"
+#include "src/rocm/utils.h"
+#include "./opr_impl.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+namespace {
+const char* status2str(rocrand_status status) {
+    switch (status) {
+#define C(v) \
+    case v:  \
+        return #v
+        C(ROCRAND_STATUS_SUCCESS);
+        C(ROCRAND_STATUS_VERSION_MISMATCH);
+        C(ROCRAND_STATUS_NOT_CREATED);
+        C(ROCRAND_STATUS_ALLOCATION_FAILED);
+        C(ROCRAND_STATUS_TYPE_ERROR);
+        C(ROCRAND_STATUS_OUT_OF_RANGE);
+        C(ROCRAND_STATUS_LENGTH_NOT_MULTIPLE);
+        C(ROCRAND_STATUS_DOUBLE_PRECISION_REQUIRED);
+        C(ROCRAND_STATUS_LAUNCH_FAILURE);
+        C(ROCRAND_STATUS_INTERNAL_ERROR);
+#undef C
+    }
+    return "unknown";
+}
+#define ROCRAND_CHECK(expr)                                                \
+    do {                                                                   \
+        rocrand_status status = (expr);                                    \
+        MEGDNN_MARK_USED_VAR(&status2str);                                 \
+        if (status != ROCRAND_STATUS_SUCCESS) {                            \
+            megdnn_throw(                                                  \
+                    ssprintf("rocrand call failed: status=%d(%s) call=%s", \
+                             status, status2str(status), #expr));          \
+        }                                                                  \
+    } while (0)
+
+}  // namespace
+
+RocRANDHandle::RocRANDHandle(hipStream_t stream, uint64_t seed) {
+    ROCRAND_CHECK(rocrand_create_generator(&m_gen, ROCRAND_RNG_PSEUDO_XORWOW));
+    ROCRAND_CHECK(rocrand_set_stream(m_gen, stream));
+    this->seed(seed);
+}
+
+RocRANDHandle::~RocRANDHandle() {
+    if (rocrand_destroy_generator(m_gen) != ROCRAND_STATUS_SUCCESS) {
+        megdnn_trap();
+    }
+}
+
+void RocRANDHandle::seed(uint64_t seed) {
+    ROCRAND_CHECK(rocrand_set_seed(m_gen, seed));
+    m_seed = seed;
+}
+
+UniformRNGImpl::UniformRNGImpl(Handle *handle):
+    UniformRNG(handle),
+    m_rocrand_handle(hip_stream(handle))
+{
+}
+
+void UniformRNGImpl::exec(
+        _megdnn_tensor_inout dst, _megdnn_workspace workspace) {
+    check_exec(dst.layout, workspace.size);
+    megdnn_assert(dst.layout.dtype == dtype::Float32(),
+            "only float32 supported");
+    m_rocrand_handle.ensure_seed(m_param.seed);
+    ROCRAND_CHECK(rocrand_generate_uniform(m_rocrand_handle.gen(),
+                dst.ptr<dt_float32>(), dst.layout.total_nr_elems()));
+}
+
+GaussianRNGImpl::GaussianRNGImpl(Handle *handle):
+    GaussianRNG(handle),
+    m_rocrand_handle(hip_stream(handle))
+{
+}
+
+void GaussianRNGImpl::exec(
+        _megdnn_tensor_inout dst, _megdnn_workspace workspace) {
+    check_exec(dst.layout, workspace.size);
+    megdnn_assert(dst.layout.dtype == dtype::Float32(),
+            "only float32 supported");
+    auto ptr = dst.ptr<dt_float32>();
+    auto size = dst.layout.total_nr_elems();
+    megdnn_assert(size);
+    m_rocrand_handle.ensure_seed(m_param.seed);
+    auto gen = m_rocrand_handle.gen();
+    if (size % 2) {
+        auto wk = workspace.ptr<dt_float32>();
+        ROCRAND_CHECK(rocrand_generate_normal(gen, wk, 2, m_param.mean,
+                    m_param.std));
+        hip_check(hipMemcpyAsync(
+                    ptr + size - 1, wk, sizeof(dt_float32), hipMemcpyDeviceToDevice,
+                    hip_stream(handle())));
+        -- size;
+    }
+
+    if (size) {
+        ROCRAND_CHECK(rocrand_generate_normal(
+                    gen, ptr, size, m_param.mean, m_param.std));
+    }
+}
+
+size_t GaussianRNGImpl::get_workspace_in_bytes(const TensorLayout &layout) {
+    if (layout.total_nr_elems() % 2)
+        return 2 * layout.dtype.size();
+    return 0;
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/rng/opr_impl.h b/dnn/src/rocm/rng/opr_impl.h
new file mode 100644
index 00000000..ae6dd1f7
--- /dev/null
+++ b/dnn/src/rocm/rng/opr_impl.h
@@ -0,0 +1,70 @@
+/**
+ * \file dnn/src/rocm/rng/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megdnn/oprs.h"
+#include "src/rocm/handle.h"
+#include <rocrand.h>
+
+namespace megdnn {
+namespace rocm {
+
+class RocRANDHandle {
+    rocrand_generator m_gen;
+    uint64_t m_seed;
+
+    RocRANDHandle(const RocRANDHandle&) = delete;
+    RocRANDHandle& operator = (const RocRANDHandle&) = delete;
+
+    public:
+        RocRANDHandle(hipStream_t stream, uint64_t seed = 0);
+        ~RocRANDHandle();
+
+        void seed(uint64_t seed);
+
+        rocrand_generator gen() const {
+            return m_gen;
+        }
+
+        void ensure_seed(uint64_t seed) {
+            if (m_seed != seed) {
+                this->seed(seed);
+            }
+        }
+};
+
+class UniformRNGImpl: public UniformRNG {
+    RocRANDHandle m_rocrand_handle;
+
+    public:
+        UniformRNGImpl(Handle *handle);
+        void exec(_megdnn_tensor_inout dst, _megdnn_workspace) override;
+
+        size_t get_workspace_in_bytes(const TensorLayout&) override {
+            return 0;
+        }
+};
+
+class GaussianRNGImpl: public GaussianRNG {
+    RocRANDHandle m_rocrand_handle;
+
+    public:
+        GaussianRNGImpl(Handle *handle);
+        void exec(_megdnn_tensor_inout dst, _megdnn_workspace) override;
+
+        size_t get_workspace_in_bytes(const TensorLayout &layout) override;
+};
+
+
+} // namespace rocm
+} // namespace megdnn
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/rocblas_header.h b/dnn/src/rocm/rocblas_header.h
new file mode 100644
index 00000000..dd35bb97
--- /dev/null
+++ b/dnn/src/rocm/rocblas_header.h
@@ -0,0 +1,18 @@
+/**
+ * \file dnn/src/rocm/rocblas_header.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#include <rocblas.h>
+#pragma GCC diagnostic pop
+
diff --git a/dnn/src/rocm/sleep/kern.cpp.hip b/dnn/src/rocm/sleep/kern.cpp.hip
new file mode 100644
index 00000000..bc9e1a84
--- /dev/null
+++ b/dnn/src/rocm/sleep/kern.cpp.hip
@@ -0,0 +1,34 @@
+/**
+ * \file src/rocm/sleep/kern.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "hip_header.h"
+
+#include "./kern.h.hip"
+
+namespace {
+
+static __global__ void kern(uint64_t cycles) {
+    uint64_t start = clock64();
+    for (;;) {
+        if (clock64() - start > cycles)
+            return;
+    }
+}
+
+}
+
+void megdnn::rocm::sleep(hipStream_t stream, uint64_t cycles) {
+    hipLaunchKernelGGL(kern,
+                       1, 1, 0, stream,
+                       cycles);
+    after_kernel_launch();
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/sleep/kern.h.hip b/dnn/src/rocm/sleep/kern.h.hip
new file mode 100644
index 00000000..b7b99025
--- /dev/null
+++ b/dnn/src/rocm/sleep/kern.h.hip
@@ -0,0 +1,22 @@
+/**
+ * \file src/rocm/sleep/kern.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "hip_header.h"
+#include "src/rocm/utils.h.hip"
+
+namespace megdnn {
+namespace rocm {
+
+    void sleep(hipStream_t stream, uint64_t cycles);
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/sleep/opr_impl.cpp b/dnn/src/rocm/sleep/opr_impl.cpp
new file mode 100644
index 00000000..ce2a9521
--- /dev/null
+++ b/dnn/src/rocm/sleep/opr_impl.cpp
@@ -0,0 +1,32 @@
+/**
+ * \file dnn/src/rocm/sleep/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "./kern.h.hip"
+
+#include "src/rocm/handle.h"
+
+namespace megdnn {
+namespace rocm {
+
+void SleepForwardImpl::exec() {
+    double seconds = m_param.time;
+    megdnn_assert(seconds > 0);
+    auto hdl = static_cast<HandleImpl*>(handle());
+    sleep(hdl->stream(), hdl->device_prop().clockRate * 18.01 * seconds);
+}
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/sleep/opr_impl.h b/dnn/src/rocm/sleep/opr_impl.h
new file mode 100644
index 00000000..2cd62da9
--- /dev/null
+++ b/dnn/src/rocm/sleep/opr_impl.h
@@ -0,0 +1,28 @@
+/**
+ * \file dnn/src/rocm/sleep/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class SleepForwardImpl: public SleepForward {
+    public:
+        using SleepForward::SleepForward;
+
+        void exec() override;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/type_cvt/opr_impl.cpp b/dnn/src/rocm/type_cvt/opr_impl.cpp
new file mode 100644
index 00000000..7fe01114
--- /dev/null
+++ b/dnn/src/rocm/type_cvt/opr_impl.cpp
@@ -0,0 +1,129 @@
+/**
+ * \file dnn/src/rocm/type_cvt/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./opr_impl.h"
+#include "src/rocm/type_cvt/type_cvt.h.hip"
+
+#include "src/rocm/utils.h"
+
+using namespace megdnn;
+using namespace rocm;
+
+namespace {
+template <typename T>
+void exec_src_quantized(const TensorND& dst, const TensorND& src,
+                        const DTypeParam<T>& src_param, hipStream_t stream) {
+    bool is_dst_quantized =
+            dst.layout.dtype.category() == DTypeCategory::QUANTIZED;
+    using ctype_src = typename DTypeTrait<T>::ctype;
+    if (!is_dst_quantized) {
+        switch (dst.layout.dtype.enumv()) {
+#define cb(_dt)                                                               \
+    case DTypeTrait<_dt>::enumv: {                                            \
+        using ctype_dest = typename DTypeTrait<_dt>::ctype;                   \
+        typecvt_kern_q2n<ctype_src, ctype_dest>(dst, src, src_param, stream); \
+        return;                                                               \
+    }
+            MEGDNN_FOREACH_COMPUTING_DTYPE(cb);
+            default:
+                megdnn_assert_internal(0);
+#undef cb
+        }
+    } else {
+        switch (dst.layout.dtype.enumv()) {
+#define cb(_dt)                                                      \
+    case DTypeTrait<_dt>::enumv: {                                   \
+        auto dst_param = dst.layout.dtype.param<_dt>();              \
+        using ctype_dest = typename DTypeTrait<_dt>::ctype;          \
+        typecvt_kern_q2q<ctype_src, ctype_dest>(dst, src, src_param, \
+                                                dst_param, stream);  \
+        return;                                                      \
+    }
+            MEGDNN_FOREACH_QUANTIZED_DTYPE(cb);
+            default:
+                megdnn_assert_internal(0);
+#undef cb
+        }
+    }
+}
+
+template <typename T>
+void exec_src_normal(const TensorND& dst, const TensorND& src,
+                     hipStream_t stream) {
+    bool is_dst_quantized =
+            dst.layout.dtype.category() == DTypeCategory::QUANTIZED;
+    using ctype_src = typename DTypeTrait<T>::ctype;
+    if (!is_dst_quantized) {
+        switch (dst.layout.dtype.enumv()) {
+#define cb(_dt)                                                    \
+    case DTypeTrait<_dt>::enumv: {                                 \
+        using ctype_dest = typename DTypeTrait<_dt>::ctype;        \
+        typecvt_kern_n2n<ctype_src, ctype_dest>(dst, src, stream); \
+        return;                                                    \
+    }
+            MEGDNN_FOREACH_COMPUTING_DTYPE(cb);
+#undef cb
+            default:
+                megdnn_assert_internal(0);
+        }
+    } else {
+        switch (dst.layout.dtype.enumv()) {
+#define cb(_dt)                                                               \
+    case DTypeTrait<_dt>::enumv: {                                            \
+        auto dst_param = dst.layout.dtype.param<_dt>();                       \
+        using ctype_dest = typename DTypeTrait<_dt>::ctype;                   \
+        typecvt_kern_n2q<ctype_src, ctype_dest>(dst, src, dst_param, stream); \
+        return;                                                               \
+    }
+            MEGDNN_FOREACH_QUANTIZED_DTYPE(cb);
+            default:
+                megdnn_assert_internal(0);
+#undef cb
+        }
+    }
+}
+}  // namespace
+
+void TypeCvtImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
+    check_exec(src.layout, dst.layout);
+    bool is_src_quantized =
+            src.layout.dtype.category() == DTypeCategory::QUANTIZED;
+    auto stream = hip_stream(handle());
+    if (!is_src_quantized)
+        switch (src.layout.dtype.enumv()) {
+#define cb(_dt)                                 \
+    case DTypeTrait<_dt>::enumv: {              \
+        exec_src_normal<_dt>(dst, src, stream); \
+        return;                                 \
+    }
+            MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+#undef cb
+            default:
+                megdnn_assert_internal(0);
+        }
+    else {
+        switch (src.layout.dtype.enumv()) {
+#define cb(_dt)                                           \
+    case DTypeTrait<_dt>::enumv: {                        \
+        auto param = src.layout.dtype.param<_dt>();       \
+        exec_src_quantized<_dt>(dst, src, param, stream); \
+        return;                                           \
+    }
+            MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
+#undef cb
+            default:
+                megdnn_assert_internal(0);
+        }
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/type_cvt/opr_impl.h b/dnn/src/rocm/type_cvt/opr_impl.h
new file mode 100644
index 00000000..254e66ee
--- /dev/null
+++ b/dnn/src/rocm/type_cvt/opr_impl.h
@@ -0,0 +1,27 @@
+/**
+ * \file dnn/src/rocm/type_cvt/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace rocm {
+
+class TypeCvtImpl final: public TypeCvt {
+    public:
+        using TypeCvt::TypeCvt;
+        void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) override;
+};
+
+} // namespace rocm
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/type_cvt/type_cvt.cpp.hip b/dnn/src/rocm/type_cvt/type_cvt.cpp.hip
new file mode 100644
index 00000000..db37d7aa
--- /dev/null
+++ b/dnn/src/rocm/type_cvt/type_cvt.cpp.hip
@@ -0,0 +1,205 @@
+/**
+ * \file src/rocm/type_cvt/type_cvt.cpp.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "./type_cvt.h.hip"
+#include "megdnn/dtype.h"
+#include "src/rocm/elemwise_helper.h.hip"
+
+namespace megdnn {
+namespace rocm {
+namespace {
+
+template <typename ctype_dest, typename ctype_src>
+struct TypeCvtOp {
+    ctype_dest* dest;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, ctype_src src) {
+        dest[idx] = static_cast<ctype_dest>(src);
+    }
+};
+
+template <typename ctype_dest, typename ctype_src>
+struct TypeCvtOpToQuantized {
+    ctype_dest* dest;
+    DTypeParam<ctype_dest> param;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, ctype_src src) {
+        dest[idx] = param.quantize(src);
+    }
+};
+
+template <typename ctype_dest, typename ctype_src>
+struct TypeCvtOpFromQuantized {
+    ctype_dest* dest;
+    DTypeParam<ctype_src> param;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, ctype_src src) {
+        dest[idx] = static_cast<ctype_dest>(param.dequantize(src));
+    }
+};
+
+template <typename ctype_dest, typename ctype_src>
+struct TypeCvtOpBetweenQuantized {
+    ctype_dest* dest;
+    DTypeParam<ctype_src> src_param;
+    DTypeParam<ctype_dest> dst_param;
+
+    __device__ __forceinline__ void operator()(uint32_t idx, ctype_src src) {
+        dest[idx] = dst_param.quantize(src_param.dequantize(src));
+    }
+};
+
+}  // anonymous namespace
+
+#define main_func(OpType, body)                                        \
+    {                                                                  \
+        typedef typename DTypeTrait<dtype_src>::ctype ctype_src;       \
+        typedef typename DTypeTrait<dtype_dest>::ctype ctype_dest;     \
+        typedef OpType<ctype_dest, ctype_src> Op;                      \
+        ElemwiseOpParamN<1> param;                                     \
+        param[0] = src;                                                \
+        param.init_from_given_tensor();                                \
+        megdnn_assert(DTypeTrait<ctype_src>::enumv ==                  \
+                      src.layout.dtype.enumv().ev);                    \
+        megdnn_assert(DTypeTrait<ctype_dest>::enumv ==                 \
+                      dest.layout.dtype.enumv().ev);                   \
+        Op op;                                                         \
+        op.dest = dest.ptr<ctype_dest>();                              \
+        body; \
+        return run_elemwise<Op, ctype_src, 1>(param, stream, op); \
+    }
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_q2q(
+        const TensorND& dest, const TensorND& src,
+        const DTypeParam<dtype_src>& src_param,
+        const DTypeParam<dtype_dest>& dst_param,
+        hipStream_t stream) {
+    main_func(TypeCvtOpBetweenQuantized, op.dst_param = dst_param;
+              op.src_param = src_param;)
+}
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_n2q(
+        const TensorND& dest, const TensorND& src,
+        const DTypeParam<dtype_dest>& dst_param,
+        hipStream_t stream) {
+    main_func(TypeCvtOpToQuantized, op.param = dst_param;);
+}
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_q2n(
+        const TensorND& dest, const TensorND& src,
+        const DTypeParam<dtype_src>& src_param,
+        hipStream_t stream) {
+    main_func(TypeCvtOpFromQuantized, op.param = src_param;);
+}
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_n2n(const TensorND& dest, const TensorND& src,
+                      hipStream_t stream) {
+    main_func(TypeCvtOp, );
+}
+
+#define INST_Q2Q(dtype_src, dtype_dest)                               \
+    template void typecvt_kern_q2q<dtype_src, dtype_dest>(            \
+            const TensorND& dest, const TensorND& src,                \
+            const DTypeParam<dtype_src>&  \
+                    src_param,                                        \
+            const DTypeParam<dtype_dest>& \
+                    dst_param,                                        \
+            hipStream_t stream);
+
+#define INST_Q2N(dtype_src, dtype_dest)                              \
+    template void typecvt_kern_q2n<dtype_src, dtype_dest>(           \
+            const TensorND& dest, const TensorND& src,               \
+            const DTypeParam<dtype_src>& \
+                    src_param,                                       \
+            hipStream_t stream);
+
+#define INST_N2Q(dtype_src, dtype_dest)                               \
+    template void typecvt_kern_n2q<dtype_src, dtype_dest>(            \
+            const TensorND& dest, const TensorND& src,                \
+            const DTypeParam<dtype_dest>& \
+                    dst_param,                                        \
+            hipStream_t stream);
+
+#define INST_N2N(dtype_src, dtype_dest)                    \
+    template void typecvt_kern_n2n<dtype_src, dtype_dest>( \
+            const TensorND& dest, const TensorND& src, hipStream_t stream);
+
+#if !MEGDNN_DISABLE_FLOAT16
+#define MEGDNN_FOREACH_COMPUTING_DTYPE_WITH_DTYPE_SRC(dtype_src, cb) \
+    cb(dtype_src, dt_int8) \
+    cb(dtype_src, dt_int32) \
+    cb(dtype_src, dt_int16) \
+    cb(dtype_src, dt_uint8) \
+    cb(dtype_src, dt_float32) \
+    cb(dtype_src, dt_float16) \
+
+#else
+
+#define MEGDNN_FOREACH_COMPUTING_DTYPE_WITH_DTYPE_SRC(dtype_src, cb) \
+    cb(dtype_src, dt_int8) \
+    cb(dtype_src, dt_int32) \
+    cb(dtype_src, dt_int16) \
+    cb(dtype_src, dt_uint8) \
+    cb(dtype_src, dt_float32) \
+
+#endif
+
+
+#define MEGDNN_FOREACH_QUANTIZED_DTYPE_WITH_DTYPE_SRC(dtype_src, cb) \
+    cb(dtype_src, dt_quint8) \
+    cb(dtype_src, dt_qint32) \
+    cb(dtype_src, dt_qint8) \
+
+#define INST_SRC_QUANTIZED(dtype_src) \
+    MEGDNN_FOREACH_COMPUTING_DTYPE_WITH_DTYPE_SRC(dtype_src, INST_Q2N) \
+    MEGDNN_FOREACH_QUANTIZED_DTYPE_WITH_DTYPE_SRC(dtype_src, INST_Q2Q) \
+
+#define INST_SRC_NORMAL(dtype_src) \
+    MEGDNN_FOREACH_COMPUTING_DTYPE_WITH_DTYPE_SRC(dtype_src, INST_N2N) \
+    MEGDNN_FOREACH_QUANTIZED_DTYPE_WITH_DTYPE_SRC(dtype_src, INST_N2Q) \
+
+#if !MEGDNN_DISABLE_FLOAT16
+#define MEGDNN_FOREACH_COMPUTING_CTYPE(cb) \
+    cb(dt_int8) \
+    cb(dt_int32) \
+    cb(dt_int16) \
+    cb(dt_uint8) \
+    cb(dt_float32) \
+    cb(dt_float16) \
+
+#else
+#define MEGDNN_FOREACH_COMPUTING_CTYPE(cb) \
+    cb(dt_int8) \
+    cb(dt_int32) \
+    cb(dt_int16) \
+    cb(dt_uint8) \
+    cb(dt_float32) \
+
+#endif
+
+#define MEGDNN_FOREACH_QUANTIZED_CTYPE(cb) \
+    cb(dt_quint8) \
+    cb(dt_qint32) \
+    cb(dt_qint8)
+
+MEGDNN_FOREACH_QUANTIZED_CTYPE(INST_SRC_QUANTIZED)
+MEGDNN_FOREACH_COMPUTING_CTYPE(INST_SRC_NORMAL)
+
+template void typecvt_kern_n2q<dtype::Int8, dtype::QuantizedS8>(
+        const TensorND& src, const TensorND& dst,
+        const DTypeParam<dt_qint8>& param, hipStream_t stream);
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/type_cvt/type_cvt.h.hip b/dnn/src/rocm/type_cvt/type_cvt.h.hip
new file mode 100644
index 00000000..4af15407
--- /dev/null
+++ b/dnn/src/rocm/type_cvt/type_cvt.h.hip
@@ -0,0 +1,42 @@
+/**
+ * \file src/rocm/type_cvt/type_cvt.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+
+#include "hip_header.h"
+#include "megdnn/basic_types.h"
+
+namespace megdnn {
+namespace rocm {
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_n2n(const TensorND& dest, const TensorND& src,
+                      hipStream_t stream);
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_n2q(
+        const TensorND& dest, const TensorND& src,
+        const DTypeParam<dtype_dest>& param,
+        hipStream_t stream);
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_q2n(
+        const TensorND& dest, const TensorND& src,
+        const DTypeParam<dtype_src>& param,
+        hipStream_t stream);
+
+template <typename dtype_src, typename dtype_dest>
+void typecvt_kern_q2q(
+        const TensorND& dest, const TensorND& src,
+        const DTypeParam<dtype_src>& src_param,
+        const DTypeParam<dtype_dest>& dst_param,
+        hipStream_t stream);
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/rocm/utils.cpp b/dnn/src/rocm/utils.cpp
new file mode 100644
index 00000000..e2b5f4f6
--- /dev/null
+++ b/dnn/src/rocm/utils.cpp
@@ -0,0 +1,109 @@
+/**
+ * \file dnn/src/rocm/utils.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "src/rocm/utils.h.hip"
+#include "src/rocm/utils.h"
+
+#include "src/common/utils.h"
+#include "src/rocm/handle.h"
+#include "src/rocm/int_fastdiv.h.hip"
+
+#include <mutex>
+
+using namespace megdnn;
+using namespace rocm;
+
+namespace {
+
+struct DevicePropRec {
+    bool init = false;
+    hipDeviceProp_t prop;
+    std::mutex mtx;
+};
+constexpr int MAX_NR_DEVICE = 32;
+DevicePropRec device_prop_rec[MAX_NR_DEVICE];
+
+const char* rocblasGetErrorString(rocblas_status error) {
+    switch (error) {
+        case rocblas_status_success:
+            return "rocblas_status_success";
+        case rocblas_status_invalid_handle:
+            return "rocblas_status_invalid_handle";
+        case rocblas_status_not_implemented:
+            return "rocblas_status_not_implemented";
+        case rocblas_status_invalid_pointer:
+            return "rocblas_status_invalid_pointer";
+        case rocblas_status_invalid_size:
+            return "rocblas_status_invalid_size";
+        case rocblas_status_memory_error:
+            return "rocblas_status_memory_error";
+        case rocblas_status_internal_error:
+            return "rocblas_status_internal_error";
+    }
+    return "Unknown ROCBlas error";
+}
+
+}  // anonymous namespace
+
+void rocm::__throw_hip_error__(hipError_t err, const char* msg) {
+    auto s = ssprintf("hip error %s(%d) occurred; expr: %s",
+                      hipGetErrorString(err), int(err), msg);
+    megdnn_throw(s.c_str());
+}
+
+void rocm::__throw_miopen_error__(miopenStatus_t err, const char* msg) {
+    auto s = ssprintf("miopen error %s(%d) occurred; expr: %s",
+                      miopenGetErrorString(err), int(err), msg);
+    megdnn_throw(s.c_str());
+}
+
+void rocm::__throw_rocblas_error__(rocblas_status err, const char* msg) {
+    auto s = ssprintf("rocblas error %s(%d) occurred; expr: %s",
+                      rocblasGetErrorString(err), int(err), msg);
+    megdnn_throw(s.c_str());
+}
+
+void rocm::report_error(const char* msg) {
+    megdnn_throw(msg);
+    MEGDNN_MARK_USED_VAR(msg);
+}
+
+uint32_t rocm::safe_size_in_kern(size_t size) {
+    if (!size || size > Uint32Fastdiv::MAX_DIVIDEND) {
+        megdnn_throw(
+                ssprintf("invalid size for element-wise kernel: %zu; "
+                         "max supported size is %u",
+                         size, Uint32Fastdiv::MAX_DIVIDEND));
+    }
+    return size;
+}
+
+hipDeviceProp_t rocm::current_device_prop() {
+    int dev;
+    hip_check(hipGetDevice(&dev));
+    megdnn_assert(dev < MAX_NR_DEVICE, "device number too large: %d", dev);
+    auto&& rec = device_prop_rec[dev];
+    if (!rec.init) {
+        std::lock_guard<std::mutex> lock(rec.mtx);
+        if (!rec.init) {
+            hip_check(hipGetDeviceProperties(&rec.prop, dev));
+            rec.init = true;
+        }
+    }
+    return rec.prop;
+}
+
+size_t rocm::max_batch_x_channel_size() {
+    return current_device_prop().maxGridSize[2];
+}
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/src/rocm/utils.h b/dnn/src/rocm/utils.h
new file mode 100644
index 00000000..dd54a6fe
--- /dev/null
+++ b/dnn/src/rocm/utils.h
@@ -0,0 +1,72 @@
+/**
+ * \file dnn/src/rocm/utils.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megcore_cdefs.h"
+#include "src/common/utils.h"
+#include "megdnn/handle.h"
+
+#include "src/rocm/handle.h"
+#include "src/rocm/utils.h.hip"
+
+#include "src/rocm/miopen_with_check.h"
+#include <rocblas.h>
+
+namespace megdnn {
+namespace rocm {
+
+static inline HandleImpl* concrete_handle(Handle* handle) {
+    return static_cast<rocm::HandleImpl*>(handle);
+}
+
+static inline miopenHandle_t miopen_handle(Handle* handle) {
+    return concrete_handle(handle)->miopen_handle();
+}
+
+static inline bool enable_miopen_algo_search(Handle* handle) {
+    return concrete_handle(handle)->enable_miopen_algo_search();
+}
+
+static inline void enable_miopen_algo_search(Handle* handle,
+                                             bool enable_algo_search) {
+    return concrete_handle(handle)->enable_miopen_algo_search(
+            enable_algo_search);
+}
+
+static inline rocblas_handle get_rocblas_handle(Handle* handle) {
+    return concrete_handle(handle)->get_rocblas_handle();
+}
+
+static inline hipStream_t hip_stream(Handle* handle) {
+    return concrete_handle(handle)->stream();
+}
+
+static inline megcore::AsyncErrorInfo* async_error_info(Handle* handle) {
+    return concrete_handle(handle)->megcore_context().error_info;
+}
+
+static inline void callback_free(hipStream_t /* stream */, hipError_t status,
+                                 void* userData) {
+    hip_check(status);
+    free(userData);
+}
+
+//! get property of currently active device
+hipDeviceProp_t current_device_prop();
+
+//! get the MIOPEN_MAX_BATCH_X_CHANNEL_SIZE, it's just return the max size of
+//! the third demension
+size_t max_batch_x_channel_size();
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/rocm/utils.h.hip b/dnn/src/rocm/utils.h.hip
new file mode 100644
index 00000000..1cfc3aae
--- /dev/null
+++ b/dnn/src/rocm/utils.h.hip
@@ -0,0 +1,97 @@
+/**
+ * \file src/rocm/utils.h.hip
+ *
+ * This file is part of MegDNN, a deep neural network run-time library
+ * developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+#include "src/common/utils.cuh"
+
+#include <stdint.h>
+
+#include "src/rocm/miopen_with_check.h"
+#include "src/rocm/rocblas_header.h"
+#define hip_check(_x)                                       \
+    do {                                                    \
+        hipError_t _err = (_x);                             \
+        if (_err != hipSuccess) {                           \
+            ::megdnn::rocm::__throw_hip_error__(_err, #_x); \
+        }                                                   \
+    } while (0)
+
+#define rocblas_check(_x)                                       \
+    do {                                                        \
+        rocblas_status _err = (_x);                             \
+        if (_err != rocblas_status_success) {                   \
+            ::megdnn::rocm::__throw_rocblas_error__(_err, #_x); \
+        }                                                       \
+    } while (0)
+
+#define miopen_check(_x)                                       \
+    do {                                                       \
+        miopenStatus_t _err = (_x);                            \
+        if (_err != miopenStatusSuccess) {                     \
+            ::megdnn::rocm::__throw_miopen_error__(_err, #_x); \
+        }                                                      \
+    } while (0)
+
+#define after_kernel_launch()         \
+    do {                              \
+        hip_check(hipGetLastError()); \
+    } while (0)
+
+#if MEGDNN_THREADS_512
+#define NR_THREADS 512
+#define NR_THREADS_X 32
+#define NR_THREADS_Y 16
+#else
+#define NR_THREADS 1024
+#define NR_THREADS_X 32
+#define NR_THREADS_Y 32
+#endif
+
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+
+namespace megdnn {
+namespace rocm {
+
+//! Error handling funcions
+MEGDNN_NORETURN void __throw_hip_error__(hipError_t err, const char* msg);
+MEGDNN_NORETURN void __throw_miopen_error__(miopenStatus_t err,
+                                            const char* msg);
+MEGDNN_NORETURN void __throw_rocblas_error__(rocblas_status err,
+                                             const char* msg);
+MEGDNN_NORETURN void report_error(const char* msg);
+
+template <typename T, size_t N>
+struct array_wrapper {
+    T data[N];
+};
+
+/*!
+ * \brief convert size to uint32_t and check for not overflow
+ *
+ * throw exception with human readable message if size not in the interval (0,
+ * Uint32Fastdiv::MAX_DIVIDEND)
+ */
+uint32_t safe_size_in_kern(size_t size);
+
+#ifdef __HIPCC__
+template <typename T>
+inline __device__ void fill_shared_mem(T* shared, uint32_t n, const T& val) {
+    uint32_t stride = hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
+    uint32_t i =
+            (hipThreadIdx_z * hipBlockDim_y + hipThreadIdx_y) * hipBlockDim_x +
+            hipThreadIdx_x;
+    for (; i < n; i += stride)
+        shared[i] = val;
+}
+#endif
+
+}  // namespace rocm
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/add_update.cpp b/dnn/test/rocm/add_update.cpp
new file mode 100644
index 00000000..13c32b0f
--- /dev/null
+++ b/dnn/test/rocm/add_update.cpp
@@ -0,0 +1,52 @@
+/**
+ * \file dnn/test/rocm/add_update.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, ADD_UPDATE) {
+    Checker<AddUpdate> checker(handle_rocm());
+
+    checker.set_dtype(0, dtype::Float32())
+            .set_dtype(1, dtype::Float32())
+            .execs({{2, 3, 4}, {2, 3, 4}});
+#if !MEGDNN_DISABLE_FLOAT16
+    checker.set_dtype(0, dtype::Float16())
+            .set_dtype(1, dtype::Float16())
+            .execs({{2, 3, 4}, {2, 3, 4}});
+#endif
+    checker.execl({{{2, 3, 4}, dtype::Float32()},
+                   {{2, 3, 4}, {16, 4, 1}, dtype::Float32()}});
+#if !MEGDNN_DISABLE_FLOAT16
+    checker.execl({{{2, 3, 4}, dtype::Float16()},
+                   {{2, 3, 4}, {16, 4, 1}, dtype::Float16()}});
+#endif
+    checker.execl({{{2, 3, 4}, {16, 4, 1}, dtype::Float32()},
+                   {{2, 3, 4}, dtype::Float32()}});
+
+    checker.execl({{{2, 3, 4}, dtype::Float32()}, {{1}, dtype::Float32()}});
+    checker.execl(
+            {{{2, 3, 4}, dtype::Float32()}, {{2, 1, 4}, dtype::Float32()}});
+    checker.set_param({2, -1, 3})
+            .set_dtype(0, dtype::Int32())
+            .set_dtype(1, dtype::Int32())
+            .execs({{2, 3, 2}, {2, 3, 2}});
+}
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/test/rocm/argmxx.cpp b/dnn/test/rocm/argmxx.cpp
new file mode 100644
index 00000000..f94b259e
--- /dev/null
+++ b/dnn/test/rocm/argmxx.cpp
@@ -0,0 +1,107 @@
+/**
+ * \file dnn/test/rocm/argmxx.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "test/rocm/fixture.h"
+
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/common/rng.h"
+
+namespace {
+
+using namespace megdnn;
+using namespace test;
+
+class ArgmxxRNG final: public RNG {
+    public:
+        void gen(const TensorND &tensor) override {
+            auto offset = tensor.layout.span().low_elem;
+            auto nr_elems = tensor.layout.span().dist_elem();
+#define cb(DType) \
+            if (tensor.layout.dtype == DType()) { \
+                using ctype = typename DTypeTrait<DType>::ctype; \
+                auto ptr = tensor.ptr<ctype>(); \
+                for (size_t i = 0; i < nr_elems; ++i) { \
+                    ptr[offset+i] = i; \
+                } \
+                std::random_shuffle(ptr + offset, ptr + offset + nr_elems); \
+                return; \
+            }
+            MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
+#undef cb
+            megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
+                                                tensor.layout.dtype.name())));
+        }
+};
+
+template <typename Argmxx>
+void test_argmxx(Handle *handle)
+{
+    Checker<Argmxx> checker(handle);
+    checker.set_dtype(1, dtype::Int32());
+    using Param = typename Argmxx::Param;
+    ArgmxxRNG rng;
+    checker.set_rng(0, &rng);
+    for (size_t axis = 0; axis < 4; ++axis) {
+        Param param;
+        param.axis = axis;
+        checker.set_param(param).set_dtype(0, dtype::Float32()).
+            execs({{2, 3, 4, 5}, {}});
+        checker.set_param(param).set_dtype(0, dtype::Float16()).
+            execs({{2, 3, 4, 5}, {}});
+    }
+    checker.set_dtype(0, dtype::Float32());
+    Param param;
+    param.axis = 1;
+    checker.set_param(param);
+    // 1-step
+    checker.execs({{2, 64, 32}, {}});
+    // 2-step
+    checker.execs({{2, 192, 32}, {}});
+    // 3-step
+    checker.execs({{2, 4333, 32}, {}});
+    // single reduce
+    checker.execs({{2, 1, 1}, {}});
+    checker.execs({{2, 1+1, 1}, {}});
+    checker.execs({{2, 2048+1, 1}, {}});
+    checker.execs({{2, 2048*2048+1, 1}, {}});
+    checker.execs({{2, 1+1, 31}, {}});
+    checker.execs({{2, 16+1, 31}, {}});
+    checker.execs({{2, 16*16+1, 31}, {}});
+    checker.execs({{2, 16*16*16+1, 31}, {}});
+    checker.execs({{2, 16*16*16*16+1, 31}, {}});
+    checker.execs({{3, 256*256+1, 2}, {}});
+    checker.execs({{3, 128*128+1, 3}, {}});
+    checker.execs({{3, 64*64+1, 7}, {}});
+    checker.execs({{3, 32*32+1, 15}, {}});
+    checker.execs({{3, 512, 500}, {}});
+    // very large reduce
+    checker.execs({{1, 4194304, 1}, {}});
+}
+
+} // anonymous namespace
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, ARGMAX)
+{
+    test_argmxx<Argmax>(handle_rocm());
+}
+
+TEST_F(ROCM, ARGMIN)
+{
+    test_argmxx<Argmin>(handle_rocm());
+}
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/batched_matrix_mul.cpp b/dnn/test/rocm/batched_matrix_mul.cpp
new file mode 100644
index 00000000..1c91a1df
--- /dev/null
+++ b/dnn/test/rocm/batched_matrix_mul.cpp
@@ -0,0 +1,52 @@
+/**
+ * \file dnn/test/rocm/batched_matrix_mul.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "test/common/checker.h"
+#include "test/common/rng.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, BATCHED_MATRIX_MUL) {
+    Checker<BatchedMatrixMul> checker(handle_rocm());
+    checker.set_epsilon(1e-2);
+    using Param = MatrixMul::Param;
+    size_t b = 9, m = 10, n = 11, k = 12;
+    std::vector<DType> dtypes{MEGDNN_INC_FLOAT16(dtype::Float16() MEGDNN_COMMA)
+                                      dtype::Float32()};
+    for (auto dtype : dtypes)
+        for (unsigned mask = 0; mask < 4; ++mask) {
+            Param param;
+            param.transposeA = mask & 1;
+            param.transposeB = mask & 2;
+            TensorShape A, B;
+            if (param.transposeA)
+                A = TensorShape{b, k, m};
+            else
+                A = TensorShape{b, m, k};
+            if (param.transposeB)
+                B = TensorShape{b, n, k};
+            else
+                B = TensorShape{b, k, n};
+            checker.set_param(param)
+                    .set_dtype(0, dtype)
+                    .set_dtype(1, dtype)
+                    .set_dtype(2, dtype)
+                    .execs({A, B, {}});
+        }
+}
+
+} // namespace test
+} // namespace megdnn
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/test/rocm/benchmark.cpp b/dnn/test/rocm/benchmark.cpp
new file mode 100644
index 00000000..766b8484
--- /dev/null
+++ b/dnn/test/rocm/benchmark.cpp
@@ -0,0 +1,150 @@
+/**
+ * \file dnn/test/rocm/benchmark.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "test/common/tensor.h"
+#include "test/common/timer.h"
+#include "megdnn/oprs.h"
+#include "test/common/workspace_wrapper.h"
+#include "test/common/benchmarker.h"
+#include "src/rocm/utils.h"
+#include "test/rocm/benchmarker.h"
+
+namespace megdnn {
+namespace test {
+
+#if MEGDNN_WITH_BENCHMARK
+
+TEST_F(ROCM, REDUCE_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker =
+            ROCMBenchmarker<ReduceForward>(handle_rocm(), handle_naive(false));
+    auto run = [&](size_t A, size_t B, size_t C) {
+        auto dtype = dtype::Float32();
+        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
+        benchmarker.set_display(true);
+        ReduceForward::Param param;
+        param.axis = 1;
+        benchmarker.set_param(param);
+        // warm up
+        benchmarker.execs({{A, B, C}, {}});
+        // do actual benchmark
+        auto time_ms = benchmarker.execs({{A, B, C}, {}});
+        time_ms = benchmarker.execs({{A, B, C}, {}});
+        auto io = (double)(A * B * C + A * C) * dtype.size();
+        auto gbps = io / (time_ms * 1e6);
+        printf("io %.2fGB, flops %.3fGB/s\n", io / 1e9, gbps);
+
+    };
+    run(65536, 64, 1);
+    run(1, 268435455, 1);
+    run(256, 1048575, 1);
+    run(1, 1048575, 256);
+    run(256, 4095, 256);
+}
+
+TEST_F(ROCM, BATCHED_MATRIX_MUL_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<BatchedMatrixMulForward>(
+            handle_rocm(), handle_naive(false));
+    auto run = [&](size_t b, size_t m, size_t n, size_t k) {
+        auto dtype = dtype::Float32();
+        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
+        benchmarker.set_display(true);
+        // warm up
+        benchmarker.execs({{b, m, k}, {b, k, n}, {}});
+        // do actual benchmark
+        auto time_ms = benchmarker.execs({{b, m, k}, {b, k, n}, {}});
+        time_ms = benchmarker.execs({{b, m, k}, {b, k, n}, {}});
+        double flo = 2.0 * b * m * n * k;
+        double flops = flo / (time_ms * 1e9);
+        printf("mxnxk=%zux%zux%zu flo %.2fGB, flops %.3fTFLOPS\n", m, n, k,
+               flo / 1e9, flops);
+
+    };
+    run(32, 128, 128, 128);
+    run(32, 256, 256, 256);
+    run(32, 512, 512, 512);
+    run(32, 1024, 1024, 1024);
+    run(32, 4096, 4096, 4096);
+    //! resnet50 fwd
+    run(32, 12544, 1024, 256);
+    run(32, 12544, 1024, 512);
+    run(32, 12544, 256, 1024);
+    run(32, 12544, 256, 512);
+    run(32, 12544, 64, 147);
+    run(32, 196, 256, 2304);
+    run(32, 3025, 64, 576);
+    run(32, 3136, 2048, 1024);
+    run(32, 3136, 2048, 512);
+    run(32, 3136, 512, 1024);
+    run(32, 3136, 512, 2048);
+    run(32, 3136, 64, 576);
+    run(32, 49, 512, 4608);
+    run(32, 50176, 128, 256);
+    run(32, 50176, 512, 256);
+    run(32, 784, 128, 1152);
+    //! resnet50 bwdwrw
+    run(32, 147, 64, 12544);
+}
+
+TEST_F(ROCM, MATRIX_MUL_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<MatrixMulForward>(handle_rocm(),
+                                                         handle_naive(false));
+    auto run = [&](size_t m, size_t n, size_t k) {
+        auto dtype = dtype::Float32();
+        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
+        benchmarker.set_display(true);
+        // warm up
+        benchmarker.execs({{m, k}, {k, n}, {}});
+        // do actual benchmark
+        auto time_ms = benchmarker.execs({{m, k}, {k, n}, {}});
+        time_ms = benchmarker.execs({{m, k}, {k, n}, {}});
+        double flo = 2.0 * m * n * k;
+        double flops = flo / (time_ms * 1e9);
+        printf("mxnxk=%zux%zux%zu flo %.2fGB, flops %.3fTFLOPS\n", m, n, k,
+               flo / 1e9, flops);
+
+    };
+    run(128, 128, 128);
+    run(256, 256, 256);
+    run(512, 512, 512);
+    run(1024, 1024, 1024);
+    run(4096, 4096, 4096);
+    //! resnet50 fwd
+    run(12544, 1024, 256);
+    run(12544, 1024, 512);
+    run(12544, 256, 1024);
+    run(12544, 256, 512);
+    run(12544, 64, 147);
+    run(196, 256, 2304);
+    run(3025, 64, 576);
+    run(3136, 2048, 1024);
+    run(3136, 2048, 512);
+    run(3136, 512, 1024);
+    run(3136, 512, 2048);
+    run(3136, 64, 576);
+    run(49, 512, 4608);
+    run(50176, 128, 256);
+    run(50176, 512, 256);
+    run(784, 128, 1152);
+    //! resnet50 bwdwrw
+    run(147, 64, 12544);
+}
+
+#endif
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/benchmarker.h b/dnn/test/rocm/benchmarker.h
new file mode 100644
index 00000000..9d508d4e
--- /dev/null
+++ b/dnn/test/rocm/benchmarker.h
@@ -0,0 +1,149 @@
+/**
+ * \file dnn/test/rocm/benchmarker.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+
+#include "megdnn/basic_types.h"
+#include "test/common/opr_proxy.h"
+#include "megdnn/tensor_format.h"
+#include "test/common/rng.h"
+#include "test/rocm/fixture.h"
+#include "src/rocm/utils.h"
+
+#include "hip_header.h"
+
+#include <map>
+
+namespace megdnn {
+namespace test {
+
+template <typename Opr>
+class ROCMBenchmarker {
+public:
+    using Param = typename Opr::Param;
+    ROCMBenchmarker(Handle* handle_rocm, Handle* handle_naive);
+
+    const Handle* handle() const { return m_handle_rocm; }
+
+    /*!
+     * \brief benchmark opr on current param/dtype/rng config
+     * \returns elapsed time in ms
+     *
+     * ROCMBenchmarker would construct TensorLayout vectors from shapes and
+     * dtypes and call exec(TensorLayoutArray &).
+     */
+    float exec(const TensorShapeArray& shapes);
+    float exec(TensorLayoutArray layouts);
+
+    //! disabiguate overloaded exec
+    float execs(const TensorShapeArray& shapes) {
+        return exec(make_layouts(shapes));
+    }
+    float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
+    ROCMBenchmarker& set_param(Param param) {
+        m_param = param;
+        return *this;
+    }
+    ROCMBenchmarker& set_dtype(size_t idx, DType dtype) {
+        m_dtype[idx] = dtype;
+        return *this;
+    }
+    ROCMBenchmarker& set_rng(size_t idx, RNG* rng) {
+        m_rng[idx] = rng;
+        return *this;
+    }
+    ROCMBenchmarker& set_proxy(const OprProxy<Opr>& proxy) {
+        m_proxy = proxy;
+        return *this;
+    }
+    ROCMBenchmarker& set_display(bool display) {
+        m_display = display;
+        return *this;
+    }
+    ROCMBenchmarker& set_fmt(size_t idx, TensorFormat fmt) {
+        m_fmt[idx] = fmt;
+        return *this;
+    }
+
+    TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
+        TensorLayoutArray layouts(shapes.size());
+        for (size_t i = 0; i < shapes.size(); ++i) {
+            DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
+                                                         : dtype::Float32());
+            TensorFormat fmt = (m_fmt.find(i) != m_fmt.end()
+                                        ? m_fmt[i]
+                                        : DefaultTensorFormat::make());
+            layouts[i] = TensorLayout(shapes[i], dt, fmt);
+        }
+        return layouts;
+    }
+
+private:
+    class ROCMTimer {
+    private:
+        bool m_started, m_stopped;
+        hipEvent_t m_event_start, m_event_end;
+        hipStream_t m_stream;
+
+    public:
+        ROCMTimer() = delete;
+        ROCMTimer(hipStream_t strm) : m_stream{strm} {
+            hip_check(hipEventCreate(&m_event_start));
+            hip_check(hipEventCreate(&m_event_end));
+            reset();
+        }
+        ~ROCMTimer() {
+            hip_check(hipEventDestroy(m_event_start));
+            hip_check(hipEventDestroy(m_event_end));
+        }
+        void start() {
+            megdnn_assert(!m_started);
+            megdnn_assert(!m_stopped);
+            m_started = true;
+            hip_check(hipEventRecord(m_event_start, m_stream));
+        }
+        void stop() {
+            megdnn_assert(m_started);
+            megdnn_assert(!m_stopped);
+            m_stopped = true;
+            hip_check(hipEventRecord(m_event_end, m_stream));
+        }
+        float get_time_in_ms() const {
+            megdnn_assert(m_started);
+            megdnn_assert(m_stopped);
+            hip_check(hipEventSynchronize(m_event_end));
+            float ms;
+            hip_check(hipEventElapsedTime(&ms, m_event_start, m_event_end));
+            return ms;
+        }
+        void reset() {
+            m_started = false;
+            m_stopped = false;
+        }
+    };
+
+    bool m_display = true;
+    Handle* m_handle_naive;
+    Handle* m_handle_rocm;
+    std::unique_ptr<RNG> m_default_rng;
+    std::map<size_t, RNG*> m_rng;
+    std::map<size_t, DType> m_dtype;
+    std::map<size_t, TensorFormat> m_fmt;
+    Param m_param;
+    OprProxy<Opr> m_proxy;
+    ROCMTimer m_device_timer;
+};
+
+}  // namespace test
+}  // namespace megdnn
+
+#include "test/rocm/benchmarker.inl"
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/benchmarker.inl b/dnn/test/rocm/benchmarker.inl
new file mode 100644
index 00000000..2784e5aa
--- /dev/null
+++ b/dnn/test/rocm/benchmarker.inl
@@ -0,0 +1,101 @@
+/**
+ * \file dnn/test/rocm/benchmarker.inl
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include "test/rocm/benchmarker.h"
+
+#include <gtest/gtest.h>
+#include "test/common/timer.h"
+
+namespace megdnn {
+namespace test {
+
+template <typename Opr>
+ROCMBenchmarker<Opr>::ROCMBenchmarker(Handle* handle_rocm, Handle* handle_naive)
+        : m_handle_naive{handle_naive},
+          m_handle_rocm{handle_rocm},
+          m_default_rng{new NormalRNG()},
+          m_param{Param()},
+          m_device_timer{
+                  megdnn::rocm::concrete_handle(m_handle_rocm)->stream()} {}
+
+template <typename Opr>
+float ROCMBenchmarker<Opr>::exec(const TensorShapeArray& shapes) {
+    return exec(make_layouts(shapes));
+}
+
+template <typename Opr>
+float ROCMBenchmarker<Opr>::exec(TensorLayoutArray layouts) {
+    auto opr = m_handle_rocm->create_operator<Opr>();
+    opr->param() = m_param;
+    auto user_layouts = layouts;
+    m_proxy.deduce_layout(opr.get(), layouts);
+    for (size_t i = 0; i < layouts.size(); ++i)
+        if (user_layouts[i].ndim > 0) {
+            auto run = [&]() {
+                ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
+                        << "User provided shape is "
+                        << user_layouts[i].TensorShape::to_string()
+                        << "\nExpected shape is "
+                        << layouts[i].TensorShape::to_string();
+            };
+            run();
+        }
+    auto allocate = [&layouts](Handle* handle) {
+        TensorNDArray tensors(layouts.size());
+        auto trans_func = [handle](const TensorLayout& layout) {
+            auto span = layout.span();
+            TensorND res;
+            res.raw_ptr = static_cast<uint8_t*>(
+                                  megdnn_malloc(handle, span.dist_byte())) +
+                          span.low_byte;
+            res.layout = layout;
+            return res;
+        };
+        std::transform(layouts.begin(), layouts.end(), tensors.begin(),
+                       trans_func);
+        return tensors;
+    };
+    auto tensors_cur = allocate(m_handle_rocm);
+    auto tensors_cur_host = allocate(m_handle_naive);
+    // init
+    for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
+        TensorND& tensor = tensors_cur_host[i];
+        auto rng = m_rng[i];
+        if (!rng)
+            rng = m_default_rng.get();
+        auto size = tensor.layout.span().high_byte;
+        rng->gen(tensor);
+        megdnn_memcpy_H2D(m_handle_rocm, tensors_cur[i].raw_ptr, tensor.raw_ptr,
+                          size);
+    }
+    m_device_timer.reset();
+    m_device_timer.start();
+    m_proxy.exec(opr.get(), tensors_cur);
+    m_device_timer.stop();
+    auto time_in_ms = m_device_timer.get_time_in_ms();
+    if (m_display) {
+        std::cout << "Total time is " << time_in_ms << "ms " << std::endl;
+    }
+    auto free = [](Handle* handle, TensorNDArray& tensors) {
+        std::for_each(tensors.begin(), tensors.end(),
+                      [handle](const TensorND& tensor) {
+                          megdnn_free(handle, tensor.raw_ptr);
+                      });
+    };
+    free(m_handle_rocm, tensors_cur);
+    free(m_handle_naive, tensors_cur_host);
+    return time_in_ms;
+}
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/chanwise_convolution.cpp b/dnn/test/rocm/chanwise_convolution.cpp
new file mode 100644
index 00000000..bc6f318b
--- /dev/null
+++ b/dnn/test/rocm/chanwise_convolution.cpp
@@ -0,0 +1,444 @@
+/**
+ * \file dnn/test/rocm/chanwise_convolution.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "include/hcc_detail/hcc_defs_prologue.h"
+#include "megdnn/oprs/nn.h"
+
+#include "megcore_rocm.h"
+#include "test/common/benchmarker.h"
+#include "test/common/checker.h"
+#include "test/common/convolution.h"
+#include "test/common/tensor.h"
+#include "test/common/workspace_wrapper.h"
+#include "test/rocm/fixture.h"
+
+#include "hip_header.h"
+
+using namespace megdnn;
+using namespace test;
+
+namespace {
+
+#if MEGDNN_WITH_BENCHMARK
+bool check_need_full_bench() {
+    if (getenv("MEGDNN_CHANWISE_CONV_FULLBENCH"))
+        return true;
+    printf("set MEGDNN_CHANWISE_CONV_FULLBENCH to run full benchmark\n");
+    return false;
+}
+#endif
+
+Convolution::Param gconv_param(Convolution::Param p) {
+    p.sparse = Convolution::Param::Sparse::GROUP;
+    return p;
+}
+
+template <int P0, int P1, int P2>
+class BenchmarkEnv {
+    Handle *handle, *handle_cpu;
+    std::unique_ptr<GaussianRNG> rng;
+    TensorLayout lsrc, lflt0, lflt1, ldst;
+    std::unique_ptr<Tensor<>> src0, src1, flt0, flt0_cpu, flt1, flt1_cpu, dst0,
+            dst1;
+    hipEvent_t hip_ev[3];
+    hipStream_t hip_stream;
+    size_t pad_h, pad_w;
+
+    template <typename T>
+    static std::tuple<T, T, T> shuffle(std::tuple<T, T, T> data) {
+        return std::make_tuple(std::get<P0>(data), std::get<P1>(data),
+                               std::get<P2>(data));
+    }
+
+public:
+    BenchmarkEnv(Handle* handle, Handle* handle_cpu) {
+        this->handle = handle;
+        this->handle_cpu = handle_cpu;
+        rng = handle->create_operator<GaussianRNG>();
+        // make cpu handle used
+        handle_cpu->create_operator<Sleep>()->exec();
+
+        for (int i = 0; i < 3; ++i)
+            hipEventCreate(&hip_ev[i]);
+        megcoreGetROCMStream(handle->megcore_computing_handle(), &hip_stream);
+    }
+
+    ~BenchmarkEnv() {
+        for (int i = 0; i < 3; ++i)
+            hipEventDestroy(hip_ev[i]);
+    }
+
+    void alloc(size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
+               size_t FH, size_t FW, size_t PH, size_t PW) {
+        pad_h = PH;
+        pad_w = PW;
+        auto mkly = [](const TensorShape& s) {
+            return TensorLayout{s, dtype::Float32()};
+        };
+        lsrc = mkly({N, IC, IH, IW});
+        lflt0 = mkly({CHL_MUL * IC, IC, FH, FW});
+        lflt1 = mkly({IC, CHL_MUL, 1, FH, FW});
+        ldst = mkly(
+                {N, IC * CHL_MUL, IH - FH + 1 + PH * 2, IW - FW + 1 + PW * 2});
+        src0.reset(new Tensor<>(handle, lsrc));
+        src1.reset(new Tensor<>(handle, lsrc));
+        flt0.reset(new Tensor<>(handle, lflt0));
+        flt0_cpu.reset(new Tensor<>(handle_cpu, lflt0));
+        flt1.reset(new Tensor<>(handle, lflt1));
+        flt1_cpu.reset(new Tensor<>(handle_cpu, lflt1));
+        dst0.reset(new Tensor<>(handle, ldst));
+        dst1.reset(new Tensor<>(handle, ldst));
+    }
+
+    void fill_src() {
+        rng->exec(src0->tensornd(), {});
+        megdnn_memcpy_D2D(handle, src1->ptr(), src0->ptr(),
+                          lsrc.span().dist_byte());
+    }
+
+    void fill_flt() {
+        rng->exec(flt1->tensornd(), {});
+        megdnn_memcpy_D2H(handle, flt1_cpu->ptr(), flt1->ptr(),
+                          lflt1.span().dist_byte());
+
+        const size_t IC = lflt1[0], CHL_MUL = lflt1[1],
+                     FSIZE = lflt1[3] * lflt1[4];
+
+        // fill flt0 from flt1
+        float* src = flt1_cpu->ptr();
+        float* dst = flt0_cpu->ptr();
+        memset(dst, 0, lflt0.span().dist_byte());
+        for (size_t i = 0; i < IC; ++i) {
+            for (size_t j = 0; j < CHL_MUL; ++j) {
+                memcpy(dst + ((i * CHL_MUL + j) * IC + i) * FSIZE,
+                       src + (i * CHL_MUL + j) * FSIZE, FSIZE * sizeof(float));
+            }
+        }
+
+        megdnn_memcpy_H2D(handle, flt0->ptr(), dst, lflt0.span().dist_byte());
+    }
+
+    void fill_dst() {
+        rng->exec(dst0->tensornd(), {});
+        megdnn_memcpy_D2D(handle, dst1->ptr(), dst0->ptr(),
+                          ldst.span().dist_byte());
+    }
+
+    template <class Opr>
+    void exec(Opr* opr0, Opr* opr1) {
+        opr0->param().pad_h = pad_h;
+        opr0->param().pad_w = pad_w;
+        opr1->param() = opr0->param();
+        opr1->param().sparse = param::Convolution::Sparse::GROUP;
+
+        TensorND a0, b0, c0, a1, b1, c1;
+        std::tie(a0, b0, c0) = shuffle(std::make_tuple(
+                src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
+        std::tie(a1, b1, c1) = shuffle(std::make_tuple(
+                src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
+        WorkspaceWrapper wk(handle,
+                            std::max(opr0->get_workspace_in_bytes(
+                                             a0.layout, b0.layout, c0.layout),
+                                     opr1->get_workspace_in_bytes(
+                                             a1.layout, b1.layout, c1.layout)));
+        hipProfilerStart();
+        hipEventRecord(hip_ev[0], hip_stream);
+        opr0->exec(a0, b0, c0, wk.workspace());
+        hipEventRecord(hip_ev[1], hip_stream);
+        opr1->exec(a1, b1, c1, wk.workspace());
+        hipEventRecord(hip_ev[2], hip_stream);
+        hipProfilerStop();
+
+        if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
+            getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
+            hipStreamSynchronize(hip_stream);
+            float t0 = -1, t1 = -1;
+            hipEventElapsedTime(&t0, hip_ev[0], hip_ev[1]);
+            hipEventElapsedTime(&t1, hip_ev[1], hip_ev[2]);
+            printf("%s;%s;%s: miopen/megdnn: %.3fms/%.3fms=%.3f\n",
+                   lsrc.TensorShape::to_string().c_str(),
+                   lflt1.TensorShape::to_string().c_str(),
+                   ldst.TensorShape::to_string().c_str(), t0, t1, t0 / t1);
+        }
+    }
+
+    void cmp_dst() {
+        Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
+        megdnn_memcpy_D2H(handle, dst0_cpu.ptr(), dst0->ptr(),
+                          ldst.span().dist_byte());
+        megdnn_memcpy_D2H(handle, dst1_cpu.ptr(), dst1->ptr(),
+                          ldst.span().dist_byte());
+        dst0_cpu.check_with(dst1_cpu);
+    }
+
+    void cmp_src() {
+        Tensor<> src0_cpu(handle_cpu, lsrc), src1_cpu(handle_cpu, lsrc);
+        megdnn_memcpy_D2H(handle, src0_cpu.ptr(), src0->ptr(),
+                          lsrc.span().dist_byte());
+        megdnn_memcpy_D2H(handle, src1_cpu.ptr(), src1->ptr(),
+                          lsrc.span().dist_byte());
+        src0_cpu.check_with(src1_cpu);
+    }
+
+    void cmp_flt() {
+        Tensor<> flt0_cpu(handle_cpu, lflt0), flt1_cpu(handle_cpu, lflt1);
+        float* p0 = flt0_cpu.ptr();
+        float* p1 = flt1_cpu.ptr();
+        megdnn_memcpy_D2H(handle, p0, flt0->ptr(), lflt0.span().dist_byte());
+        megdnn_memcpy_D2H(handle, p1, flt1->ptr(), lflt1.span().dist_byte());
+
+        size_t IC = lflt1[0], CHL_MUL = lflt1[1], FSIZE = lflt1[3] * lflt1[4];
+
+        double tot_err = 0, tot_err_num = 0;
+        for (size_t i = 0; i < IC; ++i) {
+            for (size_t j = 0; j < CHL_MUL; ++j) {
+                auto t0 = p0 + ((i * CHL_MUL + j) * IC + i) * FSIZE,
+                     t1 = p1 + (i * CHL_MUL + j) * FSIZE;
+                for (size_t k = 0; k < FSIZE; ++k) {
+                    auto err = std::abs(diff(t0[k], t1[k]));
+                    tot_err += err;
+                    tot_err_num += 1;
+                    ASSERT_LT(err, 1e-2)
+                            << "failed at " << i << " " << j << " " << k
+                            << " vals=" << t0[k] << "," << t1[k];
+                }
+            }
+        }
+        auto avg_err = tot_err / tot_err_num;
+        ASSERT_LT(avg_err, 1e-4);
+    }
+};
+
+}  // anonymous namespace
+
+constexpr auto M = Convolution::Mode::CROSS_CORRELATION;
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD) {
+    Checker<Convolution> checker(handle_rocm());
+    bool require_algo = false;
+    checker.set_before_exec_callback(
+            AlgoChecker<ConvolutionForward>("CHANNEL_WISE", &require_algo));
+
+    // simple case
+    checker.set_param(gconv_param({M, 0, 0, 1, 1}))
+            .execs({{1, 1, 2, 2}, {1, 1, 1, 2, 2}, {}})
+            .execs({{1, 1, 5, 5}, {1, 1, 1, 2, 2}, {}});
+
+    checker.execs({{2, 2, 5, 5}, {2, 3, 1, 2, 2}, {2, 6, 4, 4}});
+
+    checker.set_param(gconv_param({M, 1, 1, 1, 1}))
+            .execs({{2, 2, 5, 5}, {2, 1, 1, 2, 2}, {}});
+
+    checker.set_param(gconv_param({M, 2, 3, 2, 1}))
+            .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
+
+    // padding larger than kern
+    checker.set_param(gconv_param({M, 20, 30, 4, 5}))
+            .execs({{32, 12, 20, 10}, {12, 2, 1, 4, 5}, {}});
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_DATA) {
+    Checker<ConvolutionBackwardData> checker(handle_rocm());
+
+    checker.set_param(gconv_param({M, 0, 0, 1, 1}))
+            .execs({{1, 1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 2, 2}})
+            .execs({{1, 1, 1, 2, 2}, {1, 1, 5, 5}, {1, 1, 6, 6}});
+
+    checker.execs({{2, 1, 1, 2, 2}, {1, 2, 1, 1}, {1, 2, 2, 2}})
+            .execs({{2, 1, 1, 2, 2}, {1, 2, 5, 5}, {1, 2, 6, 6}})
+            .execs({{2, 3, 1, 2, 2}, {2, 6, 5, 5}, {2, 2, 6, 6}});
+
+    checker.set_param(gconv_param({M, 1, 1, 1, 1}))
+            .execs({{2, 1, 1, 2, 2}, {2, 2, 5, 5}, {2, 2, 4, 4}});
+
+    checker.set_param(gconv_param({M, 2, 3, 2, 1}))
+            .execs({{12, 3, 1, 4, 5}, {32, 36, 20, 10}, {32, 12, 39, 8}});
+
+    // padding larger than kern
+    checker.set_param(gconv_param({M, 20, 30, 4, 5}))
+            .execs({{6, 2, 1, 4, 5}, {32, 12, 10, 12}, {32, 6, 2, 3}});
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
+    Checker<ConvolutionBackwardFilter> checker(handle_rocm());
+
+    checker.set_param(gconv_param({M, 0, 0, 1, 1}))
+            .execs({{1, 1, 2, 2}, {1, 1, 1, 1}, {1, 1, 1, 2, 2}})
+            .execs({{1, 1, 6, 6}, {1, 1, 5, 5}, {1, 1, 1, 2, 2}})
+            .execs({{256, 1, 2, 2}, {256, 1, 1, 1}, {1, 1, 1, 2, 2}});
+    checker.execs({{1, 2, 2, 2}, {1, 2, 1, 1}, {2, 1, 1, 2, 2}})
+            .execs({{1, 2, 6, 6}, {1, 2, 5, 5}, {2, 1, 1, 2, 2}})
+            .execs({{2, 2, 6, 6}, {2, 6, 5, 5}, {2, 3, 1, 2, 2}});
+
+    checker.set_param(gconv_param({M, 1, 1, 1, 1}))
+            .execs({{2, 2, 4, 4}, {2, 2, 5, 5}, {2, 1, 1, 2, 2}});
+
+    checker.set_param(gconv_param({M, 0, 0, 1, 1}))
+            .execs({{40960, 1, 1, 1}, {40960, 1, 1, 1}, {1, 1, 1, 1, 1}});
+
+    checker.set_param(gconv_param({M, 2, 3, 2, 1}))
+            .execs({{32, 12, 39, 8}, {32, 36, 20, 10}, {12, 3, 1, 4, 5}});
+
+    // padding larger than kern
+    checker.set_param(gconv_param({M, 20, 30, 4, 5}))
+            .execs({{32, 6, 2, 3}, {32, 12, 10, 12}, {6, 2, 1, 4, 5}});
+
+    // unused filter items
+    checker.set_param(gconv_param({M, 2, 3, 2, 3}))
+            .execs({{32, 6, 1, 1}, {32, 12, 1, 1}, {6, 2, 1, 5, 7}});
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(ROCM, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
+    auto handle = handle_rocm();
+    auto handle_cpu = handle_naive();
+    auto conv0 = handle->create_operator<ConvolutionForward>();
+    auto conv1 = handle->create_operator<ConvolutionForward>();
+    BenchmarkEnv<0, 1, 2> benv(handle, handle_cpu);
+
+    auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
+                   size_t FH, size_t FW, size_t PH, size_t PW) {
+        benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
+        benv.fill_src();
+        benv.fill_flt();
+        benv.exec(conv0.get(), conv1.get());
+        benv.cmp_dst();
+    };
+
+    run(64, 60, 50, 50, 1, 3, 3, 1, 1);
+    if (check_need_full_bench()) {
+        run(64, 728, 18, 18, 2, 5, 5, 2, 2);
+        run(64, 64, 150, 150, 2, 3, 3, 1, 1);
+        run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
+    }
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_DATA_BENCH_CHECK) {
+    auto handle = handle_rocm();
+    auto handle_cpu = handle_naive();
+    auto conv0 = handle->create_operator<ConvolutionBackwardData>();
+    auto conv1 = handle->create_operator<ConvolutionBackwardData>();
+    BenchmarkEnv<1, 2, 0> benv(handle, handle_cpu);
+
+    auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
+                   size_t FH, size_t FW, size_t PH, size_t PW) {
+        benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
+        benv.fill_dst();
+        benv.fill_flt();
+        benv.exec(conv0.get(), conv1.get());
+        benv.cmp_src();
+    };
+
+    run(64, 60, 50, 50, 1, 3, 3, 1, 1);
+    if (check_need_full_bench()) {
+        run(64, 728, 18, 18, 2, 5, 5, 2, 2);
+        run(64, 64, 150, 150, 2, 3, 3, 1, 1);
+        run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
+    }
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BWD_FILTER_BENCH_CHECK) {
+    auto handle = handle_rocm();
+    auto handle_cpu = handle_naive();
+    auto conv0 = handle->create_operator<ConvolutionBackwardFilter>();
+    auto conv1 = handle->create_operator<ConvolutionBackwardFilter>();
+    BenchmarkEnv<0, 2, 1> benv(handle, handle_cpu);
+
+    auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t CHL_MUL,
+                   size_t FH, size_t FW, size_t PH, size_t PW) {
+        benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
+        benv.fill_src();
+        benv.fill_dst();
+        benv.exec(conv0.get(), conv1.get());
+        benv.cmp_flt();
+    };
+
+    run(64, 60, 50, 50, 1, 3, 3, 1, 1);
+    if (check_need_full_bench()) {
+        run(64, 728, 18, 18, 2, 5, 5, 2, 2);
+        run(64, 64, 150, 150, 2, 3, 3, 1, 1);
+        run(1, 2048, 4, 4, 2, 3, 3, 1, 1);
+    }
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
+    // enable profiling
+    OprProxy<ConvolutionForward> proxy(true);
+    proxy.warmup_times = 1;
+    proxy.exec_times = 10;
+    Benchmarker<ConvolutionForward> checker(handle_rocm());
+    checker.set_times(1);
+    ConvolutionForward::Param param;
+    param.sparse = ConvolutionForward::Param::Sparse::GROUP;
+    checker.set_param(param);
+
+    auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
+                   size_t FW) {
+        checker.set_proxy(proxy);
+        checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
+    };
+
+    run(128, 64, 90, 80, 3, 3);
+    run(128, 90, 100, 100, 3, 5);
+    run(128, 32, 62, 62, 5, 5);
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
+    // enable profiling
+    OprProxy<ConvolutionBackwardData> proxy(true);
+    proxy.warmup_times = 1;
+    proxy.exec_times = 10;
+    Benchmarker<ConvolutionBackwardData> checker(handle_rocm());
+    checker.set_times(1);
+    ConvolutionBackwardData::Param param;
+    param.sparse = ConvolutionForward::Param::Sparse::GROUP;
+    checker.set_param(param);
+
+    auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
+                   size_t FW) {
+        checker.set_proxy(proxy);
+        checker.execs({{C, 1, 1, FH, FW},
+                       {N, C, IH - FH + 1, IW - FW + 1},
+                       {N, C, IH, IW}});
+    };
+
+    run(128, 64, 90, 80, 3, 3);
+    run(128, 90, 100, 100, 3, 5);
+    run(128, 32, 62, 62, 5, 5);
+}
+
+TEST_F(ROCM, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
+    // enable profiling
+    OprProxy<ConvolutionBackwardFilter> proxy(true);
+    proxy.warmup_times = 1;
+    proxy.exec_times = 10;
+    Benchmarker<ConvolutionBackwardFilter> checker(handle_rocm());
+    checker.set_times(1);
+    ConvolutionBackwardFilter::Param param;
+    param.sparse = ConvolutionForward::Param::Sparse::GROUP;
+    checker.set_param(param);
+
+    auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
+                   size_t FW) {
+        checker.set_proxy(proxy);
+        checker.execs({{N, C, IH, IW},
+                       {N, C, IH - FH + 1, IW - FW + 1},
+                       {C, 1, 1, FH, FW}});
+    };
+
+    run(128, 64, 90, 80, 3, 3);
+    run(128, 90, 100, 100, 3, 5);
+    run(128, 32, 62, 62, 5, 5);
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/checksum.cpp b/dnn/test/rocm/checksum.cpp
new file mode 100644
index 00000000..e34ab661
--- /dev/null
+++ b/dnn/test/rocm/checksum.cpp
@@ -0,0 +1,70 @@
+/**
+ * \file dnn/test/rocm/checksum.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "test/rocm/fixture.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+
+using namespace megdnn;
+using namespace test;
+
+TEST_F(ROCM, CHECKSUM_FORWARD) {
+    auto rocm_opr = handle_rocm()->create_operator<megdnn::Checksum>(),
+         naive_opr = handle_naive()->create_operator<megdnn::Checksum>();
+    std::mt19937 rng(std::random_device{}());
+    for (size_t size: {3, 8, 12345, 1024 * 1024, 1024 * 1024 * 10}) {
+        auto aligned_size = size + ((512 - size % 512) % 512);
+        auto run = [&](megdnn::Checksum *opr, void *ptr, bool log_size) {
+            TensorND tensor;
+            tensor.raw_ptr = ptr;
+            tensor.layout.init_contiguous_stride({size});
+            tensor.layout.dtype = dtype::Byte();
+            WorkspaceWrapper workspace(handle_rocm(),
+                    opr->get_workspace_in_bytes(tensor.layout));
+            if (log_size) {
+                printf("checksum(%zu): workspace=%zu\n", size,
+                        workspace.workspace().size);
+            }
+            return opr->exec(tensor, workspace.workspace());
+        };
+        std::vector<uint8_t> buf(aligned_size);
+        for (size_t i = 0; i < size; ++ i)
+            buf[i] = rng();
+        auto run_offsset = [&](size_t offset) {
+            void* dev_ptr = megdnn_malloc(handle_rocm(), buf.size() + offset);
+            void* dev_buf = static_cast<char*>(dev_ptr) + offset;
+
+            Checksum::Result res_rocm[2], res_naive[2];
+
+            for (int change_last = 0; change_last < 2; ++ change_last) {
+                if (change_last)
+                    ++ buf[size - 1];
+
+                megdnn_memcpy_H2D(handle_rocm(), dev_buf, buf.data(), size);
+                res_rocm[change_last] = run(rocm_opr.get(), dev_buf, !change_last);
+                res_naive[change_last] = run(naive_opr.get(), buf.data(), false);
+            }
+
+            megdnn_free(handle_rocm(), dev_ptr);
+
+            ASSERT_EQ(res_naive[0], res_rocm[0]) << "failed for size " << size;
+            ASSERT_EQ(res_naive[1], res_rocm[1]);
+            ASSERT_NE(res_rocm[0], res_rocm[1]);
+        };
+
+        for (size_t i = 0; i < 8; ++i) {
+            run_offsset(i);
+        }
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/convolution.cpp b/dnn/test/rocm/convolution.cpp
new file mode 100644
index 00000000..014b238d
--- /dev/null
+++ b/dnn/test/rocm/convolution.cpp
@@ -0,0 +1,534 @@
+/**
+ * \file dnn/test/rocm/convolution.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "test/common/convolution.h"
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megdnn/opr_param_defs.h"
+#include "megdnn/oprs.h"
+#include "test/common/benchmarker.h"
+#include "test/common/checker.h"
+#include "test/common/rng.h"
+#include "test/common/tensor.h"
+#include "test/common/workspace_wrapper.h"
+#include "test/rocm/benchmarker.h"
+#include "test/rocm/fixture.h"
+
+#include "src/common/utils.h"
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace test {
+namespace convolution {
+std::vector<TestArg> get_args_0() {
+    std::vector<TestArg> args, tmp_args;
+#define ADD_ARGS(NAME)            \
+    tmp_args = get_args_##NAME(); \
+    args.insert(args.end(), tmp_args.begin(), tmp_args.end());
+    ADD_ARGS(common)
+    ADD_ARGS(padding)
+    ADD_ARGS(large_channel)
+    ADD_ARGS(1x1)
+    ADD_ARGS(large_filter)
+    ADD_ARGS(exhaustive_search)
+    ADD_ARGS(4x4)
+    ADD_ARGS(large_channels)
+    ADD_ARGS(x86_direct_case_2)
+    ADD_ARGS(cudnn_5_1_failures)
+    ADD_ARGS(x86_winograd_algorithm)
+    ADD_ARGS(BRAIN_481)
+#undef ADD_ARGS
+
+    return args;
+}
+
+std::vector<TestArg> get_args_1() {
+    return get_args_fallback_templated_impl();
+}
+
+std::vector<TestArg> get_args_2() {
+    return get_args_fallback_non_templated_impl();
+}
+
+std::vector<TestArg> get_group_conv_args() {
+    std::vector<TestArg> args;
+    for (size_t batch_size : {2}) {
+        for (size_t ih : {23}) {
+            for (size_t iw : {ih + 1}) {
+                for (size_t icpg : {2, 4, 8}) {
+                    for (size_t ocpg : {4, 8}) {
+                        for (size_t fh : {3, 5, 7}) {
+                            for (size_t fw : {fh, fh + 1}) {
+                                for (size_t ph : {0_z, size_t{fw / 2}}) {
+                                    for (size_t sh : {1, 2}) {
+                                        for (size_t dh : {1, 2}) {
+                                            param::Convolution param;
+                                            size_t groups = 2;
+                                            param.sparse = param::Convolution::
+                                                    Sparse::GROUP;
+
+                                            param.mode = param::Convolution::
+                                                    Mode::CROSS_CORRELATION;
+                                            param.stride_h = param.stride_w =
+                                                    sh;
+                                            param.pad_h = param.pad_w = ph;
+                                            param.dilate_h = param.dilate_w =
+                                                    dh;
+                                            args.emplace_back(
+                                                    param,
+                                                    TensorShape{batch_size,
+                                                                icpg * groups,
+                                                                ih, iw},
+                                                    TensorShape{groups, ocpg,
+                                                                icpg, fh, fw});
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return args;
+}
+} // namespace convolution
+
+TEST_F(ROCM, CONV_GROUP) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_group_conv_args();
+    Checker<ConvolutionForward> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+    }
+}
+
+TEST_F(ROCM, CONV_CHANNWISE) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_chanwise_args();
+    Checker<ConvolutionForward> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+    }
+}
+
+TEST_F(ROCM, CONVOLUTION_FORWARD_0) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_args_0();
+    Checker<ConvolutionForward> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        float scale =
+                1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_dtype(2, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+#if !MEGDNN_DISABLE_FLOAT16
+        checker.set_dtype(0, dtype::Float16())
+                .set_dtype(1, dtype::Float16())
+                .set_dtype(2, dtype::Float16())
+                .set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+#endif
+    }
+}
+
+TEST_F(ROCM, CONVOLUTION_FORWARD_1) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_args_1();
+    Checker<ConvolutionForward> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        float scale =
+                1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_dtype(2, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+#if !MEGDNN_DISABLE_FLOAT16
+        checker.set_dtype(0, dtype::Float16())
+                .set_dtype(1, dtype::Float16())
+                .set_dtype(2, dtype::Float16())
+                .set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+#endif
+    }
+}
+
+TEST_F(ROCM, CONVOLUTION_FORWARD_2) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_args_2();
+    Checker<ConvolutionForward> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        float scale =
+                1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_dtype(2, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+#if !MEGDNN_DISABLE_FLOAT16
+        checker.set_dtype(0, dtype::Float16())
+                .set_dtype(1, dtype::Float16())
+                .set_dtype(2, dtype::Float16())
+                .set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+#endif
+    }
+}
+
+TEST_F(ROCM, CONVOLUTION_1X1_FORWARD) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_1x1_args();
+    Checker<ConvolutionForward> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        float scale =
+                1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+    }
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(ROCM, CONVOLUTION_1X1_FORWARD_ALL_ALGOS) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    using namespace convolution;
+    OprProxy<ConvolutionForward> proxy{true};
+    proxy.warmup_times = 1;
+    proxy.exec_times = 10;
+    Benchmarker<ConvolutionForward> checker(handle_rocm());
+    checker.set_times(1);
+
+    auto get_computation = [&](TestArg arg) -> float {
+        megdnn_assert(arg.param.format == param::Convolution::Format::NCHW);
+        size_t N = arg.src[0], IC = arg.src[1], IH = arg.src[2],
+               IW = arg.src[3], OC = arg.filter[0], FH = arg.filter[2],
+               FW = arg.filter[3], SH = arg.param.stride_h,
+               SW = arg.param.stride_w, PH = arg.param.pad_h,
+               PW = arg.param.pad_w;
+
+        size_t OH = infer_conv_shape(IH, FH, SH, PH);
+        size_t OW = infer_conv_shape(IW, FW, SW, PW);
+        float flops = 2.0 * N * OC * OH * OW * IC * FH * FW;
+        return flops;
+    };
+
+    std::vector<TestArg> args = get_1x1_args();
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        float scale =
+                1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        checker.set_proxy(proxy)
+                .set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_param(arg.param);
+        float time_in_ms = checker.execs({arg.src, arg.filter, {}});
+        float flops = get_computation(arg);
+        printf("inp=%s,flt=%s,flops=%.2fGflo,time = %.2f ms, perf = %.2f "
+               "GFLOPS\n",
+               arg.src.to_string().c_str(), arg.filter.to_string().c_str(),
+               flops / 1e9, time_in_ms, flops / (1e6 * time_in_ms));
+    }
+}
+#endif
+
+TEST_F(ROCM, CONVOLUTION_BACKWARD_DATA_0) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_args_0();
+    Checker<ConvolutionBackwardData> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        float scale =
+                1.0f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        auto src = TensorLayout(arg.src, dtype::Float32());
+        auto filter = TensorLayout(arg.filter, dtype::Float32());
+        TensorLayout dst;
+        {
+            auto opr = handle_rocm()->create_operator<Convolution>();
+            opr->param() = arg.param;
+            opr->deduce_layout(src, filter, dst);
+        }
+        src.dtype = dst.dtype = filter.dtype = dtype::Float32();
+        checker.set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{filter, dst, src});
+#if !MEGDNN_DISABLE_FLOAT16
+        src.dtype = dst.dtype = filter.dtype = dtype::Float16();
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{filter, dst, src});
+#endif
+    }
+}
+
+TEST_F(ROCM, CONVOLUTION_BACKWARD_DATA_1) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_args_1();
+    Checker<ConvolutionBackwardData> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        float scale =
+                1.0f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        auto src = TensorLayout(arg.src, dtype::Float32());
+        auto filter = TensorLayout(arg.filter, dtype::Float32());
+        TensorLayout dst;
+        {
+            auto opr = handle_rocm()->create_operator<Convolution>();
+            opr->param() = arg.param;
+            opr->deduce_layout(src, filter, dst);
+        }
+        src.dtype = dst.dtype = filter.dtype = dtype::Float32();
+        checker.set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{filter, dst, src});
+#if !MEGDNN_DISABLE_FLOAT16
+        src.dtype = dst.dtype = filter.dtype = dtype::Float16();
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{filter, dst, src});
+#endif
+    }
+}
+
+TEST_F(ROCM, CONVOLUTION_BACKWARD_DATA_2) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+
+    std::vector<TestArg> args = get_args_2();
+    Checker<ConvolutionBackwardData> checker(handle_rocm());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        float scale =
+                1.0f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        auto src = TensorLayout(arg.src, dtype::Float32());
+        auto filter = TensorLayout(arg.filter, dtype::Float32());
+        TensorLayout dst;
+        {
+            auto opr = handle_rocm()->create_operator<Convolution>();
+            opr->param() = arg.param;
+            opr->deduce_layout(src, filter, dst);
+        }
+        src.dtype = dst.dtype = filter.dtype = dtype::Float32();
+        checker.set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{filter, dst, src});
+#if !MEGDNN_DISABLE_FLOAT16
+        src.dtype = dst.dtype = filter.dtype = dtype::Float16();
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{filter, dst, src});
+#endif
+    }
+}
+
+TEST_F(ROCM, DISABLED_CONVOLUTION_BACKWARD_FILTER) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), false);
+    using namespace convolution;
+    std::vector<TestArg> args = get_args();
+    Checker<ConvolutionBackwardFilter> checker(handle_rocm());
+    NormalRNG default_rng;
+    bool f16_checked = false;
+    MEGDNN_MARK_USED_VAR(f16_checked);
+    for (auto&& arg : args) {
+        using Mode = param::Convolution::Mode;
+        //! non xcorr not supported for miopen
+        if (arg.param.mode == Mode::CONVOLUTION) {
+            continue;
+        }
+        auto src = TensorLayout(arg.src, dtype::Float32());
+        auto filter = TensorLayout(arg.filter, dtype::Float32());
+        TensorLayout dst;
+        {
+            auto opr = handle_rocm()->create_operator<Convolution>();
+            opr->param() = arg.param;
+            opr->deduce_layout(src, filter, dst);
+        }
+        float scale = 1.0f / sqrt(dst[2] * dst[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        src.dtype = dst.dtype = filter.dtype = dtype::Float32();
+        checker.set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_epsilon(1e-3)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{src, dst, filter});
+
+#if !MEGDNN_DISABLE_FLOAT16
+//! FIXME: MIOpen convolution backward weights for FP16 with bugs
+#if 0
+        // reduce on large f16 array may introduce significant error
+        if (dst.total_nr_elems() >= 1000 && f16_checked)
+            continue;
+
+        f16_checked = true;
+        src.dtype = dst.dtype = filter.dtype = dtype::Float16();
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_epsilon(1e-1)
+                .set_param(arg.param)
+                .exec(TensorLayoutArray{src, dst, filter});
+#endif
+#endif
+    }
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(ROCM, CONV_FWD_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<ConvolutionForward>(handle_rocm(),
+                                                           handle_naive(false));
+    auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
+                   size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
+                   size_t PH = 0, size_t PW = 0,
+                   DType dtype = dtype::Float32()) {
+        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
+        benchmarker.set_display(true);
+        ConvolutionForward::Param param;
+        param.stride_h = SH;
+        param.stride_w = SW;
+        param.pad_h = PH;
+        param.pad_w = PW;
+        benchmarker.set_param(param);
+        size_t OH = (IH - FH + 2 * PH) / SH + 1;
+        size_t OW = (IW - FW + 2 * PW) / SW + 1;
+        // warm up find best algo
+        benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
+        // do actual benchmark
+        auto time_ms = benchmarker.execs(
+                {{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
+        auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
+        auto flops = flo / (time_ms * 1e9);
+        printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
+    };
+    run(32, 24, 16, 224, 224, 2, 2, 7, 7, 3, 3);
+    run(32, 128, 32, 112, 112, 1, 1, 3, 3, 1, 1);
+    run(32, 128, 128, 56, 56, 1, 1, 3, 3, 1, 1);
+    run(32, 128, 256, 28, 28, 1, 1, 3, 3, 1, 1);
+    run(32, 256, 256, 28, 28, 1, 1, 1, 1, 0, 0);
+    run(32, 256, 256, 28, 28, 2, 2, 3, 3, 1, 1);
+    run(32, 256, 256, 14, 14, 1, 1, 3, 3, 1, 1);
+    run(32, 512, 512, 7, 7, 1, 1, 3, 3, 1, 1);
+#if !MEGDNN_DISABLE_FLOAT16
+    run(32, 256, 256, 56, 56, 1, 1, 1, 1, 0, 0, dtype::Float16());
+#endif
+}
+#endif
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/elemwise.cpp b/dnn/test/rocm/elemwise.cpp
new file mode 100644
index 00000000..0ceaa96a
--- /dev/null
+++ b/dnn/test/rocm/elemwise.cpp
@@ -0,0 +1,179 @@
+/**
+ * \file dnn/test/rocm/elemwise.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "test/common/elemwise.h"
+#include "test/rocm/fixture.h"
+#include "megdnn/oprs.h"
+#include "test/common/tensor.h"
+#include "test/common/rng.h"
+
+#include "hip_header.h"
+#include "src/rocm/miopen_with_check.h"
+
+#include "test/rocm/benchmarker.h"
+
+using namespace megdnn;
+using namespace test;
+
+namespace {
+    void run_tensor_add(
+            Handle *handle_rocm, 
+            const TensorND &a, const TensorND &b,
+            const TensorND &c) {
+        auto opr = handle_rocm->create_operator<ElemwiseForward>();
+        opr->param().mode = ElemwiseForward::Mode::ADD;
+        hipProfilerStart();
+        opr->exec({a, b}, c);
+        hipProfilerStop();
+    }
+
+    using Mode = ElemwiseForward::Mode;
+    template <Mode mode>
+    void run_elemwise_benchmark(Handle* handle_rocm, Handle* handle_naive,
+                                TensorShapeArray shapes, DType dtype) {
+        auto benchmarker =
+                ROCMBenchmarker<ElemwiseForward>(handle_rocm, handle_naive);
+        benchmarker.set_display(true);
+        ElemwiseForward::Param param;
+        param.mode = mode;
+        benchmarker.set_param(param);
+        TensorShape dst_shp;
+        ElemwiseForward::deduce_shape(shapes, dst_shp);
+        shapes.push_back(dst_shp);
+        for (size_t i = 0; i < shapes.size(); i++) {
+            benchmarker.set_dtype(i, dtype); 
+        }
+        float io = 0.f;
+        for (auto&& shp : shapes) {
+            io += 1.f * shp.total_nr_elems() * dtype.size();
+        }
+        auto time_ms = benchmarker.execs(shapes);
+        printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9,
+               io / (1e6 * time_ms));
+    }
+
+}  // anonymous namespace
+
+template <typename tag>
+class ROCM_ELEMWISE : public ROCM {};
+TYPED_TEST_CASE(ROCM_ELEMWISE, elemwise::test_types);
+TYPED_TEST(ROCM_ELEMWISE, run) {
+    elemwise::run_test<TypeParam>(this->handle_rocm());
+}
+
+//! the memory of this test case is too large, sometimes will fail on tx1
+TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) {
+    constexpr size_t A = 1024 * 1024 * 64,
+              S0 = 64, S1 = 256, S2 = 64, S3 = 64;
+    static_assert(A == S0 * S1 * S2 * S3, "bad value");
+    SyncedTensor<>
+        t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
+        t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
+    UniformFloatRNG rng{-2.f, 2.f};
+    rng.gen(t0.tensornd_host());
+    run_tensor_add(handle_rocm(),
+            t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
+    auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
+    for (size_t i = 0; i < A; ++ i) {
+        ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
+    }
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) {
+    constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
+    SyncedTensor<>
+        t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
+        t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
+        t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
+    UniformFloatRNG rng{-2.f, 2.f};
+    rng.gen(t0.tensornd_host());
+    rng.gen(t1.tensornd_host());
+    run_tensor_add(handle_rocm(),
+            t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
+    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
+    for (size_t i = 0; i < A; ++ i) {
+        for (size_t j = 0; j < B; ++ j) {
+            for (size_t k = 0; k < C; ++ k) {
+                auto off = i * B * C + j * C + k;
+                ASSERT_EQ(p0[off] + p1[j], p2[off]);
+            }
+        }
+    }
+}
+
+TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) {
+    constexpr size_t A = 11583, B = 11587;
+    SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}),
+                   t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}),
+                   t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()});
+    UniformFloatRNG rng{-2.f, 2.f};
+    rng.gen(t0.tensornd_host());
+    rng.gen(t1.tensornd_host());
+    run_tensor_add(handle_rocm(),
+            t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
+    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
+    for (size_t i = 0; i < A; ++ i) {
+        for (size_t j = 0; j < B; ++ j) {
+            auto off = i * B + j;
+            ASSERT_EQ(p0[off] + p1[j], p2[off]);
+        }
+    }
+}
+
+TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) {
+    constexpr size_t A = 11583, B = 11587;
+    SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}),
+                   t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}),
+                   t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()});
+    UniformFloatRNG rng{-2.f, 2.f};
+    rng.gen(t0.tensornd_host());
+    rng.gen(t1.tensornd_host());
+    run_tensor_add(handle_rocm(),
+            t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
+    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
+    for (size_t i = 0; i < A; ++ i) {
+        for (size_t j = 0; j < B; ++ j) {
+            auto off = i * B + j;
+            ASSERT_EQ(p0[off] + p1[i], p2[off]);
+        }
+    }
+}
+
+TEST_F(ROCM, ELEMWISE_BENCHMARK) {
+    using Mode = ElemwiseForward::Mode;
+    run_elemwise_benchmark<Mode::ADD>(handle_rocm(), handle_naive(false),
+                                      {{32, 128, 56, 56}, {32, 128, 56, 56}},
+                                      dtype::Float32());
+    run_elemwise_benchmark<Mode::ADD>(handle_rocm(), handle_naive(false),
+                                      {{32, 128, 56, 56}, {1, 128, 1, 1}},
+                                      dtype::Float32());
+    run_elemwise_benchmark<Mode::FUSE_ADD_RELU>(handle_rocm(), handle_naive(false),
+                                      {{32, 128, 56, 56}, {1, 128, 1, 1}},
+                                      dtype::Float32());
+    run_elemwise_benchmark<Mode::FUSE_MUL_ADD3>(
+            handle_rocm(), handle_naive(false),
+            {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}},
+            dtype::Float32());
+}
+
+TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) {
+    using Mode = ElemwiseForward::Mode;
+    run_elemwise_benchmark<Mode::FUSE_MUL_ADD4>(
+            handle_rocm(), handle_naive(false),
+            {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}},
+            dtype::Float32());
+}
+#endif
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/test/rocm/eye.cpp b/dnn/test/rocm/eye.cpp
new file mode 100644
index 00000000..c900d85d
--- /dev/null
+++ b/dnn/test/rocm/eye.cpp
@@ -0,0 +1,48 @@
+/**
+ * \file dnn/test/rocm/eye.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+
+#include "test/rocm/benchmarker.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, EYE)
+{
+    Checker<Eye> checker(handle_rocm());
+    for (DType dtype: std::vector<DType>{
+            MEGDNN_INC_FLOAT16(dtype::Float16() MEGDNN_COMMA) dtype::Int32(), dtype::Float32()})
+    for (int k = -20; k < 20; ++k) {
+        checker.set_param({k, dtype.enumv()});
+        checker.set_dtype(0, dtype);
+        checker.exec(TensorShapeArray{{3, 4}});
+        checker.exec(TensorShapeArray{{4, 3}});
+    }
+}
+
+TEST_F(ROCM, EYE_BENCHMARK) {
+    auto benchmarker = ROCMBenchmarker<Eye>(handle_rocm(), handle_naive(false));
+    benchmarker.set_display(true);
+    benchmarker.set_param({10240, dtype::Float32().enumv()});
+    benchmarker.set_dtype(0, dtype::Float32());
+    auto time_ms = benchmarker.execs({{10000, 10000}});
+    float io = 10000 * 10000 * dtype::Float32().size();
+    printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9,
+           io / (1e6 * time_ms));
+}
+
+} // namespace test
+} // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/test/rocm/fixture.cpp b/dnn/test/rocm/fixture.cpp
new file mode 100644
index 00000000..ea11b53b
--- /dev/null
+++ b/dnn/test/rocm/fixture.cpp
@@ -0,0 +1,117 @@
+/**
+ * \file dnn/test/rocm/fixture.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+#include "src/rocm/handle.h"
+#include "test/common/memory_manager.h"
+#include "test/common/random_state.h"
+#include "test/common/utils.h"
+#include "src/rocm/utils.h"
+
+#include <cstdlib>
+#include "hip_header.h"
+
+using namespace megdnn;
+using namespace test;
+
+namespace {
+void setup_device() {
+#if !defined(WIN32)
+    auto device_id_env = std::getenv("MEGDNN_DEVICE_ID");
+    int device_id = -1;
+    if (device_id_env) {
+        device_id = std::atoi(device_id_env);
+        std::cout << "Select device " << device_id
+                  << " because MEGDNN_DEVICE_ID is set." << std::endl;
+    }
+    auto pci_bus_id_env = std::getenv("MEGDNN_PCI_BUS_ID");
+    if (pci_bus_id_env) {
+        megdnn_assert(hipSuccess ==
+                      hipDeviceGetByPCIBusId(&device_id, pci_bus_id_env));
+        std::cout << "Select device " << pci_bus_id_env << " (" << device_id
+                  << ") because MEGDNN_PCI_BUS_ID is set." << std::endl;
+    }
+    if (device_id_env && pci_bus_id_env) {
+        std::cout << "MEGDNN_DEVICE_ID and MEGDNN_PCI_BUS_ID should not "
+                     "be set simultaneously."
+                  << std::endl;
+        exit(1);
+    }
+    if (device_id_env || pci_bus_id_env) {
+        megdnn_assert(hipSuccess == hipSetDevice(device_id));
+    }
+#endif
+}
+}  // anonymous namespace
+
+void ROCM::SetUp() {
+    RandomState::reset();
+
+    setup_device();
+    megcoreDeviceHandle_t dev_handle;
+    megcore_check(megcoreCreateDeviceHandle(&dev_handle, megcorePlatformROCM));
+
+    megcoreComputingHandle_t comp_handle;
+    megcore_check(megcoreCreateComputingHandle(&comp_handle, dev_handle));
+    m_handle_rocm = Handle::make(comp_handle);
+    megdnn_assert(m_handle_rocm);
+}
+
+Handle* ROCM::handle_naive(bool check_dispatch) {
+    if (!m_handle_naive)
+        m_handle_naive = create_cpu_handle(2, check_dispatch);
+    return m_handle_naive.get();
+}
+
+void ROCM::TearDown() {
+    m_handle_naive.reset();
+    m_handle_rocm.reset();
+    MemoryManagerHolder::instance()->clear();
+}
+
+void ROCM_ERROR_INFO::SetUp() {
+    setup_device();
+    megcoreDeviceHandle_t dev_handle;
+    megcore_check(megcoreCreateDeviceHandle(&dev_handle, megcorePlatformROCM));
+
+    m_error_info_dev = nullptr;
+    void* ptr;
+    hip_check(hipMalloc(&ptr, sizeof(megcore::AsyncErrorInfo)));
+    hip_check(hipMemset(ptr, 0, sizeof(megcore::AsyncErrorInfo)));
+    hip_check(hipDeviceSynchronize());
+    m_error_info_dev = static_cast<megcore::AsyncErrorInfo*>(ptr);
+
+    // create handle bind with error_info
+    megcoreComputingHandle_t comp_handle;
+    megcore_check(megcore::createComputingHandleWithROCMContext(
+            &comp_handle, dev_handle, 0, {nullptr, m_error_info_dev}));
+    m_handle_rocm = Handle::make(comp_handle);
+    megdnn_assert(static_cast<bool>(m_handle_rocm));
+}
+
+void ROCM_ERROR_INFO::TearDown() {
+    if (m_error_info_dev) {
+        hip_check(hipFree(m_error_info_dev));
+    }
+    m_handle_rocm.reset();
+    MemoryManagerHolder::instance()->clear();
+}
+
+megcore::AsyncErrorInfo ROCM_ERROR_INFO::get_error_info() {
+    megcore::AsyncErrorInfo ret;
+    auto stream = rocm::hip_stream(m_handle_rocm.get());
+    hip_check(hipMemcpyAsync(&ret, m_error_info_dev, sizeof(ret),
+                             hipMemcpyDeviceToHost, stream));
+    hip_check(hipStreamSynchronize(stream));
+    return ret;
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/fixture.h b/dnn/test/rocm/fixture.h
new file mode 100644
index 00000000..5b02ed25
--- /dev/null
+++ b/dnn/test/rocm/fixture.h
@@ -0,0 +1,54 @@
+/**
+ * \file dnn/test/rocm/fixture.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include <gtest/gtest.h>
+#include "test/common/fix_gtest_on_platforms_without_exception.inl"
+
+#include "megdnn/handle.h"
+#include "megcore_cdefs.h"
+
+#include <memory>
+
+namespace megdnn {
+namespace test {
+
+class ROCM : public ::testing::Test {
+public:
+    void SetUp() override;
+    void TearDown() override;
+
+    Handle* handle_rocm() { return m_handle_rocm.get(); }
+    Handle* handle_naive(bool check_dispatch = true);
+
+private:
+    std::unique_ptr<Handle> m_handle_naive;
+    std::unique_ptr<Handle> m_handle_rocm;
+};
+
+//! rocm test fixture with AsyncErrorInfo
+class ROCM_ERROR_INFO : public ::testing::Test {
+public:
+    void SetUp() override;
+    void TearDown() override;
+
+    Handle* handle_rocm() { return m_handle_rocm.get(); }
+
+    megcore::AsyncErrorInfo get_error_info();
+
+private:
+    megcore::AsyncErrorInfo* m_error_info_dev;
+    std::unique_ptr<Handle> m_handle_rocm;
+};
+
+}  // namespace test
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/indexing_multi_axis_vec.cpp b/dnn/test/rocm/indexing_multi_axis_vec.cpp
new file mode 100644
index 00000000..5b425e10
--- /dev/null
+++ b/dnn/test/rocm/indexing_multi_axis_vec.cpp
@@ -0,0 +1,213 @@
+/**
+ * \file dnn/test/rocm/indexing_multi_axis_vec.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+#include "test/rocm/benchmarker.h"
+
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/common/indexing_multi_axis_vec.h"
+#include "test/common/index.h"
+
+#include <random>
+
+using namespace megdnn;
+using namespace test;
+
+namespace {
+
+    class OrderedRNG final: public RNG {
+        public:
+            void gen(const TensorND &tensor) override {
+                auto span = tensor.layout.span();
+                if (tensor.layout.dtype == dtype::Float32()) {
+                    auto ptr = tensor.ptr<float>() + span.low_elem;
+                    for (size_t i = 0, it = span.dist_elem(); i < it; ++ i) {
+                        ptr[i] = i;
+                    }
+                } else {
+                    auto ptr = tensor.ptr<int>() + span.low_elem;
+                    for (size_t i = 0, it = span.dist_elem(); i < it; ++ i) {
+                        ptr[i] = i;
+                    }
+                }
+            }
+    };
+
+    template<class Opr>
+    void run_check(Handle *handle) {
+        // see OprProxyIndexingMultiAxisVecHelper for more details
+        // execs() give input, output and index layouts
+
+        Checker<Opr> checker(handle);
+        size_t idx_size0, idx_size1;
+        OrderedRNG rng_inp;
+        IndexRNG rng0{idx_size0, 2}, rng1{idx_size1, 3};
+        checker.
+            set_dtype(0, dtype::Float32()). // data
+            set_dtype(1, dtype::Float32()). // value
+            set_dtype(2, dtype::Int32()).   // idx0
+            set_dtype(3, dtype::Int32()).   // idx1
+            set_rng(0, &rng_inp).
+            set_rng(1, &rng_inp).
+            set_rng(2, &rng0).
+            set_rng(3, &rng1);
+
+        idx_size0 = 23;
+        checker.
+            set_proxy({{0}}).
+            execs({{23}, {100}, {100}}).
+            execs({{23, 5}, {100, 5}, {100}});
+
+        idx_size0 = 2;
+        idx_size1 = 3;
+        checker.
+            set_proxy({{0, 1}}).
+            execs({{2, 3}, {10}, {10}, {10}}).
+            execs({{2, 3, 5}, {10, 5}, {10}, {10}});
+
+        idx_size0 = 4;
+        idx_size1 = 6;
+        TensorLayout inp_layout{{3, 4, 5, 6}, dtype::Float32()};
+        inp_layout.stride[0] *= 8;
+        inp_layout.stride[1] *= 2;
+        checker.
+            set_proxy({{1, 3}}).
+            execl({inp_layout,
+                    {{7, 3, 5}, dtype::Float32()},
+                    {{7}, dtype::Int32()},
+                    {{1}, dtype::Int32()},
+                    });
+
+        idx_size0 = 4;
+        idx_size1 = 5;
+        checker.
+            set_proxy({{2, 3}}).
+            execs({{2, 3, 4, 5, 6, 7}, {2, 3, 10, 6, 7}, {10}, {10}});
+
+        idx_size0 = 4;
+        checker.
+            set_proxy({{1}}).
+            execs({{1, 4}, {1, 1024 * 1024}, {1024 * 1024}});
+
+        if (std::is_same<Opr, IndexingIncrMultiAxisVec>::value) {
+            idx_size0 = 4;
+            TensorLayout val_layout{{23}, dtype::Float32()};
+            val_layout.stride[0] = 0;
+            checker.
+                set_proxy({{0}}).
+                execl({{{4}, dtype::Float32()},
+                        val_layout,
+                        {{23}, dtype::Int32()}
+                        });
+        }
+    }
+}
+
+TEST_F(ROCM, INDEXING_MULTI_AXIS_VEC) {
+    run_check<IndexingMultiAxisVec>(handle_rocm());
+    Checker<IndexingMultiAxisVec> checker(handle_rocm());
+    size_t idx_size0;
+    OrderedRNG rng_inp;
+    IndexRNG rng0{idx_size0, 2};
+    checker.
+        set_dtype(0, dtype::Float32()). // data
+        set_dtype(1, dtype::Float32()). // value
+        set_dtype(2, dtype::Int32()).   // idx0
+        set_rng(0, &rng_inp).
+        set_rng(1, &rng_inp).
+        set_rng(2, &rng0);
+
+    idx_size0 = 20;
+    checker.set_proxy({{0}})
+        .execl({TensorLayout{{20}, dtype::Float32()},
+                TensorLayout{{9}, dtype::Float32()},
+                TensorLayout{TensorShape{9}, {-1}, dtype::Int32()}});
+}
+
+TEST_F(ROCM, INDEXING_INCR_MULTI_AXIS_VEC) {
+    run_check<IndexingIncrMultiAxisVec>(handle_rocm());
+}
+
+TEST_F(ROCM, INDEXING_SET_MULTI_AXIS_VEC) {
+    Checker<IndexingSetMultiAxisVec> checker(handle_rocm());
+    OrderedRNG rng;
+    checker.
+        set_dtype(0, dtype::Float32()). // data
+        set_dtype(1, dtype::Float32()). // value
+        set_dtype(2, dtype::Int32()).   // idx0
+        set_rng(0, &rng).
+        set_rng(1, &rng).
+        set_rng(2, &rng);
+
+    checker.
+        set_proxy({{1}}).
+        execs({{5, 8, 3}, {5, 2, 3}, {2}});
+}
+
+TEST_F(ROCM_ERROR_INFO, INDEXING_MULTI_AXIS_VEC) {
+    Checker<IndexingMultiAxisVec> checker(handle_rocm());
+    UniformIntRNG idx_rng{-5, 5};
+    checker.
+        set_dtype(0, dtype::Float32()). // data
+        set_dtype(1, dtype::Float32()). // value
+        set_dtype(2, dtype::Int32()).   // idx
+        set_rng(2, &idx_rng);
+
+    bool failed = false;
+    ASSERT_EQ(0u, get_error_info().nr_error);
+    auto on_fail = [&failed, this]() {
+        failed = true;
+        auto info = get_error_info();
+        ASSERT_GE(info.nr_error, 1u);
+        printf("error msg: ");
+        printf(info.msg, info.msg_args[0], info.msg_args[1], info.msg_args[2],
+                info.msg_args[3]);
+        printf("\n");
+    };
+
+    checker.
+        set_proxy({{0}}).
+        execs({{23}, {100}, {100}});
+
+    idx_rng = {-500, 500};
+    checker.
+        set_expect_exec_fail(on_fail).
+        execs({{23}, {100}, {100}});
+
+    ASSERT_TRUE(failed);
+}
+
+TEST_F(ROCM, INDEXING_MULTI_AXIS_VEC_BENCHMARK) {
+    ROCMBenchmarker<IndexingMultiAxisVec> benchmarker(handle_rocm(), handle_naive(false));
+    benchmarker.set_display(true);
+    OrderedRNG rng_inp;
+    size_t idx_size = 10000;
+    IndexRNG rng0{idx_size, 3}, rng1{idx_size, 1};
+    benchmarker.
+        set_dtype(0, dtype::Float32()).
+        set_dtype(1, dtype::Float32()).
+        set_dtype(2, dtype::Int32()).
+        set_dtype(3, dtype::Int32()).
+        set_rng(0, &rng_inp).
+        set_rng(1, &rng_inp).
+        set_rng(2, &rng0).
+        set_rng(3, &rng1).
+        set_proxy({0, 1});
+    auto time_ms = benchmarker.execs({{1000, 1000, 1000}, {1000, 1000}, {1000}, {1000}});
+    long io = 2 * 1000 * 1000 * dtype::Float32().size();
+    printf("io = %.3f GB, random access bandwidth = %.3f GB/s\n",
+           (float)(io / 1e9), (float)(io / (time_ms * 1e6)));
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
+
diff --git a/dnn/test/rocm/indexing_one_hot.cpp b/dnn/test/rocm/indexing_one_hot.cpp
new file mode 100644
index 00000000..a84b9a16
--- /dev/null
+++ b/dnn/test/rocm/indexing_one_hot.cpp
@@ -0,0 +1,68 @@
+/**
+ * \file dnn/test/rocm/indexing_one_hot.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "test/common/benchmarker.h"
+#include "test/common/indexing_one_hot.h"
+#include "test/rocm/fixture.h"
+
+#include "megcore_rocm.h"
+#include "megdnn/oprs/general.h"
+
+#include "test/rocm/benchmarker.h"
+
+using namespace megdnn;
+using namespace test;
+
+TEST_F(ROCM, INDEXING_ONE_HOT) {
+    run_indexing_one_hot_test(handle_rocm());
+}
+
+TEST_F(ROCM_ERROR_INFO, INDEXING_ONE_HOT) {
+    ASSERT_EQ(0u, get_error_info().nr_error);
+    bool failed = false;
+    auto on_failure = [&failed, this]() {
+        failed = true;
+        auto err = get_error_info();
+        ASSERT_GE(err.nr_error, 1u);
+        printf("error msg: ");
+        printf(err.msg, err.msg_args[0], err.msg_args[1], err.msg_args[2],
+               err.msg_args[3]);
+        printf("\n");
+    };
+    run_indexing_one_hot_test(handle_rocm(), on_failure);
+    ASSERT_TRUE(failed);
+}
+
+TEST_F(ROCM, INDEXING_ONE_HOT_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<IndexingOneHotForward>(
+            handle_rocm(), handle_naive(false));
+    UniformFloatRNG rng_val{-10, 10};
+    UniformIntRNG rng_idx{0, 119};
+    benchmarker.set_display(true);
+
+    benchmarker.set_param({2})
+            .set_dtype(1, dtype::Int32{})
+            .set_rng(1, &rng_idx)
+            .set_rng(0, &rng_val);
+    constexpr size_t A = 99, B = 41, C = 120, D = 191;
+    benchmarker.execs({{A, B, C, D}, {A, B, D}, {}});
+    auto time = benchmarker.execs({{A, B, C, D}, {A, B, D}, {}});
+    time = benchmarker.execs({{A, B, C, D}, {A, B, D}, {}});
+    printf("bandwidth: %.2fGiB/s\n", A * B * D * sizeof(float) / (1e6 * time));
+}
+
+TEST_F(ROCM, INDEXING_SET_ONE_HOT) {
+    run_indexing_set_one_hot_test(handle_rocm());
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/test/rocm/linspace.cpp b/dnn/test/rocm/linspace.cpp
new file mode 100644
index 00000000..9c9b7760
--- /dev/null
+++ b/dnn/test/rocm/linspace.cpp
@@ -0,0 +1,56 @@
+/**
+ * \file dnn/test/rocm/linspace.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/rocm/benchmarker.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, LINSPACE)
+{
+    Checker<Linspace> checker(handle_rocm());
+    Linspace::Param param;
+    param.start = 0.5;
+    param.stop = 1.5;
+    param.endpoint = true;
+    for (DType dtype: std::vector<DType>{
+            dtype::Float16(), dtype::Int32(), dtype::Float32()}) {
+        checker.set_dtype(0, dtype).set_param(param).exec(
+                TensorShapeArray{{11}});
+    }
+    param.endpoint = false;
+    for (DType dtype: std::vector<DType>{
+            dtype::Float16(), dtype::Int32(), dtype::Float32()}) {
+        checker.set_dtype(0, dtype).set_param(param).exec(
+                TensorShapeArray{{11}});
+    }
+
+}
+
+TEST_F(ROCM, LINSPACE_BENCHMARK)
+{
+    ROCMBenchmarker<Linspace> benchmarker(handle_rocm(), handle_naive(false));
+    benchmarker.set_display(true);
+    Linspace::Param param{0.1, 9999.9, true};
+    size_t sz = 50000;
+    auto time_ms = benchmarker.set_dtype(0, dtype::Float32())
+       .set_param(param).execs({{sz}});
+    double bytes = sz * dtype::Float32().size();
+    printf("vec size = %ld, bandwidth = %.2f GB/s\n", sz, (float)(bytes / (time_ms * 1e6)));
+}
+
+} // namespace test
+} // namespace megdnn
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/test/rocm/matrix_mul.cpp b/dnn/test/rocm/matrix_mul.cpp
new file mode 100644
index 00000000..0c78d756
--- /dev/null
+++ b/dnn/test/rocm/matrix_mul.cpp
@@ -0,0 +1,98 @@
+/**
+ * \file dnn/test/rocm/matrix_mul.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "test/common/checker.h"
+#include "test/common/matrix_mul.h"
+
+#include "src/rocm/utils.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, MATRIX_MUL) {
+    Checker<MatrixMul> checker(handle_rocm());
+    using Param = MatrixMul::Param;
+    size_t m = 12, n = 16, k = 20;
+    //! result error for Int8x8x32, not test correctness
+    std::vector<DType> dtypes{MEGDNN_INC_FLOAT16(dtype::Float16() MEGDNN_COMMA)
+                                      dtype::Float32()/*, dtype::Int32()*/};
+    for (auto dtype : dtypes) {
+        for (unsigned mask = 0; mask < 4; ++mask) {
+            Param param;
+            param.transposeA = mask & 1;
+            param.transposeB = mask & 2;
+            DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
+            TensorShape A, B;
+            if (param.transposeA)
+                A = TensorShape{k, m};
+            else
+                A = TensorShape{m, k};
+            if (param.transposeB)
+                B = TensorShape{n, k};
+            else
+                B = TensorShape{k, n};
+            checker.set_param(param)
+                    .set_dtype(0, stype)
+                    .set_dtype(1, stype)
+                    .set_dtype(2, dtype)
+                    .set_epsilon(MEGDNN_FLOAT16_SELECT(
+                                         dtype == dtype::Float16(), false)
+                                         ? 5e-2
+                                         : 5e-3)
+                    .execs({A, B, {}});
+        }
+    }
+    // general tests
+    auto args = matrix_mul::get_matmul_args();
+    for (auto arg : args) {
+        auto m = arg.m, n = arg.n, k = arg.k;
+        auto mask = arg.mask;
+        Param param;
+        param.transposeA = mask & 1;
+        param.transposeB = mask & 2;
+        TensorShape AS, BS, CS;
+        if (param.transposeA)
+            AS = TensorShape{k, m};
+        else
+            AS = TensorShape{m, k};
+        if (param.transposeB)
+            BS = TensorShape{n, k};
+        else
+            BS = TensorShape{k, n};
+        CS = TensorShape{m, n};
+        TensorLayout AL, BL, CL;
+        if (arg.Astride == 0) {
+            AL = TensorLayout(AS, dtype::Float32());
+        } else {
+            AL = TensorLayout(AS, {ptrdiff_t(arg.Astride), 1},
+                              dtype::Float32());
+        }
+        if (arg.Bstride == 0) {
+            BL = TensorLayout(BS, dtype::Float32());
+        } else {
+            BL = TensorLayout(BS, {ptrdiff_t(arg.Bstride), 1},
+                              dtype::Float32());
+        }
+        if (arg.Cstride == 0) {
+            CL = TensorLayout(CS, dtype::Float32());
+        } else {
+            CL = TensorLayout(CS, {ptrdiff_t(arg.Cstride), 1},
+                              dtype::Float32());
+        }
+        checker.set_param(param).execl({AL, BL, CL});
+    }
+}
+
+} // namespace test
+} // namespace megdnn
+   // vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/megcore/computing.cpp b/dnn/test/rocm/megcore/computing.cpp
new file mode 100644
index 00000000..7a548aeb
--- /dev/null
+++ b/dnn/test/rocm/megcore/computing.cpp
@@ -0,0 +1,136 @@
+/**
+ * \file dnn/test/rocm/megcore/computing.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megcore.h"
+#include "megcore_rocm.h"
+
+#include "test/common/utils.h"
+#include "test/rocm/utils.h"
+#include "./fixture.h"
+#include "hip_header.h"
+
+TEST_F(MegcoreROCM, COMPUTING)
+{
+    for (int id = -1; id < std::min(nr_devices(), 2); ++id) {
+        megcoreDeviceHandle_t devHandle;
+        megcoreCreateDeviceHandle(&devHandle,
+                    megcorePlatformROCM, id, 0);
+        megcoreActivate(devHandle);
+
+        megcoreComputingHandle_t compHandle;
+        megcoreCreateComputingHandle(&compHandle,
+                    devHandle, 0);
+
+        megcoreDeviceHandle_t devHandle2;
+        megcoreGetDeviceHandle(compHandle, &devHandle2);
+        ASSERT_EQ(devHandle, devHandle2);
+
+        unsigned int flags;
+        megcoreGetComputingFlags(compHandle, &flags);
+        ASSERT_EQ(0u, flags);
+
+        unsigned char *src, *dst;
+        static const size_t N = 5;
+        unsigned char src_host[N], dst_host[N];
+        megcoreMalloc(devHandle, (void **)&src, N);
+        megcoreMalloc(devHandle, (void **)&dst, N);
+        megcoreMemset(compHandle, src, 0x0F, N);
+        megcoreMemset(compHandle, dst, 0xF0, N);
+        megcoreMemcpy(compHandle, src_host, src, N,
+                    megcoreMemcpyDeviceToHost);
+        megcoreMemcpy(compHandle, dst_host, dst, N,
+                    megcoreMemcpyDeviceToHost);
+        megcoreSynchronize(compHandle);
+        for (size_t i = 0; i < N; ++i) {
+            ASSERT_EQ(0x0F, src_host[i]);
+            ASSERT_EQ(0xF0, dst_host[i]);
+        }
+        megcoreMemcpy(compHandle, dst, src, N,
+                    megcoreMemcpyDeviceToDevice);
+        megcoreMemcpy(compHandle, src_host, src, N,
+                    megcoreMemcpyDeviceToHost);
+        megcoreMemcpy(compHandle, dst_host, dst, N,
+                    megcoreMemcpyDeviceToHost);
+        megcoreSynchronize(compHandle);
+        for (size_t i = 0; i < N; ++i) {
+            ASSERT_EQ(dst_host[i], src_host[i]);
+        }
+        megcoreFree(devHandle, src);
+        megcoreFree(devHandle, dst);
+
+        megcoreDestroyComputingHandle(compHandle);
+        megcoreDestroyDeviceHandle(devHandle);
+    }
+}
+
+TEST_F(MegcoreROCM, STREAM)
+{
+    megcoreDeviceHandle_t devHandle;
+    megcoreCreateDeviceHandle(&devHandle,
+                megcorePlatformROCM, 0, 0);
+    megcoreActivate(devHandle);
+
+    hipStream_t stream;
+    hip_check(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+
+    megcoreComputingHandle_t compHandle;
+    megcoreCreateComputingHandleWithROCMStream(&compHandle,
+                devHandle, 0, stream);
+    {
+        hipStream_t stream2;
+        megcoreGetROCMStream(compHandle, &stream2);
+        ASSERT_EQ(stream, stream2);
+    }
+
+    megcoreDeviceHandle_t devHandle2;
+    megcoreGetDeviceHandle(compHandle, &devHandle2);
+    ASSERT_EQ(devHandle, devHandle2);
+
+    unsigned int flags;
+    megcoreGetComputingFlags(compHandle, &flags);
+    ASSERT_EQ(0u, flags);
+
+    unsigned char *src, *dst;
+    static const size_t N = 5;
+    unsigned char src_host[N], dst_host[N];
+    megcoreMalloc(devHandle, (void **)&src, N);
+    megcoreMalloc(devHandle, (void **)&dst, N);
+    megcoreMemset(compHandle, src, 0x0F, N);
+    megcoreMemset(compHandle, dst, 0xF0, N);
+    megcoreMemcpy(compHandle, src_host, src, N,
+                megcoreMemcpyDeviceToHost);
+    megcoreMemcpy(compHandle, dst_host, dst, N,
+                megcoreMemcpyDeviceToHost);
+    megcoreSynchronize(compHandle);
+    for (size_t i = 0; i < N; ++i) {
+        ASSERT_EQ(0x0F, src_host[i]);
+        ASSERT_EQ(0xF0, dst_host[i]);
+    }
+    megcoreMemcpy(compHandle, dst, src, N,
+                megcoreMemcpyDeviceToDevice);
+    megcoreMemcpy(compHandle, src_host, src, N,
+                megcoreMemcpyDeviceToHost);
+    megcoreMemcpy(compHandle, dst_host, dst, N,
+                megcoreMemcpyDeviceToHost);
+    megcoreSynchronize(compHandle);
+    for (size_t i = 0; i < N; ++i) {
+        ASSERT_EQ(dst_host[i], src_host[i]);
+    }
+    megcoreFree(devHandle, src);
+    megcoreFree(devHandle, dst);
+
+    megcoreDestroyComputingHandle(compHandle);
+    megcoreDestroyDeviceHandle(devHandle);
+
+    hip_check(hipStreamDestroy(stream));
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/megcore/device.cpp b/dnn/test/rocm/megcore/device.cpp
new file mode 100644
index 00000000..f6edf9f8
--- /dev/null
+++ b/dnn/test/rocm/megcore/device.cpp
@@ -0,0 +1,63 @@
+/**
+ * \file dnn/test/rocm/megcore/device.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megcore.h"
+
+#include "test/common/utils.h"
+#include "./fixture.h"
+#include "test/rocm/utils.h"
+#include "hip_header.h"
+
+TEST_F(MegcoreROCM, DEVICE)
+{
+    for (int id = -1; id < std::min(nr_devices(), 2); ++id) {
+        megcoreDeviceHandle_t handle;
+        megcoreCreateDeviceHandle(&handle, megcorePlatformROCM,
+                    id, 0);
+
+        int deviceID;
+        megcoreGetDeviceID(handle, &deviceID);
+        ASSERT_EQ(id, deviceID);
+
+        megcorePlatform_t platform;
+        megcoreGetPlatform(handle, &platform);
+        ASSERT_EQ(megcorePlatformROCM, platform);
+
+        unsigned int flags;
+        megcoreGetDeviceFlags(handle, &flags);
+        ASSERT_EQ(0u, flags);
+
+        size_t memAlignmentInBytes;
+        megcoreGetMemAlignment(handle, &memAlignmentInBytes);
+
+        megcoreActivate(handle);
+
+        void *ptr;
+        megcoreMalloc(handle, &ptr, 256);
+        megcoreFree(handle, ptr);
+
+        megcoreDestroyDeviceHandle(handle);
+    }
+}
+
+TEST_F(MegcoreROCM, ERROR_MSG) {
+#if MEGDNN_ENABLE_EXCEPTIONS
+    megcoreDeviceHandle_t handle;
+    ASSERT_THROW(
+            megcoreCreateDeviceHandle(
+                &handle, megcorePlatformROCM, nr_devices(), 0),
+            megdnn::test::MegDNNError);
+    hipGetLastError();
+    hip_check(hipGetLastError());
+#endif
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/megcore/fixture.cpp b/dnn/test/rocm/megcore/fixture.cpp
new file mode 100644
index 00000000..c6a174c5
--- /dev/null
+++ b/dnn/test/rocm/megcore/fixture.cpp
@@ -0,0 +1,28 @@
+/**
+ * \file dnn/test/rocm/megcore/fixture.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "./fixture.h"
+#include "test/rocm/utils.h"
+
+#include <gtest/gtest.h>
+#include "hip_header.h"
+
+void MegcoreROCM::SetUp() {
+    hip_check(hipGetDeviceCount(&nr_devices_));
+    printf("We have %d GPUs\n", nr_devices_);
+}
+
+void MegcoreROCM::TearDown() {
+    hip_check(hipDeviceReset());
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/megcore/fixture.h b/dnn/test/rocm/megcore/fixture.h
new file mode 100644
index 00000000..6e913a90
--- /dev/null
+++ b/dnn/test/rocm/megcore/fixture.h
@@ -0,0 +1,25 @@
+/**
+ * \file dnn/test/rocm/megcore/fixture.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#pragma once
+#include <gtest/gtest.h>
+
+class MegcoreROCM : public ::testing::Test {
+public:
+    void SetUp() override;
+    void TearDown() override;
+
+    int nr_devices() { return nr_devices_; }
+
+private:
+    int nr_devices_;
+};
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/pooling.cpp b/dnn/test/rocm/pooling.cpp
new file mode 100644
index 00000000..d60d35a5
--- /dev/null
+++ b/dnn/test/rocm/pooling.cpp
@@ -0,0 +1,232 @@
+/**
+ * \file dnn/test/rocm/pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+
+#include "megdnn/tensor_iter.h"
+#include "test/common/checker.h"
+#include "test/common/pooling.h"
+#include "test/rocm/benchmarker.h"
+
+#include "src/rocm/utils.h"
+#include "src/common/utils.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(ROCM, POOLING_FORWARD) {
+    auto args = pooling::get_args();
+    using Format = param::Pooling::Format;
+    std::vector<DType> dtypes{MEGDNN_INC_FLOAT16(dtype::Float16() MEGDNN_COMMA)
+                                      dtype::Float32()};
+    for (auto dtype : dtypes)
+        for (auto format : {Format::NCHW})
+            for (auto&& arg : args) {
+                auto param = arg.param;
+                if (param.mode == param::Pooling::Mode::AVERAGE) {
+                    param.mode =
+                            param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+                }
+                auto src = arg.ishape;
+                param.format = format;
+                Checker<Pooling> checker(handle_rocm());
+                checker.set_epsilon(1e-2);
+                checker.set_param(param)
+                        .set_dtype(0, dtype)
+                        .set_dtype(1, dtype)
+                        .exec(TensorShapeArray{src, {}});
+            }
+}
+
+TEST_F(ROCM, POOLING_BACKWARD) {
+    auto args = pooling::get_args();
+    for (auto&& arg : args) {
+        Checker<PoolingBackward> checker(handle_rocm());
+        TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32());
+        TensorLayout olayout;
+
+        auto& param = arg.param;
+        if (param.mode == param::Pooling::Mode::AVERAGE) {
+            param.mode = param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+        }
+
+        auto constraint = [this,
+                           arg](CheckerHelper::TensorValueArray& tensors_orig) {
+            megdnn_assert(tensors_orig.size() == 4);
+            auto opr = handle_rocm()->create_operator<PoolingForward>();
+            opr->param() = arg.param;
+
+            auto tensors_rocm_storage = CheckerHelper::alloc_tensors(
+                    handle_rocm(),
+                    {tensors_orig[0].layout, tensors_orig[1].layout}, 0);
+            auto&& tensors_rocm = *tensors_rocm_storage;
+
+            auto span = tensors_rocm[0].layout.span();
+            auto dst = static_cast<dt_byte*>(tensors_rocm[0].raw_ptr) +
+                       span.low_byte;
+            auto src = static_cast<const dt_byte*>(tensors_orig[0].raw_ptr) +
+                       span.low_byte;
+            megdnn_memcpy_H2D(handle_rocm(), dst, src, span.dist_byte());
+
+            auto workspace_size = opr->get_workspace_in_bytes(
+                    tensors_rocm[0].layout, tensors_rocm[1].layout);
+            auto workspace_rocm = megdnn_malloc(handle_rocm(), workspace_size);
+            Workspace workspace{static_cast<dt_byte*>(workspace_rocm),
+                                workspace_size};
+            opr->exec(tensors_rocm[0], tensors_rocm[1], workspace);
+            megdnn_free(handle_rocm(), workspace_rocm);
+
+            span = tensors_rocm[1].layout.span();
+            dst = static_cast<dt_byte*>(tensors_orig[1].raw_ptr) +
+                  span.low_byte;
+            src = static_cast<const dt_byte*>(tensors_rocm[1].raw_ptr) +
+                  span.low_byte;
+            megdnn_memcpy_D2H(handle_rocm(), dst, src, span.dist_byte());
+        };
+
+        {
+            auto opr = handle_rocm()->create_operator<PoolingForward>();
+            opr->param() = arg.param;
+            opr->deduce_layout(ilayout, olayout);
+        }
+        auto set_dtype = [&checker](DType dtype) {
+            checker.set_dtype(0, dtype)
+                    .set_dtype(1, dtype)
+                    .set_dtype(2, dtype)
+                    .set_dtype(3, dtype);
+        };
+
+        checker.set_tensors_constraint(constraint);
+        set_dtype(dtype::Float32());
+        checker.set_param(arg.param).exec(
+                TensorShapeArray{ilayout, olayout, olayout, ilayout});
+#if !MEGDNN_DISABLE_FLOAT16
+//! FIXME: MIOpen pooling backward for fp16 with bug
+#if 0
+        Float16PeriodicalRNG rng;
+        set_dtype(dtype::Float16());
+        checker.set_param(arg.param).set_rng(0, &rng).set_epsilon(1e-2).exec(
+                TensorShapeArray{ilayout, olayout, olayout, ilayout});
+#endif
+#endif
+    }
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(ROCM, POOLING_FWD_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker =
+            ROCMBenchmarker<PoolingForward>(handle_rocm(), handle_naive(false));
+    auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t SH = 1,
+                   size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
+                   size_t PW = 0, DType dtype = dtype::Float32()) {
+        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
+        benchmarker.set_display(true);
+        PoolingForward::Param param;
+        param.mode = param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+        param.stride_h = SH;
+        param.stride_w = SW;
+        param.pad_h = PH;
+        param.pad_w = PW;
+        param.window_h = FH;
+        param.window_w = FW;
+        benchmarker.set_param(param);
+        size_t OH = infer_conv_shape(IH, FH, SH, PH);
+        size_t OW = infer_conv_shape(IW, FW, SW, PW);
+        // warm up  
+        benchmarker.execs({{N, IC, IH, IW}, {N, IC, OH, OW}});
+        // do actual benchmark
+        auto time_ms = benchmarker.execs({{N, IC, IH, IW}, {N, IC, OH, OW}});
+        time_ms = benchmarker.execs({{N, IC, IH, IW}, {N, IC, OH, OW}});
+        auto io = (double)N * IC * OH * OW * (1 + FH * FW) * dtype.size();
+        auto gbps = io / (time_ms * 1e6);
+        printf("io %.2fGB, flops %.3fGB/s\n", io / 1e9, gbps);
+    };
+    run(32, 128, 80, 64, 2, 2, 2, 2, 0, 0);
+    run(32, 128, 40, 128, 2, 2, 2, 2, 0, 0);
+    run(32, 224, 40, 32, 2, 2, 2, 2, 0, 0);
+
+    run(32, 24, 160, 128, 2, 2, 4, 4, 0, 0);
+    run(32, 24, 160, 128, 2, 2, 4, 4, 1, 1);
+}
+
+TEST_F(ROCM, POOLING_BWD_BENCHMARK) {
+    using Mode = param::Pooling::Mode;
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<PoolingBackward>(handle_rocm(),
+                                                        handle_naive(false));
+    auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t SH = 1,
+                   size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
+                   size_t PW = 0,
+                   Mode mode = Mode::AVERAGE_COUNT_EXCLUDE_PADDING,
+                   DType dtype = dtype::Float32()) {
+        benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
+        benchmarker.set_display(true);
+        PoolingForward::Param param;
+        param.mode = mode;
+        param.stride_h = SH;
+        param.stride_w = SW;
+        param.pad_h = PH;
+        param.pad_w = PW;
+        param.window_h = FH;
+        param.window_w = FW;
+        benchmarker.set_param(param);
+        size_t OH = infer_conv_shape(IH, FH, SH, PH);
+        size_t OW = infer_conv_shape(IW, FW, SW, PW);
+        // warm up
+        benchmarker.execs({{N, IC, IH, IW},
+                           {N, IC, OH, OW},
+                           {N, IC, OH, OW},
+                           {N, IC, IH, IW}});
+        // do actual benchmark
+        auto time_ms = benchmarker.execs({{N, IC, IH, IW},
+                                          {N, IC, OH, OW},
+                                          {N, IC, OH, OW},
+                                          {N, IC, IH, IW}});
+        time_ms = benchmarker.execs({{N, IC, IH, IW},
+                                     {N, IC, OH, OW},
+                                     {N, IC, OH, OW},
+                                     {N, IC, IH, IW}});
+        double io = 0.;
+        double gbps = 0.;
+        if (mode == Mode::AVERAGE_COUNT_EXCLUDE_PADDING) {
+            io = (double)N * IC * OH * OW * FH * FW * 2 * dtype.size();
+            gbps = io / (time_ms * 1e6);
+        } else {
+            io = (double)N * IC * OH * OW * 2 * dtype.size();
+            gbps = io / (time_ms * 1e6);
+        }
+        printf("Mode = %s, io %.2fGB, flops %.3fGB/s\n",
+               mode == Mode::AVERAGE_COUNT_EXCLUDE_PADDING ? "AVERAGE" : "MAX",
+               io / 1e9, gbps);
+    };
+    Mode mode = Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+    run(32, 128, 80, 64, 2, 2, 2, 2, 0, 0, mode);
+    run(32, 128, 40, 128, 2, 2, 2, 2, 0, 0, mode);
+    run(32, 224, 40, 32, 2, 2, 2, 2, 0, 0, mode);
+
+    run(32, 24, 160, 128, 2, 2, 4, 4, 0, 0, mode);
+    run(32, 24, 160, 128, 2, 2, 4, 4, 1, 1, mode);
+
+    mode = Mode::MAX;
+    run(32, 128, 80, 64, 2, 2, 2, 2, 0, 0, mode);
+    run(32, 128, 40, 128, 2, 2, 2, 2, 0, 0, mode);
+    run(32, 224, 40, 32, 2, 2, 2, 2, 0, 0, mode);
+
+    run(32, 24, 160, 128, 2, 2, 4, 4, 0, 0, mode);
+    run(32, 24, 160, 128, 2, 2, 4, 4, 1, 1, mode);
+}
+#endif
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/powc.cpp b/dnn/test/rocm/powc.cpp
new file mode 100644
index 00000000..21e18309
--- /dev/null
+++ b/dnn/test/rocm/powc.cpp
@@ -0,0 +1,29 @@
+/**
+ * \file dnn/test/rocm/powc.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "test/common/powc.h"
+
+#include "test/rocm/fixture.h"
+
+using namespace megdnn;
+using namespace test;
+
+TEST_F(ROCM, POW_C_F32) {
+    run_powc_test(handle_rocm(), dtype::Float32{});
+}
+
+#if !MEGDNN_DISABLE_FLOAT16
+//! FIXME: powc for rocm has bugs
+TEST_F(ROCM, POW_C_F16) {
+    run_powc_test(handle_rocm(), dtype::Float16{});
+}
+#endif
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/reduce.cpp b/dnn/test/rocm/reduce.cpp
new file mode 100644
index 00000000..690e75e5
--- /dev/null
+++ b/dnn/test/rocm/reduce.cpp
@@ -0,0 +1,109 @@
+/**
+ * \file dnn/test/rocm/reduce.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/common/rng.h"
+#include "test/rocm/fixture.h"
+
+using namespace megdnn;
+using namespace test;
+
+TEST_F(ROCM, REDUCE) {
+    using Mode = Reduce::Param::Mode;
+    Checker<Reduce> checker(handle_rocm());
+    UniformFloatRNG rng(-1.0f, 1.0f);
+    checker.set_epsilon(1e-2);
+    checker.set_rng(0, &rng);
+    checker.set_param({Mode::SUM, 1});
+
+    // 1-step
+    checker.execs({{2, 64, 32}, {}});
+    // 2-step
+    checker.execs({{2, 192, 32}, {}});
+    // 3-step
+    checker.execs({{2, 4333, 32}, {}});
+    // single reduce
+    checker.execs({{2, 1, 1}, {}});
+    checker.execs({{2, 1 + 1, 1}, {}});
+    checker.execs({{2, 2048 + 1, 1}, {}});
+    checker.execs({{2, 2048 * 2048 + 1, 1}, {}});
+    checker.execs({{2, 1 + 1, 31}, {}});
+    checker.execs({{2, 16 + 1, 31}, {}});
+    checker.execs({{2, 16 * 16 + 1, 31}, {}});
+    checker.execs({{2, 16 * 16 * 16 + 1, 31}, {}});
+    checker.execs({{2, 16 * 16 * 16 * 16 + 1, 31}, {}});
+    checker.execs({{2, 16 * 16 * 16 * 16 * 16 + 1, 31}, {}});
+    checker.execs({{3, 256 * 256 + 1, 2}, {}});
+    checker.execs({{3, 128 * 128 + 1, 3}, {}});
+    checker.execs({{3, 64 * 64 + 1, 7}, {}});
+    checker.execs({{3, 32 * 32 + 1, 15}, {}});
+    checker.execs({{3, 512, 500}, {}});
+    // very large reduce
+    checker.execs({{1, 4194304, 1}, {}});
+
+    auto check = [&](Reduce::Mode mode, DType src_dtype, DType dst_dtype,
+                     Reduce::DataType data_type) {
+        for (int32_t axis : {0, 1, 2, 3}) {
+            if (data_type == Reduce::DataType::DEFAULT &&
+                MEGDNN_FLOAT16_SELECT(src_dtype == dtype::Float16(), false)) {
+                checker.set_epsilon(1e-2);
+            } else {
+                checker.set_epsilon(1e-3);
+            }
+            Reduce::Param param{mode, axis, data_type};
+            auto dst_shape = TensorShape{2, 3, 100, 5};
+            dst_shape[axis] = 1;
+            checker.set_dtype(0, src_dtype)
+                    .set_dtype(1, dst_dtype)
+                    .set_param(param)
+                    .execs({{2, 3, 100, 5}, dst_shape});
+        }
+    };
+    for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR, Mode::PRODUCT,
+                      Mode::MIN, Mode::MAX}) {
+        for (auto dtype : std::vector<DType>{
+                     MEGDNN_INC_FLOAT16(dtype::Float16() MEGDNN_COMMA)
+                             dtype::Float32(),
+                     dtype::Int32()}) {
+            check(mode, dtype, dtype, Reduce::DataType::DEFAULT);
+        }
+#if !MEGDNN_DISABLE_FLOAT16
+        check(mode, dtype::Float16(), dtype::Float32(),
+              Reduce::DataType::FLOAT_O32xC32);
+        check(mode, dtype::Float16(), dtype::Float16(),
+              Reduce::DataType::FLOAT_O16xC32);
+        check(mode, dtype::Float32(), dtype::Float16(),
+              Reduce::DataType::FLOAT_O16xC32);
+        ASSERT_THROW(check(mode, dtype::Int32(), dtype::Float16(),
+                           Reduce::DataType::FLOAT_O16xC32),
+                     MegDNNError);
+        ASSERT_THROW(check(mode, dtype::Float16(), dtype::Float16(),
+                           Reduce::DataType::FLOAT_IO16xC32),
+                     MegDNNError);
+#endif
+    }
+
+#if !MEGDNN_DISABLE_FLOAT16
+    {
+        // very large reduce for I16CO32
+        Reduce::Param param{Mode::SUM_SQR, 1,
+                            Reduce::Param::DataType::FLOAT_O32xC32};
+        checker.set_dtype(0, dtype::Float16())
+                .set_dtype(1, dtype::Float32())
+                .set_param(param)
+                .execs({{1, 4194304, 1}, {1, 1, 1}});
+    }
+#endif
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/relayout.cpp b/dnn/test/rocm/relayout.cpp
new file mode 100644
index 00000000..949960ea
--- /dev/null
+++ b/dnn/test/rocm/relayout.cpp
@@ -0,0 +1,243 @@
+/**
+ * \file dnn/test/rocm/relayout.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+
+#include "test/rocm/fixture.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+#include "test/common/benchmarker.h"
+#include "test/common/relayout.h"
+#include "test/rocm/benchmarker.h"
+
+using namespace megdnn;
+using namespace test;
+
+namespace {
+template<typename tag>
+class ROCM_RELAYOUT: public ROCM {
+};
+TYPED_TEST_CASE(ROCM_RELAYOUT, relayout::test_types);
+TYPED_TEST(ROCM_RELAYOUT, run) {
+    relayout::run_test<TypeParam>(this->handle_rocm());
+}
+}
+
+TEST_F(ROCM, RELAYOUT_MEMCPY_ASYNC) {
+    Checker<Relayout> checker(handle_rocm());
+    checker.set_epsilon(1e-3);
+    struct Arg {
+        TensorLayout src, dst;
+        Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
+    };
+    std::vector<Arg> args;
+    // test for contig
+    args.emplace_back(Arg{{{51200}, {1}, dtype::Float32()},
+                          {{51200}, {1}, dtype::Float32()}});
+
+    // test for copy_2d
+    args.emplace_back(Arg{{{51200}, {9}, dtype::Float32()},
+                          {{51200}, {1}, dtype::Float32()}});
+
+    for (auto&& arg : args) {
+        checker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .execl({arg.src, arg.dst});
+    }
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(ROCM, RELAYOUT_BENCHMARK) {
+    //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
+    //! just change the first two axis
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<RelayoutForward>(handle_rocm(),
+                                                        handle_naive(false));
+    benchmarker.set_display(true);
+
+    auto run = [&](const TensorLayoutArray& layouts) {
+        for (auto&& layout : layouts) {
+            TensorLayout src = layout.dimshuffle({1, 0, 2});
+            TensorLayout dst = layout;
+            std::swap(dst.shape[0], dst.shape[1]);
+            dst.init_contiguous_stride();
+            benchmarker.execl({src, dst});
+            auto used = benchmarker.execl({src, dst});
+            used = benchmarker.execl({src, dst});
+            printf("layout: %s bandwith: %f gbps/s\n",
+                   layout.to_string().c_str(),
+                   2 * layout.total_nr_elems() * layout.dtype.size() / used *
+                           1000 / (1024 * 1024 * 1024));
+        }
+
+    };
+
+    TensorLayoutArray layouts = {
+            {{12, 23, 2}, dtype::Int32()},
+            {{12, 23, 8}, dtype::Int32()},
+            {{12, 23, 17}, dtype::Int32()},
+            {{12, 23, 64}, dtype::Int32()},
+            {{12, 23, 129}, dtype::Int32()},
+            {{12, 23, 256}, dtype::Int32()},
+            {{12, 23, 1029}, dtype::Int32()},
+            {{12, 23, 4096}, dtype::Int32()},
+            {{12, 23, 9143}, dtype::Int32()},
+            {{12, 23, 18284}, dtype::Int32()},
+            {{2, 2, 1000000}, dtype::Int32()},
+    };
+    run(layouts);
+
+    auto run2 = [&](const TensorLayoutArray& layouts) {
+        for (auto&& layout : layouts) {
+            TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
+            TensorLayout dst = layout;
+            std::swap(dst.shape[0], dst.shape[1]);
+            dst.init_contiguous_stride();
+            benchmarker.execl({src, dst});
+            auto used = benchmarker.execl({src, dst});
+            used = benchmarker.execl({src, dst});
+            printf("layout: %s bandwith: %f gbps/s\n",
+                   layout.to_string().c_str(),
+                   2 * layout.total_nr_elems() * layout.dtype.size() / used *
+                           1000 / (1024 * 1024 * 1024));
+        }
+
+    };
+
+    layouts = {
+            {{3, 12, 24, 100}, dtype::Int32()},
+            {{3, 12, 24, 1029}, dtype::Int32()},
+            {{3, 4, 24, 9143}, dtype::Int32()},
+            {{3, 4, 24, 18284}, dtype::Int32()},
+    };
+
+    run2(layouts);
+}
+
+TEST_F(ROCM, RELAYOUT_LAST_CONTIG_BENCHMARK) {
+    megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
+    auto benchmarker = ROCMBenchmarker<RelayoutForward>(handle_rocm(),
+                                                        handle_naive(false));
+    benchmarker.set_display(true);
+
+    TensorLayout src =
+            TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
+    TensorLayout dst =
+            TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
+    benchmarker.execl({src, dst});
+    auto used = benchmarker.execl({src, dst});
+    used = benchmarker.execl({src, dst});
+    printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
+           dst.to_string().c_str(),
+           2 * src.total_nr_elems() * src.dtype.size() / used * 1000 /
+                   (1024 * 1024 * 1024));
+}
+#endif
+
+TEST_F(ROCM, RELAYOUT) {
+    struct Arg {
+        TensorLayout src, dst;
+        Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
+    };
+    std::vector<Arg> args;
+#if !MEGDNN_DISABLE_FLOAT16
+    {
+        // contiguous stride
+        args.emplace_back(TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
+                          TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
+        args.emplace_back(TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
+                          TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
+        args.emplace_back(
+                TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
+                TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
+    }
+    args.emplace_back(
+            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
+            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
+    args.emplace_back(
+            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
+            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
+    args.emplace_back(
+            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
+            TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
+#endif
+    args.emplace_back(
+            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
+            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
+    args.emplace_back(
+            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
+            TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
+    args.emplace_back(
+            TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
+            TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
+    {
+        // 1d
+        size_t n = 10000;
+        args.emplace_back(TensorLayout({n}, {1}, dtype::Int32()),
+                          TensorLayout({n}, {1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({n}, {1}, dtype::Int32()),
+                          TensorLayout({n}, {2}, dtype::Int32()));
+        args.emplace_back(TensorLayout({n}, {2}, dtype::Int32()),
+                          TensorLayout({n}, {1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({n}, {2}, dtype::Int32()),
+                          TensorLayout({n}, {2}, dtype::Int32()));
+    }
+    {
+        // 2d
+        size_t m = 200, n = 300, k = 400;
+        ptrdiff_t k2 = k * 2;
+        args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
+                          TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int32()),
+                          TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int32()),
+                          TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
+                          TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
+                          TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int32()),
+                          TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int32()),
+                          TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
+        args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
+                          TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
+    }
+    {
+        // 3d
+        size_t m = 20, n = 30, k = 40;
+        ptrdiff_t k2 = k;
+        args.emplace_back(
+                TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2},
+                             dtype::Int32()),
+                TensorLayout({m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2},
+                             dtype::Int32()));
+    }
+    {
+        // simplify_layout
+        // 234..56
+        // 2..3456
+        args.emplace_back(
+                TensorLayout(
+                        {2, 3, 4, 5, 6},
+                        {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
+                        dtype::Int32()),
+                TensorLayout({2, 3, 4, 5, 6},
+                             {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
+                             dtype::Int32()));
+    }
+
+    Checker<Relayout> checker(handle_rocm());
+    for (auto&& arg : args) {
+        checker.exec(TensorLayoutArray{arg.src, arg.dst});
+    }
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/rng.cpp b/dnn/test/rocm/rng.cpp
new file mode 100644
index 00000000..70c1bcfa
--- /dev/null
+++ b/dnn/test/rocm/rng.cpp
@@ -0,0 +1,59 @@
+/**
+ * \file dnn/test/rocm/rng.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megdnn/oprs.h"
+#include "test/rocm/fixture.h"
+#include "test/naive/rng.h"
+#include "test/common/tensor.h"
+
+namespace megdnn {
+
+namespace test {
+
+TEST_F(ROCM, UNIFORM_RNG_F32) {
+    auto opr = handle_rocm()->create_operator<UniformRNG>();
+    SyncedTensor<> t(handle_rocm(), {TensorShape{200000}, dtype::Float32()});
+    opr->exec(t.tensornd_dev(), {});
+
+    assert_uniform_correct(t.ptr_mutable_host(),
+            t.layout().total_nr_elems());
+}
+
+TEST_F(ROCM, GAUSSIAN_RNG_F32) {
+    auto opr = handle_rocm()->create_operator<GaussianRNG>();
+    opr->param().mean = 0.8;
+    opr->param().std = 2.3;
+    for (size_t size: {1, 200000, 200001}) {
+        TensorLayout ly{{size}, dtype::Float32()};
+        Tensor<dt_byte> workspace(handle_rocm(),
+                {TensorShape{opr->get_workspace_in_bytes(ly)},
+                dtype::Byte()});
+        SyncedTensor<> t(handle_rocm(), ly);
+        opr->exec(t.tensornd_dev(),
+                {workspace.ptr(), workspace.layout().total_nr_elems()});
+
+        auto ptr = t.ptr_mutable_host();
+
+        if (size >= 1000) {
+            auto stat = get_mean_var(ptr, size, 0.8f);
+            ASSERT_LE(std::abs(stat.first - 0.8), 5e-3);
+            ASSERT_LE(std::abs(stat.second - 2.3 * 2.3), 5e-2);
+        }
+    }
+}
+
+} // namespace test
+} // namespace megdnn
+
+// vim: syntax=cpp.doxygen
+
+
+
diff --git a/dnn/test/rocm/sleep.cpp b/dnn/test/rocm/sleep.cpp
new file mode 100644
index 00000000..90c62d5d
--- /dev/null
+++ b/dnn/test/rocm/sleep.cpp
@@ -0,0 +1,52 @@
+/**
+ * \file dnn/test/rocm/sleep.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+#include "test/rocm/utils.h"
+#include "megdnn/oprs.h"
+
+#include <chrono>
+#include <cstdio>
+
+using namespace megdnn;
+using namespace test;
+
+#if !(MEGDNN_AARCH64)
+
+TEST_F(ROCM, SLEEP) {
+    auto opr = this->handle_rocm()->create_operator<Sleep>();
+
+    auto run = [&](float time) -> double {
+        opr->param() = {time};
+        hip_check(hipDeviceSynchronize());
+        auto t0 = std::chrono::high_resolution_clock::now();
+        opr->exec();
+        hip_check(hipDeviceSynchronize());
+        auto t1 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diff = t1 - t0;
+        return diff.count();
+    };
+
+    // warmv7up
+    run(0.01);
+
+    for (auto i: {0.1, 0.3}) {
+        auto get = run(i);
+        ASSERT_GE(get, i);
+        ASSERT_LE(get, i * 2);
+    }
+}
+
+#endif
+
+
+// vim: syntax=cpp.doxygen
+
diff --git a/dnn/test/rocm/type_cvt.cpp b/dnn/test/rocm/type_cvt.cpp
new file mode 100644
index 00000000..a89560d8
--- /dev/null
+++ b/dnn/test/rocm/type_cvt.cpp
@@ -0,0 +1,89 @@
+/**
+ * \file dnn/test/rocm/type_cvt.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "test/rocm/fixture.h"
+#include "megdnn/oprs.h"
+#include "test/common/checker.h"
+
+using namespace megdnn;
+using namespace test;
+
+TEST_F(ROCM, TYPE_CVT) {
+    UniformFloatRNG init(0, 20);
+    std::vector<DType> dtypes = {dtype::Float32(), MEGDNN_INC_FLOAT16(dtype::Float16() MEGDNN_COMMA)
+                                 dtype::Int32(),   dtype::Int16(),
+                                 dtype::Int8(),    dtype::Uint8()};
+    for (auto sdtype : dtypes)
+        for (auto ddtype : dtypes) {
+            TensorLayout src({10, 10}, sdtype), dst({10, 10}, ddtype);
+            Checker<TypeCvt> checker(handle_rocm());
+            checker.set_rng(0, &init).exec(TensorLayoutArray{src, dst});
+        }
+}
+
+TEST_F(ROCM, QUANTIZED_TYPECVT) {
+    UniformIntRNG int_rng{-66, 66};
+    Checker<TypeCvt> checker(handle_rocm());
+    checker.set_rng(0, &int_rng).set_rng(1, &int_rng);
+
+    auto run = [&](const DType& src_dtype, const DType& dst_dtype) {
+        checker.set_dtype(0, src_dtype)
+                .set_dtype(1, dst_dtype)
+                .execs({{20, 3, 224, 224}, {20, 3, 224, 224}});
+        checker.set_dtype(0, dst_dtype)
+                .set_dtype(1, src_dtype)
+                .execs({{20, 3, 224, 224}, {20, 3, 224, 224}});
+    };
+
+    run(dtype::Float32(), dtype::QuantizedS8(3.0f));
+#if !MEGDNN_DISABLE_FLOAT16
+    run(dtype::Float16(), dtype::QuantizedS8(3.0f));
+#endif
+    run(dtype::Int32(), dtype::QuantizedS32(5.0f));
+    run(dtype::Int8(), dtype::QuantizedS32(10.0f));
+
+    run(dtype::Float32(), dtype::QuantizedS8(2e-3f));
+#if !MEGDNN_DISABLE_FLOAT16
+    run(dtype::Float16(), dtype::QuantizedS8(1e-3f));
+#endif
+    run(dtype::Int32(), dtype::QuantizedS32(1e-3f));
+    run(dtype::Int8(), dtype::QuantizedS32(7e-4f));
+
+    run(dtype::QuantizedS8(3.0f), dtype::QuantizedS8(10.0f));
+    run(dtype::QuantizedS32(3.0f), dtype::QuantizedS8(10.0f));
+    run(dtype::QuantizedS8(3.0f), dtype::QuantizedS32(10.0f));
+    run(dtype::QuantizedS32(3.0f), dtype::QuantizedS32(10.0f));
+
+    run(dtype::QuantizedS8(1e-3f), dtype::QuantizedS8(5e-3f));
+    run(dtype::QuantizedS32(2e-3f), dtype::QuantizedS8(9e-4f));
+    run(dtype::QuantizedS8(9e-4f), dtype::QuantizedS32(7e-4f));
+    run(dtype::QuantizedS32(5e-3f), dtype::QuantizedS32(1e-3f));
+
+    run(dtype::Quantized8Asymm(5.0f, (uint8_t)128), dtype::Float32());
+#if !MEGDNN_DISABLE_FLOAT16
+    run(dtype::Quantized8Asymm(5.0f, (uint8_t)124), dtype::Float16());
+#endif
+    run(dtype::Quantized8Asymm(5.0f, (uint8_t)30), dtype::Int8());
+    run(dtype::Quantized8Asymm(5.0f, (uint8_t)20), dtype::Int32());
+    run(dtype::Quantized8Asymm(5.0f, (uint8_t)10), dtype::QuantizedS8(10.5f));
+    run(dtype::Quantized8Asymm(5.0f, (uint8_t)18), dtype::QuantizedS32(10.5f));
+
+    run(dtype::Quantized8Asymm(1e-3f, (uint8_t)128), dtype::Float32());
+#if !MEGDNN_DISABLE_FLOAT16
+    run(dtype::Quantized8Asymm(1e-3f, (uint8_t)124), dtype::Float16());
+#endif
+    run(dtype::Quantized8Asymm(1e-3f, (uint8_t)30), dtype::Int8());
+    run(dtype::Quantized8Asymm(1e-3f, (uint8_t)20), dtype::Int32());
+    run(dtype::Quantized8Asymm(1e-3f, (uint8_t)10), dtype::QuantizedS8(2e-3f));
+    run(dtype::Quantized8Asymm(1e-3f, (uint8_t)18), dtype::QuantizedS32(7e-4f));
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/test/rocm/utils.h b/dnn/test/rocm/utils.h
new file mode 100644
index 00000000..ca3aef05
--- /dev/null
+++ b/dnn/test/rocm/utils.h
@@ -0,0 +1,16 @@
+/**
+ * \file dnn/test/rocm/utils.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+#include <cstdio>
+#include "src/rocm/utils.h.hip"
+
+// vim: syntax=cpp.doxygen
diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp
index 76b884a4..f302bf8d 100644
--- a/src/core/impl/comp_node/comp_node.cpp
+++ b/src/core/impl/comp_node/comp_node.cpp
@@ -15,6 +15,7 @@
 
 #include "./cuda/comp_node.h"
 #include "./cpu/comp_node.h"
+#include "./rocm/comp_node.h"
 #include "./cambricon/comp_node.h"
 #include "./atlas/comp_node.h"
 
@@ -44,6 +45,8 @@ namespace {
                 return "cpu";
             case DT::ATLAS:
                 return "atlas";
+            case DT::ROCM:
+                return "rocm";
             case DT::CAMBRICON:
                 return "cambricon";
             case DT::MULTITHREAD:
@@ -158,6 +161,13 @@ CompNode::Locator CompNode::Locator::parse(const std::string &id) {
         dev_type = DeviceType::ATLAS;
         ptr += 5;
     }
+    else if (ptr[0] == 'r') {
+        if (strncmp(ptr, "rocm", 4)) {
+            err();
+        }
+        dev_type = DeviceType::ROCM;
+        ptr += 4;
+    }
     else if (ptr[2] == 'm') {
         if (strncmp(ptr, "cambricon", 9)) {
             err();
@@ -500,6 +510,9 @@ CompNode CompNode::load(const Locator& locator_physical,
         case DeviceType::ATLAS:
             ret = AtlasCompNode::load_atlas(locator_physical, locator_logical);
             break;
+        case DeviceType::ROCM:
+            ret = ROCmCompNode::load_rocm(locator_physical, locator_logical);
+            break;
         case DeviceType::CAMBRICON:
             ret = CambriconCompNode::load_cambricon(locator_physical,
                                                     locator_logical);
@@ -522,18 +535,21 @@ void CompNode::finalize() {
     comp_node_detail::DepedentObjList::invoke_callback_and_clean();
     CudaCompNode::finalize();
     CpuCompNode::finalize();
+    ROCmCompNode::finalize();
     CambriconCompNode::finalize();
     AtlasCompNode::finalize();
 }
 
 void CompNode::try_coalesce_all_free_memory() {
     CudaCompNode::try_coalesce_all_free_memory();
+    ROCmCompNode::try_coalesce_all_free_memory();
     CambriconCompNode::try_coalesce_all_free_memory();
 }
 
 void CompNode::sync_all() {
     CudaCompNode::sync_all();
     CpuCompNode::sync_all();
+    ROCmCompNode::sync_all();
     CambriconCompNode::sync_all();
     AtlasCompNode::sync_all();
 }
@@ -541,6 +557,7 @@ void CompNode::sync_all() {
 void CompNode::foreach(thin_function<void(CompNode)> callback) {
     CudaCompNode::foreach(callback);
     CpuCompNode::foreach(callback);
+    ROCmCompNode::foreach(callback);
     CambriconCompNode::foreach(callback);
     AtlasCompNode::foreach(callback);
 }
@@ -552,6 +569,8 @@ size_t CompNode::get_device_count(DeviceType type, bool warn) {
         case DeviceType::MULTITHREAD:
         case DeviceType::CPU:
             return CpuCompNode::get_device_count();
+        case DeviceType::ROCM:
+            return ROCmCompNode::get_device_count();
         case DeviceType::CAMBRICON:
             return CambriconCompNode::get_device_count();
         case DeviceType::ATLAS:
@@ -571,6 +590,9 @@ bool CompNode::contain_flag(DeviceType device_type, Flag flag) {
         case DeviceType::CPU:
             cn_flag = CpuCompNode::sm_flag;
             break;
+        case DeviceType::ROCM:
+            cn_flag = ROCmCompNode::sm_flag;
+            break;
         case DeviceType::CAMBRICON:
             cn_flag = CambriconCompNode::sm_flag;
             break;
diff --git a/src/core/impl/comp_node/rocm/comp_node.cpp b/src/core/impl/comp_node/rocm/comp_node.cpp
new file mode 100644
index 00000000..77fd6ef7
--- /dev/null
+++ b/src/core/impl/comp_node/rocm/comp_node.cpp
@@ -0,0 +1,726 @@
+/**
+ * \file src/core/impl/comp_node/rocm/comp_node.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "./comp_node.h"
+#include "megbrain/comp_node_env.h"
+#include "megbrain/utils/thread.h"
+
+#include <string>
+
+using namespace mgb;
+
+#if MGB_ROCM
+
+#include "megbrain/comp_node/alloc.h"
+
+#include <cctype>
+#include <cstdio>
+
+#include <thread>
+
+#include "hip_header.h"
+
+using ROCmCompNodeImpl = ROCmCompNode::CompNodeImpl;
+
+namespace {
+size_t get_min_system_memory(size_t available) {
+    if (available < (1u << 31)) {
+        // 225MiB
+        return 225 * 1024 * 1024;
+    } else {
+        // max(300 MiB, 0.05 * available)
+        return std::max<size_t>(300 * 1024 * 1024, available / 20);
+    }
+}
+}  // anonymous namespace
+
+namespace mgb {
+namespace mem_alloc {
+class ROCmRawAllocator final : public RawAllocator {
+public:
+    void* alloc(size_t size) override {
+        void* addr;
+        hipError_t hip_error = hipMalloc(&addr, size);
+        if (hip_error == hipSuccess) {
+            mgb_assert(addr);
+            return addr;
+        }
+        auto msg = mgb_ssprintf_log(
+                "hipMalloc failed while requesting %zd bytes (%.3fMiB)"
+                " of memory; error: %s",
+                size, size / (1024.0 * 1024), hipGetErrorString(hip_error));
+        msg.append(ROCmError::get_rocm_extra_info());
+        if (hip_error == hipErrorMemoryAllocation) {
+            mgb_log_error("%s", msg.c_str());
+            // clear hip error
+            hipGetLastError();
+            mgb_assert(hipGetLastError() == hipSuccess);
+            return nullptr;
+        }
+        mgb_throw_raw(MemAllocError{msg});
+    }
+
+    void free(void* ptr) override {
+        hipError_t hip_error = hipFree(ptr);
+        if (hip_error == hipSuccess)
+            return;
+        auto msg = ssprintf("hipFree failed for %p: %s", ptr,
+                            hipGetErrorString(hip_error));
+        msg.append(ROCmError::get_rocm_extra_info());
+        mgb_throw_raw(MemAllocError{msg});
+    }
+
+    void get_mem_info(size_t& free, size_t& tot) override {
+        hipError_t hip_error = hipMemGetInfo(&free, &tot);
+        if (hip_error == hipSuccess)
+            return;
+        auto msg = ssprintf("hipMemGetInfo failed %s",
+                            hipGetErrorString(hip_error));
+        msg.append(ROCmError::get_rocm_extra_info());
+        mgb_throw_raw(MegBrainError{msg});
+    }
+};
+
+class ROCmDeviceRuntimePolicy : public DeviceRuntimePolicy {
+public:
+    CompNode::DeviceType device_type() override {
+        return CompNode::DeviceType::ROCM;
+    }
+    void set_device(int device) override {
+        MGB_ROCM_CHECK(hipSetDevice(device));
+    }
+    void device_synchronize(int device) override {
+        MGB_ROCM_CHECK(hipSetDevice(device));
+        MGB_ROCM_CHECK(hipDeviceSynchronize());
+    }
+};
+
+/* ===================== DevMemAlloc  ===================== */
+std::unique_ptr<DevMemAlloc> DevMemAlloc::make_rocm_alloc() {
+    return std::make_unique<FwdDevMemAlloc>(
+            std::make_shared<ROCmRawAllocator>());
+}
+}  // namespace mem_alloc
+}  // namespace mgb
+
+/* ===================== ROCmCompNodeImpl  ===================== */
+class ROCmCompNode::CompNodeImpl final : public CompNode::Impl {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+    friend class EventImpl;
+    friend class ROCmCompNode;
+
+    struct DeviceInfo;
+    struct StaticData;
+    static StaticData* sd;
+    static Spinlock sd_mtx;
+
+    //! set to true when m_locator is assigned; set to false if async init
+    //! failed
+    bool m_initialized = false;
+    Locator m_locator, m_locator_logical;
+    mem_alloc::StreamMemAlloc* m_mem_alloc;
+    DeviceInfo* m_device_info;
+
+    std::unique_ptr<Event> m_sync_event;
+    Spinlock m_sync_event_mtx;
+
+    void activate() { m_env.rocm_env().activate(); }
+
+    void init(const Locator& locator, const Locator& locator_logical);
+    void fini();
+
+    //! return whether global finalized, and print warning in such case
+    static inline bool check_global_finalized();
+
+    //! enable peer copy from dev0 to dev1
+    static void enable_peer_access(int dev0, int dev1);
+
+    static void static_free_device(ImplBase* self, void* ptr) {
+        static_cast<CompNodeImpl*>(self)->free_device(ptr);
+    }
+
+    static void static_free_host(ImplBase* self, void* ptr) {
+        static_cast<CompNodeImpl*>(self)->free_host(ptr);
+    }
+
+public:
+    CompNodeImpl() : Impl(static_free_device, static_free_host) { }
+
+    void* alloc_device(size_t size) override {
+        activate();
+        return m_mem_alloc->alloc(size);
+    }
+
+    void free_device(void* ptr);
+
+    //! hipMallocHost is deprecated, we cannot allocate cpu pinned memory
+    // we can use hipHostAlloc to emulate hipMallocHost
+    void* alloc_host(size_t size) override {
+        activate();
+        void* ptr;
+        MGB_ROCM_CHECK(hipHostMalloc(&ptr, size, hipHostMallocDefault));
+        return ptr;
+    }
+
+    void free_host(void* ptr) {
+        if (!check_global_finalized()) {
+            activate();
+        }
+        MGB_ROCM_CHECK(hipHostFree(ptr));
+    }
+
+    void copy_to_host(void* host_ptr, const void* device_ptr,
+                      size_t size) override {
+        MGB_ROCM_CHECK(hipMemcpyAsync(host_ptr, device_ptr, size,
+                                      hipMemcpyDeviceToHost,
+                                      m_env.rocm_env().stream));
+    }
+
+    void copy_to_device(void* device_ptr, const void* host_ptr,
+                        size_t size) override {
+        MGB_ROCM_CHECK(hipMemcpyAsync(device_ptr, host_ptr, size,
+                                      hipMemcpyHostToDevice,
+                                      m_env.rocm_env().stream));
+    }
+
+    void peer_copy_to(Impl* dest_impl, void* dest, const void* src,
+                      size_t size) override;
+
+    size_t get_mem_addr_alignment() override {
+        return m_env.property().mem_alignment;
+    }
+
+    std::unique_ptr<Event> create_event(size_t flags) override;
+
+    void sync() override;
+
+    MemNode mem_node() override;
+
+    std::pair<size_t, size_t> get_mem_status_bytes() override {
+        // explicitly call rocm_env() to ensure async init is finished
+        m_env.rocm_env().activate();
+        size_t tot, free;
+        MGB_ROCM_CHECK(hipMemGetInfo(&free, &tot));
+        free += m_mem_alloc->get_free_memory_dev().tot;
+        return {tot, free};
+    }
+
+    Locator locator() override { return m_locator; }
+
+    Locator locator_logical() override { return m_locator_logical; }
+};
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(ROCmCompNode::CompNodeImpl);
+
+struct ROCmCompNodeImpl::DeviceInfo {
+    int dev_num = -1;
+    std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc;
+
+    bool init_done() const { return mem_alloc.get(); }
+
+    void init(const CompNodeEnv& env);
+
+    void fini() { mem_alloc.reset(); }
+};
+
+struct ROCmCompNodeImpl::StaticData {
+    static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
+
+    std::recursive_mutex mtx;
+
+    mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;
+
+    ROCmCompNode::CompNodeImpl node[MAX_NR_COMP_NODE];
+    DeviceInfo dev_info[MAX_NR_DEVICE];
+    int nr_node = 0,          //!< number of loaded node[]
+            nr_dev_used = 0;  //!< number of used dev_info[]
+
+    StaticData() {
+        prealloc_config.max_overhead = 0;
+        prealloc_config.alignment = 1;
+    }
+
+    ~StaticData() {
+        for (int i = 0; i < nr_node; ++i)
+            node[i].fini();
+        for (int i = 0; i < nr_dev_used; ++i)
+            dev_info[i].fini();
+    }
+
+    static size_t get_mem_reserve_size() {
+        if (auto setting = MGB_GETENV("MGB_ROCM_RESERVE_MEMORY")) {
+            if (!strncmp(setting, "b:", 2)) {
+                return std::stoull(setting + 2);
+            }
+            size_t tot, free;
+            MGB_ROCM_CHECK(hipFree(0));
+            MGB_ROCM_CHECK(hipMemGetInfo(&free, &tot));
+            return free - get_min_system_memory(free);
+        } else {
+            return 0;
+        }
+    }
+};
+ROCmCompNodeImpl::StaticData* ROCmCompNodeImpl::sd = nullptr;
+Spinlock ROCmCompNodeImpl::sd_mtx;
+
+void ROCmCompNodeImpl::init(const Locator& locator,
+                            const Locator& locator_logical) {
+    m_locator = locator;
+    m_locator_logical = locator_logical;
+    m_initialized = true;
+
+    auto on_succ = [this](hipStream_t stream) {
+        auto locator = m_locator;
+        log_comp_node_created(locator, m_locator_logical);
+
+        MGB_LOCK_GUARD(sd->mtx);
+        DeviceInfo* dev_info = nullptr;
+        for (int i = 0; i < sd->nr_dev_used; ++i) {
+            if (sd->dev_info[i].dev_num == locator.device) {
+                dev_info = &sd->dev_info[i];
+                break;
+            }
+        }
+
+        if (!dev_info) {
+            dev_info = &sd->dev_info[sd->nr_dev_used];
+            dev_info->init(m_env);
+            // note: add nr_dev_used only after init succeeds
+            ++sd->nr_dev_used;
+        }
+        m_device_info = dev_info;
+        m_mem_alloc = dev_info->mem_alloc->add_stream(stream);
+    };
+
+    auto on_error = [this](std::exception&) {
+        MGB_LOCK_GUARD(sd->mtx);
+        m_initialized = false;
+    };
+
+    m_env.init_rocm_async(locator.device, make_comp_node_from_impl(this),
+                          {on_succ, on_error});
+}
+
+void ROCmCompNodeImpl::fini() {
+    if (!m_initialized)
+        return;
+
+    m_sync_event.reset();
+    m_env.fini();
+    m_mem_alloc = nullptr;
+    m_device_info = nullptr;
+    m_initialized = false;
+}
+
+void ROCmCompNodeImpl::free_device(void* ptr) {
+    if (check_global_finalized())
+        return;
+
+    activate();
+    m_mem_alloc->free(ptr);
+}
+
+void ROCmCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest,
+                                    const void* src, size_t size) {
+    if (dest_impl->same_type<ROCmCompNodeImpl>()) {
+        auto&& dst_env =
+                static_cast<ROCmCompNodeImpl*>(dest_impl)->m_env.rocm_env();
+        auto&& src_env = m_env.rocm_env();
+        if (dst_env.device == src_env.device) {
+            MGB_ROCM_CHECK(hipMemcpyAsync(
+                    dest, src, size, hipMemcpyDeviceToDevice, dst_env.stream));
+        } else {
+            enable_peer_access(src_env.device, dst_env.device);
+            enable_peer_access(dst_env.device, src_env.device);
+            MGB_ROCM_CHECK(hipMemcpyPeerAsync(dest, dst_env.device, src,
+                                              src_env.device, size,
+                                              dst_env.stream));
+        }
+        return;
+    }
+    mgb_assert(dest_impl->env().property().type == DeviceType::CPU,
+               "rocm peer_copy_to only implemented for CPU");
+    auto copy = [this, dest, src, size]() {
+        auto stream = m_env.rocm_env().stream;
+        MGB_ROCM_CHECK(
+                hipMemcpyAsync(dest, src, size, hipMemcpyDeviceToHost, stream));
+        MGB_ROCM_CHECK(hipStreamSynchronize(stream));
+    };
+    dest_impl->env().cpu_env().dispatch(copy);
+}
+
+MemNode ROCmCompNodeImpl::mem_node() {
+    // m_device_info would be null before async init finishes; so we just return
+    // a prive pointer related to device number here
+    return MemNode{sd->dev_info + m_locator.device};
+}
+
+void ROCmCompNodeImpl::sync() {
+    activate();
+
+    // same behavior as cuda
+    // do not use MGB_ROCM_CHECK(hipStreamSynchronize(m_env->stream)) since
+    // other threads may be adding operations into the stream, and we only care
+    // about previous operations in current thread. However docs of
+    // hipStreamSynchronize did not describe details of such condition, so we
+    // use manual event implementation
+
+    Event* event;
+    {
+        MGB_LOCK_GUARD(m_sync_event_mtx);
+        if (!m_sync_event)
+            m_sync_event = create_event(0);
+        event = m_sync_event.get();
+    }
+    event->record();
+    event->host_wait();
+}
+
+void ROCmCompNodeImpl::enable_peer_access(int dev0, int dev1) {
+    static bool already_enabled[StaticData::MAX_NR_DEVICE]
+                               [StaticData::MAX_NR_DEVICE];
+    if (already_enabled[dev0][dev1])
+        return;
+
+    static std::mutex global_lock;
+    MGB_LOCK_GUARD(global_lock);
+    if (already_enabled[dev0][dev1])
+        return;
+
+    int can;
+    MGB_ROCM_CHECK(hipDeviceCanAccessPeer(&can, dev0, dev1));
+    if (can) {
+        mgb_log("enable peer access from GPU %d to GPU %d", dev0, dev1);
+        MGB_ROCM_CHECK(hipSetDevice(dev0));
+        auto err = hipDeviceEnablePeerAccess(dev1, 0);
+        if (err != hipSuccess) {
+            mgb_log_error("failed to enable peer access from %d to %d: %s(%d)",
+                          dev0, dev1, hipGetErrorString(err),
+                          static_cast<int>(err));
+            hipGetLastError();
+        }
+    }
+
+    // check for hipMemcpyPeer usable
+    int v0 = 1, v1 = 2;
+
+    int *dp0, *dp1;
+    MGB_ROCM_CHECK(hipSetDevice(dev0));
+    MGB_ROCM_CHECK(hipMalloc(&dp0, sizeof(int)));
+    MGB_ROCM_CHECK(hipSetDevice(dev1));
+    MGB_ROCM_CHECK(hipMalloc(&dp1, sizeof(int)));
+    MGB_ROCM_CHECK(hipMemcpy(dp0, &v0, sizeof(int), hipMemcpyHostToDevice));
+    MGB_ROCM_CHECK(hipMemcpy(dp1, &v1, sizeof(int), hipMemcpyHostToDevice));
+    MGB_ROCM_CHECK(hipMemcpyPeer(dp1, dev1, dp0, dev0, sizeof(int)));
+    int get = 0;
+    MGB_ROCM_CHECK(hipMemcpy(&get, dp1, sizeof(int), hipMemcpyDeviceToHost));
+
+    mgb_throw_if(get != 1, ROCmError,
+                 "P2P copy (%d => %d) check failed; consider disabling "
+                 "Access Control Services(ACS) for the PCI device",
+                 dev0, dev1);
+
+    already_enabled[dev0][dev1] = true;
+}
+
+/* ===================== ROCmCompNodeImpl::DeviceInfo  ===================== */
+
+void ROCmCompNodeImpl::DeviceInfo::init(const CompNodeEnv& env) {
+    mgb_assert(!mem_alloc);
+#if 0
+    // forward hipMalloc
+    auto&& rocm_env = env.rocm_env();
+    rocm_env.activate();
+    dev_num = rocm_env.device;
+    mem_alloc = mem_alloc::DevMemAlloc::make_rocm_alloc();
+#else
+    auto&& rocm_env = env.rocm_env();
+    rocm_env.activate();
+    dev_num = rocm_env.device;
+    auto reserve_size = StaticData::get_mem_reserve_size();
+    mem_alloc = mem_alloc::DevMemAlloc::make(
+            dev_num, reserve_size,
+            std::make_shared<mem_alloc::ROCmRawAllocator>(),
+            std::make_shared<mem_alloc::ROCmDeviceRuntimePolicy>());
+    mem_alloc->prealloc_config(sd->prealloc_config);
+    auto align = env.property().mem_alignment;
+    mem_alloc->alignment(align);
+    mgb_log("rocm: gpu%d: name=`%s' dyn_mem_reserve=%.2fMiB alignment=0x%zx",
+            dev_num, rocm_env.device_prop.name, reserve_size / 1024.0 / 1024,
+            align);
+#endif
+}
+
+bool ROCmCompNodeImpl::check_global_finalized() {
+    if (!sd) {
+        static std::atomic_flag warn_printed = ATOMIC_FLAG_INIT;
+        if (!warn_printed.test_and_set()) {
+            mgb_log_warn("rocm comp node method called after global finalize");
+        }
+        return true;
+    }
+    return false;
+}
+
+/* ===================== EventImpl  ===================== */
+
+class ROCmCompNode::EventImpl final : public EventImplHelper {
+    bool m_init_finished = false;
+    ROCmCompNodeImpl* const m_comp_node_impl;
+    hipEvent_t m_hip_event;
+
+    void do_record() override {
+        m_comp_node_impl->activate();
+        auto&& env = m_comp_node_impl->m_env.rocm_env();
+        MGB_ROCM_CHECK(hipEventRecord(m_hip_event, env.stream));
+    }
+
+    bool do_finished() override {
+        m_comp_node_impl->activate();
+        hipError_t err = hipEventQuery(m_hip_event);
+        if (err == hipSuccess)
+            return true;
+        if (err == hipErrorNotReady)
+            return false;
+        mgb_throw(ROCmError, "failed to query event: %d: %s", int(err),
+                  hipGetErrorString(err));
+    }
+
+    void host_wait_cv() override {
+        MGB_ROCM_CHECK(hipEventSynchronize(m_hip_event));
+    }
+
+    double do_elapsed_time_until(EventImplHelper& end) override {
+        m_comp_node_impl->activate();
+        float ret = 0.0;
+        MGB_ROCM_CHECK(hipEventElapsedTime(
+                &ret, m_hip_event, static_cast<EventImpl&>(end).m_hip_event));
+        return static_cast<double>(ret) * 1e-3;
+    }
+
+    void do_device_wait_by(Impl* cn_impl) override;
+
+public:
+    EventImpl(ROCmCompNodeImpl* comp_node_impl, size_t create_flags)
+            : EventImplHelper(comp_node_impl, create_flags),
+              m_comp_node_impl{comp_node_impl} {
+        m_comp_node_impl->activate();
+        size_t hip_flags = hipEventDisableTiming;
+        if (create_flags & NEED_TIMER)
+            hip_flags = 0;
+        MGB_ROCM_CHECK(hipEventCreateWithFlags(&m_hip_event, hip_flags));
+        m_init_finished = true;
+    }
+
+    ~EventImpl() {
+        if (m_init_finished) {
+            MGB_TRY { MGB_ROCM_CHECK(hipEventDestroy(m_hip_event)); }
+            MGB_CATCH(MegBrainError & exc, {
+                mgb_log_error("failed to destroy hip event: %s", exc.what());
+            })
+        }
+    }
+};
+
+std::unique_ptr<CompNode::Event> ROCmCompNodeImpl::create_event(size_t flags) {
+    return std::make_unique<EventImpl>(this, flags);
+}
+
+void ROCmCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
+    if (cn_impl->dyn_typeinfo() == ROCmCompNodeImpl::typeinfo()) {
+        auto imp = static_cast<ROCmCompNodeImpl*>(cn_impl);
+        auto stream = imp->m_env.rocm_env().stream;
+        imp->activate();
+        MGB_ROCM_CHECK(hipStreamWaitEvent(stream, m_hip_event, 0));
+        return;
+    }
+    if (cn_impl->env().property().type == DeviceType::CPU) {
+        auto waiter = [this]() {
+            MGB_ROCM_CHECK(hipEventSynchronize(m_hip_event));
+        };
+        cn_impl->add_callback(std::move(waiter));
+        return;
+    }
+    mgb_throw(MegBrainError, "unimplemented event device_wait_by config");
+}
+
+/* ===================== ROCmCompNode static methods ===================== */
+
+bool ROCmCompNode::available() {
+    static int result = -1;
+    static Spinlock mtx;
+    MGB_LOCK_GUARD(mtx);
+    if (result == -1) {
+        int ndev = -1;
+        auto err = hipGetDeviceCount(&ndev);
+        result = err == hipSuccess && ndev > 0;
+        if (!result) {
+            mgb_log_warn("rocm unavailable: %s(%d) ndev=%d",
+                         hipGetErrorString(err), static_cast<int>(err), ndev);
+        }
+    }
+    return result;
+}
+
+void ROCmCompNode::finalize() {
+    if (ROCmCompNodeImpl::sd) {
+        sync_all();
+
+        auto ptr = ROCmCompNodeImpl::sd;
+        ROCmCompNodeImpl::sd = nullptr;
+        ptr->~StaticData();
+    }
+}
+
+CompNode::Impl* ROCmCompNode::load_rocm(const Locator& locator,
+                                        const Locator& locator_logical) {
+    int nr_gpu = get_device_count();
+    mgb_assert(locator.device >= 0 && locator.device < nr_gpu,
+               "request gpu%d out of valid range [0, %d)", locator.device,
+               nr_gpu);
+
+    auto&& sdptr = ROCmCompNodeImpl::sd;
+    {
+        MGB_LOCK_GUARD(ROCmCompNodeImpl::sd_mtx);
+        if (!sdptr) {
+            // use static storage so object can be safely accessed even after
+            // global finalize
+            using T = ROCmCompNodeImpl::StaticData;
+            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
+            sdptr = new (&storage) T;
+        }
+    }
+    auto&& sd = *sdptr;
+    MGB_LOCK_GUARD(sd.mtx);
+
+    CompNodeImpl* available_node = nullptr;
+    for (int i = 0; i < sd.nr_node; ++i) {
+        auto&& cur = sd.node[i];
+        if (cur.m_initialized) {
+            if (cur.m_locator_logical == locator_logical) {
+                return &cur;
+            }
+        } else {
+            available_node = &cur;
+        }
+    }
+
+    if (!available_node) {
+        mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE,
+                   "too many CompNode allocated");
+        mgb_assert(locator.device < sd.MAX_NR_COMP_NODE,
+                   "device number too large");
+        available_node = &sd.node[sd.nr_node++];
+    }
+
+    mgb_assert(!available_node->m_initialized);
+    available_node->init(locator, locator_logical);
+
+    return available_node;
+}
+
+void ROCmCompNode::try_coalesce_all_free_memory() {
+    // TODO: optimized implementation
+    auto sd = ROCmCompNodeImpl::sd;
+    if (!sd)
+        return;
+
+    size_t size = 0;
+    for (int i = 0; i < sd->nr_dev_used; ++i) {
+        size += sd->dev_info[i]
+                        .mem_alloc->gather_stream_free_blk_and_release_full();
+    }
+    if (size) {
+        mgb_log_debug("%zu bytes freed by try_coalesce_all_free_memory()",
+                      size);
+    }
+}
+
+void ROCmCompNode::sync_all() {
+    auto sd = ROCmCompNodeImpl::sd;
+    if (!sd)
+        return;
+
+    for (int i = 0;; ++i) {
+        // ensure async init finished
+        CompNodeEnv* env;
+        {
+            MGB_LOCK_GUARD(sd->mtx);
+            if (i >= sd->nr_node) {
+                break;
+            }
+            env = &sd->node[i].env();
+        }
+        env->rocm_env();
+    }
+
+    MGB_LOCK_GUARD(sd->mtx);
+    for (int i = 0; i < sd->nr_dev_used; ++i) {
+        MGB_ROCM_CHECK(hipSetDevice(sd->dev_info[i].dev_num));
+        MGB_ROCM_CHECK(hipDeviceSynchronize());
+    }
+}
+
+void ROCmCompNode::foreach (thin_function<void(CompNode)> callback) {
+    auto sd = ROCmCompNodeImpl::sd;
+    if (!sd)
+        return;
+
+    for (int i = 0;; ++i) {
+        CompNode cur;
+        {
+            MGB_LOCK_GUARD(sd->mtx);
+            if (i >= sd->nr_node)
+                return;
+            cur = make_comp_node_from_impl(&sd->node[i]);
+        }
+        callback(cur);
+    }
+}
+
+size_t ROCmCompNode::get_device_count() {
+    static int cnt = -1;
+    static Spinlock mtx;
+    MGB_LOCK_GUARD(mtx);
+    if (cnt == -1) {
+        auto err = hipGetDeviceCount(&cnt);
+        if (err != hipSuccess) {
+            mgb_log_error("hipGetDeviceCount failed: %s (err %d)",
+                          hipGetErrorString(err), int(err));
+            cnt = 0;
+        }
+        mgb_assert(cnt >= 0);
+    }
+    return cnt;
+}
+
+#else
+
+bool ROCmCompNode::available() {
+    return false;
+}
+void ROCmCompNode::try_coalesce_all_free_memory() {}
+void ROCmCompNode::foreach (thin_function<void(CompNode)>) {}
+void ROCmCompNode::finalize() {}
+size_t ROCmCompNode::get_device_count() {
+    return 0;
+}
+ROCmCompNode::Impl* ROCmCompNode::load_rocm(const Locator&, const Locator&) {
+    mgb_throw(MegBrainError, "rocm disabled at compile time");
+}
+void ROCmCompNode::sync_all() {}
+
+#undef err
+
+#endif // MGB_ROCM
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/core/impl/comp_node/rocm/comp_node.h b/src/core/impl/comp_node/rocm/comp_node.h
new file mode 100644
index 00000000..8725855d
--- /dev/null
+++ b/src/core/impl/comp_node/rocm/comp_node.h
@@ -0,0 +1,39 @@
+/**
+ * \file src/core/impl/comp_node/rocm/comp_node.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "../impl_helper.h"
+
+namespace mgb {
+class ROCmCompNode final : public CompNodeImplHelper {
+public:
+    static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM;
+
+    class CompNodeImpl;
+    class EventImpl;
+
+    //! whether rocm comp node is available
+    static bool available();
+
+    static void try_coalesce_all_free_memory();
+    static void foreach (thin_function<void(CompNode)> callback);
+    static void finalize();
+    static size_t get_device_count();
+    static Impl* load_rocm(const Locator& locator,
+                           const Locator& locator_logical);
+    static void sync_all();
+};
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
+
+
diff --git a/src/core/impl/comp_node_env.cpp b/src/core/impl/comp_node_env.cpp
index e074bdc9..98c0c61c 100644
--- a/src/core/impl/comp_node_env.cpp
+++ b/src/core/impl/comp_node_env.cpp
@@ -21,6 +21,10 @@
 #include <nvToolsExtCudaRt.h>
 #endif
 #endif
+#if MGB_ROCM
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megcore_rocm.h"
+#endif
 
 #if MGB_CAMBRICON
 #include "megcore_cambricon.h"
@@ -63,6 +67,17 @@ MegDNNHandle::MegDNNHandle(const CompNodeEnv& env) {
         init = true;
     }
 #endif
+
+#if MGB_ROCM
+    if (env.property().type == CompNode::DeviceType::ROCM) {
+        megcoreCreateDeviceHandle(&m_dev_hdl, megcorePlatformROCM,
+                                  env.rocm_env().device, 0);
+        megcore::createComputingHandleWithROCMContext(
+                &m_comp_hdl, m_dev_hdl, 0,
+                {env.rocm_env().stream, make_async_error_info(env)});
+        init = true;
+    }
+#endif
 #if MGB_CAMBRICON
     if (env.property().type == CompNode::DeviceType::CAMBRICON) {
         CompNodeEnv::CnrtEnv::init_status.init();
@@ -230,6 +245,70 @@ void CompNodeEnv::init_atlas(CompNode comp_node, const AtlasEnv& env) {
 #endif
 
 
+#if MGB_ROCM
+
+void mgb::_on_hip_error(const char* expr, hipError_t err, const char* file,
+                        const char* func, int line) {
+    mgb_throw(ROCmError, "rocm error %d: %s (%s at %s:%s:%d)", int(err),
+              hipGetErrorString(err), expr, file, func, line);
+}
+
+void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node,
+                                  const ContinuationCtx<hipStream_t>& cont) {
+    m_comp_node = comp_node;
+
+    mgb_assert(!m_user_data_container && !m_async_init_need_wait);
+    m_rocm_env.device = dev;
+    m_property.type = DeviceType::ROCM;
+    MGB_ROCM_CHECK(hipGetDeviceProperties(&m_rocm_env.device_prop, dev));
+    {
+        auto&& prop = m_rocm_env.device_prop;
+        MGB_MARK_USED_VAR(prop);
+        //! FIXME: no texure alignment in device property
+        m_property.mem_alignment = 1u;
+    }
+
+    std::atomic_bool tid_set{false};
+    auto worker = [this, cont, &tid_set]() {
+        sys::set_thread_name("async_rocm_init");
+        m_async_init_tid = std::this_thread::get_id();
+        tid_set.store(true);
+        bool stream_done = false;
+        MGB_MARK_USED_VAR(stream_done);
+        MGB_TRY {
+            m_rocm_env.activate();
+            MGB_ROCM_CHECK(hipStreamCreateWithFlags(&m_rocm_env.stream,
+                                                    hipStreamNonBlocking));
+            stream_done = true;
+
+            m_user_data_container = std::make_unique<UserDataContainer>();
+
+            cont.next(m_rocm_env.stream);
+
+            // megdnn is initialized here; must be placed after cont.next()
+            // which handles comp node init
+            mgb_assert(
+                    m_property.mem_alignment ==
+                    MegDNNHandle::get(*this).handle()->alignment_requirement());
+        }
+        MGB_CATCH(std::exception & exc, {
+            mgb_log_error("async rocm init failed: %s", exc.what());
+            if (stream_done) {
+                hipStreamDestroy(m_rocm_env.stream);
+            }
+            cont.err(exc);
+            throw;
+        })
+    };
+
+    m_async_init_need_wait = true;
+    m_async_init_future = std::async(std::launch::async, worker);
+    while (!tid_set.load())
+        std::this_thread::yield();
+    mgb_assert(m_async_init_tid != std::this_thread::get_id());
+}
+#endif
+
 #if MGB_CAMBRICON
 const char* mgb::cnml_get_error_string(cnmlStatus_t err) {
     switch (err) {
@@ -330,6 +409,12 @@ void CompNodeEnv::fini() {
         MGB_CUDA_CHECK(cudaStreamDestroy(m_cuda_env.stream));
     }
 #endif
+#if MGB_ROCM
+    if (m_property.type == DeviceType::ROCM) {
+        m_rocm_env.activate();
+        MGB_ROCM_CHECK(hipStreamDestroy(m_rocm_env.stream));
+    }
+#endif
 #if MGB_CAMBRICON
     if (m_property.type == DeviceType::CAMBRICON) {
         m_cnrt_env.activate();
diff --git a/src/core/impl/exception.cpp b/src/core/impl/exception.cpp
index d62309eb..26d2c4c4 100644
--- a/src/core/impl/exception.cpp
+++ b/src/core/impl/exception.cpp
@@ -77,6 +77,30 @@ AtlasError::AtlasError(const std::string &msg):
 }
 
 
+ROCmError::ROCmError(const std::string &msg):
+    SystemError(msg)
+{
+    m_msg.append(get_rocm_extra_info());
+}
+
+std::string ROCmError::get_rocm_extra_info() {
+#if MGB_ROCM
+    // get last error and clear error
+    auto err = hipGetLastError();
+    int dev = -1;
+    hipGetDevice(&dev);
+    size_t free_byte = 0, total_byte = 0;
+    hipMemGetInfo(&free_byte, &total_byte);
+    constexpr double SIZE2MB = 1.0 / 1024 / 1024;
+    return ssprintf("(last_err=%d(%s) "
+            "device=%d mem_free=%.3fMiB mem_tot=%.3fMiB)",
+            err, hipGetErrorString(err),
+            dev, free_byte * SIZE2MB, total_byte * SIZE2MB);
+#else
+    return "rocm disabled at compile time";
+#endif
+}
+
 CnrtError::CnrtError(const std::string& msg) : SystemError(msg) {
     m_msg.append(get_cnrt_extra_info());
 }
diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h
index a65f0f9a..8b71c389 100644
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -113,6 +113,7 @@ class CompNode {
             CUDA = 1,
             CPU = 2,
             CAMBRICON = 3,
+            ROCM = 8,
             ATLAS = 9,
             MULTITHREAD,
             MAX_DEVICE_ID,
diff --git a/src/core/include/megbrain/comp_node/alloc.h b/src/core/include/megbrain/comp_node/alloc.h
index e546e440..2d15108d 100644
--- a/src/core/include/megbrain/comp_node/alloc.h
+++ b/src/core/include/megbrain/comp_node/alloc.h
@@ -199,6 +199,21 @@ class DevMemAlloc: virtual public MemAllocBase {
         static std::unique_ptr<DevMemAlloc> make_cuda_alloc();
 #endif
 
+#if MGB_ROCM
+        /*!
+         * \brief create a new allocator for a device that merely forward
+         *      hipMalloc and hipFree.
+         */
+        static std::unique_ptr<DevMemAlloc> make_rocm_alloc();
+#endif
+
+#if MGB_CAMBRICON
+        /*!
+         * \brief create a new allocator for a device that merely forward
+         * cnrtMalloc and cnrtFree.
+         */
+        static std::unique_ptr<DevMemAlloc> make_cambricon_alloc();
+#endif
 
         virtual ~DevMemAlloc() = default;
 
diff --git a/src/core/include/megbrain/comp_node_env.h b/src/core/include/megbrain/comp_node_env.h
index 9fc48079..9e46450f 100644
--- a/src/core/include/megbrain/comp_node_env.h
+++ b/src/core/include/megbrain/comp_node_env.h
@@ -70,11 +70,95 @@
 
 #endif // MGB_ATLAS
 
+
+#if MGB_ROCM
+#include "hcc_detail/hcc_defs_prologue.h"
+#include "megcore_rocm.h"
+
+#if MGB_ENABLE_LOGGING
+#define MGB_ROCM_CHECK(expr)                                                  \
+    do {                                                                      \
+        hipError_t __hip_check_code = (expr);                                 \
+        if (!mgb_likely(__hip_check_code == hipSuccess)) {                    \
+            ::mgb::_on_hip_error(#expr, __hip_check_code, __FILE__, __func__, \
+                                 __LINE__);                                   \
+        }                                                                     \
+    } while (0)
+#else
+#define MGB_ROCM_CHECK(expr)                                          \
+    do {                                                              \
+        hipError_t __hip_check_code = (expr);                         \
+        if (!mgb_likely(__hip_check_code == hipSuccess)) {            \
+            ::mgb::_on_hip_error(#expr, __hip_check_code, "", "", 1); \
+        }                                                             \
+    } while (0)
+
+#endif  // MGB_ENABLE_LOGGING
+
+#endif
+
+#if MGB_CAMBRICON
+#include <cnrt.h>
+#include <cndev.h>
+#include <cnml.h>
+
+#if MGB_ENABLE_LOGGING
+#define MGB_CNRT_CHECK(expr)                                          \
+    do {                                                              \
+        cnrtRet_t __cnrt_check_code = (expr);                         \
+        if (mgb_unlikely(__cnrt_check_code != CNRT_RET_SUCCESS)) {    \
+            ::mgb::_on_cnrt_error(#expr, __cnrt_check_code, __FILE__, \
+                                  __func__, __LINE__);                \
+        }                                                             \
+    } while (0)
+#define MGB_CNDEV_CHECK(expr)                                           \
+    do {                                                                \
+        cndevRet_t __cndev_check_code = (expr);                         \
+        if (mgb_unlikely(__cndev_check_code != CNDEV_SUCCESS)) {        \
+            ::mgb::_on_cndev_error(#expr, __cndev_check_code, __FILE__, \
+                                   __func__, __LINE__);                 \
+        }                                                               \
+    } while (0)
+#define MGB_CNML_CHECK(expr)                                          \
+    do {                                                              \
+        cnmlStatus_t __cnml_check_code = (expr);                      \
+        if (mgb_unlikely(__cnml_check_code != CNML_STATUS_SUCCESS)) { \
+            ::mgb::_on_cnml_error(#expr, __cnml_check_code, __FILE__, \
+                                  __func__, __LINE__);                \
+        }                                                             \
+    } while (0)
+#else
+#define MGB_CNRT_CHECK(expr)                                       \
+    do {                                                                \
+        cnrtRet_t __cnrt_check_code = (expr);                           \
+        if (mgb_unlikely(__cnrt_check_code != CNRT_RET_SUCCESS)) {      \
+            ::mgb::_on_cnrt_error(#expr, __cnrt_check_code, "", "", 1); \
+        }                                                               \
+    } while (0)
+#define MGB_CNDEV_CHECK(expr)                                               \
+    do {                                                                    \
+        cndevRet_t __cndev_check_code = (expr);                             \
+        if (mgb_unlikely(__cndev_check_code != CNDEV_SUCCESS)) {            \
+            ::mgb::_on_cndev_error(#expr, __cndev_check_code, __FILE__, "", \
+                                   "", 1);                                  \
+        }                                                                   \
+    } while (0)
+#define MGB_CNML_CHECK(expr)                                                  \
+    do {                                                                      \
+        cnmlStatus_t __cnml_check_code = (expr);                              \
+        if (mgb_unlikely(__cnml_check_code != CNML_STATUS_SUCCESS)) {         \
+            ::mgb::_on_cnml_error(#expr, __cnml_check_code, __FILE__, "", "", \
+                                  1);                                         \
+        }                                                                     \
+    } while (0)
+#endif  // MGB_ENABLE_LOGGING
+#endif  // MGB_CAMBRICON
+
 //! whether to enable asynchronous initialization for CompNode and CompNodeEnv
-#define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA)
+#define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA || MGB_ROCM)
 
 //! whether AsyncErrorInfo is needed
-#define MGB_NEED_MEGDNN_ASYNC_ERROR (MGB_CUDA)
+#define MGB_NEED_MEGDNN_ASYNC_ERROR (MGB_CUDA || MGB_ROCM)
 
 #if MGB_ENABLE_COMP_NODE_ASYNC_INIT
 #include <atomic>
@@ -97,6 +181,11 @@ namespace mgb {
 #endif
 
 
+#if MGB_ROCM
+[[noreturn]] void _on_hip_error(const char* expr, hipError_t err,
+                                const char* file, const char* func, int line);
+#endif
+
 #if MGB_CAMBRICON
 const char* cnml_get_error_string(cnmlStatus_t err);
 [[noreturn]] void _on_cnrt_error(const char* expr, cnrtRet_t err,
@@ -198,6 +287,11 @@ public:
             m_cuda_env.activate();
         }
 #endif
+#if MGB_ROCM
+        if (m_property.type == DeviceType::ROCM) {
+            m_rocm_env.activate();
+        }
+#endif
 #if MGB_CAMBRICON
         if (m_property.type == DeviceType::CAMBRICON) {
             m_cnrt_env.activate();
@@ -299,6 +393,28 @@ public:
     void init_atlas(CompNode comp_node, const AtlasEnv& env);
 #endif
 
+#if MGB_ROCM
+    struct ROCmEnv {
+        int device = -1;
+        hipStream_t stream = 0;
+        hipDeviceProp_t device_prop;
+
+        void activate() const { MGB_ROCM_CHECK(hipSetDevice(device)); }
+    };
+
+    const ROCmEnv& rocm_env() const {
+        if (mgb_unlikely(m_property.type != DeviceType::ROCM))
+            on_bad_device_type(DeviceType::ROCM);
+        ensure_async_init_finished();
+        return m_rocm_env;
+    }
+
+    //! init this as a rocm env asynchronously
+    void init_rocm_async(int dev, CompNode comp_node,
+                         const ContinuationCtx<hipStream_t>& cont);
+
+#endif
+
 #if MGB_CAMBRICON
     struct CnrtEnv {
         int device = -1;
@@ -400,6 +516,9 @@ private:
 #if MGB_ATLAS
     AtlasEnv m_atlas_env;
 #endif
+#if MGB_ROCM
+    ROCmEnv m_rocm_env;
+#endif
 #if MGB_CAMBRICON
     CnrtEnv m_cnrt_env;
 #endif
diff --git a/src/core/include/megbrain/exception.h b/src/core/include/megbrain/exception.h
index ad5d9f02..a4215893 100644
--- a/src/core/include/megbrain/exception.h
+++ b/src/core/include/megbrain/exception.h
@@ -145,6 +145,17 @@ public:
 };
 
 
+class ROCmError final : public SystemError {
+public:
+    /*!
+     * \brief get extra info for current rocm status, to be appended in
+     *      error message
+     */
+    static std::string get_rocm_extra_info();
+
+    ROCmError(const std::string& msg);
+};
+
 class CnrtError final : public SystemError {
 public:
     /*!