diff --git a/dnn/src/cuda/utils.cuh b/dnn/src/cuda/utils.cuh
index 3c6caa06..c59a8f4a 100644
--- a/dnn/src/cuda/utils.cuh
+++ b/dnn/src/cuda/utils.cuh
@@ -83,6 +83,12 @@
         cuda_check(cudaGetLastError()); \
     } while (0)
 
+#if MEGDNN_TEGRA_X2
+//! tx2 only have 256 cuda cores
+#define NR_THREADS   256
+#define NR_THREADS_X 32
+#define NR_THREADS_Y 8
+#else
 #if MEGDNN_THREADS_512
 #define NR_THREADS   512
 #define NR_THREADS_X 32
@@ -92,6 +98,7 @@
 #define NR_THREADS_X 32
 #define NR_THREADS_Y 32
 #endif
+#endif
 
 #define DIVUP(x, y)   (((x) + (y)-1) / (y))
 #define ROUNDUP(x, y) (DIVUP(x, y) * (y))
diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp
index 4429e4b0..48a40986 100644
--- a/dnn/test/cuda/convolution.cpp
+++ b/dnn/test/cuda/convolution.cpp
@@ -22,6 +22,8 @@
 #include "test/cuda/fixture.h"
 #include "test/cuda/utils.h"
 
+#include <cudnn.h>
+
 #define V1(x) #x
 #define V(x)  V1(x)
 #define CUDNN_VERSION_STRING \
@@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
     }
 }
 
-TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
-    using namespace convolution;
-    std::vector<TestArg> args = get_1x1_args();
-    Benchmarker<ConvolutionForward> marker(handle_cuda());
-    NormalRNG default_rng;
-    for (auto&& arg : args) {
-        float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
-        UniformFloatRNG rng(scale, 2 * scale);
-        marker.set_dtype(0, dtype::Float32())
-                .set_dtype(1, dtype::Float32())
-                .set_rng(0, &default_rng)
-                .set_rng(1, &default_rng)
-                .set_param(arg.param)
-                .execs({arg.src, arg.filter, {}});
-    }
-}
-
 TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
     using namespace convolution;
     std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
@@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
 }
 
 #if MEGDNN_WITH_BENCHMARK
+TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
+    using namespace convolution;
+    std::vector<TestArg> args = get_1x1_args();
+    Benchmarker<ConvolutionForward> marker(handle_cuda());
+    NormalRNG default_rng;
+    for (auto&& arg : args) {
+        float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
+        UniformFloatRNG rng(scale, 2 * scale);
+        marker.set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_rng(0, &default_rng)
+                .set_rng(1, &default_rng)
+                .set_param(arg.param)
+                .execs({arg.src, arg.filter, {}});
+    }
+}
+
 TEST_F(CUDA, CONV_FWD_BENCHMARK) {
     auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
                    size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
diff --git a/dnn/test/cuda/flip.cpp b/dnn/test/cuda/flip.cpp
index 118cd274..fffb947e 100644
--- a/dnn/test/cuda/flip.cpp
+++ b/dnn/test/cuda/flip.cpp
@@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) {
     }
 }
 
+#if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, FLIP_BENCHMARK) {
     auto run = [&](const TensorShapeArray& shapes) {
         Benchmarker<Flip> benchmarker(handle_cuda());
@@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) {
 
     run(shapes);
 }
+#endif
 
 }  // namespace test
 }  // namespace megdnn
diff --git a/dnn/test/cuda/images2neibs.cpp b/dnn/test/cuda/images2neibs.cpp
index 2830f9c2..b67b0f8a 100644
--- a/dnn/test/cuda/images2neibs.cpp
+++ b/dnn/test/cuda/images2neibs.cpp
@@ -14,6 +14,7 @@
 #include "test/common/images2neibs.h"
 #include "test/common/rng.h"
 #include "test/cuda/benchmark.h"
+#include "test/cuda/utils.h"
 
 namespace megdnn {
 namespace test {
@@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) {
 #endif
 
 TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) {
+    require_compute_capability(6, 1);
     UniformFloatRNG rng(0, 1);
     auto args = images2neibs::get_args();
     for (auto&& arg : args) {
diff --git a/dnn/test/cuda/indexing_one_hot.cpp b/dnn/test/cuda/indexing_one_hot.cpp
index aa888dc4..779819b4 100644
--- a/dnn/test/cuda/indexing_one_hot.cpp
+++ b/dnn/test/cuda/indexing_one_hot.cpp
@@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) {
     ASSERT_TRUE(failed);
 }
 
+TEST_F(CUDA, INDEXING_SET_ONE_HOT) {
+    run_indexing_set_one_hot_test(handle_cuda());
+}
+
+#if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) {
     Benchmarker<IndexingOneHot> bench{handle_cuda()};
     bench.set_times(1);
@@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) {
     printf("bandwidth: %.2fGiB/s\n",
            A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time);
 }
-
-TEST_F(CUDA, INDEXING_SET_ONE_HOT) {
-    run_indexing_set_one_hot_test(handle_cuda());
-}
+#endif
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/test/cuda/matrix_mul.cpp b/dnn/test/cuda/matrix_mul.cpp
index 1d48a35a..0a1c44c7 100644
--- a/dnn/test/cuda/matrix_mul.cpp
+++ b/dnn/test/cuda/matrix_mul.cpp
@@ -14,13 +14,12 @@
 #include "test/common/benchmarker.h"
 #include "test/common/checker.h"
 #include "test/common/matrix_mul.h"
+#include "test/cuda/utils.h"
 
 #if defined(cuda_check)
 #undef cuda_check
 #endif
-#include "test/cuda/utils.h"
-
-#include <cuda.h>
+#include "src/cuda/utils.h"
 
 namespace megdnn {
 namespace test {
@@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
 }
 
 TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
-    if (cuda::current_device_prop().major < 7 ||
-        (cuda::current_device_prop().major == 7 &&
-         cuda::current_device_prop().minor < 5)) {
-        printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
-               "doesn't support\n");
-        return;
-    }
+    require_compute_capability(7, 5);
     Checker<MatrixMul> checker(handle_cuda(), false);
     using Param = MatrixMul::Param;
     Param param;
@@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
     checker.exec({{256, 256}, {256, 256}, {256, 256}});
     auto args = matrix_mul::get_matmul_args();
     for (auto arg : args) {
-        size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
-               k = DIVUP(arg.k, 32) * 32;
+        size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8,
+               k = (arg.k + 31) / 32 * 32;
         checker.exec({{m, k}, {n, k}, {m, n}});
     }
 }
 
 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
-    if (cuda::current_device_prop().major < 7 ||
-        (cuda::current_device_prop().major == 7 &&
-         cuda::current_device_prop().minor < 5)) {
-        printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
-               "device doesn't support\n");
-        return;
-    }
+    require_compute_capability(7, 5);
     Benchmarker<MatrixMul> bencher(handle_cuda());
     using Param = MatrixMul::Param;
     Param param;
@@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
 }
 
 TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
-    if (cuda::current_device_prop().major < 7 ||
-        (cuda::current_device_prop().major == 7 &&
-         cuda::current_device_prop().minor < 5)) {
-        printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
-               "current "
-               "device doesn't support\n");
-        return;
-    }
+    require_compute_capability(7, 5);
     Benchmarker<MatrixMul> bencher(handle_cuda());
     using Param = MatrixMul::Param;
     Param param;
diff --git a/dnn/test/cuda/padding.cpp b/dnn/test/cuda/padding.cpp
index 1ae5ea22..7969706d 100644
--- a/dnn/test/cuda/padding.cpp
+++ b/dnn/test/cuda/padding.cpp
@@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) {
                                                              6, 7, 7, 8, 9, 9, 9, 9})});
 }
 
-// #if MEGDNN_WITH_BENCHMARK
-
+#if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) {
     using Param = Padding::Param;
 
@@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) {
         run(shapes, param);
     }
 }
-
-// #endif
\ No newline at end of file
+#endif
diff --git a/dnn/test/cuda/rotate.cpp b/dnn/test/cuda/rotate.cpp
index def9d424..142caa15 100644
--- a/dnn/test/cuda/rotate.cpp
+++ b/dnn/test/cuda/rotate.cpp
@@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) {
     }
 }
 
+#if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, BENCHMARK_ROTATE) {
     auto run = [&](const TensorShapeArray& shapes) {
         Benchmarker<Rotate> benchmarker(handle_cuda());
@@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) {
 
     run(shapes);
 }
+#endif
 
 }  // namespace rotate
 }  // namespace test
diff --git a/dnn/test/cuda/sliding_window_transpose.cpp b/dnn/test/cuda/sliding_window_transpose.cpp
index 9621dd14..74f78bd2 100644
--- a/dnn/test/cuda/sliding_window_transpose.cpp
+++ b/dnn/test/cuda/sliding_window_transpose.cpp
@@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) {
     }
 }
 
-#if MEGDNN_WITH_BENCHMARK
-TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) {
-    auto args = sliding_window_transpose::get_benchmark_args();
-    for (auto&& arg : args) {
-        CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda());
-        bencher.set_param(arg.param)
-                .set_dtype(0, dtype::Float32())
-                .exec(TensorShapeArray{arg.ishape, {}});
-    }
-}
-#endif
-
 TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) {
     UniformFloatRNG rng(0, 1);
     auto args = sliding_window_transpose::get_args();
@@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) {
     }
 }
 
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) {
+    auto args = sliding_window_transpose::get_benchmark_args();
+    for (auto&& arg : args) {
+        CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda());
+        bencher.set_param(arg.param)
+                .set_dtype(0, dtype::Float32())
+                .exec(TensorShapeArray{arg.ishape, {}});
+    }
+}
+#endif
+
 }  // namespace test
 }  // namespace megdnn
 
diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp
index 6cd7e2a9..14feae90 100644
--- a/dnn/test/cuda/type_cvt.cpp
+++ b/dnn/test/cuda/type_cvt.cpp
@@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) {
         }
 }
 
-TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) {
-    const size_t RUNS = 3;
-
-    auto run = [&](TensorLayout src, TensorLayout dst) {
-        Benchmarker<TypeCvt> benchmarker(handle_cuda());
-        auto&& layout = src;
-        benchmarker.set_times(RUNS);
-        dst.init_contiguous_stride();
-        auto used = benchmarker.execl({src, dst});
-        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
-               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
-                       (1024 * 1024 * 1024));
-    };
-
-    TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
-            dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32());
-    run(src, dst);
-}
-
 TEST_F(CUDA, QUANTIZED_TYPECVT) {
     UniformIntRNG int_rng{-66, 66};
     Checker<TypeCvt> checker(handle_cuda());
@@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) {
 }
 
 #if MEGDNN_WITH_BENCHMARK
+TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) {
+    const size_t RUNS = 3;
+
+    auto run = [&](TensorLayout src, TensorLayout dst) {
+        Benchmarker<TypeCvt> benchmarker(handle_cuda());
+        auto&& layout = src;
+        benchmarker.set_times(RUNS);
+        dst.init_contiguous_stride();
+        auto used = benchmarker.execl({src, dst});
+        printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
+               2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
+                       (1024 * 1024 * 1024));
+    };
+
+    TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
+            dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32());
+    run(src, dst);
+}
+
 TEST_F(CUDA, BENCHMARK_TYPE_CVT) {
     UniformIntRNG rng{-128, 127};
     auto run = [&](TensorLayout src, TensorLayout dst) {