diff --git a/dnn/src/aarch64/conv_bias/fp16/algos.cpp b/dnn/src/aarch64/conv_bias/fp16/algos.cpp
index 399211f0..002b1006 100644
--- a/dnn/src/aarch64/conv_bias/fp16/algos.cpp
+++ b/dnn/src/aarch64/conv_bias/fp16/algos.cpp
@@ -12,8 +12,8 @@
 
 #include "src/aarch64/conv_bias/fp16/algos.h"
 #include "src/aarch64/conv_bias/fp16/stride2_kern.h"
-#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
 #include "src/arm_common/conv_bias/postprocess_helper.h"
+#include "src/fallback/conv_bias/direct/multi_thread_common.h"
 
 using namespace megdnn;
 using namespace aarch64;
@@ -43,7 +43,7 @@ size_t ConvBiasImpl::AlgoF16DirectStride2::get_workspace(
         const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_aarch64_conv_bias_stride2_conv2357_fp16, 0, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto wbundle = arm_common::MultithreadDirectConvCommon<
+        auto wbundle = fallback::MultithreadDirectConvCommon<
                 dt_float16, __fp16>::get_bundle_stride(param, large_group);
         return wbundle.total_size_in_bytes();
     }
@@ -83,7 +83,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
         conv = fp16::conv_stride2::do_conv_7x7_stride2;
     }
 
-    WorkspaceBundle bundle = arm_common::MultithreadDirectConvCommon<
+    WorkspaceBundle bundle = fallback::MultithreadDirectConvCommon<
             dt_float16, __fp16>::get_bundle_stride(param, large_group);
     SmallVector<NCBKern> ret_kerns;
 
@@ -98,13 +98,13 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
             size_t OC = fm.ocpg;
             bundle.set(kern_param.workspace_ptr);
             for (size_t ic = 0; ic < IC; ic++) {
-                arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
                         copy_padding_kern_stride(
                                 bundle, kern_param, ncb_index,
                                 {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
                         do_conv_kern_stride(
                                 bundle, kern_param, ncb_index, conv,
                                 {ncb_index.thread_id, 0, oc});
@@ -116,7 +116,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
                     copy_padding_kern_stride(
                             bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
@@ -125,7 +125,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>::
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
                     do_conv_kern_stride(
                             bundle, kern_param, ncb_index, conv, ncb_index.ndrange_id);
         };
diff --git a/dnn/src/aarch64/conv_bias/fp32/algos.cpp b/dnn/src/aarch64/conv_bias/fp32/algos.cpp
index a5d3358c..ae6bf20a 100644
--- a/dnn/src/aarch64/conv_bias/fp32/algos.cpp
+++ b/dnn/src/aarch64/conv_bias/fp32/algos.cpp
@@ -11,9 +11,9 @@
 
 #include "src/aarch64/conv_bias/fp32/algos.h"
 #include "src/aarch64/conv_bias/fp32/stride2_kern.h"
-#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
 #include "src/arm_common/conv_bias/postprocess_helper.h"
 #include "src/fallback/conv_bias/common.h"
+#include "src/fallback/conv_bias/direct/multi_thread_common.h"
 
 #include "midout.h"
 
@@ -42,8 +42,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
         const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_aarch64_conv_bias_stride2_conv2357_fp32, 0, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto wbundle = arm_common::MultithreadDirectConvCommon<
-                float, float>::get_bundle_stride(param, large_group);
+        auto wbundle =
+                fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
+                        param, large_group);
         return wbundle.total_size_in_bytes();
     }
     MIDOUT_END();
@@ -82,7 +83,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
     }
 
     WorkspaceBundle bundle =
-            arm_common::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
+            fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                     param, large_group);
     SmallVector<NCBKern> ret_kerns;
 
@@ -97,13 +98,13 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
             size_t OC = fm.ocpg;
             bundle.set(kern_param.workspace_ptr);
             for (size_t ic = 0; ic < IC; ic++) {
-                arm_common::MultithreadDirectConvCommon<float, float>::
+                fallback::MultithreadDirectConvCommon<float, float>::
                         copy_padding_kern_stride(
                                 bundle, kern_param, ncb_index,
                                 {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                arm_common::MultithreadDirectConvCommon<float, float>::
+                fallback::MultithreadDirectConvCommon<float, float>::
                         do_conv_kern_stride(
                                 bundle, kern_param, ncb_index, conv,
                                 {ncb_index.thread_id, 0, oc});
@@ -115,7 +116,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            arm_common::MultithreadDirectConvCommon<float, float>::
+            fallback::MultithreadDirectConvCommon<float, float>::
                     copy_padding_kern_stride(
                             bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
@@ -124,7 +125,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            arm_common::MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
+            fallback::MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                     bundle, kern_param, ncb_index, conv, ncb_index.ndrange_id);
         };
         ret_kerns.push_back({do_conv, {group, N, OC}});
diff --git a/dnn/src/arm_common/conv_bias/f16/algos.cpp b/dnn/src/arm_common/conv_bias/f16/algos.cpp
index d26912aa..f66c3f2b 100644
--- a/dnn/src/arm_common/conv_bias/f16/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/f16/algos.cpp
@@ -10,7 +10,6 @@
  */
 
 #include "src/arm_common/conv_bias/f16/algos.h"
-#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
 #include "src/arm_common/conv_bias/f16/direct.h"
 #include "src/arm_common/conv_bias/f16/do_conv_stride1.h"
 #include "src/arm_common/conv_bias/f16/strategy.h"
@@ -18,6 +17,7 @@
 #include "src/arm_common/conv_bias/postprocess_helper.h"
 #include "src/common/opr_delegate.h"
 #include "src/fallback/conv_bias/common.h"
+#include "src/fallback/conv_bias/direct/multi_thread_common.h"
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "midout.h"
 MIDOUT_DECL(megdnn_arm_common_winograd_fp16)
@@ -187,8 +187,9 @@ bool ConvBiasImpl::AlgoF16Direct::usable(
 size_t ConvBiasImpl::AlgoF16Direct::get_workspace(const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 0, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto wbundle = MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
-                param, large_group);
+        auto wbundle =
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
+                        param, large_group);
         return wbundle.total_size_in_bytes();
     }
     MIDOUT_END();
@@ -204,7 +205,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
     size_t group = fm.group;
     bool large_group = group >= param.nr_threads;
     WorkspaceBundle bundle =
-            MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle(
                     param, large_group);
     SmallVector<NCBKern> ret_kerns;
     //! When group >= nr_threads, treat it as large_group, each thread process
@@ -220,17 +221,20 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
             bundle.set(kern_param.workspace_ptr);
             if (fm.should_flip) {
                 for (size_t oc = 0; oc < OC; oc++) {
-                    MultithreadDirectConvCommon<dt_float16, __fp16>::weight_flip_kern(
-                            bundle, kern_param, ncb_index,
-                            {ncb_index.thread_id, 0, oc});
+                    fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                            weight_flip_kern(
+                                    bundle, kern_param, ncb_index,
+                                    {ncb_index.thread_id, 0, oc});
                 }
             }
             for (size_t ic = 0; ic < IC; ic++) {
-                MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern(
-                        bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic});
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                        copy_padding_kern(
+                                bundle, kern_param, ncb_index,
+                                {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
                         bundle, kern_param, ncb_index, fp16::conv_bias::kern_direct_f16,
                         {ncb_index.thread_id, 0, oc});
             }
@@ -242,8 +246,9 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
                                        const NCBKernParam& kern_param,
                                        const NCBKernIndex& ncb_index) mutable {
                 bundle.set(kern_param.workspace_ptr);
-                MultithreadDirectConvCommon<dt_float16, __fp16>::weight_flip_kern(
-                        bundle, kern_param, ncb_index, ncb_index.ndrange_id);
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                        weight_flip_kern(
+                                bundle, kern_param, ncb_index, ncb_index.ndrange_id);
             };
             ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
         }
@@ -251,15 +256,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern(
-                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                    copy_padding_kern(
+                            bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
         ret_kerns.push_back({copy_padding, {group, N, IC}});
         auto do_conv = [bundle](
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern(
                     bundle, kern_param, ncb_index, fp16::conv_bias::kern_direct_f16,
                     ncb_index.ndrange_id);
         };
@@ -324,9 +330,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride1::get_kimpl
     }
     SWITCH_KERN();
 
-    WorkspaceBundle bundle =
-            MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride(
-                    param, large_group);
+    WorkspaceBundle bundle = fallback::MultithreadDirectConvCommon<
+            dt_float16, __fp16>::get_bundle_stride(param, large_group);
     SmallVector<NCBKern> ret_kerns;
     //! When group >= nr_threads, treat it as large_group, each thread process
     //! one group for better performance
@@ -340,15 +345,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride1::get_kimpl
             size_t OC = fm.ocpg;
             bundle.set(kern_param.workspace_ptr);
             for (size_t ic = 0; ic < IC; ic++) {
-                MultithreadDirectConvCommon<dt_float16, __fp16>::
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
                         copy_padding_kern_stride(
                                 bundle, kern_param, ncb_index,
                                 {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern_stride(
-                        bundle, kern_param, ncb_index, conv_kern_function,
-                        {ncb_index.thread_id, 0, oc});
+                fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                        do_conv_kern_stride(
+                                bundle, kern_param, ncb_index, conv_kern_function,
+                                {ncb_index.thread_id, 0, oc});
             }
         };
         ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
@@ -357,17 +363,19 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride1::get_kimpl
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<dt_float16, __fp16>::copy_padding_kern_stride(
-                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                    copy_padding_kern_stride(
+                            bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
         ret_kerns.push_back({copy_padding, {group, N, IC}});
         auto do_conv = [bundle, conv_kern_function](
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<dt_float16, __fp16>::do_conv_kern_stride(
-                    bundle, kern_param, ncb_index, conv_kern_function,
-                    ncb_index.ndrange_id);
+            fallback::MultithreadDirectConvCommon<dt_float16, __fp16>::
+                    do_conv_kern_stride(
+                            bundle, kern_param, ncb_index, conv_kern_function,
+                            ncb_index.ndrange_id);
         };
         ret_kerns.push_back({do_conv, {group, N, OC}});
     }
@@ -378,9 +386,8 @@ size_t ConvBiasImpl::AlgoF16DirectStride1::get_workspace(
         const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp16_kimpl, 1, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto bundle =
-                MultithreadDirectConvCommon<dt_float16, __fp16>::get_bundle_stride(
-                        param, large_group);
+        auto bundle = fallback::MultithreadDirectConvCommon<
+                dt_float16, __fp16>::get_bundle_stride(param, large_group);
         return bundle.total_size_in_bytes();
     }
     MIDOUT_END();
diff --git a/dnn/src/arm_common/conv_bias/fp32/algos.cpp b/dnn/src/arm_common/conv_bias/fp32/algos.cpp
index ce886905..e65869fa 100644
--- a/dnn/src/arm_common/conv_bias/fp32/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/algos.cpp
@@ -11,7 +11,6 @@
  */
 
 #include "src/arm_common/conv_bias/fp32/algos.h"
-#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
 #include "src/arm_common/conv_bias/fp32/direct.h"
 #include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
 #include "src/arm_common/conv_bias/fp32/do_conv_stride2.h"
@@ -20,6 +19,7 @@
 #include "src/arm_common/conv_bias/postprocess_helper.h"
 #include "src/common/opr_delegate.h"
 #include "src/fallback/conv_bias/common.h"
+#include "src/fallback/conv_bias/direct/multi_thread_common.h"
 
 #include "midout.h"
 
@@ -343,7 +343,7 @@ bool ConvBiasImpl::AlgoF32Direct::usable(
 size_t ConvBiasImpl::AlgoF32Direct::get_workspace(const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto wbundle = MultithreadDirectConvCommon<float, float>::get_bundle(
+        auto wbundle = fallback::MultithreadDirectConvCommon<float, float>::get_bundle(
                 param, large_group);
         return wbundle.total_size_in_bytes();
     }
@@ -359,7 +359,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
     size_t group = fm.group;
     bool large_group = group >= param.nr_threads;
     WorkspaceBundle bundle =
-            MultithreadDirectConvCommon<float, float>::get_bundle(param, large_group);
+            fallback::MultithreadDirectConvCommon<float, float>::get_bundle(
+                    param, large_group);
     SmallVector<NCBKern> ret_kerns;
     //! When group >= nr_threads, treat it as large_group, each thread process
     //! one group for better performance
@@ -374,17 +375,18 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
             bundle.set(kern_param.workspace_ptr);
             if (fm.should_flip) {
                 for (size_t oc = 0; oc < OC; oc++) {
-                    MultithreadDirectConvCommon<float, float>::weight_flip_kern(
-                            bundle, kern_param, ncb_index,
-                            {ncb_index.thread_id, 0, oc});
+                    fallback::MultithreadDirectConvCommon<float, float>::
+                            weight_flip_kern(
+                                    bundle, kern_param, ncb_index,
+                                    {ncb_index.thread_id, 0, oc});
                 }
             }
             for (size_t ic = 0; ic < IC; ic++) {
-                MultithreadDirectConvCommon<float, float>::copy_padding_kern(
+                fallback::MultithreadDirectConvCommon<float, float>::copy_padding_kern(
                         bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                MultithreadDirectConvCommon<float, float>::do_conv_kern(
+                fallback::MultithreadDirectConvCommon<float, float>::do_conv_kern(
                         bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct,
                         {ncb_index.thread_id, 0, oc});
             }
@@ -396,7 +398,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
                                        const NCBKernParam& kern_param,
                                        const NCBKernIndex& ncb_index) mutable {
                 bundle.set(kern_param.workspace_ptr);
-                MultithreadDirectConvCommon<float, float>::weight_flip_kern(
+                fallback::MultithreadDirectConvCommon<float, float>::weight_flip_kern(
                         bundle, kern_param, ncb_index, ncb_index.ndrange_id);
             };
             ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
@@ -405,7 +407,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<float, float>::copy_padding_kern(
+            fallback::MultithreadDirectConvCommon<float, float>::copy_padding_kern(
                     bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
         ret_kerns.push_back({copy_padding, {group, N, IC}});
@@ -413,7 +415,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<float, float>::do_conv_kern(
+            fallback::MultithreadDirectConvCommon<float, float>::do_conv_kern(
                     bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct,
                     ncb_index.ndrange_id);
         };
@@ -452,8 +454,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace(
         const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto bundle = MultithreadDirectConvCommon<float, float>::get_bundle_stride(
-                param, large_group);
+        auto bundle =
+                fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
+                        param, large_group);
         return bundle.total_size_in_bytes();
     }
     MIDOUT_END();
@@ -492,7 +495,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
     SWITCH_KERN_STR1();
 
     WorkspaceBundle bundle =
-            MultithreadDirectConvCommon<float, float>::get_bundle_stride(
+            fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                     param, large_group);
     SmallVector<NCBKern> ret_kerns;
     //! When group >= nr_threads, treat it as large_group, each thread process
@@ -507,13 +510,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
             size_t OC = fm.ocpg;
             bundle.set(kern_param.workspace_ptr);
             for (size_t ic = 0; ic < IC; ic++) {
-                MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
-                        bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic});
+                fallback::MultithreadDirectConvCommon<float, float>::
+                        copy_padding_kern_stride(
+                                bundle, kern_param, ncb_index,
+                                {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
-                        bundle, kern_param, ncb_index, conv_kern_function,
-                        {ncb_index.thread_id, 0, oc});
+                fallback::MultithreadDirectConvCommon<float, float>::
+                        do_conv_kern_stride(
+                                bundle, kern_param, ncb_index, conv_kern_function,
+                                {ncb_index.thread_id, 0, oc});
             }
         };
         ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
@@ -522,15 +528,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
-                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
+            fallback::MultithreadDirectConvCommon<float, float>::
+                    copy_padding_kern_stride(
+                            bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
         ret_kerns.push_back({copy_padding, {group, N, IC}});
         auto do_conv = [bundle, conv_kern_function](
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
+            fallback::MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                     bundle, kern_param, ncb_index, conv_kern_function,
                     ncb_index.ndrange_id);
         };
@@ -570,8 +577,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
         const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 1) {
         bool large_group = param.filter_meta.group >= param.nr_threads;
-        auto bundle = MultithreadDirectConvCommon<float, float>::get_bundle_stride(
-                param, large_group);
+        auto bundle =
+                fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
+                        param, large_group);
         return bundle.total_size_in_bytes();
     }
     MIDOUT_END();
@@ -609,7 +617,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
     SWITCH_KERN_STR2();
 
     WorkspaceBundle bundle =
-            MultithreadDirectConvCommon<float, float>::get_bundle_stride(
+            fallback::MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                     param, large_group);
     SmallVector<NCBKern> ret_kerns;
     //! When group >= nr_threads, treat it as large_group, each thread process
@@ -624,13 +632,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
             size_t OC = fm.ocpg;
             bundle.set(kern_param.workspace_ptr);
             for (size_t ic = 0; ic < IC; ic++) {
-                MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
-                        bundle, kern_param, ncb_index, {ncb_index.thread_id, 0, ic});
+                fallback::MultithreadDirectConvCommon<float, float>::
+                        copy_padding_kern_stride(
+                                bundle, kern_param, ncb_index,
+                                {ncb_index.thread_id, 0, ic});
             }
             for (size_t oc = 0; oc < OC; oc++) {
-                MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
-                        bundle, kern_param, ncb_index, conv_kern_function,
-                        {ncb_index.thread_id, 0, oc});
+                fallback::MultithreadDirectConvCommon<float, float>::
+                        do_conv_kern_stride(
+                                bundle, kern_param, ncb_index, conv_kern_function,
+                                {ncb_index.thread_id, 0, oc});
             }
         };
         ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
@@ -639,15 +650,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
                                     const NCBKernParam& kern_param,
                                     const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
-                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
+            fallback::MultithreadDirectConvCommon<float, float>::
+                    copy_padding_kern_stride(
+                            bundle, kern_param, ncb_index, ncb_index.ndrange_id);
         };
         ret_kerns.push_back({copy_padding, {group, N, IC}});
         auto do_conv = [bundle, conv_kern_function](
                                const NCBKernParam& kern_param,
                                const NCBKernIndex& ncb_index) mutable {
             bundle.set(kern_param.workspace_ptr);
-            MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
+            fallback::MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                     bundle, kern_param, ncb_index, conv_kern_function,
                     ncb_index.ndrange_id);
         };
diff --git a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp b/dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp
similarity index 97%
rename from dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp
rename to dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp
index ebf43f6e..fc51510e 100644
--- a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp
+++ b/dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp
@@ -1,5 +1,5 @@
 /**
- * \file dnn/src/arm_common/conv_bias/direct/multi_thread_common.cpp
+ * \file dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -9,12 +9,14 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 
-#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
-#include "src/arm_common/conv_bias/postprocess_helper.h"
+#include "multi_thread_common.h"
 #include "src/fallback/matrix_mul/opr_impl.h"
 
 using namespace megdnn;
-using namespace arm_common;
+using namespace fallback;
+#if MEGDNN_X86
+using namespace x86;
+#endif
 
 namespace {
 bool need_dst_copy(const megdnn::fallback::ConvBiasImpl::NCBKernSizeParam& param) {
@@ -354,8 +356,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride(
             kern_param.nonlineMode, kern_param.bias_type, kern_param.dst_type, 1_z, 1_z,
             OH, OW);
 };
-template class megdnn::arm_common::MultithreadDirectConvCommon<float, float>;
+template class megdnn::fallback::MultithreadDirectConvCommon<float, float>;
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class megdnn::arm_common::MultithreadDirectConvCommon<dt_float16, __fp16>;
+template class megdnn::fallback::MultithreadDirectConvCommon<dt_float16, __fp16>;
 #endif
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h b/dnn/src/fallback/conv_bias/direct/multi_thread_common.h
similarity index 84%
rename from dnn/src/arm_common/conv_bias/direct/multi_thread_common.h
rename to dnn/src/fallback/conv_bias/direct/multi_thread_common.h
index 09d47fe5..4303c408 100644
--- a/dnn/src/arm_common/conv_bias/direct/multi_thread_common.h
+++ b/dnn/src/fallback/conv_bias/direct/multi_thread_common.h
@@ -1,5 +1,5 @@
 /**
- * \file dnn/src/arm_common/conv_bias/direct/multi_thread_common.h
+ * \file dnn/src/fallback/conv_bias/direct/multi_thread_common.h
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,11 +10,20 @@
  */
 #pragma once
 
-#include "src/arm_common/conv_bias/opr_impl.h"
+#include "src/fallback/conv_bias/opr_impl.h"
 #include "src/fallback/matrix_mul/opr_impl.h"
 
+#if MEGDNN_X86
+#include "src/x86/conv_bias/postprocess_helper.h"
+#elif (MEGDNN_ARMV7 || MEGDNN_AARCH64)
+#include "src/arm_common/conv_bias/postprocess_helper.h"
+#else
+//! TODO: optimize common postprocess_helper with general intrinsic
+#include "src/common/postprocess_helper.h"
+#endif
+
 namespace megdnn {
-namespace arm_common {
+namespace fallback {
 
 template <class io_ctype, class compute_ctype>
 class MultithreadDirectConvCommon {
@@ -53,7 +62,7 @@ public:
             const CpuNDRange& workspace_ids);
 };
 
-}  // namespace arm_common
+}  // namespace fallback
 }  // namespace megdnn
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h
index 3dc3efe5..8af5c231 100644
--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -42,7 +42,7 @@ GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
 #elif defined(GI_SSE2_INTRINSICS)
     return _mm_castsi128_ps(Vector);
 #else
-    return (GI_FLOAT32_t)In;
+    return (GI_FLOAT32_t)Vector;
 #endif
 }
 
@@ -53,7 +53,7 @@ GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
 #elif defined(GI_SSE2_INTRINSICS)
     return _mm_castsi128_ps(Vector);
 #else
-    return (GI_FLOAT32_t)In;
+    return (GI_FLOAT32_t)Vector;
 #endif
 }
 
diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h
index b2d95b56..97181862 100644
--- a/dnn/src/fallback/general_intrinsic/gi_int.h
+++ b/dnn/src/fallback/general_intrinsic/gi_int.h
@@ -1,5 +1,5 @@
 /**
- * \file dnn/src/fallback/general_intrinsic/gi_float.h
+ * \file dnn/src/fallback/general_intrinsic/gi_int.h
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.