diff --git a/dnn/src/cuda/conv_bias/cudnn_conv.cpp b/dnn/src/cuda/conv_bias/cudnn_conv.cpp
index 1be71d81..51d86929 100644
--- a/dnn/src/cuda/conv_bias/cudnn_conv.cpp
+++ b/dnn/src/cuda/conv_bias/cudnn_conv.cpp
@@ -31,6 +31,15 @@ bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available(
         }
     }
 
+    // FIXME: cudnn cannot handle the case when the initial value of dst tensor
+    // contains nan and beta is zero, because the result of 0.f * nan is still
+    // nan
+    if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 &&
+        args.dst_layout->dtype.enumv() == DTypeEnum::Float32 &&
+        args.opr->param().format == param::ConvBias::Format::NCHW) {
+        return false;
+    }
+
     auto dst_layout = *args.dst_layout;
     if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
         dst_layout.dtype = DType();
diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
index fce69d5a..674dd629 100644
--- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
+++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
@@ -57,6 +57,15 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
     }
 #endif
 
+    // FIXME: cudnn cannot handle the case when the initial value of dst tensor
+    // contains nan and beta is zero, because the result of 0.f * nan is still
+    // nan
+    if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 &&
+        args.dst_layout->dtype.enumv() == DTypeEnum::Float32 &&
+        param.format == param::ConvBias::Format::NCHW) {
+        return false;
+    }
+
     //! FIXME: conv kernel of cudnn for NCHW4_NCHW tensor format causes illegal
     //! memory access errors, so we have to disable this kernel here.
     if (param.format == param::ConvBias::Format::NCHW4_NCHW ||
diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp
index 4cb259d1..7d4241f3 100644
--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -1619,6 +1619,8 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
             megdnn::param::Convolution::Format::NCHW4;
     megdnn::param::ConvBias::Format conv_bias_format =
             megdnn::param::ConvBias::Format::NCHW4;
+    megdnn::param::ConvBias::Format conv_bias_format_nchw4_nchw =
+            megdnn::param::ConvBias::Format::NCHW4_NCHW;
     megdnn::param::BatchConvBias::Format batch_conv_bias_format =
             megdnn::param::BatchConvBias::Format::NCHW4;
     RelayoutMode src_to_nchw4_mode = RelayoutMode::NCHW_TO_NCHW4;
@@ -1821,6 +1823,7 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
         return new_opr;
     };
     auto replace_conv_bias_opr = [trans_nchw4, conv_bias_format,
+                                  conv_bias_format_nchw4_nchw,
                                   src_to_nchw4_mode](
                                          OperatorNodeBase* opr,
                                          const VarNodeArray& new_inp) {
@@ -1851,19 +1854,27 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
         conv_bias_filter = new_filter.node();
         // format: NCHW --> NCHW4
         auto new_param = conv_bias_opr.param();
-        new_param.format = conv_bias_format;
+        if (conv_bias_opr.output().size() > 0 &&
+            conv_bias_opr.output(0)->dtype().enumv() == DTypeEnum::Float32) {
+            new_param.format = conv_bias_format_nchw4_nchw;
+        } else {
+            new_param.format = conv_bias_format;
+        }
         if (new_inp.size() == 2) {
             auto new_conv_bias_opr = opr::ConvBias::make(
                     conv_bias_src, conv_bias_filter, new_param,
                     conv_bias_opr.execution_policy(), conv_bias_opr.config());
             OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
-            mgb_assert(new_conv_bias_opr.shape().ndim == 5,
-                       "The conv_bias dst dim is not trans to nchw4");
+            mgb_assert(
+                new_conv_bias_opr.node()->dtype().enumv() == DTypeEnum::Float32 ||
+                new_conv_bias_opr.shape().ndim == 5,
+                "The conv_bias dst dim is not trans to nchw4");
             return new_opr;
         }
-        // bias: NCHW --> NCHW4
+        // bias: NCHW --> NCHW4 when bias_dtype is not Float32
         VarNode* conv_bias_bias = new_inp[2];
-        if (new_inp[2]->shape().ndim == 4) {
+        if (new_inp[2]->dtype().enumv() != DTypeEnum::Float32 &&
+            new_inp[2]->shape().ndim == 4) {
             auto new_bias =
                     RelayoutPlaceholder::make(new_inp[2], src_to_nchw4_mode);
             conv_bias_bias = new_bias.node();
@@ -1873,13 +1884,16 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
                     conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
                     conv_bias_opr.execution_policy(), conv_bias_opr.config());
             OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
-            mgb_assert(new_conv_bias_opr.shape().ndim == 5,
-                       "The conv_bias dst dim is not trans to nchw4");
+            mgb_assert(
+                new_conv_bias_opr.node()->dtype().enumv() == DTypeEnum::Float32 ||
+                new_conv_bias_opr.shape().ndim == 5,
+                "The conv_bias dst dim is not trans to nchw4");
             return new_opr;
         }
-        // z_inp: NCHW --> NCHW4
+        // z_inp: NCHW --> NCHW4 when bias_dtype is not Float32
         VarNode* z_inp = new_inp[3];
-        if (new_inp[3]->shape().ndim == 4) {
+        if (new_inp[3]->dtype().enumv() != DTypeEnum::Float32 &&
+            new_inp[3]->shape().ndim == 4) {
             auto new_z =
                     RelayoutPlaceholder::make(new_inp[3], src_to_nchw4_mode);
             z_inp = new_z.node();
@@ -1889,8 +1903,10 @@ std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter() {
                 new_param, conv_bias_opr.execution_policy(),
                 conv_bias_opr.config());
         OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
-        mgb_assert(new_conv_bias_opr.shape().ndim == 5,
-                   "The conv_bias dst dim is not trans to nchw4");
+        mgb_assert(
+            new_conv_bias_opr.node()->dtype().enumv() == DTypeEnum::Float32 ||
+            new_conv_bias_opr.shape().ndim == 5,
+            "The conv_bias dst dim is not trans to nchw4");
         return new_opr;
     };
     auto replace_elemwise_opr = [=](OperatorNodeBase* opr,
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index 9be80442..af6ef436 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -3088,6 +3088,88 @@ TEST(TestGoptInference, ConvertFormatNCHW4GPU) {
     MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
 }
 
+TEST(TestGoptInference, ConvertFormatNCHW4FloatGPU) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(6, 1);
+
+    HostTensorGenerator<> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+
+    auto mkcvar = [&](const char* name, const TensorShape& shp,
+                      const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
+                        .rename(name),
+                dtype);
+    };
+
+    auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(1.2f));
+    opr::ConvBias::Param param_conv_bias;
+    param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
+    param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
+
+    // conv1, with bias
+    auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)),
+         b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::Float32());
+    auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias, {},
+                                     OperatorNodeConfig{dtype::Float32()});
+
+    // conv2, with bias and z
+    auto w2 = mkcvar("w2", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)),
+         b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::Float32()),
+         z2 = mkcvar("z2", {2, 8, 16, 16}, dtype::Float32());
+    auto conv2 = opr::ConvBias::make(x, w2, b2, z2, param_conv_bias, {},
+                                     OperatorNodeConfig{dtype::Float32()});
+
+    // conv3, relu
+    param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
+    auto w3 = mkcvar("w3", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)),
+         b3 = mkcvar("b3", {1, 8, 1, 1}, dtype::Float32()),
+         z3 = mkcvar("z3", {2, 8, 16, 16}, dtype::Float32());
+    auto conv3 = opr::ConvBias::make(x, w3, b3, z3, param_conv_bias, {},
+                                     OperatorNodeConfig{dtype::Float32()});
+
+    auto y = conv1 + conv2 + conv3;
+
+    SymbolVar y_opt;
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_nchw4();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+    }
+
+    bool succ = true;
+    auto cb = [&succ](cg::OperatorNodeBase* opr) {
+        if (opr->same_type<opr::ConvBias>()) {
+            auto& conv_bias = opr->cast_final_safe<opr::ConvBias>();
+            if (conv_bias.param().format !=
+                opr::ConvBias::Param::Format::NCHW4_NCHW) {
+                succ = false;
+            }
+        }
+    };
+
+    cg::DepOprIter{cb}.add(y_opt);
+    ASSERT_TRUE(succ);
+
+    HostTensorND host_y, host_y_opt;
+    auto func = graph->compile({make_callback_copy(y, host_y),
+                                make_callback_copy(y_opt, host_y_opt)});
+    func->execute();
+
+    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
+}
+
 #endif
 
 TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {