From 17371e79b9e2acfbf09b855b7f835c45dfd7ed8e Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Sat, 4 Sep 2021 13:44:56 +0800
Subject: [PATCH] fix(dnn/reduce): fix reduce_mean o16c32 is incorrect for
 large tensor

GitOrigin-RevId: ebf03d814a893efca9dd9e09bb58001c22093fd4
---
 dnn/src/common/reduce_helper.h |  4 ++--
 dnn/test/cuda/reduce.cpp       | 10 ++++++++++
 dnn/test/fallback/reduce.cpp   |  9 +++++++++
 dnn/test/rocm/reduce.cpp       | 10 ++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/dnn/src/common/reduce_helper.h b/dnn/src/common/reduce_helper.h
index 08d4fc47..45da96e3 100644
--- a/dnn/src/common/reduce_helper.h
+++ b/dnn/src/common/reduce_helper.h
@@ -48,10 +48,10 @@ struct MeanOp {
     src_ctype* src;
     dst_ctype* dst;
     const size_t B;
-
+    
     MEGDNN_HOST MEGDNN_DEVICE wtype read(uint32_t idx) { return src[idx]; }
     MEGDNN_HOST MEGDNN_DEVICE void write(uint32_t idx, wtype val) {
-        dst[idx] = val / static_cast<dst_ctype>(B);
+        dst[idx] = val / static_cast<wtype>(B);
     }
     static MEGDNN_HOST MEGDNN_DEVICE wtype apply(wtype lhs, wtype rhs) {
         return lhs + rhs;
diff --git a/dnn/test/cuda/reduce.cpp b/dnn/test/cuda/reduce.cpp
index 7815225a..0f0c2dc8 100644
--- a/dnn/test/cuda/reduce.cpp
+++ b/dnn/test/cuda/reduce.cpp
@@ -103,6 +103,16 @@ TEST_F(CUDA, REDUCE) {
                 .set_param(param)
                 .execs({{1, 4194304, 1}, {1, 1, 1}});
     }
+
+    {
+        // large reduce_mean for O16C32
+        Reduce::Param param{Mode::MEAN, 1,
+                            Reduce::Param::DataType::FLOAT_O16xC32};
+        checker.set_dtype(0, dtype::Float16())
+                .set_dtype(1, dtype::Float16())
+                .set_param(param)
+                .execs({{1, 65536, 5}, {1, 1, 5}});
+    }
 }
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/test/fallback/reduce.cpp b/dnn/test/fallback/reduce.cpp
index 303b58de..a0c65962 100644
--- a/dnn/test/fallback/reduce.cpp
+++ b/dnn/test/fallback/reduce.cpp
@@ -74,6 +74,15 @@ TEST_F(FALLBACK, REDUCE) {
                 Config config(param, dtype, shape);
                 configs.push_back(config);
             }
+    
+    {
+        // large reduce_mean for O16C32
+        TensorShape shape{1, 65536, 5};
+        Param param(Mode::MEAN, 1, DataType::FLOAT_O16xC32);
+        Config config(param, dtype::Float16(), shape);
+        configs.push_back(config);
+    }
+    
     for (auto&& config : configs) {
         auto&& dtype = config.dtype;
         auto&& param = config.param;
diff --git a/dnn/test/rocm/reduce.cpp b/dnn/test/rocm/reduce.cpp
index 4893aa15..0d22e106 100644
--- a/dnn/test/rocm/reduce.cpp
+++ b/dnn/test/rocm/reduce.cpp
@@ -103,6 +103,16 @@ TEST_F(ROCM, REDUCE) {
                 .set_param(param)
                 .execs({{1, 4194304, 1}, {1, 1, 1}});
     }
+
+    {
+        // large reduce_mean for O16C32
+        Reduce::Param param{Mode::MEAN, 1,
+                            Reduce::Param::DataType::FLOAT_O16xC32};
+        checker.set_dtype(0, dtype::Float16())
+                .set_dtype(1, dtype::Float16())
+                .set_param(param)
+                .execs({{1, 65536, 5}, {1, 1, 5}});
+    }
 #endif
 }