/** * \file dnn/src/cambricon/checksum/checksum_kernel_union1.mlu * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "checksum.mlu.h" #include "cnsccl.h" #include "mlu.h" #define CLUSTER_DIM 1 #define CORE_DIM 4 #define STRIDE 1024 __mlu_entry__ void checksum_kernel_union1(uint32_t* dst, uint32_t* src, int nr_elems) { __nram__ uint32_t sum = 0; __nram__ uint32_t val[STRIDE]; const uint32_t TASK_DIM = CLUSTER_DIM * CORE_DIM; __mlu_shared__ uint32_t partial_sum[TASK_DIM]; int task_stride = STRIDE; int start_offset = taskId * task_stride; int global_stride = taskDim * task_stride; for (int task_offset = start_offset; task_offset < nr_elems; task_offset += global_stride) { int end_offset = task_offset + task_stride; end_offset = end_offset > nr_elems ? nr_elems : end_offset; int copy_elems = end_offset - task_offset; __memcpy(val, src + task_offset, copy_elems * sizeof(uint32_t), GDRAM2NRAM); for (int i = 0; i < copy_elems; i++) { sum = sum + val[i] * (task_offset + i + 1); } } partial_sum[taskId] = sum; __sync_cluster(); if (taskId == 0) { uint32_t res = 0; for (int i = 0; i < taskDim; i++) { res += partial_sum[i]; } dst[0] = res; } } #undef CLUSTER_DIM #undef CORE_DIM #undef STRIDE // vim: ft=cpp syntax=cpp.doxygen