You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

checksum_kernel_union1.mlu 1.8 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. /**
  2. * \file dnn/src/cambricon/checksum/checksum_kernel_union1.mlu
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "checksum.mlu.h"
  12. #include "cnsccl.h"
  13. #include "mlu.h"
  14. #define CLUSTER_DIM 1
  15. #define CORE_DIM 4
  16. #define STRIDE 1024
  17. __mlu_entry__ void checksum_kernel_union1(uint32_t* dst, uint32_t* src,
  18. int nr_elems) {
  19. __nram__ uint32_t sum = 0;
  20. __nram__ uint32_t val[STRIDE];
  21. const uint32_t TASK_DIM = CLUSTER_DIM * CORE_DIM;
  22. __mlu_shared__ uint32_t partial_sum[TASK_DIM];
  23. int task_stride = STRIDE;
  24. int start_offset = taskId * task_stride;
  25. int global_stride = taskDim * task_stride;
  26. for (int task_offset = start_offset; task_offset < nr_elems;
  27. task_offset += global_stride) {
  28. int end_offset = task_offset + task_stride;
  29. end_offset = end_offset > nr_elems ? nr_elems : end_offset;
  30. int copy_elems = end_offset - task_offset;
  31. __memcpy(val, src + task_offset, copy_elems * sizeof(uint32_t),
  32. GDRAM2NRAM);
  33. for (int i = 0; i < copy_elems; i++) {
  34. sum = sum + val[i] * (task_offset + i + 1);
  35. }
  36. }
  37. partial_sum[taskId] = sum;
  38. __sync_cluster();
  39. if (taskId == 0) {
  40. uint32_t res = 0;
  41. for (int i = 0; i < taskDim; i++) {
  42. res += partial_sum[i];
  43. }
  44. dst[0] = res;
  45. }
  46. }
  47. #undef CLUSTER_DIM
  48. #undef CORE_DIM
  49. #undef STRIDE
  50. // vim: ft=cpp syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台