You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

l2_cache_optimize.h 4.1 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef INC_FRAMEWORK_COMMON_L2_CACHE_OPTIMIZE_H_
  17. #define INC_FRAMEWORK_COMMON_L2_CACHE_OPTIMIZE_H_
  18. #include <stdint.h>
  19. #include <algorithm>
  20. #include <functional>
  21. #include <string>
  22. #include <vector>
  23. #include "common/types.h"
  24. #include "common/util.h"
  25. #include "graph/compute_graph.h"
  26. using std::vector;
  27. namespace ge {
  28. // Size of RC memory alignment, 2M
  29. constexpr size_t ALIGN_SIZE = 2097152;
  30. constexpr uint32_t RC_VALUE_DEFAULT = 1;
  31. constexpr uint32_t RC_VALUE_MAX = 32;
  32. // RC data type classification
  33. enum RCType {
  34. RC_DEFAULT, // Such as temporary workspace memory of operator, variable (including global and local variable)
  35. RC_HCOM, // Output of gradient aggregation, RC value should be set to 0
  36. RC_L2LOSS, // Parameter of L2 loss operator, RC value should be set to 0
  37. RC_INPUTOUTPUT, // Input and output tensor of operator, RC value is returned by FE calculation
  38. RC_WEIGHTS, // The weight, fp16, RC value used by FP/BP operator should be set to 1 or the actual access numbers
  39. RC_DW, // The gradient data DW and RC value output by BP operator
  40. // should be set to 1 or the actual access numbers
  41. RC_ARGS // Args of FlowTable, actual access numbers
  42. };
  43. enum MemType { INPUT_TENSOR, OUTPUT_TENSOR, WEIGHT, WORKSPACE };
  44. // Memory usage information < node, type, number >
  45. struct NodeInfo {
  46. string nodeName;
  47. MemType memType;
  48. size_t index;
  49. };
  50. // Memory block RC value
  51. struct RCMemoryBlock {
  52. RCType type; // RC type
  53. size_t blockSize; // memory block size
  54. size_t headOffset; // Start offset from base address
  55. size_t tailOffset; // End offset from base address
  56. uint32_t rcCount; // RC value
  57. NodeInfo nodeInfo; // Input and output indexes of node objects to which RC belongs
  58. };
  59. // L2Cache optimizer
  60. class L2CacheOptimize {
  61. public:
  62. explicit L2CacheOptimize(ge::ComputeGraphPtr &graph);
  63. ~L2CacheOptimize();
  64. // Collect the information L2Cache Memory optimization
  65. Status Gath();
  66. private:
  67. ge::ComputeGraphPtr graph_;
  68. // Save RC block information list
  69. vector<RCMemoryBlock> weightRCs;
  70. vector<RCMemoryBlock> opRCs;
  71. // Extract RC information generated by FE from compiled graph
  72. void RetirveRCinfo();
  73. // Take the maximum common divisor of RC values for the duplicate address
  74. void Merge(vector<RCMemoryBlock> &blocks);
  75. // The RC information is aligned with the 2m address
  76. void Align(vector<RCMemoryBlock> &blocks);
  77. // Weight of l2loss operator, output of gradient aggregation output, RC value set to 0
  78. void HandleOutputZeroRC(RCType type, ge::NodePtr node, vector<int64_t> &outputList, vector<RCMemoryBlock> &blocks);
  79. // Processing operator input Tensor's RC
  80. void HandOPInput(ge::NodePtr node, vector<int64_t> &inputList, vector<RCMemoryBlock> &blocks);
  81. // Processing operator output Tensor's RC
  82. void HandOPoutput(ge::NodePtr node, vector<int64_t> &outputList, vector<RCMemoryBlock> &blocks);
  83. // maximum common divisor
  84. uint32_t Measure(uint32_t x, uint32_t y) {
  85. if (x == 0 || y == 0) return RC_VALUE_DEFAULT;
  86. uint32_t z = y;
  87. while (x % y != 0) {
  88. z = x % y;
  89. x = y;
  90. y = z;
  91. }
  92. return z;
  93. }
  94. bool Contain(const RCMemoryBlock &l_block, const RCMemoryBlock &r_block);
  95. bool Cross(const RCMemoryBlock &l_block, const RCMemoryBlock &r_block);
  96. bool Connect(const RCMemoryBlock &l_block, const RCMemoryBlock &r_block);
  97. };
  98. } // namespace ge
  99. #endif // INC_FRAMEWORK_COMMON_L2_CACHE_OPTIMIZE_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示