You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 6.4 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. package grampus
  2. import (
  3. "encoding/json"
  4. "strings"
  5. "code.gitea.io/gitea/modules/setting"
  6. "code.gitea.io/gitea/models"
  7. "code.gitea.io/gitea/modules/context"
  8. "code.gitea.io/gitea/modules/log"
  9. "code.gitea.io/gitea/modules/notification"
  10. "code.gitea.io/gitea/modules/timeutil"
  11. )
  12. const (
  13. JobPath = "job/"
  14. ProcessorTypeNPU = "npu.huawei.com/NPU"
  15. ProcessorTypeGPU = "nvidia.com/gpu"
  16. GpuWorkDir = "/tmp/"
  17. NpuWorkDir = "/cache/"
  18. CommandPrepareScript = ";mkdir -p output;mkdir -p code;mkdir -p dataset;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" +
  19. "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;"
  20. //CommandPrepareScript = "pwd;cd /cache;mkdir -p output;mkdir -p code;mkdir -p dataset;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" +
  21. // "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;"
  22. CodeArchiveName = "master.zip"
  23. )
  24. var (
  25. poolInfos *models.PoolInfos
  26. FlavorInfos *models.FlavorInfos
  27. ImageInfos *models.ImageInfosModelArts
  28. SpecialPools *models.SpecialPools
  29. )
  30. type GenerateTrainJobReq struct {
  31. JobName string
  32. Command string
  33. ResourceSpecId string
  34. ImageUrl string //与image_id二选一,都有的情况下优先image_url
  35. ImageId string
  36. DisplayJobName string
  37. Uuid string
  38. Description string
  39. CodeObsPath string
  40. BootFile string
  41. BootFileUrl string
  42. DataUrl string
  43. TrainUrl string
  44. WorkServerNumber int
  45. EngineID int64
  46. CommitID string
  47. IsLatestVersion string
  48. BranchName string
  49. PreVersionId int64
  50. PreVersionName string
  51. FlavorName string
  52. VersionCount int
  53. EngineName string
  54. TotalVersionCount int
  55. ComputeResource string
  56. ProcessType string
  57. DatasetName string
  58. Params string
  59. }
  60. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  61. createTime := timeutil.TimeStampNow()
  62. centerID, centerName := getCentersParamter(ctx, req)
  63. jobResult, err := createJob(models.CreateGrampusJobRequest{
  64. Name: req.JobName,
  65. Tasks: []models.GrampusTasks{
  66. {
  67. Name: req.JobName,
  68. Command: req.Command,
  69. ResourceSpecId: req.ResourceSpecId,
  70. ImageId: req.ImageId,
  71. ImageUrl: req.ImageUrl,
  72. CenterID: centerID,
  73. CenterName: centerName,
  74. ReplicaNum: 1,
  75. },
  76. },
  77. })
  78. if err != nil {
  79. log.Error("createJob failed: %v", err.Error())
  80. return err
  81. }
  82. jobID := jobResult.JobInfo.JobID
  83. err = models.CreateCloudbrain(&models.Cloudbrain{
  84. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  85. UserID: ctx.User.ID,
  86. RepoID: ctx.Repo.Repository.ID,
  87. JobID: jobID,
  88. JobName: req.JobName,
  89. DisplayJobName: req.DisplayJobName,
  90. JobType: string(models.JobTypeTrain),
  91. Type: models.TypeC2Net,
  92. Uuid: req.Uuid,
  93. DatasetName: req.DatasetName,
  94. CommitID: req.CommitID,
  95. IsLatestVersion: req.IsLatestVersion,
  96. ComputeResource: req.ComputeResource,
  97. ImageID: req.ImageId,
  98. TrainUrl: req.TrainUrl,
  99. BranchName: req.BranchName,
  100. Parameters: req.Params,
  101. BootFile: req.BootFile,
  102. DataUrl: req.DataUrl,
  103. FlavorCode: req.ResourceSpecId,
  104. Description: req.Description,
  105. WorkServerNumber: req.WorkServerNumber,
  106. FlavorName: req.FlavorName,
  107. EngineName: req.EngineName,
  108. VersionCount: req.VersionCount,
  109. TotalVersionCount: req.TotalVersionCount,
  110. CreatedUnix: createTime,
  111. UpdatedUnix: createTime,
  112. })
  113. if err != nil {
  114. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  115. return err
  116. }
  117. var actionType models.ActionType
  118. if req.ComputeResource == models.NPUResource {
  119. actionType = models.ActionCreateGrampusNPUTrainTask
  120. } else if req.ComputeResource == models.GPUResource {
  121. actionType = models.ActionCreateGrampusGPUTrainTask
  122. }
  123. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  124. return nil
  125. }
  126. func getCentersParamter(ctx *context.Context, req *GenerateTrainJobReq) ([]string, []string) {
  127. var centerID []string
  128. var centerName []string
  129. includeCenters := make(map[string]string)
  130. excludeCenters := make(map[string]string)
  131. if SpecialPools != nil {
  132. for _, pool := range SpecialPools.Pools {
  133. if !pool.IsExclusive && strings.Contains(req.ComputeResource, pool.Type) {
  134. org, _ := models.GetOrgByName(pool.Org)
  135. if org != nil {
  136. isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
  137. if isOrgMember {
  138. for _, info := range pool.Pool {
  139. includeCenters[info.Queue] = info.Value
  140. }
  141. } else {
  142. for _, info := range pool.Pool {
  143. excludeCenters[info.Queue] = info.Value
  144. }
  145. }
  146. }
  147. }
  148. }
  149. }
  150. if len(includeCenters) > 0 {
  151. //如果有专属资源池,根据专属资源池指定智算中心
  152. for k, v := range includeCenters {
  153. centerID = append(centerID, k)
  154. centerName = append(centerName, v)
  155. }
  156. } else if len(excludeCenters) > 0 {
  157. //否则,有要排除的中心,先获取所有中心,删除其中的排除中心,得到指定的智算中心
  158. allCenters := make(map[string]string)
  159. specs, err := GetResourceSpecs(req.ProcessType)
  160. if err == nil {
  161. for _, info := range specs.Infos {
  162. for _, center := range info.Centers {
  163. allCenters[center.ID] = center.Name
  164. }
  165. }
  166. }
  167. for k, _ := range excludeCenters {
  168. delete(allCenters, k)
  169. }
  170. for k, v := range allCenters {
  171. centerID = append(centerID, k)
  172. centerName = append(centerName, v)
  173. }
  174. }
  175. return centerID, centerName
  176. }
  177. func TransTrainJobStatus(status string) string {
  178. if status == models.GrampusStatusPending {
  179. status = models.GrampusStatusWaiting
  180. }
  181. return strings.ToUpper(status)
  182. }
  183. func InitSpecialPool() {
  184. if SpecialPools == nil && setting.Grampus.SpecialPools != "" {
  185. json.Unmarshal([]byte(setting.Grampus.SpecialPools), &SpecialPools)
  186. }
  187. }