You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 4.4 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. package grampus
  2. import (
  3. "code.gitea.io/gitea/models"
  4. "code.gitea.io/gitea/modules/context"
  5. "code.gitea.io/gitea/modules/log"
  6. "code.gitea.io/gitea/modules/notification"
  7. "code.gitea.io/gitea/modules/timeutil"
  8. "strings"
  9. )
  10. const (
  11. JobPath = "job/"
  12. ProcessorTypeNPU = "npu.huawei.com/NPU"
  13. ProcessorTypeGPU = "nvidia.com/gpu"
  14. CommandPrepareScript = "pwd;cd /cache;mkdir -p output;mkdir -p code;mkdir -p dataset;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" +
  15. "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;"
  16. //CommandPrepareScript = "bash;pwd;apt-get -y update;apt-get -y upgrade;apt-get -y install wget;apt-get -y install unzip;" +
  17. // "cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" +
  18. // "unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;"
  19. CodeArchiveName = "master.zip"
  20. )
  21. var (
  22. poolInfos *models.PoolInfos
  23. FlavorInfos *models.FlavorInfos
  24. ImageInfos *models.ImageInfosModelArts
  25. )
  26. type GenerateTrainJobReq struct {
  27. JobName string
  28. Command string
  29. ResourceSpecId string
  30. ImageUrl string //与image_id二选一,都有的情况下优先image_url
  31. ImageId string
  32. DisplayJobName string
  33. Uuid string
  34. Description string
  35. CodeObsPath string
  36. BootFile string
  37. BootFileUrl string
  38. DataUrl string
  39. TrainUrl string
  40. WorkServerNumber int
  41. EngineID int64
  42. CommitID string
  43. IsLatestVersion string
  44. BranchName string
  45. PreVersionId int64
  46. PreVersionName string
  47. FlavorName string
  48. VersionCount int
  49. EngineName string
  50. TotalVersionCount int
  51. ComputeResource string
  52. DatasetName string
  53. Params string
  54. }
  55. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  56. createTime := timeutil.TimeStampNow()
  57. jobResult, err := createJob(models.CreateGrampusJobRequest{
  58. Name: req.JobName,
  59. Tasks: []models.GrampusTasks{
  60. {
  61. Name: req.JobName,
  62. Command: req.Command,
  63. ResourceSpecId: req.ResourceSpecId,
  64. ImageId: req.ImageId,
  65. ImageUrl: req.ImageUrl,
  66. ReplicaNum: 1,
  67. },
  68. },
  69. })
  70. if err != nil {
  71. log.Error("createJob failed: %v", err.Error())
  72. return err
  73. }
  74. jobID := jobResult.JobInfo.JobID
  75. err = models.CreateCloudbrain(&models.Cloudbrain{
  76. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  77. UserID: ctx.User.ID,
  78. RepoID: ctx.Repo.Repository.ID,
  79. JobID: jobID,
  80. JobName: req.JobName,
  81. DisplayJobName: req.DisplayJobName,
  82. JobType: string(models.JobTypeTrain),
  83. Type: models.TypeC2Net,
  84. Uuid: req.Uuid,
  85. DatasetName: req.DatasetName,
  86. CommitID: req.CommitID,
  87. IsLatestVersion: req.IsLatestVersion,
  88. ComputeResource: req.ComputeResource,
  89. ImageID: req.ImageId,
  90. TrainUrl: req.TrainUrl,
  91. BranchName: req.BranchName,
  92. Parameters: req.Params,
  93. BootFile: req.BootFile,
  94. DataUrl: req.DataUrl,
  95. FlavorCode: req.ResourceSpecId,
  96. Description: req.Description,
  97. WorkServerNumber: req.WorkServerNumber,
  98. FlavorName: req.FlavorName,
  99. EngineName: req.EngineName,
  100. VersionCount: req.VersionCount,
  101. TotalVersionCount: req.TotalVersionCount,
  102. CreatedUnix: createTime,
  103. UpdatedUnix: createTime,
  104. })
  105. if err != nil {
  106. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  107. return err
  108. }
  109. var actionType models.ActionType
  110. if req.ComputeResource == models.NPUResource {
  111. actionType = models.ActionCreateGrampusNPUTrainTask
  112. } else if req.ComputeResource == models.GPUResource {
  113. actionType = models.ActionCreateGrampusGPUTrainTask
  114. }
  115. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  116. return nil
  117. }
  118. func TransTrainJobStatus(status string) string {
  119. if status == models.GrampusStatusPending {
  120. status = models.GrampusStatusWaiting
  121. }
  122. return strings.ToUpper(status)
  123. }