You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sync_status.go 5.0 kB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. package cloudbrainTask
  2. import (
  3. "code.gitea.io/gitea/models"
  4. "code.gitea.io/gitea/modules/cloudbrain"
  5. "code.gitea.io/gitea/modules/grampus"
  6. "code.gitea.io/gitea/modules/log"
  7. "code.gitea.io/gitea/modules/modelarts"
  8. "code.gitea.io/gitea/modules/modelarts_cd"
  9. "code.gitea.io/gitea/modules/notification"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. "net/http"
  13. "strconv"
  14. )
  15. var noteBookOKMap = make(map[int64]int, 20)
  16. var noteBookFailMap = make(map[int64]int, 20)
  17. //if a task notebook url can get successfulCount times, the notebook can browser.
  18. const successfulCount = 3
  19. const maxSuccessfulCount=10
  20. func SyncCloudBrainOneStatus(task *models.Cloudbrain) (*models.Cloudbrain, error) {
  21. jobResult, err := cloudbrain.GetJob(task.JobID)
  22. if err != nil {
  23. log.Error("GetJob failed:", err)
  24. return task, err
  25. }
  26. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  27. if err != nil {
  28. log.Error("ConvertToJobResultPayload failed:", err)
  29. return task, err
  30. }
  31. oldStatus := task.Status
  32. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  33. taskRoles := result.TaskRoles
  34. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  35. task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
  36. task.ContainerID = taskRes.TaskStatuses[0].ContainerID
  37. }
  38. if (result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobRunning)) ||
  39. task.Status == string(models.JobRunning) || (result.JobStatus.State == string(models.JobRunning) && isNoteBookReady(task)) {
  40. models.ParseAndSetDurationFromCloudBrainOne(result, task)
  41. task.Status = result.JobStatus.State
  42. if oldStatus != task.Status {
  43. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  44. }
  45. err = models.UpdateJob(task)
  46. if err != nil {
  47. log.Error("UpdateJob failed:", err)
  48. return task, err
  49. }
  50. }
  51. return task, nil
  52. }
  53. func SyncGrampusNotebookStatus(job *models.Cloudbrain) (*models.Cloudbrain, error) {
  54. result, err := grampus.GetNotebookJob(job.JobID)
  55. if err != nil {
  56. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  57. return job, err
  58. }
  59. if job.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  60. job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  61. }
  62. oldStatus := job.Status
  63. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  64. job.Duration = result.JobInfo.RunSec
  65. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  66. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  67. job.EndTime = job.StartTime.Add(job.Duration)
  68. }
  69. job.CorrectCreateUnix()
  70. if len(job.AiCenter) == 0 {
  71. if len(result.JobInfo.Tasks) > 0 {
  72. if len(result.JobInfo.Tasks[0].CenterID) > 0 && len(result.JobInfo.Tasks[0].CenterName) > 0 {
  73. job.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  74. }
  75. }
  76. }
  77. if job.Status != models.GrampusStatusWaiting {
  78. if oldStatus != job.Status {
  79. notification.NotifyChangeCloudbrainStatus(job, oldStatus)
  80. }
  81. if job.ComputeResource == models.NPUResource {
  82. job.TrainUrl = result.JobInfo.Tasks[0].CodeUrl
  83. job.DataUrl = result.JobInfo.Tasks[0].DataUrl
  84. }
  85. err = models.UpdateJob(job)
  86. if err != nil {
  87. log.Error("UpdateJob failed:", err)
  88. return nil, err
  89. }
  90. }
  91. return job, nil
  92. }
  93. func isNoteBookReady(task *models.Cloudbrain) bool {
  94. if task.JobType != string(models.JobTypeDebug) {
  95. return true
  96. }
  97. noteBookUrl := setting.DebugServerHost + "jpylab_" + task.JobID + "_" + task.SubTaskName
  98. res,err := http.Get(noteBookUrl)
  99. if err != nil {
  100. return false
  101. }
  102. log.Info("notebook success count:"+strconv.Itoa(noteBookOKMap[task.ID])+",fail count:"+strconv.Itoa(noteBookFailMap[task.ID]))
  103. if res.StatusCode == http.StatusOK {
  104. count := noteBookOKMap[task.ID]
  105. if count==0{ //如果是第一次成功,把失败数重置为0
  106. noteBookFailMap[task.ID]=0
  107. }
  108. if count < successfulCount-1 || (noteBookFailMap[task.ID]==0 && count < maxSuccessfulCount-1) {
  109. noteBookOKMap[task.ID] = count + 1
  110. return false
  111. } else {
  112. log.Info("notebook success count:"+strconv.Itoa(count)+",fail count:"+strconv.Itoa(noteBookFailMap[task.ID]))
  113. delete(noteBookOKMap, task.ID)
  114. delete(noteBookFailMap, task.ID)
  115. return true
  116. }
  117. }else{
  118. noteBookFailMap[task.ID]+=1
  119. }
  120. return false
  121. }
  122. func StopDebugJob(task *models.Cloudbrain) error {
  123. param := models.NotebookAction{
  124. Action: models.ActionStop,
  125. }
  126. var err error = nil
  127. if task.JobType == string(models.JobTypeDebug) {
  128. if task.Type == models.TypeCloudBrainOne {
  129. return cloudbrain.StopJob(task.JobID)
  130. } else if task.Type == models.TypeCloudBrainTwo {
  131. _, err = modelarts.ManageNotebook2(task.JobID, param)
  132. } else if task.Type == models.TypeCDCenter {
  133. _, err = modelarts_cd.ManageNotebook(task.JobID, param)
  134. } else if task.Type == models.TypeC2Net {
  135. _, err = grampus.StopJob(task.JobID, task.JobType)
  136. }
  137. }
  138. return err
  139. }