Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/3165 Reviewed-by: lewis <747342561@qq.com>fix-3163
@@ -16,6 +16,8 @@ import ( | |||||
"strings" | "strings" | ||||
"time" | "time" | ||||
"code.gitea.io/gitea/services/cloudbrain/cloudbrainTask" | |||||
"code.gitea.io/gitea/modules/notification" | "code.gitea.io/gitea/modules/notification" | ||||
"code.gitea.io/gitea/modules/setting" | "code.gitea.io/gitea/modules/setting" | ||||
@@ -81,47 +83,22 @@ func GetCloudbrainTask(ctx *context.APIContext) { | |||||
"JobDuration": job.TrainJobDuration, | "JobDuration": job.TrainJobDuration, | ||||
}) | }) | ||||
} else { | } else { | ||||
jobResult, err := cloudbrain.GetJob(job.JobID) | |||||
if err != nil { | |||||
ctx.NotFound(err) | |||||
log.Error("GetJob failed:", err) | |||||
return | |||||
} | |||||
result, _ := models.ConvertToJobResultPayload(jobResult.Payload) | |||||
jobAfter, err := cloudbrainTask.SyncCloudBrainOneStatus(job) | |||||
if err != nil { | if err != nil { | ||||
ctx.NotFound(err) | ctx.NotFound(err) | ||||
log.Error("ConvertToJobResultPayload failed:", err) | |||||
log.Error("Sync cloud brain one status failed:", err) | |||||
return | return | ||||
} | } | ||||
oldStatus := job.Status | |||||
job.Status = result.JobStatus.State | |||||
taskRoles := result.TaskRoles | |||||
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { | |||||
job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||||
job.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||||
job.Status = taskRes.TaskStatuses[0].State | |||||
} | |||||
if result.JobStatus.State != string(models.JobWaiting) { | |||||
models.ParseAndSetDurationFromCloudBrainOne(result, job) | |||||
if oldStatus != job.Status { | |||||
notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||||
} | |||||
err = models.UpdateJob(job) | |||||
if err != nil { | |||||
log.Error("UpdateJob failed:", err) | |||||
} | |||||
} | |||||
ctx.JSON(http.StatusOK, map[string]interface{}{ | ctx.JSON(http.StatusOK, map[string]interface{}{ | ||||
"ID": ID, | "ID": ID, | ||||
"JobName": result.Config.JobName, | |||||
"JobStatus": result.JobStatus.State, | |||||
"SubState": result.JobStatus.SubState, | |||||
"CreatedTime": time.Unix(result.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05"), | |||||
"CompletedTime": time.Unix(result.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05"), | |||||
"JobDuration": job.TrainJobDuration, | |||||
"JobName": jobAfter.JobName, | |||||
"JobStatus": jobAfter.Status, | |||||
"SubState": "", | |||||
"CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"), | |||||
"CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"), | |||||
"JobDuration": jobAfter.TrainJobDuration, | |||||
}) | }) | ||||
} | } | ||||
} | } | ||||
@@ -12,6 +12,8 @@ import ( | |||||
"strconv" | "strconv" | ||||
"strings" | "strings" | ||||
"code.gitea.io/gitea/services/cloudbrain/cloudbrainTask" | |||||
"code.gitea.io/gitea/modules/urfs_client/urchin" | "code.gitea.io/gitea/modules/urfs_client/urchin" | ||||
"code.gitea.io/gitea/modules/notification" | "code.gitea.io/gitea/modules/notification" | ||||
@@ -20,7 +22,6 @@ import ( | |||||
"code.gitea.io/gitea/modules/setting" | "code.gitea.io/gitea/modules/setting" | ||||
"code.gitea.io/gitea/models" | "code.gitea.io/gitea/models" | ||||
"code.gitea.io/gitea/modules/cloudbrain" | |||||
"code.gitea.io/gitea/modules/context" | "code.gitea.io/gitea/modules/context" | ||||
"code.gitea.io/gitea/modules/log" | "code.gitea.io/gitea/modules/log" | ||||
"code.gitea.io/gitea/modules/modelarts" | "code.gitea.io/gitea/modules/modelarts" | ||||
@@ -109,39 +110,11 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { | |||||
} | } | ||||
if job.Type == models.TypeCloudBrainOne { | if job.Type == models.TypeCloudBrainOne { | ||||
jobResult, err := cloudbrain.GetJob(job.JobID) | |||||
if err != nil { | |||||
ctx.NotFound(err) | |||||
log.Error("GetJob failed:", err) | |||||
return | |||||
} | |||||
result, err := models.ConvertToJobResultPayload(jobResult.Payload) | |||||
job, err = cloudbrainTask.SyncCloudBrainOneStatus(job) | |||||
if err != nil { | if err != nil { | ||||
ctx.NotFound(err) | ctx.NotFound(err) | ||||
log.Error("ConvertToJobResultPayload failed:", err) | |||||
return | return | ||||
} | } | ||||
oldStatus := job.Status | |||||
job.Status = result.JobStatus.State | |||||
if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { | |||||
taskRoles := result.TaskRoles | |||||
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||||
job.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||||
job.Status = taskRes.TaskStatuses[0].State | |||||
} | |||||
if result.JobStatus.State != string(models.JobWaiting) { | |||||
models.ParseAndSetDurationFromCloudBrainOne(result, job) | |||||
if oldStatus != job.Status { | |||||
notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||||
} | |||||
err = models.UpdateJob(job) | |||||
if err != nil { | |||||
log.Error("UpdateJob failed:", err) | |||||
} | |||||
} | |||||
} else if job.Type == models.TypeCloudBrainTwo { | } else if job.Type == models.TypeCloudBrainTwo { | ||||
err := modelarts.HandleTrainJobInfo(job) | err := modelarts.HandleTrainJobInfo(job) | ||||
if err != nil { | if err != nil { | ||||
@@ -1845,59 +1845,37 @@ func SyncCloudbrainStatus() { | |||||
continue | continue | ||||
} | } | ||||
if task.Type == models.TypeCloudBrainOne { | if task.Type == models.TypeCloudBrainOne { | ||||
result, err := cloudbrain.GetJob(task.JobID) | |||||
task, err = cloudbrainTask.SyncCloudBrainOneStatus(task) | |||||
if err != nil { | if err != nil { | ||||
log.Error("GetJob(%s) failed:%v", task.JobName, err) | |||||
log.Error("Sync cloud brain one (%s) failed:%v", task.JobName, err) | |||||
continue | continue | ||||
} | } | ||||
if result != nil { | |||||
jobRes, _ := models.ConvertToJobResultPayload(result.Payload) | |||||
taskRoles := jobRes.TaskRoles | |||||
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
oldStatus := task.Status | |||||
task.Status = taskRes.TaskStatuses[0].State | |||||
if task.Status != string(models.JobWaiting) { | |||||
models.ParseAndSetDurationFromCloudBrainOne(jobRes, task) | |||||
if task.Status != string(models.JobWaiting) { | |||||
if task.Duration >= setting.MaxDuration && task.JobType == string(models.JobTypeDebug) { | |||||
log.Info("begin to stop job(%s), because of the duration", task.DisplayJobName) | |||||
err = cloudbrain.StopJob(task.JobID) | |||||
if err != nil { | |||||
log.Error("StopJob(%s) failed:%v", task.DisplayJobName, err) | |||||
continue | |||||
} | |||||
oldStatus := task.Status | |||||
task.Status = string(models.JobStopped) | |||||
if task.EndTime == 0 { | |||||
task.EndTime = timeutil.TimeStampNow() | |||||
} | |||||
task.ComputeAndSetDuration() | |||||
if oldStatus != task.Status { | if oldStatus != task.Status { | ||||
notification.NotifyChangeCloudbrainStatus(task, oldStatus) | notification.NotifyChangeCloudbrainStatus(task, oldStatus) | ||||
} | } | ||||
err = models.UpdateJob(task) | err = models.UpdateJob(task) | ||||
if err != nil { | if err != nil { | ||||
log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||||
} | |||||
var maxDuration int64 | |||||
if task.JobType == string(models.JobTypeBenchmark) { | |||||
maxDuration = setting.BenchmarkMaxDuration | |||||
} else if task.JobType == string(models.JobTypeSnn4imagenet) || task.JobType == string(models.JobTypeBrainScore) { | |||||
maxDuration = setting.ModelBenchmarkMaxDuration | |||||
} else { | |||||
maxDuration = setting.MaxDuration | |||||
} | |||||
if task.Duration >= maxDuration && task.JobType != string(models.JobTypeTrain) { | |||||
log.Info("begin to stop job(%s), because of the duration", task.DisplayJobName) | |||||
err = cloudbrain.StopJob(task.JobID) | |||||
if err != nil { | |||||
log.Error("StopJob(%s) failed:%v", task.DisplayJobName, err) | |||||
continue | |||||
} | |||||
task.Status = string(models.JobStopped) | |||||
if task.EndTime == 0 { | |||||
task.EndTime = timeutil.TimeStampNow() | |||||
} | |||||
task.ComputeAndSetDuration() | |||||
if oldStatus != task.Status { | |||||
notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||||
} | |||||
err = models.UpdateJob(task) | |||||
if err != nil { | |||||
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) | |||||
continue | |||||
} | |||||
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) | |||||
continue | |||||
} | } | ||||
} | } | ||||
} | } | ||||
} else if task.Type == models.TypeCloudBrainTwo { | } else if task.Type == models.TypeCloudBrainTwo { | ||||
if task.JobType == string(models.JobTypeDebug) { | if task.JobType == string(models.JobTypeDebug) { | ||||
@@ -0,0 +1,83 @@ | |||||
package cloudbrainTask | |||||
import ( | |||||
"net/http" | |||||
"code.gitea.io/gitea/models" | |||||
"code.gitea.io/gitea/modules/cloudbrain" | |||||
"code.gitea.io/gitea/modules/httplib" | |||||
"code.gitea.io/gitea/modules/log" | |||||
"code.gitea.io/gitea/modules/notification" | |||||
"code.gitea.io/gitea/modules/setting" | |||||
) | |||||
var noteBookOKMap = make(map[int64]int, 20) | |||||
//if a task notebook url can get two times, the notebook can browser. | |||||
const successfulCount = 2 | |||||
func SyncCloudBrainOneStatus(task *models.Cloudbrain) (*models.Cloudbrain, error) { | |||||
jobResult, err := cloudbrain.GetJob(task.JobID) | |||||
if err != nil { | |||||
log.Error("GetJob failed:", err) | |||||
return task, err | |||||
} | |||||
result, err := models.ConvertToJobResultPayload(jobResult.Payload) | |||||
if err != nil { | |||||
log.Error("ConvertToJobResultPayload failed:", err) | |||||
return task, err | |||||
} | |||||
oldStatus := task.Status | |||||
if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { | |||||
taskRoles := result.TaskRoles | |||||
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||||
task.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||||
} | |||||
if (result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobRunning)) || | |||||
task.Status == string(models.JobRunning) || (result.JobStatus.State == string(models.JobRunning) && isNoteBookReady(task)) { | |||||
models.ParseAndSetDurationFromCloudBrainOne(result, task) | |||||
task.Status = result.JobStatus.State | |||||
if oldStatus != task.Status { | |||||
notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||||
} | |||||
err = models.UpdateJob(task) | |||||
if err != nil { | |||||
log.Error("UpdateJob failed:", err) | |||||
return task, err | |||||
} | |||||
} | |||||
return task, nil | |||||
} | |||||
func isNoteBookReady(task *models.Cloudbrain) bool { | |||||
if task.JobType != string(models.JobTypeDebug) { | |||||
return true | |||||
} | |||||
noteBookUrl := setting.DebugServerHost + "jpylab_" + task.JobID + "_" + task.SubTaskName | |||||
r := httplib.Get(noteBookUrl) | |||||
res, err := r.Response() | |||||
if err != nil { | |||||
return false | |||||
} | |||||
if res.StatusCode == http.StatusOK { | |||||
count := noteBookOKMap[task.ID] | |||||
if count < successfulCount-1 { | |||||
noteBookOKMap[task.ID] = count + 1 | |||||
return false | |||||
} else { | |||||
delete(noteBookOKMap, task.ID) | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} |