From 791ef137ce54cf16ad23863309d320d9c0fee88f Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 30 Nov 2021 19:42:38 +0800 Subject: [PATCH 1/6] cb_limit --- models/cloudbrain.go | 53 ++++++++++++++++++++++++++++++++++++++++++++++ routers/repo/cloudbrain.go | 17 ++++++++++++++- routers/repo/modelarts.go | 32 +++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 2 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index bb1241247..7b196a321 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -31,6 +31,7 @@ const ( JobTypeBrainScore JobType = "BRAINSCORE" JobTypeTrain JobType = "TRAIN" + //notebook ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败 @@ -46,6 +47,30 @@ const ( ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除 ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中 ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败 + + //trainjob + ModelArtsTrainJobUnknown ModelArtsJobStatus = "UNKNOWN" //作业状态未知 + ModelArtsTrainJobInit ModelArtsJobStatus = "INIT" //作业初始化状态 + ModelArtsTrainJobImageCreating ModelArtsJobStatus = "IMAGE_CREATING" //作业镜像正在创建 + ModelArtsTrainJobImageFailed ModelArtsJobStatus = "IMAGE_FAILED" //作业镜像创建失败 + ModelArtsTrainJobSubmitTrying ModelArtsJobStatus = "SUBMIT_TRYING" //作业正在提交 + ModelArtsTrainJobSubmitFailed ModelArtsJobStatus = "SUBMIT_FAILED" //作业提交失败 + ModelArtsTrainJobDeleteFailed ModelArtsJobStatus = "DELETE_FAILED" //作业删除失败 + ModelArtsTrainJobWaiting ModelArtsJobStatus = "WAITING" //作业正在排队中 + ModelArtsTrainJobRunning ModelArtsJobStatus = "RUNNING" //作业正在运行中 + ModelArtsTrainJobKilling ModelArtsJobStatus = "KILLING" //作业正在取消 + ModelArtsTrainJobCompleted ModelArtsJobStatus = "COMPLETED" //作业已经完成 + ModelArtsTrainJobFailed ModelArtsJobStatus = "FAILED" //作业运行失败 + ModelArtsTrainJobKilled ModelArtsJobStatus = "KILLED" //作业取消成功 + ModelArtsTrainJobCanceled ModelArtsJobStatus = "CANCELED" //作业取消 + ModelArtsTrainJobLost ModelArtsJobStatus = "LOST" //作业丢失 + ModelArtsTrainJobScaling ModelArtsJobStatus = "SCALING" //作业正在扩容 + ModelArtsTrainJobSubmitModelFailed ModelArtsJobStatus = "SUBMIT_MODEL_FAILED" //提交模型失败 + ModelArtsTrainJobDeployServiceFailed ModelArtsJobStatus = "DEPLOY_SERVICE_FAILED" //部署服务失败 + ModelArtsTrainJobCheckInit ModelArtsJobStatus = "CHECK_INIT" //审核作业初始化 + ModelArtsTrainJobCheckRunning ModelArtsJobStatus = "CHECK_RUNNING" //审核作业正在运行中 + ModelArtsTrainJobCheckRunningCompleted ModelArtsJobStatus = "CHECK_RUNNING_COMPLETED" //审核作业已经完成 + ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 ) type Cloudbrain struct { @@ -1091,3 +1116,31 @@ func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool { } return false } + +func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { + cloudbrains := make([]*Cloudbrain, 0, 10) + return cloudbrains, x. + NotIn("status", + JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, + ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, + ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). + Limit(100). + Find(&cloudbrains) +} + +func GetCloudbrainCountByUserID(userID int64) (int, error) { + count, err := x.In("status", JobWaiting, JobRunning).And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, TypeCloudBrainOne).Count(new(Cloudbrain)) + return int(count), err +} + +func GetCloudbrainNotebookCountByUserID(userID int64) (int, error) { + count, err := x.In("status", ModelArtsCreateQueue, ModelArtsCreating, ModelArtsStarting, ModelArtsReadyToStart, ModelArtsResizing, ModelArtsStartQueuing, ModelArtsRunning, ModelArtsRestarting). + And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) + return int(count), err +} + +func GetCloudbrainTrainJobCountByUserID(userID int64) (int, error) { + count, err := x.In("status", ModelArtsTrainJobInit, ModelArtsTrainJobImageCreating, ModelArtsTrainJobSubmitTrying, ModelArtsTrainJobWaiting, ModelArtsTrainJobRunning, ModelArtsTrainJobScaling, ModelArtsTrainJobCheckInit, ModelArtsTrainJobCheckRunning, ModelArtsTrainJobCheckRunningCompleted). + And("job_type = ? and user_id = ? and type = ?", JobTypeTrain, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) + return int(count), err +} diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 28f3a0184..d704ee0d9 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -216,7 +216,22 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { return } - _, err := models.GetCloudbrainByName(jobName) + count, err := models.GetCloudbrainCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplCloudBrainNew, &form) + return + } + } + + _, err = models.GetCloudbrainByName(jobName) if err == nil { log.Error("the job name did already exist", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 069a1a0b5..263bba85c 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -116,7 +116,22 @@ func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) description := form.Description flavor := form.Flavor - err := modelarts.GenerateTask(ctx, jobName, uuid, description, flavor) + count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form) + return + } + } + + err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor) if err != nil { ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) return @@ -854,6 +869,21 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form) + return + } + } + latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion) if err != nil { ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err) From 97e6e6804a7205671386bea8c11184b9afbe1702 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 1 Dec 2021 09:53:09 +0800 Subject: [PATCH 2/6] mod --- routers/repo/modelarts.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 263bba85c..48978b2e5 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -699,6 +699,21 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount := modelarts.VersionCount EngineName := form.EngineName + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) + ErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + ErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) + return + } + } + if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) ErrorNewDataPrepare(ctx, form) @@ -872,13 +887,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) + VersionErrorDataPrepare(ctx, form) ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) + VersionErrorDataPrepare(ctx, form) ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form) return } From 158dc25c342efaebb41c2a043fb7a3bd69eeebdf Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 1 Dec 2021 17:12:16 +0800 Subject: [PATCH 3/6] fix issue --- templates/repo/modelarts/trainjob/show.tmpl | 7 +++++-- templates/repo/modelarts/trainjob/version_new.tmpl | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl index a3fdddf38..cd85057de 100755 --- a/templates/repo/modelarts/trainjob/show.tmpl +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -474,6 +474,7 @@ td, th {