diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 73dcb150c..a2e71f8b2 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -43,9 +43,10 @@ const ( ) var ( - ResourceSpecs *models.ResourceSpecs - TrainResourceSpecs *models.ResourceSpecs - SpecialPools *models.SpecialPools + ResourceSpecs *models.ResourceSpecs + TrainResourceSpecs *models.ResourceSpecs + InferenceResourceSpecs *models.ResourceSpecs + SpecialPools *models.SpecialPools ) type GenerateCloudBrainTaskReq struct { @@ -222,7 +223,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) { func GenerateTask(req GenerateCloudBrainTaskReq) error { var resourceSpec *models.ResourceSpec var versionCount int - if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) { + if req.JobType == string(models.JobTypeTrain) { versionCount = 1 if TrainResourceSpecs == nil { json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) @@ -233,6 +234,17 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { break } } + } else if req.JobType == string(models.JobTypeInference) { + if InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) + } + for _, spec := range InferenceResourceSpecs.ResourceSpec { + if req.ResourceSpecId == spec.Id { + resourceSpec = spec + break + } + } + } else { if ResourceSpecs == nil { json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) @@ -247,21 +259,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { } //如果没有匹配到spec信息,尝试从专属资源池获取 if resourceSpec == nil && SpecialPools != nil { - for _, specialPool := range SpecialPools.Pools { - if resourceSpec != nil { - break - } - if specialPool.ResourceSpec != nil { - if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { - for _, spec := range specialPool.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - } - } - } + resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) } if resourceSpec == nil { @@ -452,6 +450,11 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e } } + //如果没有匹配到spec信息,尝试从专属资源池获取 + if resourceSpec == nil && SpecialPools != nil { + resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) + } + if resourceSpec == nil { log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) return errors.New("no such resourceSpec") @@ -590,6 +593,23 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e return nil } +func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec { + + for _, specialPool := range SpecialPools.Pools { + + if specialPool.ResourceSpec != nil { + if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) { + for _, spec := range specialPool.ResourceSpec { + if resourceSpecId == spec.Id { + return spec + } + } + } + } + } + return nil +} + func DelCloudBrainJob(jobId string) string { task, err := models.GetCloudbrainByJobID(jobId) if err != nil { diff --git a/modules/setting/setting.go b/modules/setting/setting.go index d206ed9a3..14d968b9e 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -453,20 +453,22 @@ var ( DecompressOBSTaskName string //cloudbrain config - CBAuthUser string - CBAuthPassword string - RestServerHost string - JobPath string - CBCodePathPrefix string - JobType string - GpuTypes string - SpecialPools string - DebugServerHost string - ResourceSpecs string - MaxDuration int64 - TrainGpuTypes string - TrainResourceSpecs string - MaxDatasetNum int + CBAuthUser string + CBAuthPassword string + RestServerHost string + JobPath string + CBCodePathPrefix string + JobType string + GpuTypes string + SpecialPools string + DebugServerHost string + ResourceSpecs string + MaxDuration int64 + TrainGpuTypes string + TrainResourceSpecs string + InferenceGpuTypes string + InferenceResourceSpecs string + MaxDatasetNum int //benchmark config IsBenchmarkEnabled bool @@ -1312,6 +1314,8 @@ func NewContext() { MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") + InferenceGpuTypes = sec.Key("INFERENCE_GPU_TYPES").MustString("") + InferenceResourceSpecs = sec.Key("INFERENCE_RESOURCE_SPECS").MustString("") SpecialPools = sec.Key("SPECIAL_POOL").MustString("") MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 6af613649..5276b233e 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -59,6 +59,7 @@ var ( benchmarkGpuInfos *models.GpuInfos benchmarkResourceSpecs *models.ResourceSpecs trainGpuInfos *models.GpuInfos + inferenceGpuInfos *models.GpuInfos ) const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" @@ -130,6 +131,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { } ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo + if inferenceGpuInfos == nil { + json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) + } + ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo + if benchmarkGpuInfos == nil { json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) } @@ -150,6 +156,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { } ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec + if cloudbrain.SpecialPools != nil { var debugGpuTypes []*models.GpuInfo var trainGpuTypes []*models.GpuInfo @@ -547,7 +558,18 @@ func CloudBrainRestart(ctx *context.Context) { for _, resourceType := range gpuInfos.GpuInfo { if resourceType.Queue == task.GpuQueue { hasSameResource = true - continue + break + } + } + if !hasSameResource && cloudbrain.SpecialPools != nil { + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) + for _, pool := range specialPool.Pool { + if pool.Queue == task.GpuQueue { + hasSameResource = true + } + } } } @@ -610,7 +632,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo var task *models.Cloudbrain var err error - if jobType == models.JobTypeTrain { + if jobType == models.JobTypeTrain || jobType == models.JobTypeInference { task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) } else { task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) @@ -641,6 +663,18 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB } } + } else if task.JobType == string(models.JobTypeInference) { + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + } + } } else { if cloudbrain.ResourceSpecs == nil { json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) @@ -669,6 +703,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo ctx.Data["resource_type"] = resourceType.Value } } + } else if task.JobType == string(models.JobTypeInference) { + if inferenceGpuInfos == nil { + json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) + } + for _, resourceType := range inferenceGpuInfos.GpuInfo { + if resourceType.Queue == jobRes.Config.GpuType { + ctx.Data["resource_type"] = resourceType.Value + } + } } else if cloudbrain.IsBenchmarkJob(task.JobType) { if benchmarkGpuInfos == nil { json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) @@ -2472,7 +2515,7 @@ func InferenceCloudBrainJobNew(ctx *context.Context) { } func InferenceCloudBrainJobShow(ctx *context.Context) { - cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeTrain) + cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference) } func DownloadInferenceResultFile(ctx *context.Context) { diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 780f0db51..dffbab89a 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -119,7 +119,7 @@ func MustEnableModelArts(ctx *context.Context) { func NotebookNew(ctx *context.Context) { notebookNewDataPrepare(ctx) - waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeDebug) + waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount ctx.HTML(200, tplModelArtsNotebookNew) } @@ -631,7 +631,7 @@ func TrainJobNew(ctx *context.Context) { ctx.ServerError("get new train-job info failed", err) return } - waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) + waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount ctx.HTML(200, tplModelArtsTrainJobNew) } @@ -785,7 +785,7 @@ func TrainJobNewVersion(ctx *context.Context) { ctx.ServerError("get new train-job info failed", err) return } - waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) + waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount ctx.HTML(200, tplModelArtsTrainJobVersionNew) } @@ -2057,7 +2057,7 @@ func InferenceJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: models.TypeCloudBrainAll, + Type: ctx.QueryInt("type"), JobTypes: jobTypes, }) if err != nil { @@ -2100,7 +2100,7 @@ func InferenceJobNew(ctx *context.Context) { ctx.ServerError("get new inference-job info failed", err) return } - waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeInference) + waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount ctx.HTML(200, tplModelArtsInferenceJobNew) }