@@ -43,9 +43,10 @@ const ( | |||
) | |||
var ( | |||
ResourceSpecs *models.ResourceSpecs | |||
TrainResourceSpecs *models.ResourceSpecs | |||
SpecialPools *models.SpecialPools | |||
ResourceSpecs *models.ResourceSpecs | |||
TrainResourceSpecs *models.ResourceSpecs | |||
InferenceResourceSpecs *models.ResourceSpecs | |||
SpecialPools *models.SpecialPools | |||
) | |||
type GenerateCloudBrainTaskReq struct { | |||
@@ -222,7 +223,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) { | |||
func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||
var resourceSpec *models.ResourceSpec | |||
var versionCount int | |||
if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) { | |||
if req.JobType == string(models.JobTypeTrain) { | |||
versionCount = 1 | |||
if TrainResourceSpecs == nil { | |||
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | |||
@@ -233,6 +234,17 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||
break | |||
} | |||
} | |||
} else if req.JobType == string(models.JobTypeInference) { | |||
if InferenceResourceSpecs == nil { | |||
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) | |||
} | |||
for _, spec := range InferenceResourceSpecs.ResourceSpec { | |||
if req.ResourceSpecId == spec.Id { | |||
resourceSpec = spec | |||
break | |||
} | |||
} | |||
} else { | |||
if ResourceSpecs == nil { | |||
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | |||
@@ -247,21 +259,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||
} | |||
//如果没有匹配到spec信息,尝试从专属资源池获取 | |||
if resourceSpec == nil && SpecialPools != nil { | |||
for _, specialPool := range SpecialPools.Pools { | |||
if resourceSpec != nil { | |||
break | |||
} | |||
if specialPool.ResourceSpec != nil { | |||
if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { | |||
for _, spec := range specialPool.ResourceSpec { | |||
if req.ResourceSpecId == spec.Id { | |||
resourceSpec = spec | |||
break | |||
} | |||
} | |||
} | |||
} | |||
} | |||
resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) | |||
} | |||
if resourceSpec == nil { | |||
@@ -452,6 +450,11 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||
} | |||
} | |||
//如果没有匹配到spec信息,尝试从专属资源池获取 | |||
if resourceSpec == nil && SpecialPools != nil { | |||
resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) | |||
} | |||
if resourceSpec == nil { | |||
log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | |||
return errors.New("no such resourceSpec") | |||
@@ -590,6 +593,23 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||
return nil | |||
} | |||
func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec { | |||
for _, specialPool := range SpecialPools.Pools { | |||
if specialPool.ResourceSpec != nil { | |||
if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) { | |||
for _, spec := range specialPool.ResourceSpec { | |||
if resourceSpecId == spec.Id { | |||
return spec | |||
} | |||
} | |||
} | |||
} | |||
} | |||
return nil | |||
} | |||
func DelCloudBrainJob(jobId string) string { | |||
task, err := models.GetCloudbrainByJobID(jobId) | |||
if err != nil { | |||
@@ -453,20 +453,22 @@ var ( | |||
DecompressOBSTaskName string | |||
//cloudbrain config | |||
CBAuthUser string | |||
CBAuthPassword string | |||
RestServerHost string | |||
JobPath string | |||
CBCodePathPrefix string | |||
JobType string | |||
GpuTypes string | |||
SpecialPools string | |||
DebugServerHost string | |||
ResourceSpecs string | |||
MaxDuration int64 | |||
TrainGpuTypes string | |||
TrainResourceSpecs string | |||
MaxDatasetNum int | |||
CBAuthUser string | |||
CBAuthPassword string | |||
RestServerHost string | |||
JobPath string | |||
CBCodePathPrefix string | |||
JobType string | |||
GpuTypes string | |||
SpecialPools string | |||
DebugServerHost string | |||
ResourceSpecs string | |||
MaxDuration int64 | |||
TrainGpuTypes string | |||
TrainResourceSpecs string | |||
InferenceGpuTypes string | |||
InferenceResourceSpecs string | |||
MaxDatasetNum int | |||
//benchmark config | |||
IsBenchmarkEnabled bool | |||
@@ -1312,6 +1314,8 @@ func NewContext() { | |||
MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | |||
TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | |||
TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | |||
InferenceGpuTypes = sec.Key("INFERENCE_GPU_TYPES").MustString("") | |||
InferenceResourceSpecs = sec.Key("INFERENCE_RESOURCE_SPECS").MustString("") | |||
SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | |||
MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | |||
@@ -59,6 +59,7 @@ var ( | |||
benchmarkGpuInfos *models.GpuInfos | |||
benchmarkResourceSpecs *models.ResourceSpecs | |||
trainGpuInfos *models.GpuInfos | |||
inferenceGpuInfos *models.GpuInfos | |||
) | |||
const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" | |||
@@ -130,6 +131,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||
} | |||
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | |||
if inferenceGpuInfos == nil { | |||
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||
} | |||
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo | |||
if benchmarkGpuInfos == nil { | |||
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | |||
} | |||
@@ -150,6 +156,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||
} | |||
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | |||
if cloudbrain.InferenceResourceSpecs == nil { | |||
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||
} | |||
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec | |||
if cloudbrain.SpecialPools != nil { | |||
var debugGpuTypes []*models.GpuInfo | |||
var trainGpuTypes []*models.GpuInfo | |||
@@ -547,7 +558,18 @@ func CloudBrainRestart(ctx *context.Context) { | |||
for _, resourceType := range gpuInfos.GpuInfo { | |||
if resourceType.Queue == task.GpuQueue { | |||
hasSameResource = true | |||
continue | |||
break | |||
} | |||
} | |||
if !hasSameResource && cloudbrain.SpecialPools != nil { | |||
for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||
cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) | |||
for _, pool := range specialPool.Pool { | |||
if pool.Queue == task.GpuQueue { | |||
hasSameResource = true | |||
} | |||
} | |||
} | |||
} | |||
@@ -610,7 +632,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||
var task *models.Cloudbrain | |||
var err error | |||
if jobType == models.JobTypeTrain { | |||
if jobType == models.JobTypeTrain || jobType == models.JobTypeInference { | |||
task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) | |||
} else { | |||
task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) | |||
@@ -641,6 +663,18 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | |||
} | |||
} | |||
} else if task.JobType == string(models.JobTypeInference) { | |||
if cloudbrain.InferenceResourceSpecs == nil { | |||
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||
} | |||
for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { | |||
if tmp.Id == task.ResourceSpecId { | |||
ctx.Data["GpuNum"] = tmp.GpuNum | |||
ctx.Data["CpuNum"] = tmp.CpuNum | |||
ctx.Data["MemMiB"] = tmp.MemMiB | |||
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | |||
} | |||
} | |||
} else { | |||
if cloudbrain.ResourceSpecs == nil { | |||
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | |||
@@ -669,6 +703,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||
ctx.Data["resource_type"] = resourceType.Value | |||
} | |||
} | |||
} else if task.JobType == string(models.JobTypeInference) { | |||
if inferenceGpuInfos == nil { | |||
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||
} | |||
for _, resourceType := range inferenceGpuInfos.GpuInfo { | |||
if resourceType.Queue == jobRes.Config.GpuType { | |||
ctx.Data["resource_type"] = resourceType.Value | |||
} | |||
} | |||
} else if cloudbrain.IsBenchmarkJob(task.JobType) { | |||
if benchmarkGpuInfos == nil { | |||
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | |||
@@ -2472,7 +2515,7 @@ func InferenceCloudBrainJobNew(ctx *context.Context) { | |||
} | |||
func InferenceCloudBrainJobShow(ctx *context.Context) { | |||
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeTrain) | |||
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference) | |||
} | |||
func DownloadInferenceResultFile(ctx *context.Context) { | |||
@@ -119,7 +119,7 @@ func MustEnableModelArts(ctx *context.Context) { | |||
func NotebookNew(ctx *context.Context) { | |||
notebookNewDataPrepare(ctx) | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeDebug) | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
ctx.Data["WaitCount"] = waitCount | |||
ctx.HTML(200, tplModelArtsNotebookNew) | |||
} | |||
@@ -631,7 +631,7 @@ func TrainJobNew(ctx *context.Context) { | |||
ctx.ServerError("get new train-job info failed", err) | |||
return | |||
} | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
ctx.Data["WaitCount"] = waitCount | |||
ctx.HTML(200, tplModelArtsTrainJobNew) | |||
} | |||
@@ -785,7 +785,7 @@ func TrainJobNewVersion(ctx *context.Context) { | |||
ctx.ServerError("get new train-job info failed", err) | |||
return | |||
} | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
ctx.Data["WaitCount"] = waitCount | |||
ctx.HTML(200, tplModelArtsTrainJobVersionNew) | |||
} | |||
@@ -2057,7 +2057,7 @@ func InferenceJobIndex(ctx *context.Context) { | |||
PageSize: setting.UI.IssuePagingNum, | |||
}, | |||
RepoID: repo.ID, | |||
Type: models.TypeCloudBrainAll, | |||
Type: ctx.QueryInt("type"), | |||
JobTypes: jobTypes, | |||
}) | |||
if err != nil { | |||
@@ -2100,7 +2100,7 @@ func InferenceJobNew(ctx *context.Context) { | |||
ctx.ServerError("get new inference-job info failed", err) | |||
return | |||
} | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeInference) | |||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
ctx.Data["WaitCount"] = waitCount | |||
ctx.HTML(200, tplModelArtsInferenceJobNew) | |||
} | |||