@@ -43,9 +43,10 @@ const ( | |||||
) | ) | ||||
var ( | var ( | ||||
ResourceSpecs *models.ResourceSpecs | |||||
TrainResourceSpecs *models.ResourceSpecs | |||||
SpecialPools *models.SpecialPools | |||||
ResourceSpecs *models.ResourceSpecs | |||||
TrainResourceSpecs *models.ResourceSpecs | |||||
InferenceResourceSpecs *models.ResourceSpecs | |||||
SpecialPools *models.SpecialPools | |||||
) | ) | ||||
type GenerateCloudBrainTaskReq struct { | type GenerateCloudBrainTaskReq struct { | ||||
@@ -222,7 +223,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) { | |||||
func GenerateTask(req GenerateCloudBrainTaskReq) error { | func GenerateTask(req GenerateCloudBrainTaskReq) error { | ||||
var resourceSpec *models.ResourceSpec | var resourceSpec *models.ResourceSpec | ||||
var versionCount int | var versionCount int | ||||
if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) { | |||||
if req.JobType == string(models.JobTypeTrain) { | |||||
versionCount = 1 | versionCount = 1 | ||||
if TrainResourceSpecs == nil { | if TrainResourceSpecs == nil { | ||||
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | ||||
@@ -233,6 +234,17 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||||
break | break | ||||
} | } | ||||
} | } | ||||
} else if req.JobType == string(models.JobTypeInference) { | |||||
if InferenceResourceSpecs == nil { | |||||
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) | |||||
} | |||||
for _, spec := range InferenceResourceSpecs.ResourceSpec { | |||||
if req.ResourceSpecId == spec.Id { | |||||
resourceSpec = spec | |||||
break | |||||
} | |||||
} | |||||
} else { | } else { | ||||
if ResourceSpecs == nil { | if ResourceSpecs == nil { | ||||
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | ||||
@@ -247,21 +259,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||||
} | } | ||||
//如果没有匹配到spec信息,尝试从专属资源池获取 | //如果没有匹配到spec信息,尝试从专属资源池获取 | ||||
if resourceSpec == nil && SpecialPools != nil { | if resourceSpec == nil && SpecialPools != nil { | ||||
for _, specialPool := range SpecialPools.Pools { | |||||
if resourceSpec != nil { | |||||
break | |||||
} | |||||
if specialPool.ResourceSpec != nil { | |||||
if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { | |||||
for _, spec := range specialPool.ResourceSpec { | |||||
if req.ResourceSpecId == spec.Id { | |||||
resourceSpec = spec | |||||
break | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) | |||||
} | } | ||||
if resourceSpec == nil { | if resourceSpec == nil { | ||||
@@ -452,6 +450,11 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
} | } | ||||
} | } | ||||
//如果没有匹配到spec信息,尝试从专属资源池获取 | |||||
if resourceSpec == nil && SpecialPools != nil { | |||||
resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) | |||||
} | |||||
if resourceSpec == nil { | if resourceSpec == nil { | ||||
log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | ||||
return errors.New("no such resourceSpec") | return errors.New("no such resourceSpec") | ||||
@@ -590,6 +593,23 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
return nil | return nil | ||||
} | } | ||||
func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec { | |||||
for _, specialPool := range SpecialPools.Pools { | |||||
if specialPool.ResourceSpec != nil { | |||||
if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) { | |||||
for _, spec := range specialPool.ResourceSpec { | |||||
if resourceSpecId == spec.Id { | |||||
return spec | |||||
} | |||||
} | |||||
} | |||||
} | |||||
} | |||||
return nil | |||||
} | |||||
func DelCloudBrainJob(jobId string) string { | func DelCloudBrainJob(jobId string) string { | ||||
task, err := models.GetCloudbrainByJobID(jobId) | task, err := models.GetCloudbrainByJobID(jobId) | ||||
if err != nil { | if err != nil { | ||||
@@ -453,20 +453,22 @@ var ( | |||||
DecompressOBSTaskName string | DecompressOBSTaskName string | ||||
//cloudbrain config | //cloudbrain config | ||||
CBAuthUser string | |||||
CBAuthPassword string | |||||
RestServerHost string | |||||
JobPath string | |||||
CBCodePathPrefix string | |||||
JobType string | |||||
GpuTypes string | |||||
SpecialPools string | |||||
DebugServerHost string | |||||
ResourceSpecs string | |||||
MaxDuration int64 | |||||
TrainGpuTypes string | |||||
TrainResourceSpecs string | |||||
MaxDatasetNum int | |||||
CBAuthUser string | |||||
CBAuthPassword string | |||||
RestServerHost string | |||||
JobPath string | |||||
CBCodePathPrefix string | |||||
JobType string | |||||
GpuTypes string | |||||
SpecialPools string | |||||
DebugServerHost string | |||||
ResourceSpecs string | |||||
MaxDuration int64 | |||||
TrainGpuTypes string | |||||
TrainResourceSpecs string | |||||
InferenceGpuTypes string | |||||
InferenceResourceSpecs string | |||||
MaxDatasetNum int | |||||
//benchmark config | //benchmark config | ||||
IsBenchmarkEnabled bool | IsBenchmarkEnabled bool | ||||
@@ -1312,6 +1314,8 @@ func NewContext() { | |||||
MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | ||||
TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | ||||
TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | ||||
InferenceGpuTypes = sec.Key("INFERENCE_GPU_TYPES").MustString("") | |||||
InferenceResourceSpecs = sec.Key("INFERENCE_RESOURCE_SPECS").MustString("") | |||||
SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | ||||
MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | ||||
@@ -59,6 +59,7 @@ var ( | |||||
benchmarkGpuInfos *models.GpuInfos | benchmarkGpuInfos *models.GpuInfos | ||||
benchmarkResourceSpecs *models.ResourceSpecs | benchmarkResourceSpecs *models.ResourceSpecs | ||||
trainGpuInfos *models.GpuInfos | trainGpuInfos *models.GpuInfos | ||||
inferenceGpuInfos *models.GpuInfos | |||||
) | ) | ||||
const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" | const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" | ||||
@@ -130,6 +131,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
} | } | ||||
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | ||||
if inferenceGpuInfos == nil { | |||||
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||||
} | |||||
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo | |||||
if benchmarkGpuInfos == nil { | if benchmarkGpuInfos == nil { | ||||
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | ||||
} | } | ||||
@@ -150,6 +156,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
} | } | ||||
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | ||||
if cloudbrain.InferenceResourceSpecs == nil { | |||||
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||||
} | |||||
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec | |||||
if cloudbrain.SpecialPools != nil { | if cloudbrain.SpecialPools != nil { | ||||
var debugGpuTypes []*models.GpuInfo | var debugGpuTypes []*models.GpuInfo | ||||
var trainGpuTypes []*models.GpuInfo | var trainGpuTypes []*models.GpuInfo | ||||
@@ -547,7 +558,18 @@ func CloudBrainRestart(ctx *context.Context) { | |||||
for _, resourceType := range gpuInfos.GpuInfo { | for _, resourceType := range gpuInfos.GpuInfo { | ||||
if resourceType.Queue == task.GpuQueue { | if resourceType.Queue == task.GpuQueue { | ||||
hasSameResource = true | hasSameResource = true | ||||
continue | |||||
break | |||||
} | |||||
} | |||||
if !hasSameResource && cloudbrain.SpecialPools != nil { | |||||
for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||||
cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) | |||||
for _, pool := range specialPool.Pool { | |||||
if pool.Queue == task.GpuQueue { | |||||
hasSameResource = true | |||||
} | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -610,7 +632,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
var task *models.Cloudbrain | var task *models.Cloudbrain | ||||
var err error | var err error | ||||
if jobType == models.JobTypeTrain { | |||||
if jobType == models.JobTypeTrain || jobType == models.JobTypeInference { | |||||
task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) | task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) | ||||
} else { | } else { | ||||
task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) | task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) | ||||
@@ -641,6 +663,18 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | ||||
} | } | ||||
} | } | ||||
} else if task.JobType == string(models.JobTypeInference) { | |||||
if cloudbrain.InferenceResourceSpecs == nil { | |||||
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||||
} | |||||
for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { | |||||
if tmp.Id == task.ResourceSpecId { | |||||
ctx.Data["GpuNum"] = tmp.GpuNum | |||||
ctx.Data["CpuNum"] = tmp.CpuNum | |||||
ctx.Data["MemMiB"] = tmp.MemMiB | |||||
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | |||||
} | |||||
} | |||||
} else { | } else { | ||||
if cloudbrain.ResourceSpecs == nil { | if cloudbrain.ResourceSpecs == nil { | ||||
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | ||||
@@ -669,6 +703,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
ctx.Data["resource_type"] = resourceType.Value | ctx.Data["resource_type"] = resourceType.Value | ||||
} | } | ||||
} | } | ||||
} else if task.JobType == string(models.JobTypeInference) { | |||||
if inferenceGpuInfos == nil { | |||||
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||||
} | |||||
for _, resourceType := range inferenceGpuInfos.GpuInfo { | |||||
if resourceType.Queue == jobRes.Config.GpuType { | |||||
ctx.Data["resource_type"] = resourceType.Value | |||||
} | |||||
} | |||||
} else if cloudbrain.IsBenchmarkJob(task.JobType) { | } else if cloudbrain.IsBenchmarkJob(task.JobType) { | ||||
if benchmarkGpuInfos == nil { | if benchmarkGpuInfos == nil { | ||||
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | ||||
@@ -2472,7 +2515,7 @@ func InferenceCloudBrainJobNew(ctx *context.Context) { | |||||
} | } | ||||
func InferenceCloudBrainJobShow(ctx *context.Context) { | func InferenceCloudBrainJobShow(ctx *context.Context) { | ||||
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeTrain) | |||||
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference) | |||||
} | } | ||||
func DownloadInferenceResultFile(ctx *context.Context) { | func DownloadInferenceResultFile(ctx *context.Context) { | ||||
@@ -119,7 +119,7 @@ func MustEnableModelArts(ctx *context.Context) { | |||||
func NotebookNew(ctx *context.Context) { | func NotebookNew(ctx *context.Context) { | ||||
notebookNewDataPrepare(ctx) | notebookNewDataPrepare(ctx) | ||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeDebug) | |||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
ctx.HTML(200, tplModelArtsNotebookNew) | ctx.HTML(200, tplModelArtsNotebookNew) | ||||
} | } | ||||
@@ -631,7 +631,7 @@ func TrainJobNew(ctx *context.Context) { | |||||
ctx.ServerError("get new train-job info failed", err) | ctx.ServerError("get new train-job info failed", err) | ||||
return | return | ||||
} | } | ||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
ctx.HTML(200, tplModelArtsTrainJobNew) | ctx.HTML(200, tplModelArtsTrainJobNew) | ||||
} | } | ||||
@@ -785,7 +785,7 @@ func TrainJobNewVersion(ctx *context.Context) { | |||||
ctx.ServerError("get new train-job info failed", err) | ctx.ServerError("get new train-job info failed", err) | ||||
return | return | ||||
} | } | ||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
ctx.HTML(200, tplModelArtsTrainJobVersionNew) | ctx.HTML(200, tplModelArtsTrainJobVersionNew) | ||||
} | } | ||||
@@ -2057,7 +2057,7 @@ func InferenceJobIndex(ctx *context.Context) { | |||||
PageSize: setting.UI.IssuePagingNum, | PageSize: setting.UI.IssuePagingNum, | ||||
}, | }, | ||||
RepoID: repo.ID, | RepoID: repo.ID, | ||||
Type: models.TypeCloudBrainAll, | |||||
Type: ctx.QueryInt("type"), | |||||
JobTypes: jobTypes, | JobTypes: jobTypes, | ||||
}) | }) | ||||
if err != nil { | if err != nil { | ||||
@@ -2100,7 +2100,7 @@ func InferenceJobNew(ctx *context.Context) { | |||||
ctx.ServerError("get new inference-job info failed", err) | ctx.ServerError("get new inference-job info failed", err) | ||||
return | return | ||||
} | } | ||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeInference) | |||||
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
ctx.HTML(200, tplModelArtsInferenceJobNew) | ctx.HTML(200, tplModelArtsInferenceJobNew) | ||||
} | } | ||||