Browse Source

Merge branch 'gpu-inference' of https://git.openi.org.cn/OpenI/aiforge into gpu-inference

pull/2458/head
zhoupzh 2 years ago
parent
commit
812044dcf1
4 changed files with 108 additions and 41 deletions
  1. +39
    -19
      modules/cloudbrain/cloudbrain.go
  2. +18
    -14
      modules/setting/setting.go
  3. +46
    -3
      routers/repo/cloudbrain.go
  4. +5
    -5
      routers/repo/modelarts.go

+ 39
- 19
modules/cloudbrain/cloudbrain.go View File

@@ -43,9 +43,10 @@ const (
)

var (
ResourceSpecs *models.ResourceSpecs
TrainResourceSpecs *models.ResourceSpecs
SpecialPools *models.SpecialPools
ResourceSpecs *models.ResourceSpecs
TrainResourceSpecs *models.ResourceSpecs
InferenceResourceSpecs *models.ResourceSpecs
SpecialPools *models.SpecialPools
)

type GenerateCloudBrainTaskReq struct {
@@ -222,7 +223,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) {
func GenerateTask(req GenerateCloudBrainTaskReq) error {
var resourceSpec *models.ResourceSpec
var versionCount int
if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) {
if req.JobType == string(models.JobTypeTrain) {
versionCount = 1
if TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
@@ -233,6 +234,17 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
break
}
}
} else if req.JobType == string(models.JobTypeInference) {
if InferenceResourceSpecs == nil {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs)
}
for _, spec := range InferenceResourceSpecs.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}

} else {
if ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
@@ -247,21 +259,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
}
//如果没有匹配到spec信息,尝试从专属资源池获取
if resourceSpec == nil && SpecialPools != nil {
for _, specialPool := range SpecialPools.Pools {
if resourceSpec != nil {
break
}
if specialPool.ResourceSpec != nil {
if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) {
for _, spec := range specialPool.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}
}
}
}
resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId)
}

if resourceSpec == nil {
@@ -452,6 +450,11 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e
}
}

//如果没有匹配到spec信息,尝试从专属资源池获取
if resourceSpec == nil && SpecialPools != nil {
resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId)
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
return errors.New("no such resourceSpec")
@@ -590,6 +593,23 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e
return nil
}

func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {

for _, specialPool := range SpecialPools.Pools {

if specialPool.ResourceSpec != nil {
if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
for _, spec := range specialPool.ResourceSpec {
if resourceSpecId == spec.Id {
return spec
}
}
}
}
}
return nil
}

func DelCloudBrainJob(jobId string) string {
task, err := models.GetCloudbrainByJobID(jobId)
if err != nil {


+ 18
- 14
modules/setting/setting.go View File

@@ -453,20 +453,22 @@ var (
DecompressOBSTaskName string

//cloudbrain config
CBAuthUser string
CBAuthPassword string
RestServerHost string
JobPath string
CBCodePathPrefix string
JobType string
GpuTypes string
SpecialPools string
DebugServerHost string
ResourceSpecs string
MaxDuration int64
TrainGpuTypes string
TrainResourceSpecs string
MaxDatasetNum int
CBAuthUser string
CBAuthPassword string
RestServerHost string
JobPath string
CBCodePathPrefix string
JobType string
GpuTypes string
SpecialPools string
DebugServerHost string
ResourceSpecs string
MaxDuration int64
TrainGpuTypes string
TrainResourceSpecs string
InferenceGpuTypes string
InferenceResourceSpecs string
MaxDatasetNum int

//benchmark config
IsBenchmarkEnabled bool
@@ -1312,6 +1314,8 @@ func NewContext() {
MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400)
TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("")
TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("")
InferenceGpuTypes = sec.Key("INFERENCE_GPU_TYPES").MustString("")
InferenceResourceSpecs = sec.Key("INFERENCE_RESOURCE_SPECS").MustString("")
SpecialPools = sec.Key("SPECIAL_POOL").MustString("")
MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5)


+ 46
- 3
routers/repo/cloudbrain.go View File

@@ -59,6 +59,7 @@ var (
benchmarkGpuInfos *models.GpuInfos
benchmarkResourceSpecs *models.ResourceSpecs
trainGpuInfos *models.GpuInfos
inferenceGpuInfos *models.GpuInfos
)

const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types"
@@ -130,6 +131,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
}
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo

if inferenceGpuInfos == nil {
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos)
}
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo

if benchmarkGpuInfos == nil {
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
}
@@ -150,6 +156,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.InferenceResourceSpecs == nil {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec

if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo
@@ -547,7 +558,18 @@ func CloudBrainRestart(ctx *context.Context) {
for _, resourceType := range gpuInfos.GpuInfo {
if resourceType.Queue == task.GpuQueue {
hasSameResource = true
continue
break
}
}
if !hasSameResource && cloudbrain.SpecialPools != nil {

for _, specialPool := range cloudbrain.SpecialPools.Pools {
cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug))
for _, pool := range specialPool.Pool {
if pool.Queue == task.GpuQueue {
hasSameResource = true
}
}
}
}

@@ -610,7 +632,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo

var task *models.Cloudbrain
var err error
if jobType == models.JobTypeTrain {
if jobType == models.JobTypeTrain || jobType == models.JobTypeInference {
task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid"))
} else {
task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
@@ -641,6 +663,18 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB
}
}
} else if task.JobType == string(models.JobTypeInference) {
if cloudbrain.InferenceResourceSpecs == nil {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
ctx.Data["GpuNum"] = tmp.GpuNum
ctx.Data["CpuNum"] = tmp.CpuNum
ctx.Data["MemMiB"] = tmp.MemMiB
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB
}
}
} else {
if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
@@ -669,6 +703,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
ctx.Data["resource_type"] = resourceType.Value
}
}
} else if task.JobType == string(models.JobTypeInference) {
if inferenceGpuInfos == nil {
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos)
}
for _, resourceType := range inferenceGpuInfos.GpuInfo {
if resourceType.Queue == jobRes.Config.GpuType {
ctx.Data["resource_type"] = resourceType.Value
}
}
} else if cloudbrain.IsBenchmarkJob(task.JobType) {
if benchmarkGpuInfos == nil {
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
@@ -2472,7 +2515,7 @@ func InferenceCloudBrainJobNew(ctx *context.Context) {
}

func InferenceCloudBrainJobShow(ctx *context.Context) {
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeTrain)
cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference)
}

func DownloadInferenceResultFile(ctx *context.Context) {


+ 5
- 5
routers/repo/modelarts.go View File

@@ -119,7 +119,7 @@ func MustEnableModelArts(ctx *context.Context) {

func NotebookNew(ctx *context.Context) {
notebookNewDataPrepare(ctx)
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeDebug)
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplModelArtsNotebookNew)
}
@@ -631,7 +631,7 @@ func TrainJobNew(ctx *context.Context) {
ctx.ServerError("get new train-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain)
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplModelArtsTrainJobNew)
}
@@ -785,7 +785,7 @@ func TrainJobNewVersion(ctx *context.Context) {
ctx.ServerError("get new train-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain)
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplModelArtsTrainJobVersionNew)
}
@@ -2057,7 +2057,7 @@ func InferenceJobIndex(ctx *context.Context) {
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainAll,
Type: ctx.QueryInt("type"),
JobTypes: jobTypes,
})
if err != nil {
@@ -2100,7 +2100,7 @@ func InferenceJobNew(ctx *context.Context) {
ctx.ServerError("get new inference-job info failed", err)
return
}
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeInference)
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
ctx.Data["WaitCount"] = waitCount
ctx.HTML(200, tplModelArtsInferenceJobNew)
}


Loading…
Cancel
Save