diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 8e1b94a97..ea1826de0 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -570,11 +570,12 @@ type SpecialPools struct { Pools []*SpecialPool `json:"pools"` } type SpecialPool struct { - Org string `json:"org"` - Type string `json:"type"` - IsExclusive bool `json:"isExclusive"` - Pool []*GpuInfo `json:"pool"` - JobType []string `json:"jobType"` + Org string `json:"org"` + Type string `json:"type"` + IsExclusive bool `json:"isExclusive"` + Pool []*GpuInfo `json:"pool"` + JobType []string `json:"jobType"` + ResourceSpec []*ResourceSpec `json:"resourceSpecs"` } type ImageInfosModelArts struct { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 430304dd5..5d6948acb 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -17,7 +17,7 @@ import ( ) const ( - Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` + //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` CodeMountPath = "/code" @@ -42,6 +42,7 @@ const ( var ( ResourceSpecs *models.ResourceSpecs TrainResourceSpecs *models.ResourceSpecs + SpecialPools *models.SpecialPools ) type GenerateCloudBrainTaskReq struct { @@ -70,6 +71,11 @@ type GenerateCloudBrainTaskReq struct { ResourceSpecId int } +func GetCloudbrainDebugCommand() string { + var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;/usr/local/bin/python /usr/local/bin/jupyter-lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" ` + return command +} + func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { if !ctx.IsSigned { return false @@ -222,6 +228,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { for _, spec := range TrainResourceSpecs.ResourceSpec { if req.ResourceSpecId == spec.Id { resourceSpec = spec + break } } } else { @@ -231,10 +238,29 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { for _, spec := range ResourceSpecs.ResourceSpec { if req.ResourceSpecId == spec.Id { resourceSpec = spec + break } } } + //如果没有匹配到spec信息,尝试从专属资源池获取 + if resourceSpec == nil && SpecialPools != nil { + for _, specialPool := range SpecialPools.Pools { + if resourceSpec != nil { + break + } + if specialPool.ResourceSpec != nil { + if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { + for _, spec := range specialPool.ResourceSpec { + if req.ResourceSpecId == spec.Id { + resourceSpec = spec + break + } + } + } + } + } + } if resourceSpec == nil { log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) @@ -486,7 +512,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e GPUNumber: resourceSpec.GpuNum, MemoryMB: resourceSpec.MemMiB, ShmMB: resourceSpec.ShareMemMiB, - Command: Command, + Command: GetCloudbrainDebugCommand(),//Command, NeedIBDevice: false, IsMainRole: false, UseNNI: false, @@ -538,3 +564,39 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e return nil } + +func InitSpecialPool() { + if SpecialPools == nil && setting.SpecialPools != "" { + json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools) + } +} + +func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool { + if resourceSpecs == nil || len(resourceSpecs) == 0 { + return true + } + for _, v := range resourceSpecs { + if v.Id == resourceSpecId { + return true + } + } + return false +} + +func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool { + for _, v := range pool { + if v.Queue == queue { + return true + } + } + return false +} + +func IsElementExist(s []string, str string) bool { + for _, v := range s { + if v == str { + return true + } + } + return false +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 506e31ba3..e9aafa3d3 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -460,12 +460,15 @@ var ( CBCodePathPrefix string JobType string GpuTypes string + SpecialPools string DebugServerHost string ResourceSpecs string MaxDuration int64 TrainGpuTypes string TrainResourceSpecs string MaxDatasetNum int + CullIdleTimeout string + CullInterval string //benchmark config IsBenchmarkEnabled bool @@ -1311,7 +1314,11 @@ func NewContext() { MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") + SpecialPools = sec.Key("SPECIAL_POOL").MustString("") + MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) + CullIdleTimeout = sec.Key("CULL_IDLE_TIMEOUT").MustString("900") + CullInterval = sec.Key("CULL_INTERVAL").MustString("60") sec = Cfg.Section("benchmark") IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index dbb9354aa..857e365f8 100755 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -18,6 +18,7 @@ import ( "path/filepath" "regexp" "runtime" + "strconv" "strings" texttmpl "text/template" "time" @@ -327,6 +328,7 @@ func NewFuncMap() []template.FuncMap { }, "GetRefType": GetRefType, "GetRefName": GetRefName, + "MB2GB": MB2GB, }} } @@ -785,3 +787,14 @@ func GetRefName(ref string) string { reg := regexp.MustCompile(REF_TYPE_PATTERN) return reg.ReplaceAllString(ref, "") } + +func MB2GB(size int64) string { + s := strconv.FormatFloat(float64(size)/float64(1024), 'f', 2, 64) + for strings.HasSuffix(s, "0") { + s = strings.TrimSuffix(s, "0") + } + if strings.HasSuffix(s, ".") { + s = strings.TrimSuffix(s, ".") + } + return s +} diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index cc125c97f..eb86a8293 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -752,10 +752,26 @@ func GetCloudbrainsDetailData(ctx *context.Context) { taskDetail.RepoAlias = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Alias } if ciTasks[i].Cloudbrain.Status == string(models.JobWaiting) { - WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() - taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) - if WaitTimeInt < 0 { - taskDetail.WaitTime = "00:00:00" + if ciTasks[i].Cloudbrain.DeletedAt != nilTime { + WaitTimeInt := ciTasks[i].Cloudbrain.UpdatedUnix.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() + taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) + if WaitTimeInt < 0 { + taskDetail.WaitTime = "00:00:00" + } + } else { + if ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { + WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() + taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) + if WaitTimeInt < 0 { + taskDetail.WaitTime = "00:00:00" + } + } else { + WaitTimeInt := ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() + taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) + if WaitTimeInt < 0 { + taskDetail.WaitTime = "00:00:00" + } + } } } else if ciTasks[i].Cloudbrain.Status == string(models.JobStopped) && ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { WaitTimeInt := ciTasks[i].Cloudbrain.EndTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 2a0ce19db..751e240bf 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -7,8 +7,10 @@ package repo import ( "code.gitea.io/gitea/modules/grampus" + "code.gitea.io/gitea/modules/setting" "encoding/json" "net/http" + "path" "strconv" "strings" @@ -263,39 +265,49 @@ func TrainJobGetLog(ctx *context.APIContext) { return } - resultLogFile, result, err := trainJobGetLogContent(jobID, versionName, baseLine, order, lines_int) + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + return + } + resultLogFile, result, err := trainJobGetLogContent(jobID, task.VersionID, baseLine, order, lines_int) if err != nil { log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error()) // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) return } + prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job" + _, err = storage.GetObsLogFileName(prefix) + var canLogDownload bool + if err != nil { + canLogDownload = false + } else { + canLogDownload = true + } + ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] ctx.JSON(http.StatusOK, map[string]interface{}{ - "JobID": jobID, - "LogFileName": resultLogFile.LogFileList[0], - "StartLine": result.StartLine, - "EndLine": result.EndLine, - "Content": result.Content, - "Lines": result.Lines, + "JobID": jobID, + "LogFileName": resultLogFile.LogFileList[0], + "StartLine": result.StartLine, + "EndLine": result.EndLine, + "Content": result.Content, + "Lines": result.Lines, + "CanLogDownload": canLogDownload, }) } -func trainJobGetLogContent(jobID string, versionName string, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { - task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) - if err != nil { - log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) - return nil, nil, err - } +func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { - resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) + resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10)) if err != nil { log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) return nil, nil, err } - result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines) + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(versionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines) if err != nil { log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) return nil, nil, err diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 5a3d0a6f8..4eb810d21 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2,7 +2,6 @@ package repo import ( "bufio" - "code.gitea.io/gitea/modules/grampus" "encoding/json" "errors" "fmt" @@ -16,6 +15,8 @@ import ( "time" "unicode/utf8" + "code.gitea.io/gitea/modules/grampus" + "code.gitea.io/gitea/modules/timeutil" "github.com/unknwon/i18n" @@ -135,7 +136,7 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { } ctx.Data["attachments"] = attachs - ctx.Data["command"] = cloudbrain.Command + ctx.Data["command"] = cloudbrain.GetCloudbrainDebugCommand() ctx.Data["code_path"] = cloudbrain.CodeMountPath ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath ctx.Data["model_path"] = cloudbrain.ModelMountPath @@ -149,6 +150,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType + cloudbrain.InitSpecialPool() + if gpuInfos == nil { json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) } @@ -178,6 +181,45 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) } ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + + if cloudbrain.SpecialPools != nil { + var debugGpuTypes []*models.GpuInfo + var trainGpuTypes []*models.GpuInfo + + for _, pool := range cloudbrain.SpecialPools.Pools { + org, _ := models.GetOrgByName(pool.Org) + if org != nil { + isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID) + if isOrgMember { + for _, jobType := range pool.JobType { + if jobType == string(models.JobTypeDebug) { + debugGpuTypes = append(debugGpuTypes, pool.Pool...) + if pool.ResourceSpec != nil { + ctx.Data["resource_specs"] = pool.ResourceSpec + } + } else if jobType == string(models.JobTypeTrain) { + trainGpuTypes = append(trainGpuTypes, pool.Pool...) + if pool.ResourceSpec != nil { + ctx.Data["train_resource_specs"] = pool.ResourceSpec + } + } + } + break + } + } + + } + + if len(debugGpuTypes) > 0 { + ctx.Data["gpu_types"] = debugGpuTypes + } + + if len(trainGpuTypes) > 0 { + ctx.Data["train_gpu_types"] = trainGpuTypes + } + + } + ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -217,6 +259,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { repo := ctx.Repo.Repository tpl := tplCloudBrainNew + if jobType == string(models.JobTypeTrain) { + tpl = tplCloudBrainTrainJobNew + } + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) if err == nil { if len(tasks) != 0 { @@ -269,7 +315,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { return } - command := cloudbrain.Command + command := cloudbrain.GetCloudbrainDebugCommand() if jobType == string(models.JobTypeTrain) { tpl = tplCloudBrainTrainJobNew commandTrain, err := getTrainJobCommand(form) @@ -282,6 +328,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command = commandTrain } + errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) + + if errStr != "" { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr(errStr, tpl, &form) + return + } + if branchName == "" { branchName = cloudbrain.DefaultBranchName } @@ -334,6 +388,42 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } } +/** + 检查用户传输的参数是否符合专属资源池 +*/ +func checkCloudBrainSpecialPool(ctx *context.Context, jobType string, queue string, resourceSpecId int) string { + if cloudbrain.SpecialPools != nil { + + var isInPoolOrg = false + var matchSpecialPool = false + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if cloudbrain.IsElementExist(specialPool.JobType, jobType) && cloudbrain.IsQueueInSpecialtPool(specialPool.Pool, queue) { + if cloudbrain.IsResourceSpecInSpecialPool(specialPool.ResourceSpec, resourceSpecId) { + matchSpecialPool = true + org, _ := models.GetOrgByName(specialPool.Org) + if org != nil { + isInPoolOrg, _ = models.IsOrganizationMember(org.ID, ctx.User.ID) + if isInPoolOrg { + break //传入参数,和专属资源池匹配上了,检查通过 + } + } + } + + } + + } + //资源池有匹配上,但是用户不在相应的组织中,返回错误信息。界面已经过滤了选择,界面操作不会到这个逻辑 + if matchSpecialPool && !isInPoolOrg { + return ctx.Tr("repo.grampus.no_operate_right") + } + + } + //没有匹配到资源池或者没有设置专属资源池,检查通过; 获取和资源池完全匹配检查通过 + return "" +} + func CloudBrainRestart(ctx *context.Context) { var ID = ctx.Params(":id") var resultCode = "0" @@ -573,7 +663,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo if task.TrainJobDuration == "" { if task.Duration == 0 { var duration int64 - if task.Status == string(models.JobRunning) { + if task.Status == string(models.JobWaiting) { + duration = 0 + } else if task.Status == string(models.JobRunning) { duration = time.Now().Unix() - int64(task.CreatedUnix) } else { duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) @@ -2094,7 +2186,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) repo := ctx.Repo.Repository tpl := tplCloudBrainBenchmarkNew - command := cloudbrain.Command + command := cloudbrain.GetCloudbrainDebugCommand() tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) if err == nil { diff --git a/templates/repo/grampus/trainjob/show.tmpl b/templates/repo/grampus/trainjob/show.tmpl index 579e83693..18c9cc427 100755 --- a/templates/repo/grampus/trainjob/show.tmpl +++ b/templates/repo/grampus/trainjob/show.tmpl @@ -480,8 +480,13 @@
-
+ {{if eq .ComputeResource "CPU/GPU"}} +
+ + {{$.i18n.Tr "repo.file_limit_100"}} +
+ {{end}} diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl index bd542d912..9c2a91921 100755 --- a/templates/repo/modelarts/trainjob/show.tmpl +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -488,7 +488,7 @@
{{$.i18n.Tr "repo.modelarts.download_log"}} diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl index b7fcd36ad..886469d4c 100644 --- a/templates/repo/modelarts/trainjob/version_new.tmpl +++ b/templates/repo/modelarts/trainjob/version_new.tmpl @@ -446,24 +446,6 @@ ] }, - work_server_number: { - identifier : 'work_server_number', - rules: [ - { - type : 'integer[1..25]', - prompt : '计算节点需要在1-25之间,请您键入正确的值' - } - ] - }, - run_para_list:{ - identifier : 'run_para_list', - rules: [ - { - type: 'maxLength[255]', - prompt : '所有字符最长不超过255个字符。' - } - ] - }, }, }) @@ -512,24 +494,6 @@ ] }, - work_server_number: { - identifier : 'work_server_number', - rules: [ - { - type : 'integer[1..25]', - prompt : '计算节点需要在1-25之间,请您键入正确的值' - } - ] - }, - run_para_list:{ - identifier : 'run_para_list', - rules: [ - { - type: 'maxLength[255]', - prompt : '所有字符最长不超过255个字符。' - } - ] - }, }, onSuccess: function(){ // $('.ui.page.dimmer').dimmer('show')