Browse Source

Merge remote-tracking branch 'origin/V20220718' into zouap

pull/2467/head
zouap 2 years ago
parent
commit
036214e88d
10 changed files with 241 additions and 69 deletions
  1. +6
    -5
      models/cloudbrain.go
  2. +64
    -2
      modules/cloudbrain/cloudbrain.go
  3. +7
    -0
      modules/setting/setting.go
  4. +13
    -0
      modules/templates/helper.go
  5. +20
    -4
      routers/api/v1/repo/cloudbrain_dashboard.go
  6. +27
    -15
      routers/api/v1/repo/modelarts.go
  7. +97
    -5
      routers/repo/cloudbrain.go
  8. +6
    -1
      templates/repo/grampus/trainjob/show.tmpl
  9. +1
    -1
      templates/repo/modelarts/trainjob/show.tmpl
  10. +0
    -36
      templates/repo/modelarts/trainjob/version_new.tmpl

+ 6
- 5
models/cloudbrain.go View File

@@ -570,11 +570,12 @@ type SpecialPools struct {
Pools []*SpecialPool `json:"pools"`
}
type SpecialPool struct {
Org string `json:"org"`
Type string `json:"type"`
IsExclusive bool `json:"isExclusive"`
Pool []*GpuInfo `json:"pool"`
JobType []string `json:"jobType"`
Org string `json:"org"`
Type string `json:"type"`
IsExclusive bool `json:"isExclusive"`
Pool []*GpuInfo `json:"pool"`
JobType []string `json:"jobType"`
ResourceSpec []*ResourceSpec `json:"resourceSpecs"`
}

type ImageInfosModelArts struct {


+ 64
- 2
modules/cloudbrain/cloudbrain.go View File

@@ -17,7 +17,7 @@ import (
)

const (
Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
//Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
//CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
CodeMountPath = "/code"
@@ -42,6 +42,7 @@ const (
var (
ResourceSpecs *models.ResourceSpecs
TrainResourceSpecs *models.ResourceSpecs
SpecialPools *models.SpecialPools
)

type GenerateCloudBrainTaskReq struct {
@@ -70,6 +71,11 @@ type GenerateCloudBrainTaskReq struct {
ResourceSpecId int
}

func GetCloudbrainDebugCommand() string {
var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;/usr/local/bin/python /usr/local/bin/jupyter-lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
return command
}

func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
if !ctx.IsSigned {
return false
@@ -222,6 +228,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
for _, spec := range TrainResourceSpecs.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}
} else {
@@ -231,10 +238,29 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
for _, spec := range ResourceSpecs.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}

}
//如果没有匹配到spec信息,尝试从专属资源池获取
if resourceSpec == nil && SpecialPools != nil {
for _, specialPool := range SpecialPools.Pools {
if resourceSpec != nil {
break
}
if specialPool.ResourceSpec != nil {
if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) {
for _, spec := range specialPool.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}
}
}
}
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
@@ -486,7 +512,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e
GPUNumber: resourceSpec.GpuNum,
MemoryMB: resourceSpec.MemMiB,
ShmMB: resourceSpec.ShareMemMiB,
Command: Command,
Command: GetCloudbrainDebugCommand(),//Command,
NeedIBDevice: false,
IsMainRole: false,
UseNNI: false,
@@ -538,3 +564,39 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e

return nil
}

func InitSpecialPool() {
if SpecialPools == nil && setting.SpecialPools != "" {
json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
}
}

func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
if resourceSpecs == nil || len(resourceSpecs) == 0 {
return true
}
for _, v := range resourceSpecs {
if v.Id == resourceSpecId {
return true
}
}
return false
}

func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
for _, v := range pool {
if v.Queue == queue {
return true
}
}
return false
}

func IsElementExist(s []string, str string) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}

+ 7
- 0
modules/setting/setting.go View File

@@ -460,12 +460,15 @@ var (
CBCodePathPrefix string
JobType string
GpuTypes string
SpecialPools string
DebugServerHost string
ResourceSpecs string
MaxDuration int64
TrainGpuTypes string
TrainResourceSpecs string
MaxDatasetNum int
CullIdleTimeout string
CullInterval string

//benchmark config
IsBenchmarkEnabled bool
@@ -1311,7 +1314,11 @@ func NewContext() {
MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400)
TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("")
TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("")
SpecialPools = sec.Key("SPECIAL_POOL").MustString("")
MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5)
CullIdleTimeout = sec.Key("CULL_IDLE_TIMEOUT").MustString("900")
CullInterval = sec.Key("CULL_INTERVAL").MustString("60")

sec = Cfg.Section("benchmark")
IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false)


+ 13
- 0
modules/templates/helper.go View File

@@ -18,6 +18,7 @@ import (
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
texttmpl "text/template"
"time"
@@ -327,6 +328,7 @@ func NewFuncMap() []template.FuncMap {
},
"GetRefType": GetRefType,
"GetRefName": GetRefName,
"MB2GB": MB2GB,
}}
}

@@ -785,3 +787,14 @@ func GetRefName(ref string) string {
reg := regexp.MustCompile(REF_TYPE_PATTERN)
return reg.ReplaceAllString(ref, "")
}

func MB2GB(size int64) string {
s := strconv.FormatFloat(float64(size)/float64(1024), 'f', 2, 64)
for strings.HasSuffix(s, "0") {
s = strings.TrimSuffix(s, "0")
}
if strings.HasSuffix(s, ".") {
s = strings.TrimSuffix(s, ".")
}
return s
}

+ 20
- 4
routers/api/v1/repo/cloudbrain_dashboard.go View File

@@ -752,10 +752,26 @@ func GetCloudbrainsDetailData(ctx *context.Context) {
taskDetail.RepoAlias = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Alias
}
if ciTasks[i].Cloudbrain.Status == string(models.JobWaiting) {
WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
if ciTasks[i].Cloudbrain.DeletedAt != nilTime {
WaitTimeInt := ciTasks[i].Cloudbrain.UpdatedUnix.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}
} else {
if ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 {
WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}
} else {
WaitTimeInt := ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}
}
}
} else if ciTasks[i].Cloudbrain.Status == string(models.JobStopped) && ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 {
WaitTimeInt := ciTasks[i].Cloudbrain.EndTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()


+ 27
- 15
routers/api/v1/repo/modelarts.go View File

@@ -7,8 +7,10 @@ package repo

import (
"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/setting"
"encoding/json"
"net/http"
"path"
"strconv"
"strings"

@@ -263,39 +265,49 @@ func TrainJobGetLog(ctx *context.APIContext) {
return
}

resultLogFile, result, err := trainJobGetLogContent(jobID, versionName, baseLine, order, lines_int)
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
return
}
resultLogFile, result, err := trainJobGetLogContent(jobID, task.VersionID, baseLine, order, lines_int)
if err != nil {
log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error())
// ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
_, err = storage.GetObsLogFileName(prefix)
var canLogDownload bool
if err != nil {
canLogDownload = false
} else {
canLogDownload = true
}

ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"LogFileName": resultLogFile.LogFileList[0],
"StartLine": result.StartLine,
"EndLine": result.EndLine,
"Content": result.Content,
"Lines": result.Lines,
"JobID": jobID,
"LogFileName": resultLogFile.LogFileList[0],
"StartLine": result.StartLine,
"EndLine": result.EndLine,
"Content": result.Content,
"Lines": result.Lines,
"CanLogDownload": canLogDownload,
})
}

func trainJobGetLogContent(jobID string, versionName string, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
return nil, nil, err
}
func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
return nil, nil, err
}

result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines)
result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(versionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines)
if err != nil {
log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
return nil, nil, err


+ 97
- 5
routers/repo/cloudbrain.go View File

@@ -2,7 +2,6 @@ package repo

import (
"bufio"
"code.gitea.io/gitea/modules/grampus"
"encoding/json"
"errors"
"fmt"
@@ -16,6 +15,8 @@ import (
"time"
"unicode/utf8"

"code.gitea.io/gitea/modules/grampus"

"code.gitea.io/gitea/modules/timeutil"
"github.com/unknwon/i18n"

@@ -135,7 +136,7 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
}

ctx.Data["attachments"] = attachs
ctx.Data["command"] = cloudbrain.Command
ctx.Data["command"] = cloudbrain.GetCloudbrainDebugCommand()
ctx.Data["code_path"] = cloudbrain.CodeMountPath
ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
ctx.Data["model_path"] = cloudbrain.ModelMountPath
@@ -149,6 +150,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {

ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType

cloudbrain.InitSpecialPool()

if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
@@ -178,6 +181,45 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo

for _, pool := range cloudbrain.SpecialPools.Pools {
org, _ := models.GetOrgByName(pool.Org)
if org != nil {
isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
if isOrgMember {
for _, jobType := range pool.JobType {
if jobType == string(models.JobTypeDebug) {
debugGpuTypes = append(debugGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["resource_specs"] = pool.ResourceSpec
}
} else if jobType == string(models.JobTypeTrain) {
trainGpuTypes = append(trainGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["train_resource_specs"] = pool.ResourceSpec
}
}
}
break
}
}

}

if len(debugGpuTypes) > 0 {
ctx.Data["gpu_types"] = debugGpuTypes
}

if len(trainGpuTypes) > 0 {
ctx.Data["train_gpu_types"] = trainGpuTypes
}

}

ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName

@@ -217,6 +259,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
repo := ctx.Repo.Repository
tpl := tplCloudBrainNew

if jobType == string(models.JobTypeTrain) {
tpl = tplCloudBrainTrainJobNew
}

tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
if err == nil {
if len(tasks) != 0 {
@@ -269,7 +315,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
return
}

command := cloudbrain.Command
command := cloudbrain.GetCloudbrainDebugCommand()
if jobType == string(models.JobTypeTrain) {
tpl = tplCloudBrainTrainJobNew
commandTrain, err := getTrainJobCommand(form)
@@ -282,6 +328,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
command = commandTrain
}

errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId)

if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(errStr, tpl, &form)
return
}

if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
@@ -334,6 +388,42 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}

/**
检查用户传输的参数是否符合专属资源池
*/
func checkCloudBrainSpecialPool(ctx *context.Context, jobType string, queue string, resourceSpecId int) string {
if cloudbrain.SpecialPools != nil {

var isInPoolOrg = false
var matchSpecialPool = false

for _, specialPool := range cloudbrain.SpecialPools.Pools {

if cloudbrain.IsElementExist(specialPool.JobType, jobType) && cloudbrain.IsQueueInSpecialtPool(specialPool.Pool, queue) {
if cloudbrain.IsResourceSpecInSpecialPool(specialPool.ResourceSpec, resourceSpecId) {
matchSpecialPool = true
org, _ := models.GetOrgByName(specialPool.Org)
if org != nil {
isInPoolOrg, _ = models.IsOrganizationMember(org.ID, ctx.User.ID)
if isInPoolOrg {
break //传入参数,和专属资源池匹配上了,检查通过
}
}
}

}

}
//资源池有匹配上,但是用户不在相应的组织中,返回错误信息。界面已经过滤了选择,界面操作不会到这个逻辑
if matchSpecialPool && !isInPoolOrg {
return ctx.Tr("repo.grampus.no_operate_right")
}

}
//没有匹配到资源池或者没有设置专属资源池,检查通过; 获取和资源池完全匹配检查通过
return ""
}

func CloudBrainRestart(ctx *context.Context) {
var ID = ctx.Params(":id")
var resultCode = "0"
@@ -573,7 +663,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
if task.TrainJobDuration == "" {
if task.Duration == 0 {
var duration int64
if task.Status == string(models.JobRunning) {
if task.Status == string(models.JobWaiting) {
duration = 0
} else if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.CreatedUnix)
} else {
duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
@@ -2094,7 +2186,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
repo := ctx.Repo.Repository

tpl := tplCloudBrainBenchmarkNew
command := cloudbrain.Command
command := cloudbrain.GetCloudbrainDebugCommand()

tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
if err == nil {


+ 6
- 1
templates/repo/grampus/trainjob/show.tmpl View File

@@ -480,8 +480,13 @@

</div>
<div id="dir_list{{.VersionName}}">

</div>
{{if eq .ComputeResource "CPU/GPU"}}
<div style="display:flex;align-items: center;justify-content: end;color: #f2711c;">
<i class="ri-error-warning-line" style="margin-right:0.5rem;"></i>
<span>{{$.i18n.Tr "repo.file_limit_100"}}</span>
</div>
{{end}}
</div>

</div>


+ 1
- 1
templates/repo/modelarts/trainjob/show.tmpl View File

@@ -488,7 +488,7 @@
<div class="ui tab" data-tab="second{{$k}}">
<div>
<a id="{{.VersionName}}-log-down"
class='{{if and (.CanModify) (eq .Status "KILLED" "FAILED" "START_FAILED" "STOPPED" "COMPLETED") }}ti-download-file{{else}}disabled{{end}}'
class='{{if and ($.CanLogDownload) (eq .Status "KILLED" "FAILED" "START_FAILED" "STOPPED" "COMPLETED") }}ti-download-file{{else}}disabled{{end}}'
href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/download_log_file?version_name={{.VersionName}}">
<i class="ri-download-cloud-2-line"></i>
<span style="margin-left: 0.3rem;">{{$.i18n.Tr "repo.modelarts.download_log"}}</span>


+ 0
- 36
templates/repo/modelarts/trainjob/version_new.tmpl View File

@@ -446,24 +446,6 @@
]

},
work_server_number: {
identifier : 'work_server_number',
rules: [
{
type : 'integer[1..25]',
prompt : '计算节点需要在1-25之间,请您键入正确的值'
}
]
},
run_para_list:{
identifier : 'run_para_list',
rules: [
{
type: 'maxLength[255]',
prompt : '所有字符最长不超过255个字符。'
}
]
},
},
})

@@ -512,24 +494,6 @@
]

},
work_server_number: {
identifier : 'work_server_number',
rules: [
{
type : 'integer[1..25]',
prompt : '计算节点需要在1-25之间,请您键入正确的值'
}
]
},
run_para_list:{
identifier : 'run_para_list',
rules: [
{
type: 'maxLength[255]',
prompt : '所有字符最长不超过255个字符。'
}
]
},
},
onSuccess: function(){
// $('.ui.page.dimmer').dimmer('show')


Loading…
Cancel
Save