Browse Source

Merge pull request 'fix-bug' (#948) from liuzx_trainjob into V20211115

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/948
Reviewed-by: zhoupzh <zhoupzh@pcl.ac.cn>
pull/952/head
zhoupzh 3 years ago
parent
commit
22aa29905b
6 changed files with 124 additions and 43 deletions
  1. +1
    -1
      models/cloudbrain.go
  2. +2
    -2
      modules/storage/obs.go
  3. +1
    -2
      routers/api/v1/repo/modelarts.go
  4. +118
    -36
      routers/repo/modelarts.go
  5. +1
    -1
      templates/repo/modelarts/notebook/show.tmpl
  6. +1
    -1
      templates/repo/modelarts/trainjob/index.tmpl

+ 1
- 1
models/cloudbrain.go View File

@@ -87,7 +87,7 @@ type Cloudbrain struct {
LogUrl string //日志输出的obs路径
PreVersionId int64 //父版本的版本id
FlavorCode string //modelarts上的规格id
Description string //描述
Description string `xorm:"varchar(256)"` //描述
WorkServerNumber int //节点数
FlavorName string //规格名称
EngineName string //引擎名称


+ 2
- 2
modules/storage/obs.go View File

@@ -176,10 +176,10 @@ func ObsModelDownload(JobName string, fileName string) (io.ReadCloser, error) {
}
}

func GetObsListObject(jobName, parentDir string) ([]FileInfo, error) {
func GetObsListObject(jobName, parentDir string, versionOutputPath string) ([]FileInfo, error) {
input := &obs.ListObjectsInput{}
input.Bucket = setting.Bucket
input.Prefix = strings.TrimPrefix(path.Join(setting.TrainJobModelPath, jobName, setting.OutPutPath, parentDir), "/")
input.Prefix = strings.TrimPrefix(path.Join(setting.TrainJobModelPath, jobName, setting.OutPutPath, versionOutputPath, parentDir), "/")
strPrefix := strings.Split(input.Prefix, "/")
output, err := ObsCli.ListObjects(input)
fileInfos := make([]FileInfo, 0)


+ 1
- 2
routers/api/v1/repo/modelarts.go View File

@@ -303,8 +303,7 @@ func ModelList(ctx *context.APIContext) {
return
}
VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(task.TotalVersionCount)
parentDir = VersionOutputPath + "/" + parentDir
models, err := storage.GetObsListObject(task.JobName, parentDir)
models, err := storage.GetObsListObject(task.JobName, parentDir, VersionOutputPath)
if err != nil {
log.Info("get TrainJobListModel failed:", err)
ctx.ServerError("GetObsListObject:", err)


+ 118
- 36
routers/repo/modelarts.go View File

@@ -477,6 +477,94 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error {
return nil
}

func ErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
// var versionName = ctx.Params(":version-name")
var versionName = ctx.Query("version_name")

task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
return err
}

t := time.Now()
var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
ctx.Data["job_name"] = task.JobName

attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
if err != nil {
ctx.ServerError("GetAllUserAttachments failed:", err)
return err
}
ctx.Data["attachments"] = attachs

var resourcePools modelarts.ResourcePool
if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["resource_pools"] = resourcePools.Info

var engines modelarts.Engine
if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["engines"] = engines.Info

var versionInfos modelarts.VersionInfo
if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info

var Parameters modelarts.Parameters
if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["params"] = Parameters.Parameter

outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
ctx.Data["train_url"] = outputObsPath

Branches, err := ctx.Repo.GitRepo.GetBranches()
if err != nil {
ctx.ServerError("GetBranches error:", err)
return err
}
ctx.Data["branches"] = Branches
ctx.Data["description"] = form.Description
ctx.Data["dataset_name"] = task.DatasetName
ctx.Data["work_server_number"] = form.WorkServerNumber
ctx.Data["flavor_name"] = form.FlavorName
ctx.Data["engine_name"] = form.EngineName
ctx.Data["flavor_code"] = task.FlavorCode
ctx.Data["engine_id"] = task.EngineID

ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["branch_name"] = form.BranchName
configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
ctx.ServerError("getConfigList failed:", err)
return err
}
ctx.Data["config_list"] = configList.ParaConfigs

return nil
}

func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true
VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(modelarts.TotalVersionCount)
@@ -492,6 +580,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
// codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
@@ -529,13 +618,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
}); err != nil {
log.Error("创建任务失败,服务器超时!: %s (%v)", repo.FullName(), err)
trainJobNewDataPrepare(ctx)

ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
trainJobNewDataPrepare(ctx)
ctx.RenderWithErr("创建任务失败,服务器超时!", tplModelArtsTrainJobNew, &form)
return
}
@@ -555,7 +637,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
return
}

// parentDir := VersionOutputPath + "/"
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
// if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
trainJobNewDataPrepare(ctx)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
@@ -651,6 +735,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
TotalVersionCount: modelarts.TotalVersionCount,
}

//将params转换Parameters.Parameter,出错时返回给前端
var Parameters modelarts.Parameters
if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return
}

err = modelarts.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
@@ -658,7 +749,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["params"] = Parameters.Parameter
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
@@ -689,7 +780,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
@@ -701,16 +792,16 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ

if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}

attach, err := models.GetAttachmentByUUID(uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
return
}
// attach, err := models.GetAttachmentByUUID(uuid)
// if err != nil {
// log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
// return
// }

//todo: del the codeLocalPath
_, err = ioutil.ReadDir(codeLocalPath)
@@ -724,13 +815,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
Branch: branch_name,
}); err != nil {
log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)

ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
return
}
@@ -738,21 +823,23 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
return
}

if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
parentDir := VersionOutputPath + "/"
// if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
return
}
@@ -772,7 +859,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
err := json.Unmarshal([]byte(params), &parameters)
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
return
}
@@ -791,7 +878,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
if isSaveParam == "on" {
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
return
}
@@ -815,7 +902,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ

if err != nil {
log.Error("Failed to CreateTrainJobConfig: %v", err)
trainJobNewVersionDataPrepare(ctx)
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
@@ -862,15 +949,11 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ErrorDataPrepare(ctx, form)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
// ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

@@ -963,7 +1046,6 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {

func TrainJobShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true

var jobID = ctx.Params(":jobid")

repo := ctx.Repo.Repository


+ 1
- 1
templates/repo/modelarts/notebook/show.tmpl View File

@@ -11,7 +11,7 @@
{{.i18n.Tr "repo.cloudbrain"}}
</a>
<div class="divider"> / </div>
<a class="section" href="{{.RepoLink}}/cloudbrain">
<a class="section" href="{{.RepoLink}}/modelarts/notebook">
{{$.i18n.Tr "repo.modelarts.notebook"}}
</a>
<div class="divider"> / </div>


+ 1
- 1
templates/repo/modelarts/trainjob/index.tmpl View File

@@ -328,7 +328,7 @@
</div>
<!-- 版本数量 -->
<div class="one wide column text center padding0">
<span>{{.VersionCount}} </span>
<span style="font-size: 12px;">{{.VersionCount}} </span>
</div>
<!-- 任务状态 -->
<div class="two wide column padding0" style="padding-left: 2.2rem !important;">


Loading…
Cancel
Save