Browse Source

Merge pull request 'check bootfile' (#2672) from fix-2559 into V20220815

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/2672
Reviewed-by: lewis <747342561@qq.com>
pull/2695/head
lewis 2 years ago
parent
commit
51dc66c7de
5 changed files with 71 additions and 12 deletions
  1. +1
    -0
      options/locale/locale_en-US.ini
  2. +1
    -0
      options/locale/locale_zh-CN.ini
  3. +17
    -0
      routers/repo/cloudbrain.go
  4. +16
    -0
      routers/repo/grampus.go
  5. +36
    -12
      routers/repo/modelarts.go

+ 1
- 0
options/locale/locale_en-US.ini View File

@@ -1073,6 +1073,7 @@ cloudbrain_operate = Operate
cloudbrain_status_createtime = Status/Createtime
cloudbrain_status_runtime = Running Time
cloudbrain_jobname_err=Name must start with a lowercase letter or number,can include lowercase letter,number,_ and -,can not end with _, and can be up to 36 characters long.
cloudbrain_bootfile_err=The bootfile does not exist in the repository
cloudbrain_query_fail=Failed to query cloudbrain information.
cloudbrain.mirror_tag = Mirror Tag
cloudbrain.mirror_description = Mirror Description


+ 1
- 0
options/locale/locale_zh-CN.ini View File

@@ -1076,6 +1076,7 @@ cloudbrain_operate=操作
cloudbrain_status_createtime=状态/创建时间
cloudbrain_status_runtime = 运行时长
cloudbrain_jobname_err=只能以小写字母或数字开头且只包含小写字母、数字、_和-,不能以_结尾,最长36个字符。
cloudbrain_bootfile_err=仓库中不存在启动文件
cloudbrain_query_fail=查询云脑任务失败。
cloudbrain.mirror_tag = 镜像标签
cloudbrain.mirror_description = 镜像描述


+ 17
- 0
routers/repo/cloudbrain.go View File

@@ -239,6 +239,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
resourceSpecId := form.ResourceSpecId
branchName := form.BranchName
bootFile := strings.TrimSpace(form.BootFile)
repo := ctx.Repo.Repository
tpl := tplCloudBrainNew

@@ -305,6 +306,13 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {

command := cloudbrain.GetCloudbrainDebugCommand()
if jobType == string(models.JobTypeTrain) {
bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
return
}
tpl = tplCloudBrainTrainJobNew
commandTrain, err := getTrainJobCommand(form)
if err != nil {
@@ -413,6 +421,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
resourceSpecId := form.ResourceSpecId
branchName := form.BranchName
bootFile := strings.TrimSpace(form.BootFile)
labelName := form.LabelName
repo := ctx.Repo.Repository

@@ -450,6 +459,14 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
return
}

count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])


+ 16
- 0
routers/repo/grampus.go View File

@@ -215,6 +215,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplGrampusTrainJobGPUNew, &form)
return
}

errStr := checkSpecialPool(ctx, "GPU")
if errStr != "" {
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
@@ -399,6 +407,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplGrampusTrainJobNPUNew, &form)
return
}

errStr := checkSpecialPool(ctx, "NPU")
if errStr != "" {
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)


+ 36
- 12
routers/repo/modelarts.go View File

@@ -1089,7 +1089,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
// dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
FlavorName := form.FlavorName
VersionCount := modelarts.VersionCount
@@ -1117,6 +1117,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err)
trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobNew, &form)
return
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
trainJobErrorNewDataPrepare(ctx, form)
@@ -1148,9 +1156,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)
commitID, _ := gitRepo.GetBranchCommitID(branchName)

if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form)
@@ -1292,7 +1300,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
Parameters: param,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
BranchName: branch_name,
BranchName: branchName,
Params: form.Params,
FlavorName: FlavorName,
EngineName: EngineName,
@@ -1394,7 +1402,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
// dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
branchName := form.BranchName
PreVersionName := form.VersionName
FlavorName := form.FlavorName
EngineName := form.EngineName
@@ -1414,6 +1422,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err)
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobVersionNew, &form)
return
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
versionErrorDataPrepare(ctx, form)
@@ -1428,8 +1444,8 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)
if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
commitID, _ := gitRepo.GetBranchCommitID(branchName)
if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("Failed git clone repo to local(!: %s (%v)", repo.FullName(), err)
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form)
@@ -1582,7 +1598,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
Parameters: param,
PreVersionId: task.VersionID,
CommitID: commitID,
BranchName: branch_name,
BranchName: branchName,
FlavorName: FlavorName,
EngineName: EngineName,
PreVersionName: PreVersionName,
@@ -2025,7 +2041,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
branchName := form.BranchName
FlavorName := form.FlavorName
EngineName := form.EngineName
LabelName := form.LabelName
@@ -2060,6 +2076,14 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err)
inferenceJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsInferenceJobNew, &form)
return
}

//Determine whether the task name of the task in the project is duplicated
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeInference), displayJobName)
if err == nil {
@@ -2092,9 +2116,9 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)
commitID, _ := gitRepo.GetBranchCommitID(branchName)

if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
inferenceJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsInferenceJobNew, &form)
@@ -2178,7 +2202,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
Uuid: uuid,
Parameters: param, //modelarts train parameters
CommitID: commitID,
BranchName: branch_name,
BranchName: branchName,
Params: form.Params,
FlavorName: FlavorName,
EngineName: EngineName,


Loading…
Cancel
Save