Browse Source

Merge remote-tracking branch 'origin/liuzx_trainjob' into zouap

pull/1036/head
zouap 3 years ago
parent
commit
a8e3c130fd
4 changed files with 208 additions and 155 deletions
  1. +14
    -2
      models/cloudbrain.go
  2. +1
    -0
      modules/auth/modelarts.go
  3. +88
    -67
      modules/modelarts/modelarts.go
  4. +105
    -86
      routers/repo/modelarts.go

+ 14
- 2
models/cloudbrain.go View File

@@ -69,8 +69,8 @@ type Cloudbrain struct {
CanDel bool `xorm:"-"`
Type int `xorm:"INDEX DEFAULT 0"`

VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string `xorm:"INDEX"`
Uuid string
DatasetName string
VersionCount int64 `xorm:"INDEX DEFAULT 1"`
@@ -80,6 +80,18 @@ type Cloudbrain struct {
ComputeResource string
EngineID int64

TrainUrl string
BranchName string
Parameters string
BootFile string
DataUrl string
LogUrl string
PreVersionId int64
FlavorCode string
Description string
WorkServerNumber int
FlavorName string

User *User `xorm:"-"`
Repo *Repository `xorm:"-"`
}


+ 1
- 0
modules/auth/modelarts.go View File

@@ -40,6 +40,7 @@ type CreateModelArtsTrainJobForm struct {
PrameterDescription string `form:"parameter_description"`
BranchName string `form:"branch_name" binding:"Required"`
VersionName string `form:"version_name" binding:"Required"`
FlavorName string `form:"flavor_name" binding:"Required"`
}

func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {


+ 88
- 67
modules/modelarts/modelarts.go View File

@@ -35,19 +35,20 @@ const (
// "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
// "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
// "]}"
CodePath = "/code/"
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 20
TrainUrl = "train_url"
DataUrl = "data_url"
PerPage = 10
IsLatestVersion = "1"
NotLatestVersion = "0"
ComputeResource = "NPU"
CodePath = "/code/"
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 20
TrainUrl = "train_url"
DataUrl = "data_url"
PerPage = 10
IsLatestVersion = "1"
NotLatestVersion = "0"
ComputeResource = "NPU"
InitFatherVersionName = "V0001"

SortByCreateTime = "create_time"
ConfigTypeCustom = "custom"
@@ -59,21 +60,25 @@ var (
)

type GenerateTrainJobReq struct {
JobName string
Uuid string
Description string
CodeObsPath string
BootFile string
DataUrl string
TrainUrl string
FlavorCode string
LogUrl string
PoolID string
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
CommitID string
IsLatestVersion string
JobName string
Uuid string
Description string
CodeObsPath string
BootFile string
DataUrl string
TrainUrl string
FlavorCode string
LogUrl string
PoolID string
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
CommitID string
IsLatestVersion string
Params string
BranchName string
FatherVersionName string
FlavorName string
}

type GenerateTrainJobVersionReq struct {
@@ -90,8 +95,11 @@ type GenerateTrainJobVersionReq struct {
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
Params string
PreVersionId int64
CommitID string
BranchName string
FlavorName string
}

type VersionInfo struct {
@@ -193,7 +201,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error
return nil
}

func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult *models.CreateTrainJobResult, err error) {
func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
jobResult, err := createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
@@ -215,42 +223,53 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return nil, err
return err
}

attach, err := models.GetAttachmentByUUID(req.Uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
return nil, err
return err
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: ComputeResource,
EngineID: req.EngineID,
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: ComputeResource,
EngineID: req.EngineID,
FatherVersionName: req.FatherVersionName,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
})

if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return nil, err
return err
}

return jobResult, nil
return nil
}

func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (jobresult *models.CreateTrainJobResult, err error) {
func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (err error) {
jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
@@ -271,13 +290,13 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR
}, jobId)
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return nil, err
return err
}

attach, err := models.GetAttachmentByUUID(req.Uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
return nil, err
return err
}

err = models.CreateCloudbrain(&models.Cloudbrain{
@@ -296,10 +315,21 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR
FatherVersionName: fatherVersionName,
ComputeResource: ComputeResource,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
PreVersionId: req.PreVersionId,
FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
})
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return nil, err
return err
}

repo := ctx.Repo.Repository
@@ -319,38 +349,29 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return nil, err
return err
}

//将训练任务的上一版本的isLatestVersion设置为"0"
latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion)
if err != nil {
ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
return nil, err
return err
}
err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil, err
return err
}

// lastVersionNum := jobResult.VersionName[1:]
// lastVersionNumToInt64, err := strconv.ParseInt(lastVersionNum, 10, 64)
// if err != nil {
// ctx.ServerError("lastVersionNumToInt64 faild:", err)
// return nil
// }
// lastVersionName := "V" + strconv.FormatInt(lastVersionNumToInt64-1, 10)
//将训练任务的本版本的isLatestVersion设置为"0"

//将当前版本的isLatestVersion和任务数量更新
//将当前版本的isLatestVersion设置为"1"和任务数量更新
err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil, err
return err
}

return jobResult, err
return err
}

func TransTrainJobStatus(status int) string {


+ 105
- 86
routers/repo/modelarts.go View File

@@ -620,12 +620,17 @@ func TrainJobNewVersion(ctx *context.Context) {
func trainJobNewVersionDataPrepare(ctx *context.Context) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("versionName")
jobID = "19373"
var versionName = ctx.Query("version_name")

task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
return err
}

t := time.Now()
var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
ctx.Data["job_name"] = jobName
ctx.Data["job_name"] = task.JobName

attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
if err != nil {
@@ -670,10 +675,14 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error {
ctx.ServerError("GetBranches error:", err)
return err
}
ctx.Data["Branches"] = Branches
ctx.Data["BranchesCount"] = len(Branches)
ctx.Data["jobID"] = jobID
ctx.Data["versionName"] = versionName
ctx.Data["branches"] = Branches
ctx.Data["branch_name"] = task.BranchName
ctx.Data["description"] = task.Description
ctx.Data["boot_file"] = task.BootFile
ctx.Data["dataset_name"] = task.DatasetName
ctx.Data["params"] = task.Parameters
ctx.Data["work_server_number"] = task.WorkServerNumber
ctx.Data["flavor_name"] = task.FlavorName

configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
@@ -705,6 +714,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
FlavorName := form.FlavorName

if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
@@ -833,24 +843,28 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
}

req := &modelarts.GenerateTrainJobReq{
JobName: jobName,
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFile: codeObsPath + bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Uuid: uuid,
Parameters: parameters.Parameter,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
}

jobResult, err := modelarts.GenerateTrainJob(ctx, req)
JobName: jobName,
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFile: codeObsPath + bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Uuid: uuid,
Parameters: parameters.Parameter,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
BranchName: branch_name,
Params: form.Params,
FatherVersionName: modelarts.InitFatherVersionName,
FlavorName: FlavorName,
}

err = modelarts.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
trainJobNewDataPrepare(ctx)
@@ -862,34 +876,34 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
}
// 保存openi创建训练任务界面的参数
err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{

JobName: req.JobName,
JobID: strconv.FormatInt(jobResult.JobID, 10),
VersionName: jobResult.VersionName,
ResourcePools: form.PoolID,
EngineVersions: form.EngineID,
FlavorInfos: form.Flavor,
TrainUrl: outputObsPath,
BootFile: form.BootFile,
Uuid: form.Attachment,
DatasetName: attach.Name,
Params: form.Params,
BranchName: branch_name,
})
// // 保存openi创建训练任务界面的参数
// err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{

if err != nil {
log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
// JobName: req.JobName,
// JobID: strconv.FormatInt(jobResult.JobID, 10),
// VersionName: jobResult.VersionName,
// ResourcePools: form.PoolID,
// EngineVersions: form.EngineID,
// FlavorInfos: form.Flavor,
// TrainUrl: outputObsPath,
// BootFile: form.BootFile,
// Uuid: form.Attachment,
// DatasetName: attach.Name,
// Params: form.Params,
// BranchName: branch_name,
// })

// if err != nil {
// log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
// trainJobNewVersionDataPrepare(ctx)
// ctx.Data["bootFile"] = form.BootFile
// ctx.Data["uuid"] = form.Attachment
// ctx.Data["datasetName"] = attach.Name
// ctx.Data["params"] = form.Params
// ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
// return
// }
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

@@ -918,6 +932,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
fatherVersionName := form.VersionName
FlavorName := form.FlavorName

if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
@@ -1063,11 +1078,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
LogUrl: logObsPath,
PoolID: poolID,
Uuid: uuid,
Parameters: parameters.Parameter,
Params: form.Params,
PreVersionId: task.VersionID,
CommitID: commitID,
BranchName: branch_name,
FlavorName: FlavorName,
}
jobResult, err := modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName)
err = modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
@@ -1079,33 +1096,33 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
return
}
// 保存openi创建训练任务界面的参数
err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{
JobName: req.JobName,
JobID: strconv.FormatInt(jobResult.JobID, 10),
VersionName: jobResult.VersionName,
ResourcePools: form.PoolID,
EngineVersions: form.EngineID,
FlavorInfos: form.Flavor,
TrainUrl: outputObsPath,
BootFile: form.BootFile,
Uuid: form.Attachment,
DatasetName: attach.Name,
Params: form.Params,
BranchName: branch_name,
})
// err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{
// JobName: req.JobName,
// JobID: strconv.FormatInt(jobResult.JobID, 10),
// VersionName: jobResult.VersionName,
// ResourcePools: form.PoolID,
// EngineVersions: form.EngineID,
// FlavorInfos: form.Flavor,
// TrainUrl: outputObsPath,
// BootFile: form.BootFile,
// Uuid: form.Attachment,
// DatasetName: attach.Name,
// Params: form.Params,
// BranchName: branch_name,
// })

if err != nil {
log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
// if err != nil {
// log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
// trainJobNewVersionDataPrepare(ctx)
// ctx.Data["bootFile"] = form.BootFile
// ctx.Data["uuid"] = form.Attachment
// ctx.Data["datasetName"] = attach.Name
// ctx.Data["params"] = form.Params
// ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
// return
// }
// ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}
@@ -1387,18 +1404,18 @@ func TrainJobStop(ctx *context.Context) {

func TrainJobVersionDel(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Params(":versionName")
var versionName = ctx.Query(":versionName")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

_, err = modelarts.DelTrainJob(jobID)
if err != nil {
log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

@@ -1408,12 +1425,13 @@ func TrainJobVersionDel(ctx *context.Context) {
return
}

ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
// ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

func TrainJobVersionStop(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Params(":versionName")
var versionName = ctx.Query(":versionName")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
@@ -1428,7 +1446,8 @@ func TrainJobVersionStop(ctx *context.Context) {
return
}

ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
// ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

func canUserCreateTrainJob(uid int64) (bool, error) {


Loading…
Cancel
Save