Browse Source

Merge pull request '训练作业优化' (#631) from download-cb2-model into V20211101

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/631
Reviewed-by: ychao_1983 <ychao_1983@sina.com>
pull/635/head
ychao_1983 3 years ago
parent
commit
731f1bc213
7 changed files with 19 additions and 50 deletions
  1. +2
    -2
      models/cloudbrain.go
  2. +3
    -3
      modules/modelarts/modelarts.go
  3. +2
    -0
      routers/api/v1/repo/modelarts.go
  4. +5
    -40
      routers/repo/modelarts.go
  5. +1
    -1
      templates/repo/modelarts/index.tmpl
  6. +2
    -2
      templates/repo/modelarts/notebook/index.tmpl
  7. +4
    -2
      templates/repo/modelarts/trainjob/index.tmpl

+ 2
- 2
models/cloudbrain.go View File

@@ -60,7 +60,7 @@ type Cloudbrain struct {
ContainerIp string
CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
Duration int `xorm:"INDEX duration"`
Duration int64 `xorm:"INDEX duration"`
TrainJobDuration string
DeletedAt time.Time `xorm:"deleted"`
CanDebug bool `xorm:"-"`
@@ -933,7 +933,7 @@ func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err erro
return
}

func SetTrainJobStatusByJobID(jobID string, status string, duration int, trainjobduration string) (err error) {
func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
_, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
return


+ 3
- 3
modules/modelarts/modelarts.go View File

@@ -221,7 +221,7 @@ func TransTrainJobStatus(status int) string {
case 0:
return "UNKNOWN"
case 1:
return "CREATING"
return "INIT"
case 2:
return "IMAGE_CREATING"
case 3:
@@ -237,13 +237,13 @@ func TransTrainJobStatus(status int) string {
case 8:
return "RUNNING"
case 9:
return "STOPPED"
return "KILLING"
case 10:
return "COMPLETED"
case 11:
return "FAILED"
case 12:
return "STOPPED"
return "KILLED"
case 13:
return "CANCELED"
case 14:


+ 2
- 0
routers/api/v1/repo/modelarts.go View File

@@ -64,6 +64,8 @@ func GetModelArtsTrainJob(ctx *context.APIContext) {
}

job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
job.Duration = result.Duration
job.TrainJobDuration = result.TrainJobDuration
err = models.UpdateJob(job)
if err != nil {
log.Error("UpdateJob failed:", err)


+ 5
- 40
routers/repo/modelarts.go View File

@@ -506,43 +506,7 @@ func TrainJobIndex(ctx *context.Context) {
page = 1
}

tasks, _, err := models.Cloudbrains(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTrainJob,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return
}

for i := range tasks {
TrainJobDetail, err := modelarts.GetTrainJob(tasks[i].Cloudbrain.JobID, strconv.FormatInt(tasks[i].Cloudbrain.VersionID, 10))
if TrainJobDetail != nil {
TrainJobDetail.CreateTime = time.Unix(int64(TrainJobDetail.LongCreateTime/1000), 0).Format("2006-01-02 15:04:05")
if TrainJobDetail.Duration != 0 {
TrainJobDetail.TrainJobDuration = addZero(TrainJobDetail.Duration/3600000) + ":" + addZero(TrainJobDetail.Duration%3600000/60000) + ":" + addZero(TrainJobDetail.Duration%60000/1000)

} else {
TrainJobDetail.TrainJobDuration = "00:00:00"
}
}
if err != nil {
log.Error("GetJob(%s) failed:%v", tasks[i].Cloudbrain.JobID, err.Error())
return
}
err = models.SetTrainJobStatusByJobID(tasks[i].Cloudbrain.JobID, modelarts.TransTrainJobStatus(TrainJobDetail.IntStatus), int(TrainJobDetail.Duration), string(TrainJobDetail.TrainJobDuration))
// err = models.UpdateJob(tasks[i].Cloudbrain)
if err != nil {
ctx.ServerError("UpdateJob failed", err)
return
}
}

trainTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
PageSize: setting.UI.IssuePagingNum,
@@ -560,7 +524,7 @@ func TrainJobIndex(ctx *context.Context) {
ctx.Data["Page"] = pager

ctx.Data["PageIsCloudBrain"] = true
ctx.Data["Tasks"] = trainTasks
ctx.Data["Tasks"] = tasks
ctx.HTML(200, tplModelArtsTrainJobIndex)
}

@@ -901,12 +865,13 @@ func TrainJobShow(ctx *context.Context) {
} else {
result.TrainJobDuration = "00:00:00"
}
err = models.SetTrainJobStatusByJobID(jobID, modelarts.TransTrainJobStatus(result.IntStatus), int(result.Duration), string(result.TrainJobDuration))
result.Status = modelarts.TransTrainJobStatus(result.IntStatus)
err = models.SetTrainJobStatusByJobID(jobID, result.Status, result.Duration, string(result.TrainJobDuration))
if err != nil {
ctx.ServerError("UpdateJob failed", err)
return
}
result.Status = modelarts.TransTrainJobStatus(result.IntStatus)
result.DatasetName = attach.Name
}



+ 1
- 1
templates/repo/modelarts/index.tmpl View File

@@ -415,7 +415,7 @@
$(".job-status").each((index, job) => {
const jobID = job.dataset.jobid;
const repoPath = job.dataset.repopath;
if (job.textContent.trim() == 'STOPPED') {
if (job.textContent.trim() == 'STOPPED' || job.textContent.trim() == 'START_FAILED' || job.textContent.trim() == 'CREATE_FAILED') {
return
}



+ 2
- 2
templates/repo/modelarts/notebook/index.tmpl View File

@@ -423,12 +423,12 @@

// 加载任务状态
var timeid = window.setInterval(loadJobStatus, 15000);
// $(document).ready(loadJobStatus);
$(document).ready(loadJobStatus);
function loadJobStatus() {
$(".job-status").each((index, job) => {
const jobID = job.dataset.jobid;
const repoPath = job.dataset.repopath;
if (job.textContent.trim() == 'STOPPED') {
if (job.textContent.trim() == 'STOPPED' || job.textContent.trim() == 'START_FAILED' || job.textContent.trim() == 'CREATE_FAILED') {
return
}



+ 4
- 2
templates/repo/modelarts/trainjob/index.tmpl View File

@@ -459,12 +459,14 @@

// 加载任务状态
var timeid = window.setInterval(loadJobStatus, 15000);
// $(document).ready(loadJobStatus);
$(document).ready(loadJobStatus);
function loadJobStatus() {
$(".job-status").each((index, job) => {
const jobID = job.dataset.jobid;
const repoPath = job.dataset.repopath;
if (job.textContent.trim() == 'STOPPED') {
if (job.textContent.trim() == 'IMAGE_FAILED' || job.textContent.trim() == 'SUBMIT_FAILED' || job.textContent.trim() == 'DELETE_FAILED'
|| job.textContent.trim() == 'KILLED' || job.textContent.trim() == 'COMPLETED' || job.textContent.trim() == 'FAILED'
|| job.textContent.trim() == 'CANCELED' || job.textContent.trim() == 'LOST') {
return
}



Loading…
Cancel
Save