Browse Source

Merge pull request '智算训练任务系统调度失败将错误原因放日志中' (#3463) from fix-3339 into V20221228

Reviewed-on: https://openi.pcl.ac.cn/OpenI/aiforge/pulls/3463
Reviewed-by: zouap <zouap@pcl.ac.cn>
pull/3473/head
zouap 2 years ago
parent
commit
f7156e6bc9
4 changed files with 22 additions and 6 deletions
  1. +2
    -1
      models/cloudbrain.go
  2. +0
    -1
      modules/grampus/resty.go
  3. +3
    -4
      modules/modelarts/modelarts.go
  4. +17
    -0
      routers/repo/grampus.go

+ 2
- 1
models/cloudbrain.go View File

@@ -1566,7 +1566,8 @@ type CreateGrampusJobResponse struct {

type GetGrampusJobResponse struct {
GrampusResult
JobInfo GrampusJobInfo `json:"otJob"`
JobInfo GrampusJobInfo `json:"otJob"`
ExitDiagnostics string `json:"exitDiagnostics"`
}

type GrampusNotebookResponse struct {


+ 0
- 1
modules/grampus/resty.go View File

@@ -198,7 +198,6 @@ sendjob:
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlTrainJob + "/" + jobID)

if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}


+ 3
- 4
modules/modelarts/modelarts.go View File

@@ -22,9 +22,9 @@ import (
const (
//notebook

storageTypeOBS = "obs"
autoStopDuration = 4 * 60 * 60
AutoStopDurationMs = 4 * 60 * 60 * 1000
storageTypeOBS = "obs"
autoStopDuration = 4 * 60 * 60
AutoStopDurationMs = 4 * 60 * 60 * 1000

CodePath = "/code/"
OutputPath = "/output/"
@@ -168,7 +168,6 @@ type OrgMultiNode struct {
Node []int `json:"node"`
}


type Parameters struct {
Parameter []struct {
Label string `json:"label"`


+ 17
- 0
routers/repo/grampus.go View File

@@ -1359,6 +1359,23 @@ func GrampusGetLog(ctx *context.Context) {
})
return
}
result, err := grampus.GetJob(jobID)
if err != nil {
log.Error("GetJob(%s) failed:%v", job.JobName, err)
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobName": job.JobName,
"Content": content,
"CanLogDownload": false,
})
return
}
if result != nil {
job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
if job.Status == models.GrampusStatusFailed {
content = content + "\n" + result.ExitDiagnostics
}
}

canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobName": job.JobName,


Loading…
Cancel
Save