Browse Source

train-job log

pull/625/head
lewis 4 years ago
parent
commit
dcbe2adf03
5 changed files with 141 additions and 4 deletions
  1. +17
    -0
      models/cloudbrain.go
  2. +2
    -0
      modules/modelarts/modelarts.go
  3. +92
    -4
      modules/modelarts/resty.go
  4. +29
    -0
      routers/repo/modelarts.go
  5. +1
    -0
      routers/routes/routes.go

+ 17
- 0
models/cloudbrain.go View File

@@ -649,6 +649,23 @@ type GetTrainJobResult struct {
NasShareAddr string `json:"nas_share_addr"`
}

type GetTrainJobLogResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
IsSuccess bool `json:"is_success"`
Content string `json:"content"`
Lines int `json:"lines"`
StartLine string `json:"start_line"`
EndLine string `json:"end_line"`
}

type GetTrainJobLogFileNamesResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
IsSuccess bool `json:"is_success"`
LogFileList []string `json:"log_file_list"`
}

func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
sess := x.NewSession()
defer sess.Close()


+ 2
- 0
modules/modelarts/modelarts.go View File

@@ -43,6 +43,8 @@ const (
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc"
OrderAsc = "asc"
)

type GenerateTrainJobReq struct {


+ 92
- 4
modules/modelarts/resty.go View File

@@ -1,15 +1,15 @@
package modelarts

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"crypto/tls"
"encoding/json"
"fmt"
"net/http"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/setting"
"github.com/go-resty/resty/v2"
"net/http"
"strconv"
)

var (
@@ -509,3 +509,91 @@ sendjob:

return &result, nil
}

func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"base_line": baseLine,
"lines": strconv.Itoa(lines),
"log_file": logFile,
"order": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLog(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogFileNamesResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLog(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
}

return &result, nil
}

+ 29
- 0
routers/repo/modelarts.go View File

@@ -554,3 +554,32 @@ func TrainJobShow(ctx *context.Context) {
ctx.Data["result"] = result
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

func TrainJobGetLog(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true

var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, 20)
if err != nil {
log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

ctx.Data["log"] = result
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

+ 1
- 0
routers/routes/routes.go View File

@@ -936,6 +936,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug)
m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop)
m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel)
m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog)
})
m.Get("/create", reqRepoCloudBrainWriter, repo.TrainJobNew)
m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)


Loading…
Cancel
Save