Browse Source

Merge branch 'train-job' of https://git.openi.org.cn/OpenI/aiforge into train-job

pull/625/head
Gitea 4 years ago
parent
commit
1788dfa0ac
5 changed files with 359 additions and 9 deletions
  1. +59
    -3
      models/cloudbrain.go
  2. +61
    -1
      modules/modelarts/modelarts.go
  3. +176
    -4
      modules/modelarts/resty.go
  4. +61
    -0
      routers/repo/modelarts.go
  5. +2
    -1
      routers/routes/routes.go

+ 59
- 3
models/cloudbrain.go View File

@@ -60,6 +60,8 @@ type Cloudbrain struct {
DeletedAt time.Time `xorm:"deleted"`
CanDebug bool `xorm:"-"`
Type int `xorm:"INDEX DEFAULT 0"`
VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string

User *User `xorm:"-"`
Repo *Repository `xorm:"-"`
@@ -499,7 +501,7 @@ type Config struct {
LogUrl string `json:"log_url"`
//UserImageUrl string `json:"user_image_url"`
//UserCommand string `json:"user_command"`
//CreateVersion bool `json:"create_version"`
CreateVersion bool `json:"create_version"`
//Volumes []Volumes `json:"volumes"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
@@ -507,7 +509,7 @@ type Config struct {

type CreateConfigParams struct {
ConfigName string `json:"config_name"`
Description string `json:"config_desc"`
Description string `json:"config_desc"`
WorkServerNum int `json:"worker_server_num"`
AppUrl string `json:"app_url"` //训练作业的代码目录
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
@@ -570,7 +572,7 @@ type CreateTrainJobResult struct {
JobName string `json:"job_name"`
JobID int64 `json:"job_id"`
Status int `json:"status"`
CreationTime int64 `json:"create_time"`
CreateTime int64 `json:"create_time"`
VersionID int64 `json:"version_id"`
ResourceID string `json:"resource_id"`
VersionName string `json:"version_name"`
@@ -610,6 +612,60 @@ type ErrorResult struct {
IsSuccess bool `json:"is_success"`
}

type GetTrainJobResult struct {
IsSuccess bool `json:"is_success"`
JobName string `json:"job_name"`
JobID int64 `json:"job_id"`
Description string `json:"job_desc"`
Status int `json:"status"`
LongCreateTime int64 `json:"create_time"`
CreateTime string
Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
VersionID int64 `json:"version_id"`
ResourceID string `json:"resource_id"`
VersionName string `json:"version_name"`
PreVersionID int64 `json:"pre_version_id"`
WorkServerNum int `json:"worker_server_num"`
AppUrl string `json:"app_url"` //训练作业的代码目录
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
Parameter []Parameter `json:"parameter"`
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
//DatasetID string `json:"dataset_id"`
//DataVersionID string `json:"dataset_version_id"`
//DataSource []DataSource `json:"data_source"`
//SpecID int64 `json:"spec_id"`
EngineID int64 `json:"engine_id"`
//ModelID int64 `json:"model_id"`
TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
LogUrl string `json:"log_url"`
//UserImageUrl string `json:"user_image_url"`
//UserCommand string `json:"user_command"`
CreateVersion bool `json:"create_version"`
//Volumes []Volumes `json:"volumes"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
PoolName string `json:"pool_name"`
NasMountPath string `json:"nas_mount_path"`
NasShareAddr string `json:"nas_share_addr"`
}

type GetTrainJobLogResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
IsSuccess bool `json:"is_success"`
Content string `json:"content"`
Lines int `json:"lines"`
StartLine string `json:"start_line"`
EndLine string `json:"end_line"`
}

type GetTrainJobLogFileNamesResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
IsSuccess bool `json:"is_success"`
LogFileList []string `json:"log_file_list"`
}

func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
sess := x.NewSession()
defer sess.Close()


+ 61
- 1
modules/modelarts/modelarts.go View File

@@ -43,6 +43,8 @@ const (
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc"
OrderAsc = "asc"
)

type GenerateTrainJobReq struct {
@@ -149,6 +151,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
@@ -161,18 +164,75 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: strconv.Itoa(jobResult.Status),
Status: transTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeDebug),
Type: models.TypeCloudBrainTrainJob,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
})

if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
}

return nil
}

func transTrainJobStatus(status int) string{
switch status {
case 0:
return "UNKNOWN"
case 1:
return "INIT"
case 2:
return "IMAGE_CREATING"
case 3:
return "IMAGE_FAILED"
case 4:
return "SUBMIT_TRYING"
case 5:
return "SUBMIT_FAILED"
case 6:
return "DELETE_FAILED"
case 7:
return "WAITING"
case 8:
return "RUNNING"
case 9:
return "KILLING"
case 10:
return "COMPLETED"
case 11:
return "FAILED"
case 12:
return "KILLED"
case 13:
return "CANCELED"
case 14:
return "LOST"
case 15:
return "SCALING"
case 16:
return "SUBMIT_MODEL_FAILED"
case 17:
return "DEPLOY_SERVICE_FAILED"
case 18:
return "CHECK_INIT"
case 19:
return "CHECK_RUNNING"
case 20:
return "CHECK_RUNNING_COMPLETED"
case 21:
return "CHECK_FAILED"

default:
return strconv.Itoa(status)
}

return ""
}

+ 176
- 4
modules/modelarts/resty.go View File

@@ -1,15 +1,15 @@
package modelarts

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"crypto/tls"
"encoding/json"
"fmt"
"net/http"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/setting"
"github.com/go-resty/resty/v2"
"net/http"
"strconv"
)

var (
@@ -425,3 +425,175 @@ sendjob:

return &result, nil
}

func GetConfigList() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetResourceSpecsResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)

if err != nil {
return nil, fmt.Errorf("resty GetResourceSpecs: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/" + versionID)

if err != nil {
return nil, fmt.Errorf("resty GetTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("获取作业详情失败")
}

return &result, nil
}

func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"base_line": baseLine,
"lines": strconv.Itoa(lines),
"log_file": logFile,
"order": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLog(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogFileNamesResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLog(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
}

return &result, nil
}

+ 61
- 0
routers/repo/modelarts.go View File

@@ -9,6 +9,7 @@ import (
"errors"
"github.com/unknwon/com"
"io"
"net/http"
"os"
"path"
"strconv"
@@ -379,6 +380,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
return
}

//todo: del local code?

if isSaveParam == "on" {
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
@@ -522,3 +525,61 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {

return nil
}

func TrainJobShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true

var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
return
}

result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetJob(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
return
}

if result != nil {
createTime, _ := com.StrTo(result.LongCreateTime).Int64()
result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05")
}

ctx.Data["task"] = task
ctx.Data["jobID"] = jobID
ctx.Data["result"] = result
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

func TrainJobGetLog(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true

var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, 20)
if err != nil {
log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

ctx.Data["log"] = result
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

+ 2
- 1
routers/routes/routes.go View File

@@ -932,10 +932,11 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Group("/train-job", func() {
m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex)
m.Group("/:jobid", func() {
m.Get("", reqRepoCloudBrainReader, repo.NotebookShow)
m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow)
m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug)
m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop)
m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel)
m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog)
})
m.Get("/create", reqRepoCloudBrainWriter, repo.TrainJobNew)
m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)


Loading…
Cancel
Save