From 150d9ec6c553a852d8c7aad7830782dec8a20109 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 1 Apr 2021 19:17:02 +0800 Subject: [PATCH 1/2] job show --- models/cloudbrain.go | 45 ++++++++++++++++++++-- modules/modelarts/modelarts.go | 60 +++++++++++++++++++++++++++++- modules/modelarts/resty.go | 84 ++++++++++++++++++++++++++++++++++++++++++ routers/repo/modelarts.go | 32 ++++++++++++++++ routers/routes/routes.go | 2 +- 5 files changed, 218 insertions(+), 5 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index d3bf77922..bc09b2c57 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -60,6 +60,8 @@ type Cloudbrain struct { DeletedAt time.Time `xorm:"deleted"` CanDebug bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string User *User `xorm:"-"` Repo *Repository `xorm:"-"` @@ -499,7 +501,7 @@ type Config struct { LogUrl string `json:"log_url"` //UserImageUrl string `json:"user_image_url"` //UserCommand string `json:"user_command"` - //CreateVersion bool `json:"create_version"` + CreateVersion bool `json:"create_version"` //Volumes []Volumes `json:"volumes"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` @@ -507,7 +509,7 @@ type Config struct { type CreateConfigParams struct { ConfigName string `json:"config_name"` - Description string `json:"config_desc"` + Description string `json:"config_desc"` WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 @@ -570,7 +572,7 @@ type CreateTrainJobResult struct { JobName string `json:"job_name"` JobID int64 `json:"job_id"` Status int `json:"status"` - CreationTime int64 `json:"create_time"` + CreateTime int64 `json:"create_time"` VersionID int64 `json:"version_id"` ResourceID string `json:"resource_id"` VersionName string `json:"version_name"` @@ -610,6 +612,43 @@ type ErrorResult struct { IsSuccess bool `json:"is_success"` } +type GetTrainJobResult struct { + IsSuccess bool `json:"is_success"` + JobName string `json:"job_name"` + JobID int64 `json:"job_id"` + Description string `json:"job_desc"` + Status int `json:"status"` + LongCreateTime int64 `json:"create_time"` + CreateTime string + Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒 + VersionID int64 `json:"version_id"` + ResourceID string `json:"resource_id"` + VersionName string `json:"version_name"` + PreVersionID int64 `json:"pre_version_id"` + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + CreateVersion bool `json:"create_version"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PoolName string `json:"pool_name"` + NasMountPath string `json:"nas_mount_path"` + NasShareAddr string `json:"nas_share_addr"` +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { sess := x.NewSession() defer sess.Close() diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 34fd1195c..b865ec7ac 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -149,6 +149,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, + CreateVersion: true, Flavor: models.Flavor{ Code: req.FlavorCode, }, @@ -161,18 +162,75 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { } err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: strconv.Itoa(jobResult.Status), + Status: transTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: strconv.FormatInt(jobResult.JobID, 10), JobName: req.JobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTrainJob, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, }) if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) return err } return nil } + +func transTrainJobStatus(status int) string{ + switch status { + case 0: + return "UNKNOWN" + case 1: + return "INIT" + case 2: + return "IMAGE_CREATING" + case 3: + return "IMAGE_FAILED" + case 4: + return "SUBMIT_TRYING" + case 5: + return "SUBMIT_FAILED" + case 6: + return "DELETE_FAILED" + case 7: + return "WAITING" + case 8: + return "RUNNING" + case 9: + return "KILLING" + case 10: + return "COMPLETED" + case 11: + return "FAILED" + case 12: + return "KILLED" + case 13: + return "CANCELED" + case 14: + return "LOST" + case 15: + return "SCALING" + case 16: + return "SUBMIT_MODEL_FAILED" + case 17: + return "DEPLOY_SERVICE_FAILED" + case 18: + return "CHECK_INIT" + case 19: + return "CHECK_RUNNING" + case 20: + return "CHECK_RUNNING_COMPLETED" + case 21: + return "CHECK_FAILED" + + default: + return strconv.Itoa(status) + } + + return "" +} diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index a9c1ffa76..110b01b96 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -425,3 +425,87 @@ sendjob: return &result, nil } + +func GetConfigList() (*models.GetResourceSpecsResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetResourceSpecsResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/" + versionID) + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("获取作业详情失败") + } + + return &result, nil +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 7edb62c94..cb0ea8d4d 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -9,6 +9,7 @@ import ( "errors" "github.com/unknwon/com" "io" + "net/http" "os" "path" "strconv" @@ -379,6 +380,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } + //todo: del local code? + if isSaveParam == "on" { if form.ParameterTemplateName == "" { log.Error("ParameterTemplateName is empty") @@ -522,3 +525,32 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { return nil } + +func TrainJobShow(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetJob(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + if result != nil { + createTime, _ := com.StrTo(result.LongCreateTime).Int64() + result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05") + } + + ctx.Data["task"] = task + ctx.Data["jobID"] = jobID + ctx.Data["result"] = result + ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) +} diff --git a/routers/routes/routes.go b/routers/routes/routes.go index f14fac7f2..2e311a780 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -932,7 +932,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/train-job", func() { m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex) m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) + m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug) m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop) m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel) From dcbe2adf0351dbe2f2e611e4b69c0c67d8ff7cc8 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 1 Apr 2021 19:58:29 +0800 Subject: [PATCH 2/2] train-job log --- models/cloudbrain.go | 17 ++++++++ modules/modelarts/modelarts.go | 2 + modules/modelarts/resty.go | 96 ++++++++++++++++++++++++++++++++++++++++-- routers/repo/modelarts.go | 29 +++++++++++++ routers/routes/routes.go | 1 + 5 files changed, 141 insertions(+), 4 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index bc09b2c57..b58383e6e 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -649,6 +649,23 @@ type GetTrainJobResult struct { NasShareAddr string `json:"nas_share_addr"` } +type GetTrainJobLogResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + Content string `json:"content"` + Lines int `json:"lines"` + StartLine string `json:"start_line"` + EndLine string `json:"end_line"` +} + +type GetTrainJobLogFileNamesResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + LogFileList []string `json:"log_file_list"` +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { sess := x.NewSession() defer sess.Close() diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index b865ec7ac..37c3972e1 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -43,6 +43,8 @@ const ( OutputPath = "/output/" LogPath = "/log/" JobPath = "/job/" + OrderDesc = "desc" + OrderAsc = "asc" ) type GenerateTrainJobReq struct { diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index 110b01b96..d3a639ab0 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -1,15 +1,15 @@ package modelarts import ( + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" "crypto/tls" "encoding/json" "fmt" - "net/http" - - "code.gitea.io/gitea/models" - "code.gitea.io/gitea/modules/setting" "github.com/go-resty/resty/v2" + "net/http" + "strconv" ) var ( @@ -509,3 +509,91 @@ sendjob: return &result, nil } + +func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "base_line": baseLine, + "lines": strconv.Itoa(lines), + "log_file": logFile, + "order": order, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLog(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogFileNamesResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLog(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) + } + + return &result, nil +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index cb0ea8d4d..3bd73d931 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -554,3 +554,32 @@ func TrainJobShow(ctx *context.Context) { ctx.Data["result"] = result ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } + +func TrainJobGetLog(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + var jobID = ctx.Params(":jobid") + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, 20) + if err != nil { + log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) + return + } + + ctx.Data["log"] = result + ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) +} diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 2e311a780..7669b6908 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -936,6 +936,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug) m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop) m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel) + m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) }) m.Get("/create", reqRepoCloudBrainWriter, repo.TrainJobNew) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)