@@ -60,6 +60,8 @@ type Cloudbrain struct { | |||||
DeletedAt time.Time `xorm:"deleted"` | DeletedAt time.Time `xorm:"deleted"` | ||||
CanDebug bool `xorm:"-"` | CanDebug bool `xorm:"-"` | ||||
Type int `xorm:"INDEX DEFAULT 0"` | Type int `xorm:"INDEX DEFAULT 0"` | ||||
VersionID int64 `xorm:"INDEX DEFAULT 0"` | |||||
VersionName string | |||||
User *User `xorm:"-"` | User *User `xorm:"-"` | ||||
Repo *Repository `xorm:"-"` | Repo *Repository `xorm:"-"` | ||||
@@ -499,7 +501,7 @@ type Config struct { | |||||
LogUrl string `json:"log_url"` | LogUrl string `json:"log_url"` | ||||
//UserImageUrl string `json:"user_image_url"` | //UserImageUrl string `json:"user_image_url"` | ||||
//UserCommand string `json:"user_command"` | //UserCommand string `json:"user_command"` | ||||
//CreateVersion bool `json:"create_version"` | |||||
CreateVersion bool `json:"create_version"` | |||||
//Volumes []Volumes `json:"volumes"` | //Volumes []Volumes `json:"volumes"` | ||||
Flavor Flavor `json:"flavor"` | Flavor Flavor `json:"flavor"` | ||||
PoolID string `json:"pool_id"` | PoolID string `json:"pool_id"` | ||||
@@ -507,7 +509,7 @@ type Config struct { | |||||
type CreateConfigParams struct { | type CreateConfigParams struct { | ||||
ConfigName string `json:"config_name"` | ConfigName string `json:"config_name"` | ||||
Description string `json:"config_desc"` | |||||
Description string `json:"config_desc"` | |||||
WorkServerNum int `json:"worker_server_num"` | WorkServerNum int `json:"worker_server_num"` | ||||
AppUrl string `json:"app_url"` //训练作业的代码目录 | AppUrl string `json:"app_url"` //训练作业的代码目录 | ||||
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | ||||
@@ -570,7 +572,7 @@ type CreateTrainJobResult struct { | |||||
JobName string `json:"job_name"` | JobName string `json:"job_name"` | ||||
JobID int64 `json:"job_id"` | JobID int64 `json:"job_id"` | ||||
Status int `json:"status"` | Status int `json:"status"` | ||||
CreationTime int64 `json:"create_time"` | |||||
CreateTime int64 `json:"create_time"` | |||||
VersionID int64 `json:"version_id"` | VersionID int64 `json:"version_id"` | ||||
ResourceID string `json:"resource_id"` | ResourceID string `json:"resource_id"` | ||||
VersionName string `json:"version_name"` | VersionName string `json:"version_name"` | ||||
@@ -610,6 +612,60 @@ type ErrorResult struct { | |||||
IsSuccess bool `json:"is_success"` | IsSuccess bool `json:"is_success"` | ||||
} | } | ||||
type GetTrainJobResult struct { | |||||
IsSuccess bool `json:"is_success"` | |||||
JobName string `json:"job_name"` | |||||
JobID int64 `json:"job_id"` | |||||
Description string `json:"job_desc"` | |||||
Status int `json:"status"` | |||||
LongCreateTime int64 `json:"create_time"` | |||||
CreateTime string | |||||
Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒 | |||||
VersionID int64 `json:"version_id"` | |||||
ResourceID string `json:"resource_id"` | |||||
VersionName string `json:"version_name"` | |||||
PreVersionID int64 `json:"pre_version_id"` | |||||
WorkServerNum int `json:"worker_server_num"` | |||||
AppUrl string `json:"app_url"` //训练作业的代码目录 | |||||
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | |||||
Parameter []Parameter `json:"parameter"` | |||||
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | |||||
//DatasetID string `json:"dataset_id"` | |||||
//DataVersionID string `json:"dataset_version_id"` | |||||
//DataSource []DataSource `json:"data_source"` | |||||
//SpecID int64 `json:"spec_id"` | |||||
EngineID int64 `json:"engine_id"` | |||||
//ModelID int64 `json:"model_id"` | |||||
TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | |||||
LogUrl string `json:"log_url"` | |||||
//UserImageUrl string `json:"user_image_url"` | |||||
//UserCommand string `json:"user_command"` | |||||
CreateVersion bool `json:"create_version"` | |||||
//Volumes []Volumes `json:"volumes"` | |||||
Flavor Flavor `json:"flavor"` | |||||
PoolID string `json:"pool_id"` | |||||
PoolName string `json:"pool_name"` | |||||
NasMountPath string `json:"nas_mount_path"` | |||||
NasShareAddr string `json:"nas_share_addr"` | |||||
} | |||||
type GetTrainJobLogResult struct { | |||||
ErrorCode string `json:"error_code"` | |||||
ErrorMsg string `json:"error_msg"` | |||||
IsSuccess bool `json:"is_success"` | |||||
Content string `json:"content"` | |||||
Lines int `json:"lines"` | |||||
StartLine string `json:"start_line"` | |||||
EndLine string `json:"end_line"` | |||||
} | |||||
type GetTrainJobLogFileNamesResult struct { | |||||
ErrorCode string `json:"error_code"` | |||||
ErrorMsg string `json:"error_msg"` | |||||
IsSuccess bool `json:"is_success"` | |||||
LogFileList []string `json:"log_file_list"` | |||||
} | |||||
func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | ||||
sess := x.NewSession() | sess := x.NewSession() | ||||
defer sess.Close() | defer sess.Close() | ||||
@@ -43,6 +43,8 @@ const ( | |||||
OutputPath = "/output/" | OutputPath = "/output/" | ||||
LogPath = "/log/" | LogPath = "/log/" | ||||
JobPath = "/job/" | JobPath = "/job/" | ||||
OrderDesc = "desc" | |||||
OrderAsc = "asc" | |||||
) | ) | ||||
type GenerateTrainJobReq struct { | type GenerateTrainJobReq struct { | ||||
@@ -149,6 +151,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
TrainUrl: req.TrainUrl, | TrainUrl: req.TrainUrl, | ||||
LogUrl: req.LogUrl, | LogUrl: req.LogUrl, | ||||
PoolID: req.PoolID, | PoolID: req.PoolID, | ||||
CreateVersion: true, | |||||
Flavor: models.Flavor{ | Flavor: models.Flavor{ | ||||
Code: req.FlavorCode, | Code: req.FlavorCode, | ||||
}, | }, | ||||
@@ -161,18 +164,75 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
} | } | ||||
err = models.CreateCloudbrain(&models.Cloudbrain{ | err = models.CreateCloudbrain(&models.Cloudbrain{ | ||||
Status: strconv.Itoa(jobResult.Status), | |||||
Status: transTrainJobStatus(jobResult.Status), | |||||
UserID: ctx.User.ID, | UserID: ctx.User.ID, | ||||
RepoID: ctx.Repo.Repository.ID, | RepoID: ctx.Repo.Repository.ID, | ||||
JobID: strconv.FormatInt(jobResult.JobID, 10), | JobID: strconv.FormatInt(jobResult.JobID, 10), | ||||
JobName: req.JobName, | JobName: req.JobName, | ||||
JobType: string(models.JobTypeDebug), | JobType: string(models.JobTypeDebug), | ||||
Type: models.TypeCloudBrainTrainJob, | Type: models.TypeCloudBrainTrainJob, | ||||
VersionID: jobResult.VersionID, | |||||
VersionName: jobResult.VersionName, | |||||
}) | }) | ||||
if err != nil { | if err != nil { | ||||
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) | |||||
return err | return err | ||||
} | } | ||||
return nil | return nil | ||||
} | } | ||||
func transTrainJobStatus(status int) string{ | |||||
switch status { | |||||
case 0: | |||||
return "UNKNOWN" | |||||
case 1: | |||||
return "INIT" | |||||
case 2: | |||||
return "IMAGE_CREATING" | |||||
case 3: | |||||
return "IMAGE_FAILED" | |||||
case 4: | |||||
return "SUBMIT_TRYING" | |||||
case 5: | |||||
return "SUBMIT_FAILED" | |||||
case 6: | |||||
return "DELETE_FAILED" | |||||
case 7: | |||||
return "WAITING" | |||||
case 8: | |||||
return "RUNNING" | |||||
case 9: | |||||
return "KILLING" | |||||
case 10: | |||||
return "COMPLETED" | |||||
case 11: | |||||
return "FAILED" | |||||
case 12: | |||||
return "KILLED" | |||||
case 13: | |||||
return "CANCELED" | |||||
case 14: | |||||
return "LOST" | |||||
case 15: | |||||
return "SCALING" | |||||
case 16: | |||||
return "SUBMIT_MODEL_FAILED" | |||||
case 17: | |||||
return "DEPLOY_SERVICE_FAILED" | |||||
case 18: | |||||
return "CHECK_INIT" | |||||
case 19: | |||||
return "CHECK_RUNNING" | |||||
case 20: | |||||
return "CHECK_RUNNING_COMPLETED" | |||||
case 21: | |||||
return "CHECK_FAILED" | |||||
default: | |||||
return strconv.Itoa(status) | |||||
} | |||||
return "" | |||||
} |
@@ -1,15 +1,15 @@ | |||||
package modelarts | package modelarts | ||||
import ( | import ( | ||||
"code.gitea.io/gitea/models" | |||||
"code.gitea.io/gitea/modules/log" | "code.gitea.io/gitea/modules/log" | ||||
"code.gitea.io/gitea/modules/setting" | |||||
"crypto/tls" | "crypto/tls" | ||||
"encoding/json" | "encoding/json" | ||||
"fmt" | "fmt" | ||||
"net/http" | |||||
"code.gitea.io/gitea/models" | |||||
"code.gitea.io/gitea/modules/setting" | |||||
"github.com/go-resty/resty/v2" | "github.com/go-resty/resty/v2" | ||||
"net/http" | |||||
"strconv" | |||||
) | ) | ||||
var ( | var ( | ||||
@@ -425,3 +425,175 @@ sendjob: | |||||
return &result, nil | return &result, nil | ||||
} | } | ||||
func GetConfigList() (*models.GetResourceSpecsResult, error) { | |||||
checkSetting() | |||||
client := getRestyClient() | |||||
var result models.GetResourceSpecsResult | |||||
retry := 0 | |||||
sendjob: | |||||
res, err := client.R(). | |||||
SetHeader("Content-Type", "application/json"). | |||||
SetAuthToken(TOKEN). | |||||
SetResult(&result). | |||||
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||||
if err != nil { | |||||
return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) | |||||
} | |||||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
retry++ | |||||
_ = getToken() | |||||
goto sendjob | |||||
} | |||||
if res.StatusCode() != http.StatusOK { | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | |||||
if !result.IsSuccess { | |||||
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
} | |||||
return &result, nil | |||||
} | |||||
func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { | |||||
checkSetting() | |||||
client := getRestyClient() | |||||
var result models.GetTrainJobResult | |||||
retry := 0 | |||||
sendjob: | |||||
res, err := client.R(). | |||||
SetHeader("Content-Type", "application/json"). | |||||
SetAuthToken(TOKEN). | |||||
SetResult(&result). | |||||
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/" + versionID) | |||||
if err != nil { | |||||
return nil, fmt.Errorf("resty GetTrainJob: %v", err) | |||||
} | |||||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
retry++ | |||||
_ = getToken() | |||||
goto sendjob | |||||
} | |||||
if res.StatusCode() != http.StatusOK { | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | |||||
if !result.IsSuccess { | |||||
log.Error("GetTrainJob(%s) failed", jobID) | |||||
return &result, fmt.Errorf("获取作业详情失败") | |||||
} | |||||
return &result, nil | |||||
} | |||||
func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) { | |||||
checkSetting() | |||||
client := getRestyClient() | |||||
var result models.GetTrainJobLogResult | |||||
retry := 0 | |||||
sendjob: | |||||
res, err := client.R(). | |||||
SetQueryParams(map[string]string{ | |||||
"base_line": baseLine, | |||||
"lines": strconv.Itoa(lines), | |||||
"log_file": logFile, | |||||
"order": order, | |||||
}). | |||||
SetAuthToken(TOKEN). | |||||
SetResult(&result). | |||||
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log") | |||||
if err != nil { | |||||
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) | |||||
} | |||||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
retry++ | |||||
_ = getToken() | |||||
goto sendjob | |||||
} | |||||
if res.StatusCode() != http.StatusOK { | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | |||||
if !result.IsSuccess { | |||||
log.Error("GetTrainJobLog(%s) failed", jobID) | |||||
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) | |||||
} | |||||
return &result, nil | |||||
} | |||||
func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) { | |||||
checkSetting() | |||||
client := getRestyClient() | |||||
var result models.GetTrainJobLogFileNamesResult | |||||
retry := 0 | |||||
sendjob: | |||||
res, err := client.R(). | |||||
SetAuthToken(TOKEN). | |||||
SetResult(&result). | |||||
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names") | |||||
if err != nil { | |||||
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) | |||||
} | |||||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
retry++ | |||||
_ = getToken() | |||||
goto sendjob | |||||
} | |||||
if res.StatusCode() != http.StatusOK { | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | |||||
if !result.IsSuccess { | |||||
log.Error("GetTrainJobLog(%s) failed", jobID) | |||||
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) | |||||
} | |||||
return &result, nil | |||||
} |
@@ -9,6 +9,7 @@ import ( | |||||
"errors" | "errors" | ||||
"github.com/unknwon/com" | "github.com/unknwon/com" | ||||
"io" | "io" | ||||
"net/http" | |||||
"os" | "os" | ||||
"path" | "path" | ||||
"strconv" | "strconv" | ||||
@@ -379,6 +380,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
return | return | ||||
} | } | ||||
//todo: del local code? | |||||
if isSaveParam == "on" { | if isSaveParam == "on" { | ||||
if form.ParameterTemplateName == "" { | if form.ParameterTemplateName == "" { | ||||
log.Error("ParameterTemplateName is empty") | log.Error("ParameterTemplateName is empty") | ||||
@@ -522,3 +525,61 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { | |||||
return nil | return nil | ||||
} | } | ||||
func TrainJobShow(ctx *context.Context) { | |||||
ctx.Data["PageIsCloudBrain"] = true | |||||
var jobID = ctx.Params(":jobid") | |||||
task, err := models.GetCloudbrainByJobID(jobID) | |||||
if err != nil { | |||||
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||||
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | |||||
return | |||||
} | |||||
result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
if err != nil { | |||||
log.Error("GetJob(%s) failed:%v", jobID, err.Error()) | |||||
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | |||||
return | |||||
} | |||||
if result != nil { | |||||
createTime, _ := com.StrTo(result.LongCreateTime).Int64() | |||||
result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05") | |||||
} | |||||
ctx.Data["task"] = task | |||||
ctx.Data["jobID"] = jobID | |||||
ctx.Data["result"] = result | |||||
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | |||||
} | |||||
func TrainJobGetLog(ctx *context.Context) { | |||||
ctx.Data["PageIsCloudBrain"] = true | |||||
var jobID = ctx.Params(":jobid") | |||||
task, err := models.GetCloudbrainByJobID(jobID) | |||||
if err != nil { | |||||
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||||
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | |||||
return | |||||
} | |||||
resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
if err != nil { | |||||
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | |||||
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | |||||
return | |||||
} | |||||
result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, 20) | |||||
if err != nil { | |||||
log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) | |||||
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | |||||
return | |||||
} | |||||
ctx.Data["log"] = result | |||||
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | |||||
} |
@@ -932,10 +932,11 @@ func RegisterRoutes(m *macaron.Macaron) { | |||||
m.Group("/train-job", func() { | m.Group("/train-job", func() { | ||||
m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex) | m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex) | ||||
m.Group("/:jobid", func() { | m.Group("/:jobid", func() { | ||||
m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) | |||||
m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) | |||||
m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug) | m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug) | ||||
m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop) | m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop) | ||||
m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel) | m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel) | ||||
m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) | |||||
}) | }) | ||||
m.Get("/create", reqRepoCloudBrainWriter, repo.TrainJobNew) | m.Get("/create", reqRepoCloudBrainWriter, repo.TrainJobNew) | ||||
m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) | m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) | ||||