你确认删除该任务么?此任务一旦删除不可恢复。
+diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample index d294c8823..7a4298f6b 100755 --- a/custom/conf/app.ini.sample +++ b/custom/conf/app.ini.sample @@ -1141,3 +1141,9 @@ growth_issue=0.2 growth_contributors=0.2 growth_commit=0.2 growth_comments=0.2 + + +[grampus] +USERNAME = +PASSWORD = +SERVER_HOST = diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go new file mode 100755 index 000000000..b60afb5cc --- /dev/null +++ b/modules/grampus/grampus.go @@ -0,0 +1,294 @@ +package grampus + +import ( + "code.gitea.io/gitea/modules/timeutil" + "strconv" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/notification" +) + +const ( + //notebook + storageTypeOBS = "obs" + autoStopDuration = 4 * 60 * 60 + autoStopDurationMs = 4 * 60 * 60 * 1000 + + DataSetMountPath = "/home/ma-user/work" + NotebookEnv = "Python3" + NotebookType = "Ascend" + FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" + + //train-job + // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" + // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" + // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + + // "]}" + // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + + // "]}" + CodePath = "/code/" + OutputPath = "/output/" + ResultPath = "/result/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 500 + TrainUrl = "train_url" + DataUrl = "data_url" + ResultUrl = "result_url" + CkptUrl = "ckpt_url" + DeviceTarget = "device_target" + Ascend = "Ascend" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" + VersionCount = 1 + + SortByCreateTime = "create_time" + ConfigTypeCustom = "custom" + TotalVersionCount = 1 +) + +var ( + poolInfos *models.PoolInfos + FlavorInfos *models.FlavorInfos + ImageInfos *models.ImageInfosModelArts +) + +type GenerateTrainJobReq struct { + JobName string + DisplayJobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + IsLatestVersion string + Params string + BranchName string + PreVersionId int64 + PreVersionName string + FlavorName string + VersionCount int + EngineName string + TotalVersionCount int +} + +type GenerateInferenceJobReq struct { + JobName string + DisplayJobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + Params string + BranchName string + FlavorName string + EngineName string + LabelName string + IsLatestVersion string + VersionCount int + TotalVersionCount int + ModelName string + ModelVersion string + CkptName string + ResultUrl string +} + +type VersionInfo struct { + Version []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"version"` +} + +type Flavor struct { + Info []struct { + Code string `json:"code"` + Value string `json:"value"` + } `json:"flavor"` +} + +type Engine struct { + Info []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"engine"` +} + +type ResourcePool struct { + Info []struct { + ID string `json:"id"` + Value string `json:"value"` + } `json:"resource_pool"` +} + +// type Parameter struct { +// Label string `json:"label"` +// Value string `json:"value"` +// } + +// type Parameters struct { +// Parameter []Parameter `json:"parameter"` +// } + +type Parameters struct { + Parameter []struct { + Label string `json:"label"` + Value string `json:"value"` + } `json:"parameter"` +} + +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { + createTime := timeutil.TimeStampNow() + jobResult, err := createTrainJob(models.CreateTrainJobParams{ + JobName: req.JobName, + Description: req.Description, + Config: models.Config{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.BootFileUrl, + DataUrl: req.DataUrl, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + LogUrl: req.LogUrl, + PoolID: req.PoolID, + CreateVersion: true, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, + Parameter: req.Parameters, + }, + }) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + attach, err := models.GetAttachmentByUUID(req.Uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) + return err + } + jobId := strconv.FormatInt(jobResult.JobID, 10) + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobId, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: models.NPUResource, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + FlavorCode: req.FlavorCode, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, + CreatedUnix: createTime, + UpdatedUnix: createTime, + }) + + if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) + return err + } + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) + return nil +} + +func TransTrainJobStatus(status int) string { + switch status { + case 0: + return "UNKNOWN" + case 1: + return "INIT" + case 2: + return "IMAGE_CREATING" + case 3: + return "IMAGE_FAILED" + case 4: + return "SUBMIT_TRYING" + case 5: + return "SUBMIT_FAILED" + case 6: + return "DELETE_FAILED" + case 7: + return "WAITING" + case 8: + return "RUNNING" + case 9: + return "KILLING" + case 10: + return "COMPLETED" + case 11: + return "FAILED" + case 12: + return "KILLED" + case 13: + return "CANCELED" + case 14: + return "LOST" + case 15: + return "SCALING" + case 16: + return "SUBMIT_MODEL_FAILED" + case 17: + return "DEPLOY_SERVICE_FAILED" + case 18: + return "CHECK_INIT" + case 19: + return "CHECK_RUNNING" + case 20: + return "CHECK_RUNNING_COMPLETED" + case 21: + return "CHECK_FAILED" + + default: + return strconv.Itoa(status) + } +} diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go new file mode 100755 index 000000000..693ba71a1 --- /dev/null +++ b/modules/grampus/resty.go @@ -0,0 +1,1112 @@ +package grampus + +import ( + "crypto/tls" + "encoding/json" + "fmt" + "net/http" + "strconv" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "github.com/go-resty/resty/v2" +) + +var ( + restyClient *resty.Client + HOST string + TOKEN string +) + +const ( + urlOpenApiV1 = "/openapi/v1/" + + urlGetToken = urlOpenApiV1 + "token" + urlNotebook = "/demanager/instances" + urlTrainJob = "/training-jobs" + urlResourceSpecs = "/job/resource-specs" + urlTrainJobConfig = "/training-job-configs" + errorCodeExceedLimit = "ModelArts.0118" + + urlNotebook2 = "" + + modelartsIllegalToken = "" +) + +type GetTokenParams struct { + UserName string `json:"user_name"` + Password string `json:"password"` +} + +type GetTokenResult struct { + Token string `json:"token"` + Expiration int64 `json:"expiration"` +} + +func getRestyClient() *resty.Client { + if restyClient == nil { + restyClient = resty.New() + restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true}) + } + return restyClient +} + +func checkSetting() { + if len(HOST) != 0 && len(TOKEN) != 0 && restyClient != nil { + return + } + + err := getToken() + if err != nil { + log.Error("getToken failed:%v", err) + } +} + +func getToken() error { + HOST = setting.Grampus.Host + + client := getRestyClient() + params := GetTokenParams{ + UserName: setting.Grampus.UserName, + Password: setting.Grampus.Password, + } + + var result GetTokenResult + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetBody(params). + SetResult(&result). + Post(HOST + urlGetToken) + if err != nil { + return fmt.Errorf("resty getToken: %v", err) + } + + if res.StatusCode() != http.StatusOK { + return fmt.Errorf("getToken failed:%s", res.String()) + } + + TOKEN = result.Token + log.Info(TOKEN) + + return nil +} + +func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateNotebookResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) + + if err != nil { + return nil, fmt.Errorf("resty create notebook: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == errorCodeExceedLimit { + response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" + } + return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func GetJob(jobID string) (*models.GetNotebookResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetNotebookResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return nil, fmt.Errorf("resty GetJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) { + checkSetting() + client := getRestyClient() + var result models.GetNotebook2Result + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + + if err != nil { + return nil, fmt.Errorf("resty GetJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookActionResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetBody(param). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action") + + if err != nil { + return &result, fmt.Errorf("resty StopJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookActionResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) + + if err != nil { + return &result, fmt.Errorf("resty ManageNotebook2: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelNotebook(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelNotebook2(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelJob(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookGetJobTokenResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token") + + if err != nil { + return &result, fmt.Errorf("resty GetJobToken: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) + + if err != nil { + return nil, fmt.Errorf("resty create train-job: %s", err) + } + + req, _ := json.Marshal(createJobParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobVersionParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") + + if err != nil { + return nil, fmt.Errorf("resty create train-job version: %s", err) + } + + req, _ := json.Marshal(createJobVersionParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetResourceSpecsResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) + + if err != nil { + return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobConfigResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(req). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + //temp, _ := json.Marshal(req) + //log.Info("%s", temp) + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetConfigListResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "per_page": strconv.Itoa(perPage), + "page": strconv.Itoa(page), + "sortBy": sortBy, + "order": order, + "search_content": searchContent, + "config_type": configType, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty GetConfigList: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetParaConfig(configName, configType string) (models.GetConfigResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetConfigResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "config_type": configType, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName) + + if err != nil { + return result, fmt.Errorf("resty GetParaConfig: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return result, nil +} + +func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("获取作业详情失败") + } + + return &result, nil +} + +func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "base_line": baseLine, + "lines": strconv.Itoa(lines), + "log_file": logFile, + "order": order, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLog(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogFileNamesResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLogFileNames(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func DelTrainJob(jobID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("DelTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop") + + if err != nil { + return &result, fmt.Errorf("resty StopTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("StopTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + + if err != nil { + return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("DelTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) + + if err != nil { + return nil, fmt.Errorf("resty create inference-job: %s", err) + } + + req, _ := json.Marshal(createJobParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateNotebookResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) + + if err != nil { + return nil, fmt.Errorf("resty create notebook2: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == errorCodeExceedLimit { + response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" + } + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 5c87b68c5..945a7c6f8 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -528,6 +528,13 @@ var ( FlavorInfos string TrainJobFLAVORINFOS string + //grampus config + Grampus = struct { + Host string + UserName string + Password string + }{} + //elk config ElkUrl string ElkUser string @@ -1382,6 +1389,15 @@ func NewContext() { Course.OrgName = sec.Key("org_name").MustString("") Course.TeamName = sec.Key("team_name").MustString("") + GetGrampusConfig() +} + +func GetGrampusConfig() { + sec := Cfg.Section("grampus") + + Grampus.Host = sec.Key("SERVER_HOST").MustString("") + Grampus.UserName = sec.Key("USERNAME").MustString("") + Grampus.Password = sec.Key("PASSWORD").MustString("") } func SetRadarMapConfig() { diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 3e8329dfb..8f3189661 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1,22 +1,435 @@ package repo import ( - "code.gitea.io/gitea/modules/base" + "code.gitea.io/gitea/modules/auth" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/util" + "encoding/json" + "io/ioutil" "net/http" + "os" + "path" + "strconv" + "strings" + "time" + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/base" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" ) const ( - tplGrampusTrainJobNew base.TplName = "repo/grampus/trainjob/new" - tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" + //GPU + tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" + tplGrampusTrainJobGPUShow base.TplName = "repo/grampus/trainjob/gpu/show" + + //NPU + tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" + tplGrampusTrainJobNPUShow base.TplName = "repo/grampus/trainjob/npu/show" ) -func GrampusNew(ctx *context.Context) { - err := cloudBrainNewDataPrepare(ctx) +func GrampusTrainJobGPUNew(ctx *context.Context) { + err := grampusGpuNewDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew) +} + +func grampusGpuNewDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + t := time.Now() + var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["display_job_name"] = displayJobName + + //get valid images + result, err := cloudbrain.GetImages() + if err != nil { + ctx.Data["error"] = err.Error() + log.Error("cloudbrain.GetImages failed:", err.Error(), ctx.Data["MsgID"]) + } + + for i, payload := range result.Payload.ImageInfo { + if strings.HasPrefix(result.Payload.ImageInfo[i].Place, "192.168") { + result.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)] + } else { + result.Payload.ImageInfo[i].PlaceView = payload.Place + } + } + + ctx.Data["images"] = result.Payload.ImageInfo + + resultPublic, err := cloudbrain.GetPublicImages() + if err != nil { + ctx.Data["error"] = err.Error() + log.Error("cloudbrain.GetPublicImages failed:", err.Error(), ctx.Data["MsgID"]) + } + + for i, payload := range resultPublic.Payload.ImageInfo { + if strings.HasPrefix(resultPublic.Payload.ImageInfo[i].Place, "192.168") { + resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)] + } else { + resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place + } + } + + ctx.Data["public_images"] = resultPublic.Payload.ImageInfo + + //get valid dataset + attachs, err := models.GetAllUserAttachments(ctx.User.ID) + if err != nil { + log.Error("GetAllUserAttachments failed: %v", err, ctx.Data["MsgID"]) + return err + } + + ctx.Data["attachments"] = attachs + ctx.Data["command"] = cloudbrain.Command + ctx.Data["code_path"] = cloudbrain.CodeMountPath + ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath + ctx.Data["model_path"] = cloudbrain.ModelMountPath + ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath + ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled + + //get valid resource specs + if categories == nil { + json.Unmarshal([]byte(setting.BenchmarkCategory), &categories) + } + ctx.Data["benchmark_categories"] = categories.Category + + ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType + + if gpuInfos == nil { + json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) + } + ctx.Data["gpu_types"] = gpuInfos.GpuInfo + + if trainGpuInfos == nil { + json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) + } + ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo + + if benchmarkGpuInfos == nil { + json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) + } + ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo + + if benchmarkResourceSpecs == nil { + json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs) + } + ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec + + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec + + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + + ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath + ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled + + ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath + ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled + + ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne + + ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode") + + return nil +} + +func GrampusTrainJobNPUNew(ctx *context.Context) { + err := trainJobNpuNewDataPrepare(ctx) if err != nil { ctx.ServerError("get new train-job info failed", err) return } - ctx.HTML(http.StatusOK, tplGrampusTrainJobNew) + ctx.HTML(200, tplGrampusTrainJobNPUNew) +} + +func trainJobNpuNewDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + + t := time.Now() + var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["display_job_name"] = displayJobName + + //get valid dataset + attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return err + } + ctx.Data["attachments"] = attachs + + //get valid resource specs + var resourcePools modelarts.ResourcePool + if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["resource_pools"] = resourcePools.Info + + var engines modelarts.Engine + if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engines"] = engines.Info + + var versionInfos modelarts.VersionInfo + if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engine_versions"] = versionInfos.Version + + var flavorInfos modelarts.Flavor + if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["flavor_infos"] = flavorInfos.Info + + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) + if err != nil { + ctx.ServerError("getConfigList failed:", err) + return err + } + ctx.Data["config_list"] = configList.ParaConfigs + ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo + + return nil +} + +func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { + VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) + displayJobName := form.DisplayJobName + jobName := util.ConvertDisplayJobNameToJobName(displayJobName) + uuid := form.Attachment + description := form.Description + workServerNumber := form.WorkServerNumber + engineID := form.EngineID + bootFile := form.BootFile + flavorCode := form.Flavor + params := form.Params + poolID := form.PoolID + isSaveParam := form.IsSaveParam + repo := ctx.Repo.Repository + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" + dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName + isLatestVersion := modelarts.IsLatestVersion + FlavorName := form.FlavorName + VersionCount := modelarts.VersionCount + EngineName := form.EngineName + + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) + return + } + } + + if err := paramCheckCreateTrainJob(form); err != nil { + log.Error("paramCheckCreateTrainJob failed:(%v)", err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } + //Determine whether the task name of the task in the project is duplicated + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) + if err == nil { + if len(tasks) != 0 { + log.Error("the job name did already exist", ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form) + return + } + } else { + if !models.IsErrJobNotExist(err) { + log.Error("system error, %v", err, ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + return + } + } + + //todo: del the codeLocalPath + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branch_name) + + if err := downloadCode(repo, codeLocalPath, branch_name); err != nil { + log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) + return + } + + // parentDir := VersionOutputPath + "/" + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) + return + } + + var parameters models.Parameters + param := make([]models.Parameter, 0) + existDeviceTarget := false + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) + return + } + + for _, parameter := range parameters.Parameter { + if parameter.Label == modelarts.DeviceTarget { + existDeviceTarget = true + } + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + if !existDeviceTarget { + param = append(param, models.Parameter{ + Label: modelarts.DeviceTarget, + Value: modelarts.Ascend, + }) + } + + //save param config + if isSaveParam == "on" { + saveparams := append(param, models.Parameter{ + Label: modelarts.TrainUrl, + Value: outputObsPath, + }, models.Parameter{ + Label: modelarts.DataUrl, + Value: dataPath, + }) + if form.ParameterTemplateName == "" { + log.Error("ParameterTemplateName is empty") + trainJobNewDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) + return + } + + _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ + ConfigName: form.ParameterTemplateName, + Description: form.PrameterDescription, + DataUrl: dataPath, + AppUrl: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + TrainUrl: outputObsPath, + Flavor: models.Flavor{ + Code: flavorCode, + }, + WorkServerNum: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Parameter: saveparams, + }) + + if err != nil { + log.Error("Failed to CreateTrainJobConfig: %v", err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) + return + } + } + + req := &modelarts.GenerateTrainJobReq{ + JobName: jobName, + DisplayJobName: displayJobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: param, + CommitID: commitID, + IsLatestVersion: isLatestVersion, + BranchName: branch_name, + Params: form.Params, + FlavorName: FlavorName, + EngineName: EngineName, + VersionCount: VersionCount, + TotalVersionCount: modelarts.TotalVersionCount, + } + + //将params转换Parameters.Parameter,出错时返回给前端 + var Parameters modelarts.Parameters + if err := json.Unmarshal([]byte(params), &Parameters); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return + } + + err = modelarts.GenerateTrainJob(ctx, req) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index a64eb0fae..ab3e2dd55 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1085,14 +1085,26 @@ func RegisterRoutes(m *macaron.Macaron) { }, context.RepoRef()) m.Group("/grampus", func() { m.Group("/train-job", func() { - m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) - m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) - m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + m.Group("/gpu", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew) + //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) + }) + m.Group("/npu", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) + //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) }) - m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusNew) - m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusCreate) }) }, context.RepoRef()) m.Group("/modelmanage", func() { diff --git a/templates/repo/grampus/trainjob/gpu/new.tmpl b/templates/repo/grampus/trainjob/gpu/new.tmpl new file mode 100755 index 000000000..7c42eba75 --- /dev/null +++ b/templates/repo/grampus/trainjob/gpu/new.tmpl @@ -0,0 +1,447 @@ +{{template "base/head" .}} + + +
+ {{$.i18n.Tr "repo.cloudbrain_task"}} + | +
+
+ {{.DisplayJobName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.status"}} + | + +
+
+ {{.Status}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} + | + +
+
+ {{TimeSinceUnix1 .CreatedUnix}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} + | + +
+
+ {{$.duration}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} + | + +
+
+ {{$.resource_type}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + | + +
+
+ {{$.i18n.Tr "cloudbrain.gpu_num"}}:{{$.GpuNum}},{{$.i18n.Tr "cloudbrain.cpu_num"}}:{{$.CpuNum}},{{$.i18n.Tr "cloudbrain.memory"}}(MB):{{$.MemMiB}},{{$.i18n.Tr "cloudbrain.shared_memory"}}(MB):{{$.ShareMemMiB}}
+
+ |
+
+ {{$.i18n.Tr "cloudbrain.mirror"}} + | + +
+
+ {{.Image}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.code_version"}} + | + +
+
+ {{.BranchName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + | + +
+
+ {{.BootFile}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} + | + +
+
+ {{.DatasetName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + | + +
+
+ {{.Parameters}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.description"}} + | + +
+
+ {{.Description}}
+
+ |
+
你确认删除该任务么?此任务一旦删除不可恢复。
++ {{$.i18n.Tr "repo.cloudbrain_task"}} + | +
+
+ {{.DisplayJobName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.status"}} + | + +
+
+ {{.Status}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.run_version"}} + | + +
+
+ {{.VersionName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} + | + +
+
+
+ {{if not (eq .Cloudbrain.StartTime 0)}}
+ {{TimeSinceUnix1 .Cloudbrain.StartTime}}
+ {{else}}
+ {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}}
+ {{end}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} + | + +
+
+ {{.TrainJobDuration}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + | + +
+
+ {{.FlavorName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.compute_node"}} + | +
+
+ {{.WorkServerNumber}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.AI_driver"}} + | +
+
+ {{.EngineName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.code_version"}} + | + +
+
+ {{.BranchName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + | + +
+
+ {{.BootFile}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} + | + +
+
+ {{.DatasetName}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + | + +
+
+ {{.Parameters}}
+
+ |
+
+ {{$.i18n.Tr "repo.modelarts.train_job.description"}} + | + +
+
+ {{.Cloudbrain.Description}}
+
+ |
+
{{.i18n.Tr "cloudbrain.task_delete_confirm"}}
+