package modelarts import ( "encoding/json" "errors" "fmt" "path" "strconv" "strings" "code.gitea.io/gitea/modules/modelarts_cd" "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/notification" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/timeutil" ) const ( //notebook storageTypeOBS = "obs" autoStopDuration = 4 * 60 * 60 autoStopDurationMs = 4 * 60 * 60 * 1000 MORDELART_USER_IMAGE_ENGINE_ID = -1 DataSetMountPath = "/home/ma-user/work" NotebookEnv = "Python3" NotebookType = "Ascend" FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" //train-job // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + // "]}" // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + // "]}" CodePath = "/code/" OutputPath = "/output/" ResultPath = "/result/" LogPath = "/log/" JobPath = "/job/" OrderDesc = "desc" //向下查询 OrderAsc = "asc" //向上查询 Lines = 500 TrainUrl = "train_url" DataUrl = "data_url" MultiDataUrl = "multi_data_url" ResultUrl = "result_url" CkptUrl = "ckpt_url" DeviceTarget = "device_target" Ascend = "Ascend" PerPage = 10 IsLatestVersion = "1" NotLatestVersion = "0" VersionCountOne = 1 SortByCreateTime = "create_time" ConfigTypeCustom = "custom" TotalVersionCount = 1 ) var ( poolInfos *models.PoolInfos TrainFlavorInfos *Flavor SpecialPools *models.SpecialPools MultiNodeConfig *MultiNodes ) type GenerateTrainJobReq struct { JobName string DisplayJobName string Uuid string Description string CodeObsPath string BootFile string BootFileUrl string DataUrl string TrainUrl string LogUrl string PoolID string WorkServerNumber int EngineID int64 Parameters []models.Parameter CommitID string IsLatestVersion string Params string BranchName string PreVersionId int64 PreVersionName string FlavorCode string FlavorName string VersionCount int EngineName string TotalVersionCount int UserImageUrl string UserCommand string DatasetName string Spec *models.Specification ModelName string LabelName string CkptName string ModelVersion string PreTrainModelUrl string } type GenerateInferenceJobReq struct { JobName string DisplayJobName string Uuid string Description string CodeObsPath string BootFile string BootFileUrl string DataUrl string TrainUrl string LogUrl string PoolID string WorkServerNumber int EngineID int64 Parameters []models.Parameter CommitID string Params string BranchName string FlavorName string EngineName string LabelName string IsLatestVersion string VersionCount int TotalVersionCount int ModelName string ModelVersion string CkptName string ResultUrl string Spec *models.Specification DatasetName string JobType string } type VersionInfo struct { Version []struct { ID int `json:"id"` Value string `json:"value"` Url string `json:"url"` } `json:"version"` } type Flavor struct { Info []struct { Code string `json:"code"` Value string `json:"value"` UnitPrice int64 `json:"unitPrice"` } `json:"flavor"` } type Engine struct { Info []struct { ID int `json:"id"` Value string `json:"value"` } `json:"engine"` } type ResourcePool struct { Info []struct { ID string `json:"id"` Value string `json:"value"` } `json:"resource_pool"` } type MultiNodes struct { Info []OrgMultiNode `json:"multinode"` } type OrgMultiNode struct { Org string `json:"org"` Node []int `json:"node"` } // type Parameter struct { // Label string `json:"label"` // Value string `json:"value"` // } // type Parameters struct { // Parameter []Parameter `json:"parameter"` // } type Parameters struct { Parameter []struct { Label string `json:"label"` Value string `json:"value"` } `json:"parameter"` } func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error { var dataActualPath string if uuid != "" { dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" } else { userPath := setting.UserBasePath + ctx.User.Name + "/" isExist, err := storage.ObsHasObject(userPath) if err != nil { log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"]) return err } if !isExist { if err = storage.ObsCreateObject(userPath); err != nil { log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"]) return err } } dataActualPath = setting.Bucket + "/" + userPath } if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } createTime := timeutil.TimeStampNow() jobResult, err := CreateJob(models.CreateNotebookParams{ JobName: jobName, Description: description, ProfileID: setting.ProfileID, Flavor: flavor, Pool: models.Pool{ ID: poolInfos.PoolInfo[0].PoolId, Name: poolInfos.PoolInfo[0].PoolName, Type: poolInfos.PoolInfo[0].PoolType, }, Spec: models.Spec{ Storage: models.Storage{ Type: storageTypeOBS, Location: models.Location{ Path: dataActualPath, }, }, AutoStop: models.AutoStop{ Enable: true, Duration: autoStopDuration, }, }, }) if err != nil { log.Error("CreateJob failed: %v", err.Error()) return err } err = models.CreateCloudbrain(&models.Cloudbrain{ Status: string(models.JobWaiting), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTwo, Uuid: uuid, ComputeResource: models.NPUResource, CreatedUnix: createTime, UpdatedUnix: createTime, }) if err != nil { return err } notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask) return nil } func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } imageName, err := GetNotebookImageName(imageId) if err != nil { log.Error("GetNotebookImageName failed: %v", err.Error()) return err } createTime := timeutil.TimeStampNow() jobResult, err := createNotebook2(models.CreateNotebook2Params{ JobName: jobName, Description: description, Flavor: spec.SourceSpecId, Duration: autoStopDurationMs, ImageID: imageId, PoolID: poolInfos.PoolInfo[0].PoolId, Feature: models.NotebookFeature, Volume: models.VolumeReq{ Capacity: setting.Capacity, Category: models.EVSCategory, Ownership: models.ManagedOwnership, }, WorkspaceID: "0", }) if err != nil { log.Error("createNotebook2 failed: %v", err.Error()) if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { log.Info("(%s)unknown error, set temp status", displayJobName) errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: models.TempJobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, Type: models.TypeCloudBrainTwo, JobName: jobName, JobType: string(models.JobTypeDebug), }) if errTemp != nil { log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) return errTemp } } return err } task := &models.Cloudbrain{ Status: jobResult.Status, UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, FlavorCode: spec.SourceSpecId, DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTwo, Uuid: uuid, ComputeResource: models.NPUResource, Image: imageName, Description: description, CreatedUnix: createTime, UpdatedUnix: createTime, Spec: spec, } err = models.CreateCloudbrain(task) if err != nil { return err } stringId := strconv.FormatInt(task.ID, 10) notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask) return nil } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { createTime := timeutil.TimeStampNow() var jobResult *models.CreateTrainJobResult var createErr error if req.EngineID < 0 { jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{ JobName: req.JobName, Description: req.Description, Config: models.UserImageConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, UserImageUrl: req.UserImageUrl, UserCommand: req.UserCommand, }, }) } else { jobResult, createErr = createTrainJob(models.CreateTrainJobParams{ JobName: req.JobName, Description: req.Description, Config: models.Config{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, }) } if createErr != nil { log.Error("createTrainJob failed: %v", createErr.Error()) if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) { log.Info("(%s)unknown error, set temp status", req.DisplayJobName) errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: models.TempJobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, Type: models.TypeCloudBrainTwo, JobName: req.JobName, JobType: string(models.JobTypeTrain), }) if errTemp != nil { log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) return errTemp } } return createErr } jobId := strconv.FormatInt(jobResult.JobID, 10) createErr = models.CreateCloudbrain(&models.Cloudbrain{ Status: TransTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobId, JobName: req.JobName, DisplayJobName: req.DisplayJobName, JobType: string(models.JobTypeTrain), Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: req.DatasetName, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, ComputeResource: models.NPUResource, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, EngineName: req.EngineName, VersionCount: req.VersionCount, TotalVersionCount: req.TotalVersionCount, CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, ModelName: req.ModelName, ModelVersion: req.ModelVersion, LabelName: req.LabelName, PreTrainModelUrl: req.PreTrainModelUrl, CkptName: req.CkptName, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error()) return createErr } notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) return nil } func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) { return createTrainJobUserImage(models.CreateUserImageTrainJobParams{ JobName: req.JobName, Description: req.Description, Config: models.UserImageConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ Code: req.FlavorCode, }, Parameter: req.Parameters, UserImageUrl: req.UserImageUrl, UserCommand: req.UserCommand, }, }) } func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { createTime := timeutil.TimeStampNow() var jobResult *models.CreateTrainJobResult var createErr error if req.EngineID < 0 { jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{ Description: req.Description, Config: models.TrainJobVersionUserImageConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, UserImageUrl: req.UserImageUrl, UserCommand: req.UserCommand, }, }, jobId) } else { jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{ Description: req.Description, Config: models.TrainJobVersionConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, }, }, jobId) } if createErr != nil { log.Error("createTrainJobVersion failed: %v", createErr.Error()) if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) { log.Info("(%s)unknown error, set temp status", req.DisplayJobName) errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: jobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, Type: models.TypeCloudBrainTwo, JobName: req.JobName, JobType: string(models.JobTypeTrain), }) if errTemp != nil { log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) return errTemp } } return createErr } var jobTypes []string jobTypes = append(jobTypes, string(models.JobTypeTrain)) repo := ctx.Repo.Repository VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ RepoID: repo.ID, Type: models.TypeCloudBrainTwo, JobTypes: jobTypes, JobID: strconv.FormatInt(jobResult.JobID, 10), }) if createErr != nil { ctx.ServerError("Cloudbrain", createErr) return createErr } //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount createErr = models.CreateCloudbrain(&models.Cloudbrain{ Status: TransTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: strconv.FormatInt(jobResult.JobID, 10), JobName: req.JobName, DisplayJobName: req.DisplayJobName, JobType: string(models.JobTypeTrain), Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: req.DatasetName, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, PreVersionName: req.PreVersionName, ComputeResource: models.NPUResource, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, PreVersionId: req.PreVersionId, FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, EngineName: req.EngineName, TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1, VersionCount: VersionListCount + 1, CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, ModelName: req.ModelName, ModelVersion: req.ModelVersion, LabelName: req.LabelName, PreTrainModelUrl: req.PreTrainModelUrl, CkptName: req.CkptName, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) return createErr } //将训练任务的上一版本的isLatestVersion设置为"0" createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount) if createErr != nil { ctx.ServerError("Update IsLatestVersion failed", createErr) return createErr } return createErr } func TransTrainJobStatus(status int) string { switch status { case 0: return "UNKNOWN" case 1: return "INIT" case 2: return "IMAGE_CREATING" case 3: return "IMAGE_FAILED" case 4: return "SUBMIT_TRYING" case 5: return "SUBMIT_FAILED" case 6: return "DELETE_FAILED" case 7: return "WAITING" case 8: return "RUNNING" case 9: return "KILLING" case 10: return "COMPLETED" case 11: return "FAILED" case 12: return "KILLED" case 13: return "CANCELED" case 14: return "LOST" case 15: return "SCALING" case 16: return "SUBMIT_MODEL_FAILED" case 17: return "DEPLOY_SERVICE_FAILED" case 18: return "CHECK_INIT" case 19: return "CHECK_RUNNING" case 20: return "CHECK_RUNNING_COMPLETED" case 21: return "CHECK_FAILED" default: return strconv.Itoa(status) } } func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) { talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount) VersionOutputPath = "V" + talVersionCountToString return VersionOutputPath } func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) { createTime := timeutil.TimeStampNow() jobResult, err := createInferenceJob(models.CreateInferenceJobParams{ JobName: req.JobName, Description: req.Description, InfConfig: models.InfConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, // TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, }) if err != nil { log.Error("createInferenceJob failed: %v", err.Error()) if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { log.Info("(%s)unknown error, set temp status", req.DisplayJobName) err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: models.TempJobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, Type: models.TypeCloudBrainTwo, JobName: req.JobName, JobType: req.JobType, }) if err != nil { log.Error("InsertCloudbrainTemp failed: %v", err.Error()) return "", err } } return "", err } // attach, err := models.GetAttachmentByUUID(req.Uuid) // if err != nil { // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) // return err // } jobID := strconv.FormatInt(jobResult.JobID, 10) err = models.CreateCloudbrain(&models.Cloudbrain{ Status: TransTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobID, JobName: req.JobName, DisplayJobName: req.DisplayJobName, JobType: req.JobType, Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: req.DatasetName, CommitID: req.CommitID, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, EngineName: req.EngineName, LabelName: req.LabelName, IsLatestVersion: req.IsLatestVersion, ComputeResource: models.NPUResource, VersionCount: req.VersionCount, TotalVersionCount: req.TotalVersionCount, ModelName: req.ModelName, ModelVersion: req.ModelVersion, CkptName: req.CkptName, ResultUrl: req.ResultUrl, CreatedUnix: createTime, UpdatedUnix: createTime, Spec: req.Spec, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) return "", err } if req.JobType == string(models.JobTypeModelSafety) { task, err := models.GetCloudbrainByJobID(jobID) if err == nil { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask) } } else { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask) } return jobID, nil } func GetNotebookImageName(imageId string) (string, error) { var validImage = false var imageName = "" for _, imageInfo := range setting.StImageInfos.ImageInfo { if imageInfo.Id == imageId { validImage = true imageName = imageInfo.Value } } if !validImage { log.Error("the image id(%s) is invalid", imageId) return imageName, errors.New("the image id is invalid") } return imageName, nil } func InitSpecialPool() { if SpecialPools == nil && setting.ModelArtsSpecialPools != "" { json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools) } } func InitMultiNode() { if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" { json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig) } } func HandleTrainJobInfo(task *models.Cloudbrain) error { result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) if err != nil { log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err) return err } if result != nil { oldStatus := task.Status task.Status = TransTrainJobStatus(result.IntStatus) task.Duration = result.Duration / 1000 task.TrainJobDuration = result.TrainJobDuration if task.StartTime == 0 && result.StartTime > 0 { task.StartTime = timeutil.TimeStamp(result.StartTime / 1000) } task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { task.EndTime = task.StartTime.Add(task.Duration) } task.CorrectCreateUnix() if oldStatus != task.Status { notification.NotifyChangeCloudbrainStatus(task, oldStatus) } err = models.UpdateJob(task) if err != nil { log.Error("UpdateJob(%s) failed:%v", task.JobName, err) return err } } return nil } func HandleNotebookInfo(task *models.Cloudbrain) error { var result *models.GetNotebook2Result var err error if task.Type == models.TypeCloudBrainTwo { result, err = GetNotebook2(task.JobID) } else if task.Type == models.TypeCDCenter { result, err = modelarts_cd.GetNotebook(task.JobID) } if err != nil { log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) return err } if result != nil { oldStatus := task.Status task.Status = result.Status if task.StartTime == 0 && result.Lease.UpdateTime > 0 { task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) } if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { task.EndTime = timeutil.TimeStampNow() } task.CorrectCreateUnix() task.ComputeAndSetDuration() if oldStatus != task.Status { notification.NotifyChangeCloudbrainStatus(task, oldStatus) } if task.FlavorCode == "" { task.FlavorCode = result.Flavor } err = models.UpdateJob(task) if err != nil { log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) return err } } return nil } func SyncTempStatusJob() { jobs, err := models.GetCloudBrainTempJobs() if err != nil { log.Error("GetCloudBrainTempJobs failed:%v", err.Error()) return } for _, temp := range jobs { log.Info("start to handle record: %s", temp.JobName) if temp.Type == models.TypeCloudBrainTwo { if temp.JobType == string(models.JobTypeDebug) { err = handleNotebook(temp) if err != nil { log.Error("handleNotebook falied:%v", err) break } } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) { _, err = models.GetCloudbrainByJobID(temp.JobID) if err != nil { //one version err = handleTrainJob(temp) if err != nil { log.Error("handleTrainJob falied:%v", err) break } } else { //multi version err = handleTrainJobMultiVersion(temp) if err != nil { log.Error("handleTrainJobMultiVersion falied:%v", err) break } } } } } return } func handleNotebook(temp *models.CloudbrainTemp) error { if temp.Status == models.TempJobStatus { err := handleTempNotebook(temp) if err != nil { log.Error("handleTempNotebook failed:%v", err) return err } } else if temp.Status == string(models.ModelArtsStopping) { res, err := GetNotebook2(temp.JobID) if err != nil { log.Error("GetNotebook2 failed:%v", err) return err } temp.Status = res.Status if temp.Status == string(models.ModelArtsStopped) { err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) return err } _, err := DelNotebook2(temp.JobID) if err != nil { log.Error("DelNotebook2 failed:%v", err) return err } temp.Status = string(models.ModelArtsDeleted) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) return err } } } return nil } func handleTempNotebook(temp *models.CloudbrainTemp) error { var err error var isExist bool for { result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName) if err != nil { log.Error("GetNotebookList failed:%v", err) break } temp.QueryTimes++ err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) } if result != nil { for _, notebook := range result.NotebookList { if temp.JobID == models.TempJobId { //new notebook if notebook.JobName == temp.JobName { isExist = true temp.Status = notebook.Status temp.JobID = notebook.JobID break } } else { //restart: always can find one record if notebook.JobName == temp.JobName { if notebook.Status != string(models.ModelArtsStopped) { isExist = true temp.Status = notebook.Status temp.JobID = notebook.JobID break } } } } if isExist { log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status) if temp.Status == string(models.ModelArtsCreateFailed) { err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) break } _, err := DelNotebook2(temp.JobID) if err != nil { log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err) break } temp.Status = string(models.ModelArtsDeleted) } else { _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop}) if err != nil { log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err) break } temp.Status = string(models.ModelArtsStopping) } models.UpdateCloudbrainTemp(temp) } else { log.Error("can not find the record(%s) till now", temp.JobName) err = errors.New("not found") break } } else { log.Error("can not find the record(%s) till now", temp.JobName) err = errors.New("not found") break } break } if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist { log.Info("reach MaxTempQueryTimes, set the job failed") temp.Status = string(models.ModelArtsTrainJobFailed) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err) return err } } return err } func handleTrainJob(temp *models.CloudbrainTemp) error { if temp.Status == models.TempJobStatus { err := handleTempTrainJob(temp) if err != nil { log.Error("handleTempTrainJob failed:%v", err) return err } } else if temp.Status == string(models.ModelArtsTrainJobKilling) { res, err := GetTrainJob(temp.JobID, temp.VersionID) if err != nil { log.Error("GetTrainJob failed:%v", err) return err } temp.Status = TransTrainJobStatus(res.IntStatus) if temp.Status == string(models.ModelArtsTrainJobKilled) { err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) return err } _, err := DelTrainJob(temp.JobID) if err != nil { log.Error("DelTrainJob failed:%v", err) return err } temp.Status = string(models.ModelArtsDeleted) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) return err } } } return nil } func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error { if temp.Status == models.TempJobStatus { err := handleTempTrainJobMultiVersion(temp) if err != nil { log.Error("handleTempTrainJobMultiVersion failed:%v", err) return err } } else if temp.Status == string(models.ModelArtsTrainJobKilling) { res, err := GetTrainJob(temp.JobID, temp.VersionID) if err != nil { log.Error("GetTrainJob failed:%v", err) return err } temp.Status = TransTrainJobStatus(res.IntStatus) if temp.Status == string(models.ModelArtsTrainJobKilled) { err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) return err } _, err := DelTrainJobVersion(temp.JobID, temp.VersionID) if err != nil { log.Error("DelTrainJob failed:%v", err) return err } temp.Status = string(models.ModelArtsDeleted) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) return err } } } return nil } func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error { var err error var isExist bool for { result, err := GetTrainJobVersionList(1000, 1, temp.JobID) if err != nil { log.Error("GetTrainJobVersionList failed:%v", err) break } temp.QueryTimes++ err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) } if result != nil { count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type) if result.VersionCount == int64(count+1) { isExist = true temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus) temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10) log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status) _, err := StopTrainJob(temp.JobID, temp.VersionID) if err != nil { log.Error("StopTrainJob failed:%v", err) break } temp.Status = string(models.ModelArtsTrainJobKilling) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err) break } } else { log.Error("can not find the record(%s) till now", temp.JobName) err = errors.New("not found") break } } break } if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist { log.Info("reach MaxTempQueryTimes, set the job failed") temp.Status = string(models.ModelArtsTrainJobFailed) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err) return err } } return err } func handleTempTrainJob(temp *models.CloudbrainTemp) error { var err error var isExist bool for { result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName) if err != nil { log.Error("GetTrainJobList failed:%v", err) break } temp.QueryTimes++ err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp failed:%v", err) } if result != nil { for _, job := range result.JobList { if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) { isExist = true temp.Status = TransTrainJobStatus(job.IntStatus) temp.JobID = strconv.FormatInt(job.JobID, 10) temp.VersionID = strconv.FormatInt(job.VersionID, 10) log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status) _, err = StopTrainJob(temp.JobID, temp.VersionID) if err != nil { log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err) break } temp.Status = string(models.ModelArtsTrainJobKilling) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err) break } } } if !isExist { log.Error("can not find the record(%s) till now", temp.JobName) err = errors.New("not found") break } } break } if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist { log.Info("reach MaxTempQueryTimes, set the job failed") temp.Status = string(models.ModelArtsTrainJobFailed) err = models.UpdateCloudbrainTemp(temp) if err != nil { log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err) return err } } return err }