package modelarts import ( "encoding/json" "fmt" "path" "strconv" "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/storage" ) const ( //notebook storageTypeOBS = "obs" autoStopDuration = 4 * 60 * 60 DataSetMountPath = "/home/ma-user/work" NotebookEnv = "Python3" NotebookType = "Ascend" FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" //train-job // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + // "]}" // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + // "]}" CodePath = "/code/" OutputPath = "/output/" ResultPath = "/result/" LogPath = "/log/" JobPath = "/job/" OrderDesc = "desc" //向下查询 OrderAsc = "asc" //向上查询 Lines = 500 TrainUrl = "train_url" DataUrl = "data_url" ResultUrl = "result_url" CkptUrl = "ckpt_url" PerPage = 10 IsLatestVersion = "1" NotLatestVersion = "0" DebugType = -1 VersionCount = 1 SortByCreateTime = "create_time" ConfigTypeCustom = "custom" TotalVersionCount = 1 ) var ( poolInfos *models.PoolInfos FlavorInfos *models.FlavorInfos ) type GenerateTrainJobReq struct { JobName string Uuid string Description string CodeObsPath string BootFile string BootFileUrl string DataUrl string TrainUrl string FlavorCode string LogUrl string PoolID string WorkServerNumber int EngineID int64 Parameters []models.Parameter CommitID string IsLatestVersion string Params string BranchName string PreVersionId int64 PreVersionName string FlavorName string VersionCount int EngineName string TotalVersionCount int } type GenerateTrainJobVersionReq struct { JobName string Uuid string Description string CodeObsPath string BootFile string BootFileUrl string DataUrl string TrainUrl string FlavorCode string LogUrl string PoolID string WorkServerNumber int EngineID int64 Parameters []models.Parameter Params string PreVersionId int64 CommitID string BranchName string FlavorName string EngineName string PreVersionName string TotalVersionCount int } type GenerateInferenceJobReq struct { JobName string Uuid string Description string CodeObsPath string BootFile string BootFileUrl string DataUrl string TrainUrl string FlavorCode string LogUrl string PoolID string WorkServerNumber int EngineID int64 Parameters []models.Parameter CommitID string Params string BranchName string FlavorName string EngineName string LabelName string IsLatestVersion string VersionCount int TotalVersionCount int ModelName string ModelVersion string CkptName string ResultUrl string } type VersionInfo struct { Version []struct { ID int `json:"id"` Value string `json:"value"` } `json:"version"` } type Flavor struct { Info []struct { Code string `json:"code"` Value string `json:"value"` } `json:"flavor"` } type Engine struct { Info []struct { ID int `json:"id"` Value string `json:"value"` } `json:"engine"` } type ResourcePool struct { Info []struct { ID string `json:"id"` Value string `json:"value"` } `json:"resource_pool"` } // type Parameter struct { // Label string `json:"label"` // Value string `json:"value"` // } // type Parameters struct { // Parameter []Parameter `json:"parameter"` // } type Parameters struct { Parameter []struct { Label string `json:"label"` Value string `json:"value"` } `json:"parameter"` } func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error { var dataActualPath string if uuid != "" { dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" } else { userPath := setting.UserBasePath + ctx.User.Name + "/" isExist, err := storage.ObsHasObject(userPath) if err != nil { log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"]) return err } if !isExist { if err = storage.ObsCreateObject(userPath); err != nil { log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"]) return err } } dataActualPath = setting.Bucket + "/" + userPath } if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } jobResult, err := CreateJob(models.CreateNotebookParams{ JobName: jobName, Description: description, ProfileID: setting.ProfileID, Flavor: flavor, Pool: models.Pool{ ID: poolInfos.PoolInfo[0].PoolId, Name: poolInfos.PoolInfo[0].PoolName, Type: poolInfos.PoolInfo[0].PoolType, }, Spec: models.Spec{ Storage: models.Storage{ Type: storageTypeOBS, Location: models.Location{ Path: dataActualPath, }, }, AutoStop: models.AutoStop{ Enable: true, Duration: autoStopDuration, }, }, }) if err != nil { log.Error("CreateJob failed: %v", err.Error()) return err } err = models.CreateCloudbrain(&models.Cloudbrain{ Status: string(models.JobWaiting), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTwo, Uuid: uuid, ComputeResource: models.NPUResource, }) if err != nil { return err } return nil } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { jobResult, err := createTrainJob(models.CreateTrainJobParams{ JobName: req.JobName, Description: req.Description, Config: models.Config{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ Code: req.FlavorCode, }, Parameter: req.Parameters, }, }) if err != nil { log.Error("CreateJob failed: %v", err.Error()) return err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) return err } err = models.CreateCloudbrain(&models.Cloudbrain{ Status: TransTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: strconv.FormatInt(jobResult.JobID, 10), JobName: req.JobName, JobType: string(models.JobTypeTrain), Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: attach.Name, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, ComputeResource: models.NPUResource, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, FlavorCode: req.FlavorCode, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, EngineName: req.EngineName, VersionCount: req.VersionCount, TotalVersionCount: req.TotalVersionCount, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) return err } return nil } func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ Description: req.Description, Config: models.TrainJobVersionConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ Code: req.FlavorCode, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, }, }, jobId) if err != nil { log.Error("CreateJob failed: %v", err.Error()) return err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) return err } var jobTypes []string jobTypes = append(jobTypes, string(models.JobTypeTrain)) repo := ctx.Repo.Repository VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ RepoID: repo.ID, Type: models.TypeCloudBrainTwo, JobTypes: jobTypes, JobID: strconv.FormatInt(jobResult.JobID, 10), }) if err != nil { ctx.ServerError("Cloudbrain", err) return err } //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount err = models.CreateCloudbrain(&models.Cloudbrain{ Status: TransTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: strconv.FormatInt(jobResult.JobID, 10), JobName: req.JobName, JobType: string(models.JobTypeTrain), Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: attach.Name, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, PreVersionName: req.PreVersionName, ComputeResource: models.NPUResource, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, PreVersionId: req.PreVersionId, FlavorCode: req.FlavorCode, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, EngineName: req.EngineName, TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1, VersionCount: VersionListCount + 1, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) return err } //将训练任务的上一版本的isLatestVersion设置为"0" err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount) if err != nil { ctx.ServerError("Update IsLatestVersion failed", err) return err } return err } func TransTrainJobStatus(status int) string { switch status { case 0: return "UNKNOWN" case 1: return "INIT" case 2: return "IMAGE_CREATING" case 3: return "IMAGE_FAILED" case 4: return "SUBMIT_TRYING" case 5: return "SUBMIT_FAILED" case 6: return "DELETE_FAILED" case 7: return "WAITING" case 8: return "RUNNING" case 9: return "KILLING" case 10: return "COMPLETED" case 11: return "FAILED" case 12: return "KILLED" case 13: return "CANCELED" case 14: return "LOST" case 15: return "SCALING" case 16: return "SUBMIT_MODEL_FAILED" case 17: return "DEPLOY_SERVICE_FAILED" case 18: return "CHECK_INIT" case 19: return "CHECK_RUNNING" case 20: return "CHECK_RUNNING_COMPLETED" case 21: return "CHECK_FAILED" default: return strconv.Itoa(status) } } func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) { talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount) VersionOutputPath = "V" + talVersionCountToString return VersionOutputPath } func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) { jobResult, err := createInferenceJob(models.CreateInferenceJobParams{ JobName: req.JobName, Description: req.Description, InfConfig: models.InfConfig{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, // TrainUrl: req.TrainUrl, LogUrl: req.LogUrl, PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ Code: req.FlavorCode, }, Parameter: req.Parameters, }, }) if err != nil { log.Error("CreateJob failed: %v", err.Error()) return err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) return err } err = models.CreateCloudbrain(&models.Cloudbrain{ Status: TransTrainJobStatus(jobResult.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: strconv.FormatInt(jobResult.JobID, 10), JobName: req.JobName, JobType: string(models.JobTypeInference), Type: models.TypeCloudBrainTwo, VersionID: jobResult.VersionID, VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: attach.Name, CommitID: req.CommitID, EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, FlavorCode: req.FlavorCode, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, EngineName: req.EngineName, LabelName: req.LabelName, IsLatestVersion: req.IsLatestVersion, VersionCount: req.VersionCount, TotalVersionCount: req.TotalVersionCount, ModelName: req.ModelName, ModelVersion: req.ModelVersion, CkptName: req.CkptName, ResultUrl: req.ResultUrl, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) return err } return nil }