|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219 |
- package modelarts
-
- import (
- "encoding/json"
- "errors"
- "fmt"
- "strconv"
- "strings"
-
- "code.gitea.io/gitea/modules/cloudbrain"
-
- "code.gitea.io/gitea/modules/modelarts_cd"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/notification"
- "code.gitea.io/gitea/modules/setting"
- "code.gitea.io/gitea/modules/timeutil"
- )
-
- const (
- //notebook
-
- storageTypeOBS = "obs"
- autoStopDuration = 4 * 60 * 60
- AutoStopDurationMs = 4 * 60 * 60 * 1000
-
- CodePath = "/code/"
- OutputPath = "/output/"
- ResultPath = "/result/"
- LogPath = "/log/"
- JobPath = "/job/"
- OrderDesc = "desc" //向下查询
- OrderAsc = "asc" //向上查询
- Lines = 500
- TrainUrl = "train_url"
- DataUrl = "data_url"
- MultiDataUrl = "multi_data_url"
- ResultUrl = "result_url"
- CkptUrl = "ckpt_url"
- DeviceTarget = "device_target"
- Ascend = "Ascend"
- PerPage = 10
- IsLatestVersion = "1"
- NotLatestVersion = "0"
- VersionCountOne = 1
-
- SortByCreateTime = "create_time"
- ConfigTypeCustom = "custom"
- TotalVersionCount = 1
- )
-
- var (
- poolInfos *models.PoolInfos
- TrainFlavorInfos *Flavor
- SpecialPools *models.SpecialPools
- MultiNodeConfig *MultiNodes
- )
-
- type GenerateTrainJobReq struct {
- JobName string
- DisplayJobName string
- Uuid string
- Description string
- CodeObsPath string
- BootFile string
- BootFileUrl string
- DataUrl string
- TrainUrl string
- LogUrl string
- PoolID string
- WorkServerNumber int
- EngineID int64
- Parameters []models.Parameter
- CommitID string
- IsLatestVersion string
- Params string
- BranchName string
- PreVersionId int64
- PreVersionName string
- FlavorCode string
- FlavorName string
- VersionCount int
- EngineName string
- TotalVersionCount int
- UserImageUrl string
- UserCommand string
- DatasetName string
- Spec *models.Specification
- ModelName string
- LabelName string
- CkptName string
- ModelVersion string
- PreTrainModelUrl string
- }
-
- type GenerateInferenceJobReq struct {
- JobName string
- DisplayJobName string
- Uuid string
- Description string
- CodeObsPath string
- BootFile string
- BootFileUrl string
- DataUrl string
- TrainUrl string
- LogUrl string
- PoolID string
- WorkServerNumber int
- EngineID int64
- Parameters []models.Parameter
- CommitID string
- Params string
- BranchName string
- FlavorName string
- EngineName string
- LabelName string
- IsLatestVersion string
- VersionCount int
- TotalVersionCount int
- ModelName string
- ModelVersion string
- CkptName string
- ResultUrl string
- Spec *models.Specification
- DatasetName string
- JobType string
- UserImageUrl string
- UserCommand string
- }
-
- type VersionInfo struct {
- Version []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- Url string `json:"url"`
- } `json:"version"`
- }
-
- type Flavor struct {
- Info []struct {
- Code string `json:"code"`
- Value string `json:"value"`
- UnitPrice int64 `json:"unitPrice"`
- } `json:"flavor"`
- }
-
- type Engine struct {
- Info []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- } `json:"engine"`
- }
-
- type ResourcePool struct {
- Info []struct {
- ID string `json:"id"`
- Value string `json:"value"`
- } `json:"resource_pool"`
- }
-
- type MultiNodes struct {
- Info []OrgMultiNode `json:"multinode"`
- }
- type OrgMultiNode struct {
- Org string `json:"org"`
- Node []int `json:"node"`
- }
-
- type Parameters struct {
- Parameter []struct {
- Label string `json:"label"`
- Value string `json:"value"`
- } `json:"parameter"`
- }
-
- func GenerateNotebook2(ctx *context.Context, req cloudbrain.GenerateModelArtsNotebookReq) (string, error) {
- if poolInfos == nil {
- json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
- }
-
- imageName, err := GetNotebookImageName(req.ImageId)
- if err != nil {
- log.Error("GetNotebookImageName failed: %v", err.Error())
- return "", err
- }
- createTime := timeutil.TimeStampNow()
- jobResult, err := createNotebook2(models.CreateNotebook2Params{
- JobName: req.JobName,
- Description: req.Description,
- Flavor: req.Spec.SourceSpecId,
- Duration: req.AutoStopDurationMs,
- ImageID: req.ImageId,
- PoolID: poolInfos.PoolInfo[0].PoolId,
- Feature: models.NotebookFeature,
- Volume: models.VolumeReq{
- Capacity: setting.Capacity,
- Category: models.EVSCategory,
- Ownership: models.ManagedOwnership,
- },
- WorkspaceID: "0",
- })
- if err != nil {
- log.Error("createNotebook2 failed: %v", err.Error())
- if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- JobID: models.TempJobId,
- VersionID: models.TempVersionId,
- Status: models.TempJobStatus,
- Type: models.TypeCloudBrainTwo,
- JobName: req.JobName,
- JobType: string(models.JobTypeDebug),
- })
- if errTemp != nil {
- log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
- return "", errTemp
- }
- }
- return "", err
- }
- task := &models.Cloudbrain{
- Status: jobResult.Status,
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobResult.ID,
- JobName: req.JobName,
- FlavorCode: req.Spec.SourceSpecId,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeDebug),
- Type: models.TypeCloudBrainTwo,
- Uuid: req.Uuid,
- ComputeResource: models.NPUResource,
- Image: imageName,
- BootFile: req.BootFile,
- BranchName: req.BranchName,
- Description: req.Description,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- Spec: req.Spec,
- ModelName: req.ModelName,
- ModelVersion: req.ModelVersion,
- LabelName: req.LabelName,
- PreTrainModelUrl: req.PreTrainModelUrl,
- CkptName: req.CkptName,
- }
-
- err = models.CreateCloudbrain(task)
- if err != nil {
- return "", err
- }
-
- stringId := strconv.FormatInt(task.ID, 10)
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask)
- return jobResult.ID, nil
- }
-
- func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
- createTime := timeutil.TimeStampNow()
- var jobResult *models.CreateTrainJobResult
- var createErr error
- if req.EngineID < 0 {
- jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.UserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.Spec.SourceSpecId,
- },
- Parameter: req.Parameters,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- ShareAddr: setting.ModelArtsShareAddr,
- MountPath: setting.ModelArtsMountPath,
- NasType: setting.ModelArtsNasType,
- },
- })
- } else {
- jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.Config{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.Spec.SourceSpecId,
- },
- Parameter: req.Parameters,
- ShareAddr: setting.ModelArtsShareAddr,
- MountPath: setting.ModelArtsMountPath,
- NasType: setting.ModelArtsNasType,
- },
- })
- }
- if createErr != nil {
- log.Error("createTrainJob failed: %v", createErr.Error())
- if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- JobID: models.TempJobId,
- VersionID: models.TempVersionId,
- Status: models.TempJobStatus,
- Type: models.TypeCloudBrainTwo,
- JobName: req.JobName,
- JobType: string(models.JobTypeTrain),
- })
- if errTemp != nil {
- log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
- return "", errTemp
- }
- }
- return "", createErr
- }
- jobID := strconv.FormatInt(jobResult.JobID, 10)
- createErr = models.CreateCloudbrain(&models.Cloudbrain{
- Status: TransTrainJobStatus(jobResult.Status),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobID,
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeTrain),
- Type: models.TypeCloudBrainTwo,
- VersionID: jobResult.VersionID,
- VersionName: jobResult.VersionName,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- IsLatestVersion: req.IsLatestVersion,
- ComputeResource: models.NPUResource,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- FlavorCode: req.Spec.SourceSpecId,
- Description: req.Description,
- WorkServerNumber: req.WorkServerNumber,
- FlavorName: req.FlavorName,
- EngineName: req.EngineName,
- VersionCount: req.VersionCount,
- TotalVersionCount: req.TotalVersionCount,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- Spec: req.Spec,
- ModelName: req.ModelName,
- ModelVersion: req.ModelVersion,
- LabelName: req.LabelName,
- PreTrainModelUrl: req.PreTrainModelUrl,
- CkptName: req.CkptName,
- })
-
- if createErr != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
- return "", createErr
- }
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
- return jobID, nil
- }
-
- func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
-
- return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.UserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- },
- })
- }
-
- func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
- createTime := timeutil.TimeStampNow()
- var jobResult *models.CreateTrainJobResult
- var createErr error
-
- if req.EngineID < 0 {
- jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
- Description: req.Description,
- Config: models.TrainJobVersionUserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- Flavor: models.Flavor{
- Code: req.Spec.SourceSpecId,
- },
- Parameter: req.Parameters,
- PreVersionId: req.PreVersionId,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- ShareAddr: setting.ModelArtsShareAddr,
- MountPath: setting.ModelArtsMountPath,
- NasType: setting.ModelArtsNasType,
- },
- }, jobId)
- } else {
- jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
- Description: req.Description,
- Config: models.TrainJobVersionConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- Flavor: models.Flavor{
- Code: req.Spec.SourceSpecId,
- },
- Parameter: req.Parameters,
- PreVersionId: req.PreVersionId,
- ShareAddr: setting.ModelArtsShareAddr,
- MountPath: setting.ModelArtsMountPath,
- NasType: setting.ModelArtsNasType,
- },
- }, jobId)
- }
- if createErr != nil {
- log.Error("createTrainJobVersion failed: %v", createErr.Error())
- if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- JobID: jobId,
- VersionID: models.TempVersionId,
- Status: models.TempJobStatus,
- Type: models.TypeCloudBrainTwo,
- JobName: req.JobName,
- JobType: string(models.JobTypeTrain),
- })
- if errTemp != nil {
- log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
- return errTemp
- }
- }
- return createErr
- }
-
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeTrain))
- repo := ctx.Repo.Repository
- VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobTypes: jobTypes,
- JobID: strconv.FormatInt(jobResult.JobID, 10),
- })
- if createErr != nil {
- ctx.ServerError("Cloudbrain", createErr)
- return createErr
- }
- //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
-
- createErr = models.CreateCloudbrain(&models.Cloudbrain{
- Status: TransTrainJobStatus(jobResult.Status),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: strconv.FormatInt(jobResult.JobID, 10),
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeTrain),
- Type: models.TypeCloudBrainTwo,
- VersionID: jobResult.VersionID,
- VersionName: jobResult.VersionName,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- IsLatestVersion: req.IsLatestVersion,
- PreVersionName: req.PreVersionName,
- ComputeResource: models.NPUResource,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- PreVersionId: req.PreVersionId,
- FlavorCode: req.Spec.SourceSpecId,
- Description: req.Description,
- WorkServerNumber: req.WorkServerNumber,
- FlavorName: req.FlavorName,
- EngineName: req.EngineName,
- TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
- VersionCount: VersionListCount + 1,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- Spec: req.Spec,
- ModelName: req.ModelName,
- ModelVersion: req.ModelVersion,
- LabelName: req.LabelName,
- PreTrainModelUrl: req.PreTrainModelUrl,
- CkptName: req.CkptName,
- })
- if createErr != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
- return createErr
- }
-
- //将训练任务的上一版本的isLatestVersion设置为"0"
- createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
- if createErr != nil {
- ctx.ServerError("Update IsLatestVersion failed", createErr)
- return createErr
- }
-
- return createErr
- }
-
- func TransTrainJobStatus(status int) string {
- switch status {
- case 0:
- return "UNKNOWN"
- case 1:
- return "INIT"
- case 2:
- return "IMAGE_CREATING"
- case 3:
- return "IMAGE_FAILED"
- case 4:
- return "SUBMIT_TRYING"
- case 5:
- return "SUBMIT_FAILED"
- case 6:
- return "DELETE_FAILED"
- case 7:
- return "WAITING"
- case 8:
- return "RUNNING"
- case 9:
- return "KILLING"
- case 10:
- return "COMPLETED"
- case 11:
- return "FAILED"
- case 12:
- return "KILLED"
- case 13:
- return "CANCELED"
- case 14:
- return "LOST"
- case 15:
- return "SCALING"
- case 16:
- return "SUBMIT_MODEL_FAILED"
- case 17:
- return "DEPLOY_SERVICE_FAILED"
- case 18:
- return "CHECK_INIT"
- case 19:
- return "CHECK_RUNNING"
- case 20:
- return "CHECK_RUNNING_COMPLETED"
- case 21:
- return "CHECK_FAILED"
-
- default:
- return strconv.Itoa(status)
- }
- }
-
- func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
- talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
- VersionOutputPath = "V" + talVersionCountToString
- return VersionOutputPath
- }
-
- func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
- createTime := timeutil.TimeStampNow()
- var jobResult *models.CreateTrainJobResult
- var createErr error
- if req.EngineID < 0 {
- jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.InfUserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- // TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.Spec.SourceSpecId,
- },
- Parameter: req.Parameters,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- },
- })
- } else {
- jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
- JobName: req.JobName,
- Description: req.Description,
- InfConfig: models.InfConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- // TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.Spec.SourceSpecId,
- },
- Parameter: req.Parameters,
- },
- })
- }
- if createErr != nil {
- log.Error("createInferenceJob failed: %v", err.Error())
- if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- JobID: models.TempJobId,
- VersionID: models.TempVersionId,
- Status: models.TempJobStatus,
- Type: models.TypeCloudBrainTwo,
- JobName: req.JobName,
- JobType: req.JobType,
- })
- if err != nil {
- log.Error("InsertCloudbrainTemp failed: %v", err.Error())
- return "", err
- }
- }
- return "", err
- }
-
- // attach, err := models.GetAttachmentByUUID(req.Uuid)
- // if err != nil {
- // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
- // return err
- // }
- jobID := strconv.FormatInt(jobResult.JobID, 10)
- err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: TransTrainJobStatus(jobResult.Status),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobID,
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: req.JobType,
- Type: models.TypeCloudBrainTwo,
- VersionID: jobResult.VersionID,
- VersionName: jobResult.VersionName,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- FlavorCode: req.Spec.SourceSpecId,
- Description: req.Description,
- WorkServerNumber: req.WorkServerNumber,
- FlavorName: req.FlavorName,
- EngineName: req.EngineName,
- LabelName: req.LabelName,
- IsLatestVersion: req.IsLatestVersion,
- ComputeResource: models.NPUResource,
- VersionCount: req.VersionCount,
- TotalVersionCount: req.TotalVersionCount,
- ModelName: req.ModelName,
- ModelVersion: req.ModelVersion,
- CkptName: req.CkptName,
- ResultUrl: req.ResultUrl,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- Spec: req.Spec,
- })
-
- if err != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
- return "", err
- }
- if req.JobType == string(models.JobTypeModelSafety) {
- task, err := models.GetCloudbrainByJobID(jobID)
- if err == nil {
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
- }
- } else {
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
- }
-
- return jobID, nil
- }
-
- func GetNotebookImageName(imageId string) (string, error) {
- var validImage = false
- var imageName = ""
-
- for _, imageInfo := range setting.StImageInfos.ImageInfo {
- if imageInfo.Id == imageId {
- validImage = true
- imageName = imageInfo.Value
- }
- }
-
- if !validImage {
- log.Error("the image id(%s) is invalid", imageId)
- return imageName, errors.New("the image id is invalid")
- }
-
- return imageName, nil
- }
-
- func InitSpecialPool() {
- if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
- json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
- }
- }
-
- func InitMultiNode() {
- if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
- json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
- }
-
- }
-
- func HandleTrainJobInfo(task *models.Cloudbrain) error {
-
- result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
- if err != nil {
- log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
-
- if result != nil {
- oldStatus := task.Status
- task.Status = TransTrainJobStatus(result.IntStatus)
- task.Duration = result.Duration / 1000
- task.TrainJobDuration = result.TrainJobDuration
-
- if task.StartTime == 0 && result.StartTime > 0 {
- task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
- }
- task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
- if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
- task.EndTime = task.StartTime.Add(task.Duration)
- }
- task.CorrectCreateUnix()
- if oldStatus != task.Status {
- notification.NotifyChangeCloudbrainStatus(task, oldStatus)
- }
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
- return err
- }
- }
-
- return nil
- }
-
- func HandleNotebookInfo(task *models.Cloudbrain) error {
- var result *models.GetNotebook2Result
- var err error
- if task.Type == models.TypeCloudBrainTwo {
- result, err = GetNotebook2(task.JobID)
- } else if task.Type == models.TypeCDCenter {
- result, err = modelarts_cd.GetNotebook(task.JobID)
- }
- if err != nil {
- log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
-
- if result != nil {
- oldStatus := task.Status
- task.Status = result.Status
- if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
- task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
- }
- if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
- task.EndTime = timeutil.TimeStampNow()
- }
- task.CorrectCreateUnix()
- task.ComputeAndSetDuration()
- if oldStatus != task.Status {
- notification.NotifyChangeCloudbrainStatus(task, oldStatus)
- }
- if task.FlavorCode == "" {
- task.FlavorCode = result.Flavor
- }
-
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
- }
-
- return nil
- }
-
- func SyncTempStatusJob() {
- jobs, err := models.GetCloudBrainTempJobs()
- if err != nil {
- log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
- return
- }
-
- for _, temp := range jobs {
- log.Info("start to handle record: %s", temp.JobName)
- if temp.Type == models.TypeCloudBrainTwo {
- if temp.JobType == string(models.JobTypeDebug) {
- err = handleNotebook(temp)
- if err != nil {
- log.Error("handleNotebook falied:%v", err)
- break
- }
- } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
- _, err = models.GetCloudbrainByJobID(temp.JobID)
- if err != nil {
- //one version
- err = handleTrainJob(temp)
- if err != nil {
- log.Error("handleTrainJob falied:%v", err)
- break
- }
- } else {
- //multi version
- err = handleTrainJobMultiVersion(temp)
- if err != nil {
- log.Error("handleTrainJobMultiVersion falied:%v", err)
- break
- }
- }
- }
- }
- }
-
- return
- }
-
- func handleNotebook(temp *models.CloudbrainTemp) error {
- if temp.Status == models.TempJobStatus {
- err := handleTempNotebook(temp)
- if err != nil {
- log.Error("handleTempNotebook failed:%v", err)
- return err
- }
- } else if temp.Status == string(models.ModelArtsStopping) {
- res, err := GetNotebook2(temp.JobID)
- if err != nil {
- log.Error("GetNotebook2 failed:%v", err)
- return err
- }
-
- temp.Status = res.Status
- if temp.Status == string(models.ModelArtsStopped) {
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- return err
- }
-
- _, err := DelNotebook2(temp.JobID)
- if err != nil {
- log.Error("DelNotebook2 failed:%v", err)
- return err
- }
-
- temp.Status = string(models.ModelArtsDeleted)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- return err
- }
- }
- }
-
- return nil
- }
-
- func handleTempNotebook(temp *models.CloudbrainTemp) error {
- var err error
- var isExist bool
-
- for {
- result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
- if err != nil {
- log.Error("GetNotebookList failed:%v", err)
- break
- }
-
- temp.QueryTimes++
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- }
-
- if result != nil {
- for _, notebook := range result.NotebookList {
- if temp.JobID == models.TempJobId {
- //new notebook
- if notebook.JobName == temp.JobName {
- isExist = true
- temp.Status = notebook.Status
- temp.JobID = notebook.JobID
- break
- }
- } else {
- //restart: always can find one record
- if notebook.JobName == temp.JobName {
- if notebook.Status != string(models.ModelArtsStopped) {
- isExist = true
- temp.Status = notebook.Status
- temp.JobID = notebook.JobID
- break
- }
- }
- }
- }
-
- if isExist {
- log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
- if temp.Status == string(models.ModelArtsCreateFailed) {
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- break
- }
-
- _, err := DelNotebook2(temp.JobID)
- if err != nil {
- log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
- break
- }
-
- temp.Status = string(models.ModelArtsDeleted)
- } else {
- _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
- if err != nil {
- log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
- break
- }
- temp.Status = string(models.ModelArtsStopping)
- }
-
- models.UpdateCloudbrainTemp(temp)
- } else {
- log.Error("can not find the record(%s) till now", temp.JobName)
- err = errors.New("not found")
- break
- }
- } else {
- log.Error("can not find the record(%s) till now", temp.JobName)
- err = errors.New("not found")
- break
- }
-
- break
- }
-
- if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
- log.Info("reach MaxTempQueryTimes, set the job failed")
-
- temp.Status = string(models.ModelArtsTrainJobFailed)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
- return err
- }
- }
-
- return err
- }
-
- func handleTrainJob(temp *models.CloudbrainTemp) error {
- if temp.Status == models.TempJobStatus {
- err := handleTempTrainJob(temp)
- if err != nil {
- log.Error("handleTempTrainJob failed:%v", err)
- return err
- }
- } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
- res, err := GetTrainJob(temp.JobID, temp.VersionID)
- if err != nil {
- log.Error("GetTrainJob failed:%v", err)
- return err
- }
-
- temp.Status = TransTrainJobStatus(res.IntStatus)
- if temp.Status == string(models.ModelArtsTrainJobKilled) {
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- return err
- }
-
- _, err := DelTrainJob(temp.JobID)
- if err != nil {
- log.Error("DelTrainJob failed:%v", err)
- return err
- }
-
- temp.Status = string(models.ModelArtsDeleted)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- return err
- }
- }
- }
-
- return nil
- }
-
- func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
- if temp.Status == models.TempJobStatus {
- err := handleTempTrainJobMultiVersion(temp)
- if err != nil {
- log.Error("handleTempTrainJobMultiVersion failed:%v", err)
- return err
- }
- } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
- res, err := GetTrainJob(temp.JobID, temp.VersionID)
- if err != nil {
- log.Error("GetTrainJob failed:%v", err)
- return err
- }
-
- temp.Status = TransTrainJobStatus(res.IntStatus)
- if temp.Status == string(models.ModelArtsTrainJobKilled) {
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- return err
- }
-
- _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
- if err != nil {
- log.Error("DelTrainJob failed:%v", err)
- return err
- }
-
- temp.Status = string(models.ModelArtsDeleted)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- return err
- }
- }
-
- }
-
- return nil
- }
-
- func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
- var err error
- var isExist bool
-
- for {
- result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
- if err != nil {
- log.Error("GetTrainJobVersionList failed:%v", err)
- break
- }
-
- temp.QueryTimes++
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- }
-
- if result != nil {
- count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
- if result.VersionCount == int64(count+1) {
- isExist = true
- temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
- temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
-
- log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
-
- _, err := StopTrainJob(temp.JobID, temp.VersionID)
- if err != nil {
- log.Error("StopTrainJob failed:%v", err)
- break
- }
- temp.Status = string(models.ModelArtsTrainJobKilling)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
- break
- }
- } else {
- log.Error("can not find the record(%s) till now", temp.JobName)
- err = errors.New("not found")
- break
- }
- }
-
- break
- }
-
- if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
- log.Info("reach MaxTempQueryTimes, set the job failed")
-
- temp.Status = string(models.ModelArtsTrainJobFailed)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
- return err
- }
- }
-
- return err
- }
-
- func handleTempTrainJob(temp *models.CloudbrainTemp) error {
- var err error
- var isExist bool
-
- for {
- result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
- if err != nil {
- log.Error("GetTrainJobList failed:%v", err)
- break
- }
-
- temp.QueryTimes++
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp failed:%v", err)
- }
-
- if result != nil {
- for _, job := range result.JobList {
- if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
- isExist = true
- temp.Status = TransTrainJobStatus(job.IntStatus)
- temp.JobID = strconv.FormatInt(job.JobID, 10)
- temp.VersionID = strconv.FormatInt(job.VersionID, 10)
-
- log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
-
- _, err = StopTrainJob(temp.JobID, temp.VersionID)
- if err != nil {
- log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
- break
- }
-
- temp.Status = string(models.ModelArtsTrainJobKilling)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
- break
- }
- }
- }
-
- if !isExist {
- log.Error("can not find the record(%s) till now", temp.JobName)
- err = errors.New("not found")
- break
- }
- }
-
- break
- }
-
- if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
- log.Info("reach MaxTempQueryTimes, set the job failed")
-
- temp.Status = string(models.ModelArtsTrainJobFailed)
- err = models.UpdateCloudbrainTemp(temp)
- if err != nil {
- log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
- return err
- }
- }
-
- return err
- }
|