diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 91eaddd09..dc56efef7 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -25,7 +25,8 @@ type ModelArtsJobStatus string const ( TypeCloudBrainOne int = iota TypeCloudBrainTwo - TypeC2Net //智算网络 + TypeC2Net //智算网络 + TypeCDCenter //成都智算中心 TypeCloudBrainAll = -1 ) @@ -120,6 +121,11 @@ const ( //AI center AICenterOfCloudBrainOne = "OpenIOne" AICenterOfCloudBrainTwo = "OpenITwo" + AICenterOfChengdu = "OpenIChengdu" + + //ComputeResource + GPU = "GPU" + NPU = "NPU" ) type Cloudbrain struct { @@ -190,6 +196,7 @@ type Cloudbrain struct { BenchmarkTypeRankLink string `xorm:"-"` StartTime timeutil.TimeStamp EndTime timeutil.TimeStamp + Spec *Specification `xorm:"-"` } func (task *Cloudbrain) ComputeAndSetDuration() { @@ -596,37 +603,17 @@ type ResourceSpec struct { ShareMemMiB int `json:"shareMemMiB"` } -type FlavorInfos struct { - FlavorInfo []*FlavorInfo `json:"flavor_info"` -} - -type FlavorInfo struct { - Id int `json:"id"` - Value string `json:"value"` - Desc string `json:"desc"` -} - type SpecialPools struct { Pools []*SpecialPool `json:"pools"` } type SpecialPool struct { - Org string `json:"org"` - Type string `json:"type"` - IsExclusive bool `json:"isExclusive"` - Pool []*GpuInfo `json:"pool"` - JobType []string `json:"jobType"` - ResourceSpec []*ResourceSpec `json:"resourceSpecs"` - Flavor []*FlavorInfo `json:"flavor"` -} - -type ImageInfosModelArts struct { - ImageInfo []*ImageInfoModelArts `json:"image_info"` -} - -type ImageInfoModelArts struct { - Id string `json:"id"` - Value string `json:"value"` - Desc string `json:"desc"` + Org string `json:"org"` + Type string `json:"type"` + IsExclusive bool `json:"isExclusive"` + Pool []*GpuInfo `json:"pool"` + JobType []string `json:"jobType"` + ResourceSpec []*ResourceSpec `json:"resourceSpecs"` + Flavor []*setting.FlavorInfo `json:"flavor"` } type PoolInfos struct { @@ -732,6 +719,17 @@ type CreateNotebook2Params struct { Volume VolumeReq `json:"volume"` } +type CreateNotebookWithoutPoolParams struct { + JobName string `json:"name"` + Description string `json:"description"` + Duration int64 `json:"duration"` //ms + Feature string `json:"feature"` + Flavor string `json:"flavor"` + ImageID string `json:"image_id"` + WorkspaceID string `json:"workspace_id"` + Volume VolumeReq `json:"volume"` +} + type VolumeReq struct { Capacity int `json:"capacity"` Category string `json:"category"` @@ -955,6 +953,7 @@ type NotebookGetJobTokenResult struct { } type NotebookDelResult struct { + NotebookResult InstanceID string `json:"instance_id"` } @@ -1481,12 +1480,6 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if len(opts.ComputeResource) > 0 { - cond = cond.And( - builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource}, - ) - } - if len(opts.JobTypes) > 0 { if opts.JobTypeNot { cond = cond.And( @@ -1506,7 +1499,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { if (opts.Cluster) != "" { if opts.Cluster == "resource_cluster_openi" { cond = cond.And( - builder.Or(builder.Eq{"cloudbrain.type": TypeCloudBrainOne}, builder.Eq{"cloudbrain.type": TypeCloudBrainTwo}), + builder.Or(builder.Eq{"cloudbrain.type": TypeCloudBrainOne}, builder.Eq{"cloudbrain.type": TypeCloudBrainTwo}, builder.Eq{"cloudbrain.type": TypeCDCenter}), ) } if opts.Cluster == "resource_cluster_c2net" { @@ -1720,11 +1713,24 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e } func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { + session := x.NewSession() + defer session.Close() + + err = session.Begin() cloudbrain.TrainJobDuration = DURATION_STR_ZERO - if _, err = x.NoAutoTime().Insert(cloudbrain); err != nil { + if _, err = session.NoAutoTime().InsertOne(cloudbrain); err != nil { + session.Rollback() return err } + if cloudbrain.Spec != nil { + if _, err = session.Insert(NewCloudBrainSpec(cloudbrain.ID, *cloudbrain.Spec)); err != nil { + session.Rollback() + return err + } + } + session.Commit() + go IncreaseDatasetUseCount(cloudbrain.Uuid) return nil } @@ -1959,7 +1965,7 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy func GetCloudbrainNotebookCountByUserID(userID int64) (int, error) { count, err := x.In("status", ModelArtsCreateQueue, ModelArtsCreating, ModelArtsStarting, ModelArtsReadyToStart, ModelArtsResizing, ModelArtsStartQueuing, ModelArtsRunning, ModelArtsRestarting). - And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) + And("job_type = ? and user_id = ? and type in (?,?)", JobTypeDebug, userID, TypeCloudBrainTwo, TypeCDCenter).Count(new(Cloudbrain)) return int(count), err } @@ -2003,11 +2009,18 @@ func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { return err } - if _, err = sess.NoAutoTime().Insert(new); err != nil { + if _, err = sess.NoAutoTime().InsertOne(new); err != nil { sess.Rollback() return err } + if new.Spec != nil { + if _, err = sess.Insert(NewCloudBrainSpec(new.ID, *new.Spec)); err != nil { + sess.Rollback() + return err + } + } + if err = sess.Commit(); err != nil { return err } @@ -2399,7 +2412,57 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) { Find(&cloudbrains) } +func GetCloudbrainWithDeletedByIDs(ids []int64) ([]*Cloudbrain, error) { + cloudbrains := make([]*Cloudbrain, 0) + return cloudbrains, x. + In("id", ids).Unscoped().Find(&cloudbrains) +} + func GetCloudbrainCountByJobName(jobName, jobType string, typeCloudbrain int) (int, error) { count, err := x.Where("job_name = ? and job_type= ? and type = ?", jobName, jobType, typeCloudbrain).Count(new(Cloudbrain)) return int(count), err } + +func LoadSpecs(tasks []*Cloudbrain) error { + cloudbrainIds := make([]int64, len(tasks)) + for i, v := range tasks { + cloudbrainIds[i] = v.ID + } + specs := make([]*CloudbrainSpec, 0) + err := x.In("cloudbrain_id", cloudbrainIds).Find(&specs) + if err != nil { + return err + } + specMap := make(map[int64]*CloudbrainSpec) + for _, v := range specs { + specMap[v.SpecId] = v + } + for _, v := range tasks { + if specMap[v.ID] != nil { + v.Spec = specMap[v.ID].ConvertToSpecification() + } + } + return nil +} + +func LoadSpecs4CloudbrainInfo(tasks []*CloudbrainInfo) error { + cloudbrainIds := make([]int64, len(tasks)) + for i, v := range tasks { + cloudbrainIds[i] = v.Cloudbrain.ID + } + specs := make([]*CloudbrainSpec, 0) + err := x.In("cloudbrain_id", cloudbrainIds).Find(&specs) + if err != nil { + return err + } + specMap := make(map[int64]*CloudbrainSpec) + for _, v := range specs { + specMap[v.CloudbrainID] = v + } + for _, v := range tasks { + if specMap[v.Cloudbrain.ID] != nil { + v.Cloudbrain.Spec = specMap[v.Cloudbrain.ID].ConvertToSpecification() + } + } + return nil +} diff --git a/models/cloudbrain_spec.go b/models/cloudbrain_spec.go new file mode 100644 index 000000000..8aa652b17 --- /dev/null +++ b/models/cloudbrain_spec.go @@ -0,0 +1,109 @@ +package models + +import ( + "code.gitea.io/gitea/modules/timeutil" +) + +type CloudbrainSpec struct { + CloudbrainID int64 `xorm:"pk"` + SpecId int64 `xorm:"index"` + SourceSpecId string + AccCardsNum int + AccCardType string + CpuCores int + MemGiB float32 + GPUMemGiB float32 + ShareMemGiB float32 + ComputeResource string + UnitPrice int + QueueId int64 + QueueCode string + Cluster string + AiCenterCode string + AiCenterName string + IsExclusive bool + ExclusiveOrg string + CreatedTime timeutil.TimeStamp `xorm:"created"` + UpdatedTime timeutil.TimeStamp `xorm:"updated"` +} + +func (s CloudbrainSpec) ConvertToSpecification() *Specification { + return &Specification{ + ID: s.SpecId, + SourceSpecId: s.SourceSpecId, + AccCardsNum: s.AccCardsNum, + AccCardType: s.AccCardType, + CpuCores: s.CpuCores, + MemGiB: s.MemGiB, + GPUMemGiB: s.GPUMemGiB, + ShareMemGiB: s.ShareMemGiB, + ComputeResource: s.ComputeResource, + UnitPrice: s.UnitPrice, + QueueId: s.QueueId, + QueueCode: s.QueueCode, + Cluster: s.Cluster, + AiCenterCode: s.AiCenterCode, + AiCenterName: s.AiCenterName, + IsExclusive: s.IsExclusive, + ExclusiveOrg: s.ExclusiveOrg, + } +} + +func NewCloudBrainSpec(cloudbrainId int64, s Specification) CloudbrainSpec { + return CloudbrainSpec{ + CloudbrainID: cloudbrainId, + SpecId: s.ID, + SourceSpecId: s.SourceSpecId, + AccCardsNum: s.AccCardsNum, + AccCardType: s.AccCardType, + CpuCores: s.CpuCores, + MemGiB: s.MemGiB, + GPUMemGiB: s.GPUMemGiB, + ShareMemGiB: s.ShareMemGiB, + ComputeResource: s.ComputeResource, + UnitPrice: s.UnitPrice, + QueueId: s.QueueId, + QueueCode: s.QueueCode, + Cluster: s.Cluster, + AiCenterCode: s.AiCenterCode, + AiCenterName: s.AiCenterName, + IsExclusive: s.IsExclusive, + ExclusiveOrg: s.ExclusiveOrg, + } +} + +func InsertCloudbrainSpec(c CloudbrainSpec) (int64, error) { + return x.Insert(&c) +} + +func GetCloudbrainSpecByID(cloudbrainId int64) (*CloudbrainSpec, error) { + r := &CloudbrainSpec{} + if has, err := x.Where("cloudbrain_id = ?", cloudbrainId).Get(r); err != nil { + return nil, err + } else if !has { + return nil, nil + } + return r, nil +} + +func FindCloudbrainTask(page, pageSize int) ([]*Cloudbrain, error) { + r := make([]*Cloudbrain, 0) + err := x.Unscoped(). + Limit(pageSize, (page-1)*pageSize). + OrderBy("cloudbrain.id"). + Find(&r) + if err != nil { + return nil, err + } + return r, nil +} + +func CountNoSpecHistoricTask() (int64, error) { + n, err := x.Unscoped(). + Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)"). + Count(&Cloudbrain{}) + if err != nil { + return 0, err + } + return n, nil +} diff --git a/models/dataset.go b/models/dataset.go index 22a20e328..4cff4d6d1 100755 --- a/models/dataset.go +++ b/models/dataset.go @@ -130,15 +130,11 @@ func (datasets DatasetList) loadAttachmentAttributes(opts *SearchDatasetOptions) permission = false datasets[i].Repo.GetOwner() - if datasets[i].Repo.Owner.IsOrganization() { - if datasets[i].Repo.Owner.IsUserPartOfOrg(opts.User.ID) { - log.Info("user is member of org.") - permission = true - } - } if !permission { isCollaborator, _ := datasets[i].Repo.IsCollaborator(opts.User.ID) - if isCollaborator ||datasets[i].Repo.IsOwnedBy(opts.User.ID){ + isInRepoTeam,_:=datasets[i].Repo.IsInRepoTeam(opts.User.ID) + + if isCollaborator ||isInRepoTeam { log.Info("Collaborator user may visit the attach.") permission = true } diff --git a/models/models.go b/models/models.go index af0f7ac79..0bd3f8a6c 100755 --- a/models/models.go +++ b/models/models.go @@ -150,6 +150,7 @@ func init() { new(ResourceScene), new(ResourceSceneSpec), new(AdminOperateLog), + new(CloudbrainSpec), new(CloudbrainTemp), new(DatasetReference), ) diff --git a/models/repo_collaboration.go b/models/repo_collaboration.go index bc71ad379..f0df0b2ec 100644 --- a/models/repo_collaboration.go +++ b/models/repo_collaboration.go @@ -130,6 +130,20 @@ func (repo *Repository) IsCollaborator(userID int64) (bool, error) { return repo.isCollaborator(x, userID) } +func (repo *Repository) IsInRepoTeam(userID int64) (bool, error) { + teams,err:=repo.GetRepoTeams() + if err!=nil || len(teams)==0{ + return false,err + } + for _,team :=range teams{ + if team.IsMember(userID){ + return true,nil + } + } + return false,nil + +} + func (repo *Repository) changeCollaborationAccessMode(e Engine, uid int64, mode AccessMode) error { // Discard invalid input if mode <= AccessModeNone || mode > AccessModeOwner { diff --git a/models/resource_queue.go b/models/resource_queue.go index ff78fcc40..fc0dd8cb5 100644 --- a/models/resource_queue.go +++ b/models/resource_queue.go @@ -71,6 +71,8 @@ func (r ResourceQueueReq) ToDTO() ResourceQueue { q.AiCenterName = "云脑一" } else if r.AiCenterCode == AICenterOfCloudBrainTwo { q.AiCenterName = "云脑二" + } else if r.AiCenterCode == AICenterOfChengdu { + q.AiCenterName = "启智成都智算" } } return q diff --git a/models/resource_specification.go b/models/resource_specification.go index dca6647ab..2da8d015d 100644 --- a/models/resource_specification.go +++ b/models/resource_specification.go @@ -2,6 +2,7 @@ package models import ( "code.gitea.io/gitea/modules/timeutil" + "fmt" "xorm.io/builder" ) @@ -22,6 +23,7 @@ type ResourceSpecification struct { ShareMemGiB float32 UnitPrice int Status int + IsAvailable bool IsAutomaticSync bool CreatedTime timeutil.TimeStamp `xorm:"created"` CreatedBy int64 @@ -40,6 +42,7 @@ func (r ResourceSpecification) ConvertToRes() *ResourceSpecificationRes { GPUMemGiB: r.GPUMemGiB, UnitPrice: r.UnitPrice, Status: r.Status, + IsAvailable: r.IsAvailable, UpdatedTime: r.UpdatedTime, } } @@ -72,14 +75,16 @@ func (r ResourceSpecificationReq) ToDTO() ResourceSpecification { IsAutomaticSync: r.IsAutomaticSync, CreatedBy: r.CreatorId, UpdatedBy: r.CreatorId, + IsAvailable: true, } } type SearchResourceSpecificationOptions struct { ListOptions - QueueId int64 - Status int - Cluster string + QueueId int64 + Status int + Cluster string + AvailableCode int } type SearchResourceBriefSpecificationOptions struct { @@ -113,6 +118,7 @@ type ResourceSpecificationRes struct { ShareMemGiB float32 UnitPrice int Status int + IsAvailable bool UpdatedTime timeutil.TimeStamp } @@ -141,6 +147,53 @@ func (r ResourceSpecAndQueue) ConvertToRes() *ResourceSpecAndQueueRes { } } +type FindSpecsOptions struct { + JobType JobType + ComputeResource string + Cluster string + AiCenterCode string + SpecId int64 + QueueCode string + SourceSpecId string + AccCardsNum int + UseAccCardsNum bool + AccCardType string + CpuCores int + UseCpuCores bool + MemGiB float32 + UseMemGiB bool + GPUMemGiB float32 + UseGPUMemGiB bool + ShareMemGiB float32 + UseShareMemGiB bool + //if true,find specs no matter used or not used in scene. if false,only find specs used in scene + RequestAll bool +} + +type Specification struct { + ID int64 + SourceSpecId string + AccCardsNum int + AccCardType string + CpuCores int + MemGiB float32 + GPUMemGiB float32 + ShareMemGiB float32 + ComputeResource string + UnitPrice int + QueueId int64 + QueueCode string + Cluster string + AiCenterCode string + AiCenterName string + IsExclusive bool + ExclusiveOrg string +} + +func (Specification) TableName() string { + return "resource_specification" +} + func InsertResourceSpecification(r ResourceSpecification) (int64, error) { return x.Insert(&r) } @@ -167,6 +220,11 @@ func SearchResourceSpecification(opts SearchResourceSpecificationOptions) (int64 if opts.Cluster != "" { cond = cond.And(builder.Eq{"resource_queue.cluster": opts.Cluster}) } + if opts.AvailableCode == 1 { + cond = cond.And(builder.Eq{"resource_specification.is_available": true}) + } else if opts.AvailableCode == 2 { + cond = cond.And(builder.Eq{"resource_specification.is_available": false}) + } //cond = cond.And(builder.Or(builder.Eq{"resource_queue.deleted_time": 0}).Or(builder.IsNull{"resource_queue.deleted_time"})) n, err := x.Where(cond).Join("INNER", "resource_queue", "resource_queue.ID = resource_specification.queue_id"). Unscoped().Count(&ResourceSpecAndQueue{}) @@ -256,7 +314,7 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS return err } if len(deleteIds) > 0 { - if _, err = sess.In("id", deleteIds).Update(&ResourceSpecification{Status: SpecOffShelf}); err != nil { + if _, err = sess.Cols("status", "is_available").In("id", deleteIds).Update(&ResourceSpecification{Status: SpecOffShelf, IsAvailable: false}); err != nil { return err } if _, err = sess.In("spec_id", deleteIds).Delete(&ResourceSceneSpec{}); err != nil { @@ -267,7 +325,7 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS //update exists specs if len(updateList) > 0 { for _, v := range updateList { - if _, err = sess.ID(v.ID).Update(&v); err != nil { + if _, err = sess.ID(v.ID).UseBool("is_available").Update(&v); err != nil { return err } } @@ -283,3 +341,221 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS return sess.Commit() } + +//FindSpecs +func FindSpecs(opts FindSpecsOptions) ([]*Specification, error) { + var cond = builder.NewCond() + if !opts.RequestAll && opts.JobType != "" { + cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType}) + } + if opts.ComputeResource != "" { + cond = cond.And(builder.Eq{"resource_queue.compute_resource": opts.ComputeResource}) + } + if opts.Cluster != "" { + cond = cond.And(builder.Eq{"resource_queue.cluster": opts.Cluster}) + } + if opts.AiCenterCode != "" { + cond = cond.And(builder.Eq{"resource_queue.ai_center_code": opts.AiCenterCode}) + } + if opts.SpecId > 0 { + cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId}) + } + if opts.QueueCode != "" { + cond = cond.And(builder.Eq{"resource_queue.queue_code": opts.QueueCode}) + } + if opts.SourceSpecId != "" { + cond = cond.And(builder.Eq{"resource_specification.source_spec_id": opts.SourceSpecId}) + } + if opts.UseAccCardsNum { + cond = cond.And(builder.Eq{"resource_specification.acc_cards_num": opts.AccCardsNum}) + } + if opts.AccCardType != "" { + cond = cond.And(builder.Eq{"resource_queue.acc_card_type": opts.AccCardType}) + } + if opts.UseCpuCores { + cond = cond.And(builder.Eq{"resource_specification.cpu_cores": opts.CpuCores}) + } + if opts.UseMemGiB { + cond = cond.And(builder.Eq{"resource_specification.mem_gi_b": opts.MemGiB}) + } + if opts.UseGPUMemGiB { + cond = cond.And(builder.Eq{"resource_specification.gpu_mem_gi_b": opts.GPUMemGiB}) + } + if opts.UseShareMemGiB { + cond = cond.And(builder.Eq{"resource_specification.share_mem_gi_b": opts.ShareMemGiB}) + } + r := make([]*Specification, 0) + s := x.Where(cond). + Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id") + + if !opts.RequestAll { + s = s.Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id"). + Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id") + } + err := s.OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc,resource_specification.cpu_cores asc,resource_specification.mem_gi_b asc,resource_specification.share_mem_gi_b asc"). + Unscoped().Find(&r) + if err != nil { + return nil, err + } + return r, nil +} + +func InitQueueAndSpec(queue ResourceQueue, spec ResourceSpecification) (*Specification, error) { + sess := x.NewSession() + defer sess.Close() + + sess.Begin() + param := ResourceQueue{ + QueueCode: queue.QueueCode, + Cluster: queue.Cluster, + AiCenterCode: queue.AiCenterCode, + ComputeResource: queue.ComputeResource, + AccCardType: queue.AccCardType, + } + _, err := sess.Get(¶m) + if err != nil { + sess.Rollback() + return nil, err + } + if param.ID == 0 { + _, err = sess.InsertOne(&queue) + if err != nil { + sess.Rollback() + return nil, err + } + } else { + queue = param + } + + spec.QueueId = queue.ID + _, err = sess.InsertOne(&spec) + if err != nil { + sess.Rollback() + return nil, err + } + sess.Commit() + return BuildSpecification(queue, spec), nil +} + +func BuildSpecification(queue ResourceQueue, spec ResourceSpecification) *Specification { + return &Specification{ + ID: spec.ID, + SourceSpecId: spec.SourceSpecId, + AccCardsNum: spec.AccCardsNum, + AccCardType: queue.AccCardType, + CpuCores: spec.CpuCores, + MemGiB: spec.MemGiB, + GPUMemGiB: spec.GPUMemGiB, + ShareMemGiB: spec.ShareMemGiB, + ComputeResource: queue.ComputeResource, + UnitPrice: spec.UnitPrice, + QueueId: queue.ID, + QueueCode: queue.QueueCode, + Cluster: queue.Cluster, + AiCenterCode: queue.AiCenterCode, + AiCenterName: queue.AiCenterName, + } +} + +func GetCloudbrainOneAccCardType(queueCode string) string { + switch queueCode { + case "a100": + return "A100" + case "openidebug": + return "T4" + case "openidgx": + return "V100" + + } + return "" +} + +var cloudbrainTwoSpecsInitFlag = false +var cloudbrainTwoSpecs map[string]*Specification + +func GetCloudbrainTwoSpecs() (map[string]*Specification, error) { + if !cloudbrainTwoSpecsInitFlag { + r, err := InitCloudbrainTwoSpecs() + if err != nil { + return nil, err + } + cloudbrainTwoSpecsInitFlag = true + cloudbrainTwoSpecs = r + } + return cloudbrainTwoSpecs, nil +} + +func InitCloudbrainTwoSpecs() (map[string]*Specification, error) { + r := make(map[string]*Specification, 0) + + queue, err := GetResourceQueue(&ResourceQueue{QueueCode: "openisupport"}) + if err != nil { + return nil, err + } + if queue == nil { + queue = &ResourceQueue{ + QueueCode: "openisupport", + Cluster: OpenICluster, + AiCenterCode: AICenterOfCloudBrainTwo, + AiCenterName: "云脑二", + ComputeResource: NPU, + AccCardType: "ASCEND910", + Remark: "处理历史云脑任务时自动生成", + } + _, err = x.InsertOne(queue) + if err != nil { + return nil, err + } + } + for i := 1; i <= 8; i = i * 2 { + sourceSpecId := "modelarts.bm.910.arm.public." + fmt.Sprint(i) + spec, err := GetResourceSpecification(&ResourceSpecification{ + SourceSpecId: sourceSpecId, + QueueId: queue.ID, + }) + if err != nil { + return nil, err + } + if spec == nil { + spec = &ResourceSpecification{ + QueueId: queue.ID, + SourceSpecId: sourceSpecId, + AccCardsNum: i, + CpuCores: i * 24, + MemGiB: float32(i * 256), + GPUMemGiB: float32(32), + Status: SpecOffShelf, + IsAvailable: true, + } + _, err = x.Insert(spec) + if err != nil { + return nil, err + } + } + r[sourceSpecId] = BuildSpecification(*queue, *spec) + } + return r, nil +} + +var grampusSpecsInitFlag = false +var grampusSpecs map[string]*Specification + +func GetGrampusSpecs() (map[string]*Specification, error) { + if !grampusSpecsInitFlag { + specMap := make(map[string]*Specification, 0) + r, err := FindSpecs(FindSpecsOptions{ + Cluster: C2NetCluster, + RequestAll: true, + }) + if err != nil { + return nil, err + } + for _, spec := range r { + specMap[spec.SourceSpecId] = spec + specMap[spec.SourceSpecId+"_"+spec.AiCenterCode] = spec + } + grampusSpecsInitFlag = true + grampusSpecs = specMap + } + return grampusSpecs, nil +} diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 39685990d..5bd294f2a 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -24,6 +24,7 @@ type CreateCloudBrainForm struct { Params string `form:"run_para_list"` BranchName string `form:"branch_name"` DatasetName string `form:"dataset_name"` + SpecId int64 `form:"spec_id"` } type CommitImageCloudBrainForm struct { @@ -72,6 +73,7 @@ type CreateCloudBrainInferencForm struct { CkptName string `form:"ckpt_name" binding:"Required"` LabelName string `form:"label_names" binding:"Required"` DatasetName string `form:"dataset_name"` + SpecId int64 `form:"spec_id"` } func (f *CreateCloudBrainForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 0338d2ae7..21008ea09 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -11,15 +11,14 @@ type CreateGrampusTrainJobForm struct { Attachment string `form:"attachment" binding:"Required"` BootFile string `form:"boot_file" binding:"Required"` ImageID string `form:"image_id" binding:"Required"` - FlavorID string `form:"flavor" binding:"Required"` Params string `form:"run_para_list" binding:"Required"` Description string `form:"description"` BranchName string `form:"branch_name" binding:"Required"` - FlavorName string `form:"flavor_name" binding:"Required"` EngineName string `form:"engine_name" binding:"Required"` WorkServerNumber int `form:"work_server_number" binding:"Required"` Image string `form:"image"` DatasetName string `form:"dataset_name"` + SpecId int64 `form:"spec_id"` } func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index ce41f5d1e..23e1f325a 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -22,6 +22,7 @@ type CreateModelArtsNotebookForm struct { Description string `form:"description"` Flavor string `form:"flavor" binding:"Required"` ImageId string `form:"image_id" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { @@ -46,6 +47,7 @@ type CreateModelArtsTrainJobForm struct { VersionName string `form:"version_name" binding:"Required"` FlavorName string `form:"flaver_names" binding:"Required"` EngineName string `form:"engine_names" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } type CreateModelArtsInferenceJobForm struct { @@ -71,6 +73,7 @@ type CreateModelArtsInferenceJobForm struct { ModelName string `form:"model_name" binding:"Required"` ModelVersion string `form:"model_version" binding:"Required"` CkptName string `form:"ckpt_name" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/auth/wechat/cloudbrain.go b/modules/auth/wechat/cloudbrain.go index 193edd1da..3cdf7ed05 100644 --- a/modules/auth/wechat/cloudbrain.go +++ b/modules/auth/wechat/cloudbrain.go @@ -62,7 +62,7 @@ type CloudbrainStopMsg struct { func (CloudbrainStopMsg) Data(ctx *TemplateContext) *DefaultWechatTemplate { return &DefaultWechatTemplate{ - First: TemplateValue{Value: setting.CloudbrainStoppedTitle}, + First: TemplateValue{Value: fmt.Sprintf(setting.CloudbrainStoppedTitle, ctx.Cloudbrain.Status)}, Keyword1: TemplateValue{Value: ctx.Cloudbrain.DisplayJobName}, Keyword2: TemplateValue{Value: getJobTypeDisplayName(ctx.Cloudbrain.JobType)}, Keyword3: TemplateValue{Value: time.Unix(int64(ctx.Cloudbrain.CreatedUnix), 0).Format("2006-01-02 15:04:05")}, diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 30f080335..748af4a29 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -20,7 +20,7 @@ import ( const ( //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` - CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` + CommandBenchmark = `cd /benchmark && bash run_bk.sh >/model/benchmark-log.txt` CodeMountPath = "/code" DataSetMountPath = "/dataset" ModelMountPath = "/model" @@ -30,8 +30,8 @@ const ( Snn4imagenetMountPath = "/snn4imagenet" BrainScoreMountPath = "/brainscore" TaskInfoName = "/taskInfo" - Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'` - BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'` + Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s' >/model/benchmark-log.txt` + BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s' >/model/benchmark-log.txt` SubTaskName = "task1" @@ -61,7 +61,6 @@ type GenerateCloudBrainTaskReq struct { Snn4ImageNetPath string BrainScorePath string JobType string - GpuQueue string Description string BranchName string BootFile string @@ -72,13 +71,13 @@ type GenerateCloudBrainTaskReq struct { DatasetInfos map[string]models.DatasetInfo BenchmarkTypeID int BenchmarkChildTypeID int - ResourceSpecId int ResultPath string TrainUrl string ModelName string ModelVersion string CkptName string LabelName string + Spec *models.Specification } func GetCloudbrainDebugCommand() string { @@ -227,50 +226,9 @@ func AdminOrImageCreaterRight(ctx *context.Context) { } func GenerateTask(req GenerateCloudBrainTaskReq) error { - var resourceSpec *models.ResourceSpec var versionCount int if req.JobType == string(models.JobTypeTrain) { versionCount = 1 - if TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) - } - for _, spec := range TrainResourceSpecs.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - } else if req.JobType == string(models.JobTypeInference) { - if InferenceResourceSpecs == nil { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) - } - for _, spec := range InferenceResourceSpecs.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - - } else { - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - - } - //如果没有匹配到spec信息,尝试从专属资源池获取 - if resourceSpec == nil && SpecialPools != nil { - resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) - } - - if resourceSpec == nil { - log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) - return errors.New("no such resourceSpec") } volumes := []models.Volume{ @@ -342,7 +300,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { jobResult, err := CreateJob(req.JobName, models.CreateJobParams{ JobName: req.JobName, RetryCount: 1, - GpuType: req.GpuQueue, + GpuType: req.Spec.QueueCode, Image: req.Image, TaskRoles: []models.TaskRole{ { @@ -350,10 +308,10 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { TaskNumber: 1, MinSucceededTaskCount: 1, MinFailedTaskCount: 1, - CPUNumber: resourceSpec.CpuNum, - GPUNumber: resourceSpec.GpuNum, - MemoryMB: resourceSpec.MemMiB, - ShmMB: resourceSpec.ShareMemMiB, + CPUNumber: req.Spec.CpuCores, + GPUNumber: req.Spec.AccCardsNum, + MemoryMB: int(req.Spec.MemGiB * 1024), + ShmMB: int(req.Spec.ShareMemGiB * 1024), Command: req.Command, NeedIBDevice: false, IsMainRole: false, @@ -384,8 +342,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { Type: models.TypeCloudBrainOne, Uuid: req.Uuids, Image: req.Image, - GpuQueue: req.GpuQueue, - ResourceSpecId: req.ResourceSpecId, + GpuQueue: req.Spec.QueueCode, ComputeResource: models.GPUResource, BenchmarkTypeID: req.BenchmarkTypeID, BenchmarkChildTypeID: req.BenchmarkChildTypeID, @@ -405,6 +362,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { CreatedUnix: createTime, UpdatedUnix: createTime, CommitID: req.CommitID, + Spec: req.Spec, }) if err != nil { @@ -416,6 +374,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { log.Error("GetCloudbrainByJobID failed: %v", err.Error()) return err } + stringId := strconv.FormatInt(task.ID, 10) if IsBenchmarkJob(req.JobType) { @@ -447,25 +406,7 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error { jobName := task.JobName - var resourceSpec *models.ResourceSpec - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if task.ResourceSpecId == spec.Id { - resourceSpec = spec - } - } - - //如果没有匹配到spec信息,尝试从专属资源池获取 - if resourceSpec == nil && SpecialPools != nil { - resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) - } - - if resourceSpec == nil { - log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) - return errors.New("no such resourceSpec") - } + spec := task.Spec var datasetInfos map[string]models.DatasetInfo if task.Uuid != "" { var err error @@ -547,10 +488,10 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e TaskNumber: 1, MinSucceededTaskCount: 1, MinFailedTaskCount: 1, - CPUNumber: resourceSpec.CpuNum, - GPUNumber: resourceSpec.GpuNum, - MemoryMB: resourceSpec.MemMiB, - ShmMB: resourceSpec.ShareMemMiB, + CPUNumber: spec.CpuCores, + GPUNumber: spec.AccCardsNum, + MemoryMB: int(spec.MemGiB * 1024), + ShmMB: int(spec.ShareMemGiB * 1024), Command: GetCloudbrainDebugCommand(), //Command, NeedIBDevice: false, IsMainRole: false, @@ -588,6 +529,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e CreatedUnix: createTime, UpdatedUnix: createTime, BranchName: task.BranchName, + Spec: spec, } err = models.RestartCloudbrain(task, newTask) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 0d84d7aa7..687fb4959 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -30,18 +30,17 @@ const ( var ( poolInfos *models.PoolInfos - FlavorInfos *models.FlavorInfos - ImageInfos *models.ImageInfosModelArts + FlavorInfos *setting.StFlavorInfos + ImageInfos *setting.StImageInfosModelArts SpecialPools *models.SpecialPools ) type GenerateTrainJobReq struct { - JobName string - Command string - ResourceSpecId string - ImageUrl string //与image_id二选一,都有的情况下优先image_url - ImageId string + JobName string + Command string + ImageUrl string //与image_id二选一,都有的情况下优先image_url + ImageId string DisplayJobName string Uuid string @@ -58,7 +57,6 @@ type GenerateTrainJobReq struct { BranchName string PreVersionId int64 PreVersionName string - FlavorName string VersionCount int EngineName string TotalVersionCount int @@ -66,6 +64,7 @@ type GenerateTrainJobReq struct { ProcessType string DatasetName string Params string + Spec *models.Specification } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { @@ -79,7 +78,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error { Name: req.JobName, Command: req.Command, - ResourceSpecId: req.ResourceSpecId, + ResourceSpecId: req.Spec.SourceSpecId, ImageId: req.ImageId, ImageUrl: req.ImageUrl, CenterID: centerID, @@ -114,15 +113,14 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, - FlavorCode: req.ResourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, - FlavorName: req.FlavorName, EngineName: req.EngineName, VersionCount: req.VersionCount, TotalVersionCount: req.TotalVersionCount, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if err != nil { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 9e8447978..4539699ad 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -1,6 +1,7 @@ package modelarts import ( + "code.gitea.io/gitea/modules/modelarts_cd" "encoding/json" "errors" "fmt" @@ -68,10 +69,9 @@ const ( var ( poolInfos *models.PoolInfos - FlavorInfos *models.FlavorInfos - ImageInfos *models.ImageInfosModelArts TrainFlavorInfos *Flavor - SpecialPools *models.SpecialPools + SpecialPools *models.SpecialPools + MultiNodeConfig *MultiNodes ) type GenerateTrainJobReq struct { @@ -84,7 +84,6 @@ type GenerateTrainJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string LogUrl string PoolID string WorkServerNumber int @@ -96,6 +95,7 @@ type GenerateTrainJobReq struct { BranchName string PreVersionId int64 PreVersionName string + FlavorCode string FlavorName string VersionCount int EngineName string @@ -103,6 +103,7 @@ type GenerateTrainJobReq struct { UserImageUrl string UserCommand string DatasetName string + Spec *models.Specification } type GenerateInferenceJobReq struct { @@ -115,7 +116,6 @@ type GenerateInferenceJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string LogUrl string PoolID string WorkServerNumber int @@ -134,6 +134,7 @@ type GenerateInferenceJobReq struct { ModelVersion string CkptName string ResultUrl string + Spec *models.Specification DatasetName string } @@ -166,6 +167,14 @@ type ResourcePool struct { } `json:"resource_pool"` } +type MultiNodes struct{ + Info []OrgMultiNode `json:"multinode"` +} +type OrgMultiNode struct{ + Org string `json:"org"` + Node []int `json:"node"` +} + // type Parameter struct { // Label string `json:"label"` // Value string `json:"value"` @@ -257,7 +266,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin return nil } -func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { +func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } @@ -271,7 +280,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc jobResult, err := createNotebook2(models.CreateNotebook2Params{ JobName: jobName, Description: description, - Flavor: flavor, + Flavor: spec.SourceSpecId, Duration: autoStopDurationMs, ImageID: imageId, PoolID: poolInfos.PoolInfo[0].PoolId, @@ -308,7 +317,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, - FlavorCode: flavor, + FlavorCode: spec.SourceSpecId, DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTwo, @@ -318,6 +327,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc Description: description, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: spec, } err = models.CreateCloudbrain(task) @@ -348,7 +358,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, UserImageUrl: req.UserImageUrl, @@ -370,7 +380,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, @@ -419,7 +429,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -428,6 +438,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error TotalVersionCount: req.TotalVersionCount, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if createErr != nil { @@ -479,7 +490,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, @@ -500,7 +511,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, @@ -567,7 +578,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job DataUrl: req.DataUrl, LogUrl: req.LogUrl, PreVersionId: req.PreVersionId, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -576,6 +587,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job VersionCount: VersionListCount + 1, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) @@ -666,7 +678,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, @@ -718,7 +730,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -734,6 +746,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e ResultUrl: req.ResultUrl, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if err != nil { @@ -748,11 +761,7 @@ func GetNotebookImageName(imageId string) (string, error) { var validImage = false var imageName = "" - if ImageInfos == nil { - json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos) - } - - for _, imageInfo := range ImageInfos.ImageInfo { + for _, imageInfo := range setting.StImageInfos.ImageInfo { if imageInfo.Id == imageId { validImage = true imageName = imageInfo.Value @@ -773,6 +782,13 @@ func InitSpecialPool() { } } +func InitMultiNode(){ + if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{ + json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig) + } + +} + func HandleTrainJobInfo(task *models.Cloudbrain) error { result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) @@ -809,8 +825,13 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error { } func HandleNotebookInfo(task *models.Cloudbrain) error { - - result, err := GetNotebook2(task.JobID) + var result *models.GetNotebook2Result + var err error + if task.Type == models.TypeCloudBrainTwo { + result, err = GetNotebook2(task.JobID) + } else if task.Type == models.TypeCDCenter { + result, err = modelarts_cd.GetNotebook(task.JobID) + } if err != nil { log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) return err diff --git a/modules/modelarts_cd/modelarts.go b/modules/modelarts_cd/modelarts.go new file mode 100755 index 000000000..330b048ca --- /dev/null +++ b/modules/modelarts_cd/modelarts.go @@ -0,0 +1,215 @@ +package modelarts_cd + +import ( + "errors" + "strconv" + "strings" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/notification" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" +) + +const ( + //notebook + storageTypeOBS = "obs" + autoStopDuration = 4 * 60 * 60 + autoStopDurationMs = 4 * 60 * 60 * 1000 + MORDELART_USER_IMAGE_ENGINE_ID = -1 + DataSetMountPath = "/home/ma-user/work" + NotebookEnv = "Python3" + NotebookType = "Ascend" + FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" + + //train-job + CodePath = "/code/" + OutputPath = "/output/" + ResultPath = "/result/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 500 + TrainUrl = "train_url" + DataUrl = "data_url" + MultiDataUrl = "multi_data_url" + ResultUrl = "result_url" + CkptUrl = "ckpt_url" + DeviceTarget = "device_target" + Ascend = "Ascend" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" + VersionCountOne = 1 + + SortByCreateTime = "create_time" + ConfigTypeCustom = "custom" + TotalVersionCount = 1 +) + +var () + +type VersionInfo struct { + Version []struct { + ID int `json:"id"` + Value string `json:"value"` + Url string `json:"url"` + } `json:"version"` +} + +type Flavor struct { + Info []struct { + Code string `json:"code"` + Value string `json:"value"` + } `json:"flavor"` +} + +type Engine struct { + Info []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"engine"` +} + +type ResourcePool struct { + Info []struct { + ID string `json:"id"` + Value string `json:"value"` + } `json:"resource_pool"` +} + +type Parameters struct { + Parameter []struct { + Label string `json:"label"` + Value string `json:"value"` + } `json:"parameter"` +} + +func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { + imageName, err := GetNotebookImageName(imageId) + if err != nil { + log.Error("GetNotebookImageName failed: %v", err.Error()) + return err + } + createTime := timeutil.TimeStampNow() + jobResult, err := createNotebook(models.CreateNotebookWithoutPoolParams{ + JobName: jobName, + Description: description, + Flavor: spec.SourceSpecId, + Duration: autoStopDurationMs, + ImageID: imageId, + Feature: models.NotebookFeature, + Volume: models.VolumeReq{ + Capacity: setting.Capacity, + Category: models.EVSCategory, + Ownership: models.ManagedOwnership, + }, + WorkspaceID: "0", + }) + if err != nil { + log.Error("createNotebook failed: %v", err.Error()) + if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { + log.Info("(%s)unknown error, set temp status", displayJobName) + errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ + JobID: models.TempJobId, + VersionID: models.TempVersionId, + Status: models.TempJobStatus, + Type: models.TypeCDCenter, + JobName: jobName, + JobType: string(models.JobTypeDebug), + }) + if errTemp != nil { + log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) + return errTemp + } + } + return err + } + task := &models.Cloudbrain{ + Status: jobResult.Status, + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobResult.ID, + JobName: jobName, + FlavorCode: spec.SourceSpecId, + DisplayJobName: displayJobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeCDCenter, + Uuid: uuid, + ComputeResource: models.NPUResource, + Image: imageName, + Description: description, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: spec, + } + + err = models.CreateCloudbrain(task) + if err != nil { + return err + } + + stringId := strconv.FormatInt(task.ID, 10) + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask) + return nil +} + +func GetNotebookImageName(imageId string) (string, error) { + var validImage = false + var imageName = "" + + for _, imageInfo := range setting.StImageInfos.ImageInfo { + if imageInfo.Id == imageId { + validImage = true + imageName = imageInfo.Value + } + } + + if !validImage { + log.Error("the image id(%s) is invalid", imageId) + return imageName, errors.New("the image id is invalid") + } + + return imageName, nil +} + +/* +func HandleNotebookInfo(task *models.Cloudbrain) error { + + result, err := GetNotebook(task.JobID) + if err != nil { + log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) + return err + } + + if result != nil { + oldStatus := task.Status + task.Status = result.Status + if task.StartTime == 0 && result.Lease.UpdateTime > 0 { + task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) + } + if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { + task.EndTime = timeutil.TimeStampNow() + } + task.CorrectCreateUnix() + task.ComputeAndSetDuration() + if oldStatus != task.Status { + notification.NotifyChangeCloudbrainStatus(task, oldStatus) + } + if task.FlavorCode == "" { + task.FlavorCode = result.Flavor + } + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) + return err + } + } + + return nil +} + +*/ diff --git a/modules/modelarts_cd/resty.go b/modules/modelarts_cd/resty.go new file mode 100755 index 000000000..6feb78967 --- /dev/null +++ b/modules/modelarts_cd/resty.go @@ -0,0 +1,220 @@ +package modelarts_cd + +import ( + "bytes" + "code.gitea.io/gitea/modules/modelarts_gateway/core" + "crypto/tls" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "strconv" + "time" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" +) + +var ( + httpClient *http.Client + HOST string + TOKEN string +) + +const ( + errorCodeExceedLimit = "ModelArts.0118" + + //notebook 2.0 + urlNotebook2 = "/notebooks" + + //error code + modelartsIllegalToken = "ModelArts.6401" + NotebookNotFound = "ModelArts.6404" + NotebookNoPermission = "ModelArts.6407" + NotebookInvalid = "ModelArts.6400" + UnknownErrorPrefix = "UNKNOWN:" +) + +func getHttpClient() *http.Client { + if httpClient == nil { + httpClient = &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}, + } + } + return httpClient +} + +func GetNotebook(jobID string) (*models.GetNotebook2Result, error) { + var result models.GetNotebook2Result + + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } + r, _ := http.NewRequest(http.MethodGet, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID, + nil) + + r.Header.Add("content-type", "application/json") + s.Sign(r) + + resp, err := client.Do(r) + if err != nil { + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) + } + + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) + } + + err = json.Unmarshal(body, &result) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(result.ErrorCode) != 0 { + log.Error("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { + var result models.NotebookActionResult + + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } + r, _ := http.NewRequest(http.MethodPost, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID+"/"+param.Action+"?duration="+strconv.Itoa(autoStopDurationMs), + nil) + + r.Header.Add("content-type", "application/json") + s.Sign(r) + + resp, err := client.Do(r) + if err != nil { + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) + } + + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) + } + + err = json.Unmarshal(body, &result) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(result.ErrorCode) != 0 { + log.Error("ManageNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("ManageNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func DelNotebook(jobID string) (*models.NotebookDelResult, error) { + var result models.NotebookDelResult + + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } + + r, _ := http.NewRequest(http.MethodDelete, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID, + nil) + + r.Header.Add("content-type", "application/json") + s.Sign(r) + + resp, err := client.Do(r) + if err != nil { + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) + } + + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) + } + + err = json.Unmarshal(body, &result) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(result.ErrorCode) != 0 { + log.Error("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func createNotebook(createJobParams models.CreateNotebookWithoutPoolParams) (*models.CreateNotebookResult, error) { + var result models.CreateNotebookResult + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } + + req, _ := json.Marshal(createJobParams) + r, _ := http.NewRequest(http.MethodPost, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2, + ioutil.NopCloser(bytes.NewBuffer(req))) + + r.Header.Add("content-type", "application/json") + s.Sign(r) + + resp, err := client.Do(r) + if err != nil { + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) + } + + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) + } + + err = json.Unmarshal(body, &result) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error()) + } + + if len(result.ErrorCode) != 0 { + log.Error("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + if result.ErrorCode == errorCodeExceedLimit { + result.ErrorMsg = "所选规格使用数量已超过最大配额限制。" + } + return &result, fmt.Errorf("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} diff --git a/modules/modelarts_gateway/core/escape.go b/modules/modelarts_gateway/core/escape.go new file mode 100755 index 000000000..e8c76b8ae --- /dev/null +++ b/modules/modelarts_gateway/core/escape.go @@ -0,0 +1,42 @@ +// based on https://github.com/golang/go/blob/master/src/net/url/url.go +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package core + +func shouldEscape(c byte) bool { + if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_' || c == '-' || c == '~' || c == '.' { + return false + } + return true +} +func escape(s string) string { + hexCount := 0 + for i := 0; i < len(s); i++ { + c := s[i] + if shouldEscape(c) { + hexCount++ + } + } + + if hexCount == 0 { + return s + } + + t := make([]byte, len(s)+2*hexCount) + j := 0 + for i := 0; i < len(s); i++ { + switch c := s[i]; { + case shouldEscape(c): + t[j] = '%' + t[j+1] = "0123456789ABCDEF"[c>>4] + t[j+2] = "0123456789ABCDEF"[c&15] + j += 3 + default: + t[j] = s[i] + j++ + } + } + return string(t) +} diff --git a/modules/modelarts_gateway/core/signer.go b/modules/modelarts_gateway/core/signer.go new file mode 100755 index 000000000..7992713b3 --- /dev/null +++ b/modules/modelarts_gateway/core/signer.go @@ -0,0 +1,208 @@ +// HWS API Gateway Signature +// based on https://github.com/datastream/aws/blob/master/signv4.go +// Copyright (c) 2014, Xianjie + +package core + +import ( + "bytes" + "crypto/hmac" + "crypto/sha256" + "fmt" + "io/ioutil" + "net/http" + "sort" + "strings" + "time" +) + +const ( + BasicDateFormat = "20060102T150405Z" + Algorithm = "SDK-HMAC-SHA256" + HeaderXDate = "X-Sdk-Date" + HeaderHost = "host" + HeaderAuthorization = "Authorization" + HeaderContentSha256 = "X-Sdk-Content-Sha256" +) + +func hmacsha256(key []byte, data string) ([]byte, error) { + h := hmac.New(sha256.New, []byte(key)) + if _, err := h.Write([]byte(data)); err != nil { + return nil, err + } + return h.Sum(nil), nil +} + +// Build a CanonicalRequest from a regular request string +// +// CanonicalRequest = +// HTTPRequestMethod + '\n' + +// CanonicalURI + '\n' + +// CanonicalQueryString + '\n' + +// CanonicalHeaders + '\n' + +// SignedHeaders + '\n' + +// HexEncode(Hash(RequestPayload)) +func CanonicalRequest(r *http.Request, signedHeaders []string) (string, error) { + var hexencode string + var err error + if hex := r.Header.Get(HeaderContentSha256); hex != "" { + hexencode = hex + } else { + data, err := RequestPayload(r) + if err != nil { + return "", err + } + hexencode, err = HexEncodeSHA256Hash(data) + if err != nil { + return "", err + } + } + return fmt.Sprintf("%s\n%s\n%s\n%s\n%s\n%s", r.Method, CanonicalURI(r), CanonicalQueryString(r), CanonicalHeaders(r, signedHeaders), strings.Join(signedHeaders, ";"), hexencode), err +} + +// CanonicalURI returns request uri +func CanonicalURI(r *http.Request) string { + pattens := strings.Split(r.URL.Path, "/") + var uri []string + for _, v := range pattens { + uri = append(uri, escape(v)) + } + urlpath := strings.Join(uri, "/") + if len(urlpath) == 0 || urlpath[len(urlpath)-1] != '/' { + urlpath = urlpath + "/" + } + return urlpath +} + +// CanonicalQueryString +func CanonicalQueryString(r *http.Request) string { + var keys []string + query := r.URL.Query() + for key := range query { + keys = append(keys, key) + } + sort.Strings(keys) + var a []string + for _, key := range keys { + k := escape(key) + sort.Strings(query[key]) + for _, v := range query[key] { + kv := fmt.Sprintf("%s=%s", k, escape(v)) + a = append(a, kv) + } + } + queryStr := strings.Join(a, "&") + r.URL.RawQuery = queryStr + return queryStr +} + +// CanonicalHeaders +func CanonicalHeaders(r *http.Request, signerHeaders []string) string { + var a []string + header := make(map[string][]string) + for k, v := range r.Header { + header[strings.ToLower(k)] = v + } + for _, key := range signerHeaders { + value := header[key] + if strings.EqualFold(key, HeaderHost) { + value = []string{r.Host} + } + sort.Strings(value) + for _, v := range value { + a = append(a, key+":"+strings.TrimSpace(v)) + } + } + return fmt.Sprintf("%s\n", strings.Join(a, "\n")) +} + +// SignedHeaders +func SignedHeaders(r *http.Request) []string { + var a []string + for key := range r.Header { + a = append(a, strings.ToLower(key)) + } + sort.Strings(a) + return a +} + +// RequestPayload +func RequestPayload(r *http.Request) ([]byte, error) { + if r.Body == nil { + return []byte(""), nil + } + b, err := ioutil.ReadAll(r.Body) + if err != nil { + return []byte(""), err + } + r.Body = ioutil.NopCloser(bytes.NewBuffer(b)) + return b, err +} + +// Create a "String to Sign". +func StringToSign(canonicalRequest string, t time.Time) (string, error) { + hash := sha256.New() + _, err := hash.Write([]byte(canonicalRequest)) + if err != nil { + return "", err + } + return fmt.Sprintf("%s\n%s\n%x", + Algorithm, t.UTC().Format(BasicDateFormat), hash.Sum(nil)), nil +} + +// Create the HWS Signature. +func SignStringToSign(stringToSign string, signingKey []byte) (string, error) { + hm, err := hmacsha256(signingKey, stringToSign) + return fmt.Sprintf("%x", hm), err +} + +// HexEncodeSHA256Hash returns hexcode of sha256 +func HexEncodeSHA256Hash(body []byte) (string, error) { + hash := sha256.New() + if body == nil { + body = []byte("") + } + _, err := hash.Write(body) + return fmt.Sprintf("%x", hash.Sum(nil)), err +} + +// Get the finalized value for the "Authorization" header. The signature parameter is the output from SignStringToSign +func AuthHeaderValue(signature, accessKey string, signedHeaders []string) string { + return fmt.Sprintf("%s Access=%s, SignedHeaders=%s, Signature=%s", Algorithm, accessKey, strings.Join(signedHeaders, ";"), signature) +} + +// Signature HWS meta +type Signer struct { + Key string + Secret string +} + +// SignRequest set Authorization header +func (s *Signer) Sign(r *http.Request) error { + var t time.Time + var err error + var dt string + if dt = r.Header.Get(HeaderXDate); dt != "" { + t, err = time.Parse(BasicDateFormat, dt) + } + if err != nil || dt == "" { + t = time.Now() + r.Header.Set(HeaderXDate, t.UTC().Format(BasicDateFormat)) + } + signedHeaders := SignedHeaders(r) + canonicalRequest, err := CanonicalRequest(r, signedHeaders) + if err != nil { + return err + } + stringToSign, err := StringToSign(canonicalRequest, t) + if err != nil { + return err + } + signature, err := SignStringToSign(stringToSign, []byte(s.Secret)) + if err != nil { + return err + } + authValue := AuthHeaderValue(signature, s.Key, signedHeaders) + r.Header.Set(HeaderAuthorization, authValue) + return nil +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 1e96ff9da..1992baf54 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -75,6 +75,26 @@ type C2NetSqInfos struct { C2NetSqInfo []*C2NetSequenceInfo `json:"sequence"` } +type StFlavorInfos struct { + FlavorInfo []*FlavorInfo `json:"flavor_info"` +} + +type FlavorInfo struct { + Id int `json:"id"` + Value string `json:"value"` + Desc string `json:"desc"` +} + +type StImageInfosModelArts struct { + ImageInfo []*ImageInfoModelArts `json:"image_info"` +} + +type ImageInfoModelArts struct { + Id string `json:"id"` + Value string `json:"value"` + Desc string `json:"desc"` +} + var ( // AppVer settings AppVer string @@ -535,18 +555,31 @@ var ( AllowedOrg string ProfileID string PoolInfos string - Flavor string + FlavorInfos string DebugHost string ImageInfos string Capacity int MaxTempQueryTimes int + StFlavorInfo *StFlavorInfos + StImageInfos *StImageInfosModelArts //train-job ResourcePools string Engines string EngineVersions string - FlavorInfos string TrainJobFLAVORINFOS string ModelArtsSpecialPools string + ModelArtsMultiNode string + + // modelarts-cd config + ModelartsCD = struct { + Enabled bool + EndPoint string + ProjectID string + AccessKey string + SecretKey string + ImageInfos string + FlavorInfos string + }{} //grampus config Grampus = struct { @@ -1422,9 +1455,8 @@ func NewContext() { AllowedOrg = sec.Key("ORGANIZATION").MustString("") ProfileID = sec.Key("PROFILE_ID").MustString("") PoolInfos = sec.Key("POOL_INFOS").MustString("") - Flavor = sec.Key("FLAVOR").MustString("") ImageInfos = sec.Key("IMAGE_INFOS").MustString("") - Capacity = sec.Key("IMAGE_INFOS").MustInt(100) + Capacity = sec.Key("CAPACITY").MustInt(100) MaxTempQueryTimes = sec.Key("MAX_TEMP_QUERY_TIMES").MustInt(30) ResourcePools = sec.Key("Resource_Pools").MustString("") Engines = sec.Key("Engines").MustString("") @@ -1432,6 +1464,7 @@ func NewContext() { FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("") ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("") + ModelArtsMultiNode=sec.Key("MULTI_NODE").MustString("") sec = Cfg.Section("elk") ElkUrl = sec.Key("ELKURL").MustString("") @@ -1460,7 +1493,7 @@ func NewContext() { CloudbrainStartedRemark = sec.Key("CLOUDBRAIN_STARTED_REMARK").MustString("感谢您的耐心等待。") CloudbrainStoppedTemplateId = sec.Key("CLOUDBRAIN_STOPPED_TEMPLATE_ID").MustString("") CloudbrainStoppedNotifyList = strings.Split(sec.Key("CLOUDBRAIN_STOPPED_NOTIFY_LIST").MustString("TRAIN"), ",") - CloudbrainStoppedTitle = sec.Key("CLOUDBRAIN_STOPPED_TITLE").MustString("您好,您申请的算力资源已结束使用,任务已完成运行,请您关注运行结果。") + CloudbrainStoppedTitle = sec.Key("CLOUDBRAIN_STOPPED_TITLE").MustString("您好,您申请的算力资源已结束使用,任务已完成运行,状态为%s,请您关注运行结果") CloudbrainStoppedRemark = sec.Key("CLOUDBRAIN_STOPPED_REMARK").MustString("感谢您的耐心等待。") SetRadarMapConfig() @@ -1472,8 +1505,8 @@ func NewContext() { Course.OrgName = sec.Key("org_name").MustString("") Course.TeamName = sec.Key("team_name").MustString("") - GetGrampusConfig() - + getGrampusConfig() + getModelartsCDConfig() getModelConvertConfig() } @@ -1496,7 +1529,22 @@ func getModelConvertConfig() { ModelConvert.NPU_TENSORFLOW_IMAGE_ID = sec.Key("NPU_TENSORFLOW_IMAGE_ID").MustInt(35) } -func GetGrampusConfig() { +func getModelartsCDConfig() { + sec := Cfg.Section("modelarts-cd") + + ModelartsCD.Enabled = sec.Key("ENABLED").MustBool(false) + ModelartsCD.EndPoint = sec.Key("ENDPOINT").MustString("https://modelarts.cn-southwest-228.cdzs.cn") + ModelartsCD.ProjectID = sec.Key("PROJECT_ID").MustString("") + ModelartsCD.AccessKey = sec.Key("ACCESS_KEY").MustString("") + ModelartsCD.SecretKey = sec.Key("SECRET_KEY").MustString("") + ModelartsCD.ImageInfos = sec.Key("IMAGE_INFOS").MustString("") + ModelartsCD.FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") + + getNotebookImageInfos() + getNotebookFlavorInfos() +} + +func getGrampusConfig() { sec := Cfg.Section("grampus") Grampus.Env = sec.Key("ENV").MustString("TEST") @@ -1630,6 +1678,26 @@ func ensureLFSDirectory() { } } +func getNotebookImageInfos() { + if StImageInfos == nil { + if ModelartsCD.Enabled { + json.Unmarshal([]byte(ModelartsCD.ImageInfos), &StImageInfos) + } else { + json.Unmarshal([]byte(ImageInfos), &StImageInfos) + } + } +} + +func getNotebookFlavorInfos() { + if StFlavorInfo == nil { + if ModelartsCD.Enabled { + json.Unmarshal([]byte(ModelartsCD.FlavorInfos), &StFlavorInfo) + } else { + json.Unmarshal([]byte(FlavorInfos), &StFlavorInfo) + } + } +} + // NewServices initializes the services func NewServices() { InitDBConfig() diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 5eac4cf2e..f4619681f 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1079,6 +1079,7 @@ balance.total_view = Total Balance balance.available = Available Balance: cloudbrain1 = cloudbrain1 cloudbrain2 = cloudbrain2 +cdCenter = cd_ai_center cloudbrain_selection = select cloudbrain cloudbrain_platform_selection = Select the cloudbrain platform you want to use: confirm_choice = Confirm @@ -1213,6 +1214,7 @@ modelarts.infer_job.select_model = Select Model modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference.py, case/main.py. modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed. modelarts.download_log=Download log file +modelarts.no_node_right = The value of 'Amount of Compute Node' is wrong, you have no right to use the current value of 'Amount of Compute Node'. debug_task_not_created = Debug task has not been created @@ -3205,6 +3207,9 @@ gpu_num = GPU cpu_num = CPU memory = Memory shared_memory = Shared Memory +gpu_memory = GPU Memory +free = Free +point_hr = Point/hr DEBUG = DEBUG diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 2fbd3ab52..c1c18305f 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1080,6 +1080,7 @@ balance.total_view=余额总览 balance.available=可用余额: cloudbrain1=云脑1 cloudbrain2=云脑2 +cdCenter=成都智算中心 intelligent_net=智算网络 cloudbrain_selection=云脑选择 cloudbrain_platform_selection=选择您准备使用的云脑平台: @@ -1226,6 +1227,7 @@ modelarts.infer_job.select_model = 选择模型 modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。 modelarts.infer_job.tooltip = 该模型已删除,无法查看。 modelarts.download_log=下载日志文件 +modelarts.no_node_right = 计算节点数的值配置错误,您没有权限使用当前配置的计算节点数。 debug_task_not_created = 未创建过调试任务 @@ -3224,6 +3226,9 @@ gpu_num = GPU数 cpu_num = CPU数 memory = 内存 shared_memory = 共享内存 +gpu_memory = 显存 +free = 免费 +point_hr = 积分/时 DEBUG = 调试任务 SNN4IMAGENET = 评测任务 diff --git a/routers/admin/cloudbrains.go b/routers/admin/cloudbrains.go index ec0034f4f..fcb878627 100755 --- a/routers/admin/cloudbrains.go +++ b/routers/admin/cloudbrains.go @@ -92,13 +92,13 @@ func CloudBrains(ctx *context.Context) { return } + models.LoadSpecs4CloudbrainInfo(ciTasks) + for i, task := range ciTasks { ciTasks[i].CanDebug = true ciTasks[i].CanDel = true ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource ciTasks[i].Cloudbrain.AiCenter = repo.GetCloudbrainAiCenter(task.Cloudbrain, ctx) - _, cardType, _ := repo.GetCloudbrainCardNumAndType(task.Cloudbrain) - ciTasks[i].Cloudbrain.CardType = cardType ciTasks[i].Cloudbrain.Cluster = repo.GetCloudbrainCluster(task.Cloudbrain, ctx) } diff --git a/routers/admin/resources.go b/routers/admin/resources.go index 7d267c19c..8a8c55f86 100644 --- a/routers/admin/resources.go +++ b/routers/admin/resources.go @@ -8,6 +8,8 @@ import ( "code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/services/cloudbrain/resource" "net/http" + "strconv" + "strings" ) const ( @@ -118,11 +120,13 @@ func GetResourceSpecificationList(ctx *context.Context) { queue := ctx.QueryInt64("queue") status := ctx.QueryInt("status") cluster := ctx.Query("cluster") + available := ctx.QueryInt("available") list, err := resource.GetResourceSpecificationList(models.SearchResourceSpecificationOptions{ - ListOptions: models.ListOptions{Page: page, PageSize: 10}, - QueueId: queue, - Status: status, - Cluster: cluster, + ListOptions: models.ListOptions{Page: page, PageSize: 10}, + QueueId: queue, + Status: status, + Cluster: cluster, + AvailableCode: available, }) if err != nil { log.Error("GetResourceSpecificationList error.%v", err) @@ -246,3 +250,37 @@ func UpdateResourceScene(ctx *context.Context, req models.ResourceSceneReq) { } ctx.JSON(http.StatusOK, response.Success()) } + +func RefreshHistorySpec(ctx *context.Context) { + scope := ctx.Query("scope") + list := ctx.Query("list") + + var scopeAll = false + if scope == "all" { + scopeAll = true + } + var ids = make([]int64, 0) + if list != "" { + strs := strings.Split(list, "|") + for _, s := range strs { + i, err := strconv.ParseInt(s, 10, 64) + if err != nil { + ctx.JSON(http.StatusOK, response.ServerError(err.Error())) + return + } + ids = append(ids, i) + } + + } + + total, success, err := resource.RefreshHistorySpec(scopeAll, ids) + if err != nil { + log.Error("RefreshHistorySpec error. %v", err) + ctx.JSON(http.StatusOK, response.ServerError(err.Error())) + return + } + r := make(map[string]interface{}, 0) + r["success"] = success + r["total"] = total + ctx.JSON(http.StatusOK, response.SuccessWithData(r)) +} diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index b450b2e26..d6b7bb076 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -405,46 +405,83 @@ func CloudbrainDownloadLogFile(ctx *context.Context) { func CloudbrainGetLog(ctx *context.Context) { ID := ctx.Params(":id") - startLine := ctx.QueryInt("base_line") - lines := ctx.QueryInt("lines") - endLine := startLine + lines - order := ctx.Query("order") - if order == "asc" { - endLine = startLine - startLine = endLine - lines - if startLine < 0 { - startLine = 0 - } - } job, err := models.GetCloudbrainByID(ID) if err != nil { log.Error("GetCloudbrainByJobName failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) return } - result := getLogFromModelDir(job.JobName, startLine, endLine) - if result == nil { - log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) - ctx.ServerError(err.Error(), err) - return + lines := ctx.QueryInt("lines") + baseLine := ctx.Query("base_line") + order := ctx.Query("order") + var result map[string]interface{} + resultPath := "/model" + if job.JobType == string(models.JobTypeInference) { + resultPath = "/result" + } + if baseLine == "" && order == "desc" { + result = getLastLogFromModelDir(job.JobName, lines, resultPath) + } else { + startLine := ctx.QueryInt("base_line") + endLine := startLine + lines + if order == "asc" { + if baseLine == "" { + startLine = 0 + endLine = lines + } else { + endLine = startLine + startLine = endLine - lines + if startLine < 0 { + startLine = 0 + } + } + } + result = getLogFromModelDir(job.JobName, startLine, endLine, resultPath) + if result == nil { + log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } } - re := map[string]interface{}{ "JobID": ID, "LogFileName": result["FileName"], - "StartLine": startLine, - "EndLine": result["endLine"], + "StartLine": result["StartLine"], + "EndLine": result["EndLine"], "Content": result["Content"], - "Lines": result["lines"], + "Lines": result["Lines"], "CanLogDownload": result["FileName"] != "", } //result := CloudbrainGetLogByJobId(job.JobID, job.JobName) - ctx.JSON(http.StatusOK, re) } -func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]interface{} { - prefix := "/" + setting.CBCodePathPrefix + jobName + "/model" +func getAllLineFromFile(path string) int { + count := 0 + reader, err := os.Open(path) + defer reader.Close() + if err == nil { + r := bufio.NewReader(reader) + for { + _, error := r.ReadString('\n') + if error == io.EOF { + log.Info("read file completed.") + break + } + if error != nil { + log.Info("read file error." + error.Error()) + break + } + count = count + 1 + } + } else { + log.Info("error:" + err.Error()) + } + return count +} + +func getLastLogFromModelDir(jobName string, lines int, resultPath string) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + resultPath files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") if err != nil { log.Error("query cloudbrain model failed: %v", err) @@ -454,11 +491,81 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i re := "" fileName := "" count := 0 + allLines := 0 + startLine := 0 + for _, file := range files { + if strings.HasSuffix(file.FileName, "log.txt") { + fileName = file.FileName + path := storage.GetMinioPath(jobName+resultPath+"/", file.FileName) + allLines = getAllLineFromFile(path) + startLine = allLines - lines + if startLine < 0 { + startLine = 0 + } + count = allLines - startLine + log.Info("path=" + path) + reader, err := os.Open(path) + defer reader.Close() + if err == nil { + r := bufio.NewReader(reader) + for i := 0; i < allLines; i++ { + line, error := r.ReadString('\n') + if error == io.EOF { + log.Info("read file completed.") + break + } + if error != nil { + log.Info("read file error." + error.Error()) + break + } + if error == nil { + if i >= startLine { + re = re + line + } + } + } + } else { + log.Info("error:" + err.Error()) + } + break + } + } + + return map[string]interface{}{ + "JobName": jobName, + "Content": re, + "FileName": fileName, + "Lines": count, + "EndLine": allLines, + "StartLine": startLine, + } +} + +func getLogFromModelDir(jobName string, startLine int, endLine int, resultPath string) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + resultPath + files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") + if err != nil { + log.Error("query cloudbrain model failed: %v", err) + return nil + } + if startLine == endLine { + return map[string]interface{}{ + "JobName": jobName, + "Content": "", + "FileName": "", + "Lines": 0, + "EndLine": startLine, + "StartLine": startLine, + } + } + re := "" + fileName := "" + count := 0 fileEndLine := endLine for _, file := range files { if strings.HasSuffix(file.FileName, "log.txt") { fileName = file.FileName - path := storage.GetMinioPath(jobName+"/model/", file.FileName) + path := storage.GetMinioPath(jobName+resultPath+"/", file.FileName) log.Info("path=" + path) reader, err := os.Open(path) defer reader.Close() @@ -467,7 +574,6 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i for i := 0; i < endLine; i++ { line, error := r.ReadString('\n') log.Info("line=" + line) - fileEndLine = i if error == io.EOF { log.Info("read file completed.") break @@ -478,11 +584,13 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i } if error == nil { if i >= startLine { + fileEndLine = i re = re + line count++ } } } + fileEndLine = fileEndLine + 1 } else { log.Info("error:" + err.Error()) } @@ -491,11 +599,12 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i } return map[string]interface{}{ - "JobName": jobName, - "Content": re, - "FileName": fileName, - "lines": count, - "endLine": fileEndLine, + "JobName": jobName, + "Content": re, + "FileName": fileName, + "Lines": count, + "EndLine": fileEndLine, + "StartLine": startLine, } } diff --git a/routers/private/internal.go b/routers/private/internal.go index 4731463b1..3e2eeab31 100755 --- a/routers/private/internal.go +++ b/routers/private/internal.go @@ -6,6 +6,7 @@ package private import ( + "code.gitea.io/gitea/routers/admin" "strings" "code.gitea.io/gitea/routers/repo" @@ -51,6 +52,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/tool/org_stat", OrgStatisticManually) m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit) m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration) + m.Post("/resources/specification/handle_historical_task", admin.RefreshHistorySpec) }, CheckInternalToken) } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 2d8bebf4b..7020f0a61 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2,6 +2,7 @@ package repo import ( "bufio" + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -121,86 +122,7 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["QueuesDetail"] = queuesDetail } - cloudbrain.InitSpecialPool() - - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - ctx.Data["gpu_types"] = gpuInfos.GpuInfo - - if trainGpuInfos == nil { - json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) - } - ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo - - if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" { - json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) - } - if inferenceGpuInfos != nil { - ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo - } - - if benchmarkGpuInfos == nil { - json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) - } - ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo - - if benchmarkResourceSpecs == nil { - json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs) - } - ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec - - if cloudbrain.ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) - } - ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec - - if cloudbrain.TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) - } - ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec - - if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) - } - if cloudbrain.InferenceResourceSpecs != nil { - ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec - } - - if cloudbrain.SpecialPools != nil { - var debugGpuTypes []*models.GpuInfo - var trainGpuTypes []*models.GpuInfo - - for _, pool := range cloudbrain.SpecialPools.Pools { - isOrgMember, _ := models.IsOrganizationMemberByOrgName(pool.Org, ctx.User.ID) - if isOrgMember { - for _, jobType := range pool.JobType { - if jobType == string(models.JobTypeDebug) { - debugGpuTypes = append(debugGpuTypes, pool.Pool...) - if pool.ResourceSpec != nil { - ctx.Data["resource_specs"] = pool.ResourceSpec - } - } else if jobType == string(models.JobTypeTrain) { - trainGpuTypes = append(trainGpuTypes, pool.Pool...) - if pool.ResourceSpec != nil { - ctx.Data["train_resource_specs"] = pool.ResourceSpec - } - } - } - break - } - - } - - if len(debugGpuTypes) > 0 { - ctx.Data["gpu_types"] = debugGpuTypes - } - - if len(trainGpuTypes) > 0 { - ctx.Data["train_gpu_types"] = trainGpuTypes - } - - } + prepareCloudbrainOneSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -218,6 +140,40 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainOneSpecs(ctx *context.Context) { + debugSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["debug_specs"] = debugSpecs + + trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["train_specs"] = trainSpecs + + inferenceSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["inference_specs"] = inferenceSpecs + + benchmarkSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["benchmark_specs"] = benchmarkSpecs +} + func CloudBrainNew(ctx *context.Context) { err := cloudBrainNewDataPrepare(ctx) if err != nil { @@ -235,9 +191,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { image := strings.TrimSpace(form.Image) uuids := form.Attachment jobType := form.JobType - gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := form.ResourceSpecId branchName := form.BranchName bootFile := strings.TrimSpace(form.BootFile) repo := ctx.Repo.Repository @@ -325,18 +279,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command = commandTrain } - errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) - - if errStr != "" { - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr(errStr, tpl, &form) - return - } - if branchName == "" { branchName = cloudbrain.DefaultBranchName } - errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) + errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) if errStr != "" { cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form) @@ -345,6 +291,17 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobType(jobType), + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } + req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -360,7 +317,6 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: jobType, - GpuQueue: gpuQueue, Description: form.Description, BranchName: branchName, BootFile: form.BootFile, @@ -368,8 +324,8 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { CommitID: commitID, BenchmarkTypeID: 0, BenchmarkChildTypeID: 0, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -417,9 +373,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra image := strings.TrimSpace(form.Image) uuid := form.Attachment jobType := string(models.JobTypeInference) - gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := form.ResourceSpecId branchName := form.BranchName bootFile := strings.TrimSpace(form.BootFile) labelName := form.LabelName @@ -501,7 +455,16 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } - + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -517,19 +480,18 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: jobType, - GpuQueue: gpuQueue, Description: form.Description, BranchName: branchName, BootFile: form.BootFile, Params: form.Params, CommitID: commitID, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), ModelName: form.ModelName, ModelVersion: form.ModelVersion, CkptName: form.CkptName, TrainUrl: form.TrainUrl, LabelName: labelName, + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -607,34 +569,25 @@ func CloudBrainRestart(ctx *context.Context) { break } - var hasSameResource bool - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - for _, resourceType := range gpuInfos.GpuInfo { - if resourceType.Queue == task.GpuQueue { - hasSameResource = true - break - } - } - if !hasSameResource && cloudbrain.SpecialPools != nil { - - for _, specialPool := range cloudbrain.SpecialPools.Pools { - cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) - for _, pool := range specialPool.Pool { - if pool.Queue == task.GpuQueue { - hasSameResource = true - } - } - } + specOld, err := resource.GetCloudbrainSpec(task.ID) + if err != nil || specOld == nil { + log.Error("CloudBrainRestart GetCloudbrainSpec error.task.id = %d", task.ID) + resultCode = "-1" + errorMsg = "Resource specification not support any more" + break } - - if !hasSameResource { - log.Error("has no same resource, can not restart", ctx.Data["MsgID"]) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, specOld.ID, models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + log.Error("CloudBrainRestart GetAndCheckSpec error.task.id = %d", task.ID) resultCode = "-1" - errorMsg = "the job's version is too old and can not be restarted" + errorMsg = "Resource specification not support any more" break } + task.Spec = spec count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug)) if err != nil { @@ -707,128 +660,13 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo ctx.NotFound(ctx.Req.URL.RequestURI(), nil) return } - hasSpec := false - if task.JobType == string(models.JobTypeTrain) { - if cloudbrain.TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) - } - - for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { - if tmp.Id == task.ResourceSpecId { - hasSpec = true - ctx.Data["GpuNum"] = tmp.GpuNum - ctx.Data["CpuNum"] = tmp.CpuNum - ctx.Data["MemMiB"] = tmp.MemMiB - ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB - break - } - } - - } else if task.JobType == string(models.JobTypeInference) { - if cloudbrain.InferenceResourceSpecs == nil { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) - } - for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { - if tmp.Id == task.ResourceSpecId { - hasSpec = true - ctx.Data["GpuNum"] = tmp.GpuNum - ctx.Data["CpuNum"] = tmp.CpuNum - ctx.Data["MemMiB"] = tmp.MemMiB - ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB - break - } - } - } else { - if cloudbrain.ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) - } - for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { - if tmp.Id == task.ResourceSpecId { - hasSpec = true - ctx.Data["GpuNum"] = tmp.GpuNum - ctx.Data["CpuNum"] = tmp.CpuNum - ctx.Data["MemMiB"] = tmp.MemMiB - ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB - break - - } - } - } - - if !hasSpec && cloudbrain.SpecialPools != nil { - - for _, specialPool := range cloudbrain.SpecialPools.Pools { - - if specialPool.ResourceSpec != nil { - - for _, spec := range specialPool.ResourceSpec { - if task.ResourceSpecId == spec.Id { - ctx.Data["GpuNum"] = spec.GpuNum - ctx.Data["CpuNum"] = spec.CpuNum - ctx.Data["MemMiB"] = spec.MemMiB - ctx.Data["ShareMemMiB"] = spec.ShareMemMiB - break - } - } - } - } + prepareSpec4Show(ctx, task) + if ctx.Written() { + return } if result != nil { jobRes, _ := models.ConvertToJobResultPayload(result.Payload) - jobRes.Resource.Memory = strings.ReplaceAll(jobRes.Resource.Memory, "Mi", "MB") - spec := "GPU数:" + strconv.Itoa(jobRes.Resource.NvidiaComGpu) + ",CPU数:" + strconv.Itoa(jobRes.Resource.CPU) + ",内存(MB):" + jobRes.Resource.Memory - ctx.Data["resource_spec"] = spec - if task.JobType == string(models.JobTypeTrain) { - if trainGpuInfos == nil { - json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) - } - for _, resourceType := range trainGpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - - } else if task.JobType == string(models.JobTypeInference) { - if inferenceGpuInfos == nil { - json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) - } - for _, resourceType := range inferenceGpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - } else if cloudbrain.IsBenchmarkJob(task.JobType) { - if benchmarkGpuInfos == nil { - json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) - } - - for _, resourceType := range benchmarkGpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - - } else { - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - for _, resourceType := range gpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - } - - if cloudbrain.SpecialPools != nil { - for _, specialPool := range cloudbrain.SpecialPools.Pools { - for _, resourceType := range specialPool.Pool { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - } - } taskRoles := jobRes.TaskRoles taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) ctx.Data["taskRes"] = taskRes @@ -952,6 +790,85 @@ func CloudBrainDebug(ctx *context.Context) { ctx.Redirect(debugUrl) } +func prepareSpec4Show(ctx *context.Context, task *models.Cloudbrain) { + s, err := resource.GetCloudbrainSpec(task.ID) + if err != nil { + log.Info("error:" + err.Error()) + ctx.NotFound(ctx.Req.URL.RequestURI(), nil) + return + } + ctx.Data["Spec"] = s +} + +func oldPrepareSpec4Show(ctx *context.Context, task *models.Cloudbrain) { + hasSpec := false + if task.JobType == string(models.JobTypeTrain) { + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + + for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + break + } + } + + } else if task.JobType == string(models.JobTypeInference) { + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + break + } + } + } else { + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + break + + } + } + } + + if !hasSpec && cloudbrain.SpecialPools != nil { + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if specialPool.ResourceSpec != nil { + + for _, spec := range specialPool.ResourceSpec { + if task.ResourceSpecId == spec.Id { + ctx.Data["GpuNum"] = spec.GpuNum + ctx.Data["CpuNum"] = spec.CpuNum + ctx.Data["MemMiB"] = spec.MemMiB + ctx.Data["ShareMemMiB"] = spec.ShareMemMiB + break + } + } + } + } + } +} + func CloudBrainCommitImageShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true ctx.Data["Type"] = ctx.Cloudbrain.Type @@ -2285,10 +2202,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) image := strings.TrimSpace(form.Image) - gpuQueue := form.GpuType command := cloudbrain.CommandBenchmark codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := cloudbrain.BenchMarkResourceID benchmarkTypeID := form.BenchmarkTypeID benchmarkChildTypeID := form.BenchmarkChildTypeID @@ -2329,19 +2244,14 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo return } - _, err = getBenchmarkGpuQueue(gpuQueue) - if err != nil { - log.Error("getBenchmarkGpuQueue failed:%v", err, ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("gpu queue error", tplCloudBrainBenchmarkNew, &form) - return - } - - _, err = getBenchmarkResourceSpec(resourceSpecId) - if err != nil { - log.Error("getBenchmarkResourceSpec failed:%v", err, ctx.Data["MsgID"]) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("resource spec error", tplCloudBrainBenchmarkNew, &form) + ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form) return } @@ -2402,14 +2312,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo } benchmarkPath := setting.JobPath + jobName + cloudbrain.BenchMarkMountPath - var gpuType string - for _, gpuInfo := range gpuInfos.GpuInfo { - if gpuInfo.Queue == gpuQueue { - gpuType = gpuInfo.Value - } - } - if err := downloadRateCode(repo, jobName, childInfo.Owner, childInfo.RepoName, benchmarkPath, form.BenchmarkCategory, gpuType, ctx.User.Name); err != nil { + if err := downloadRateCode(repo, jobName, childInfo.Owner, childInfo.RepoName, benchmarkPath, form.BenchmarkCategory, spec.AccCardType, ctx.User.Name); err != nil { log.Error("downloadRateCode failed, %v", err, ctx.Data["MsgID"]) //cloudBrainNewDataPrepare(ctx) //ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) @@ -2431,7 +2335,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, &form) return } - + log.Info("Command=" + command) + log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -2447,7 +2352,6 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: string(models.JobTypeBenchmark), - GpuQueue: gpuQueue, Description: form.Description, BranchName: cloudbrain.DefaultBranchName, BootFile: "", @@ -2455,8 +2359,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo CommitID: "", BenchmarkTypeID: benchmarkTypeID, BenchmarkChildTypeID: benchmarkChildTypeID, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -2476,9 +2380,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) image := form.Image uuid := form.Attachment jobType := form.JobType - gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := form.ResourceSpecId branchName := cloudbrain.DefaultBranchName repo := ctx.Repo.Repository @@ -2560,7 +2462,18 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } - + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } + log.Info("Command=" + command) + log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -2576,7 +2489,6 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: jobType, - GpuQueue: gpuQueue, Description: form.Description, BranchName: branchName, BootFile: form.BootFile, @@ -2584,8 +2496,8 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) CommitID: "", BenchmarkTypeID: 0, BenchmarkChildTypeID: benchmarkChildTypeID, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -2718,7 +2630,7 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { } } - command += "python /code/" + bootFile + param + " | tee " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile + command += "python /code/" + bootFile + param + " > " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile return command, nil } @@ -2763,6 +2675,8 @@ func GetCloudbrainAiCenter(task models.Cloudbrain, ctx *context.Context) string return ctx.Tr("repo.cloudbrain1") } else if task.Type == models.TypeCloudBrainTwo { return ctx.Tr("repo.cloudbrain2") + } else if task.Type == models.TypeCDCenter { + return ctx.Tr("repo.cdCenter") } else if task.Type == models.TypeC2Net { return getCutStringAiCenterByAiCenter(task.AiCenter) } @@ -2777,7 +2691,7 @@ func getCutStringAiCenterByAiCenter(aiCenter string) string { } func GetCloudbrainCluster(task models.Cloudbrain, ctx *context.Context) string { - if task.Type == models.TypeCloudBrainOne || task.Type == models.TypeCloudBrainTwo { + if task.Type == models.TypeCloudBrainOne || task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter { return ctx.Tr("cloudbrain.resource_cluster_openi") } else if task.Type == models.TypeC2Net { return ctx.Tr("cloudbrain.resource_cluster_c2net") @@ -2864,10 +2778,10 @@ func GetCloudbrainFlavorName(task models.Cloudbrain) (string, error) { return CloudbrainOneFlavorName, nil } } - } else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeC2Net) && task.FlavorName != "" { + } else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeC2Net || task.Type == models.TypeCDCenter) && task.FlavorName != "" { replaceFlavorName := strings.ReplaceAll(task.FlavorName, ":", ":") return replaceFlavorName, nil - } else if task.Type == models.TypeCloudBrainTwo && task.FlavorName == "" && task.FlavorCode != "" { + } else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter) && task.FlavorName == "" && task.FlavorCode != "" { cloudbrainTwoFlavorName := getFlavorNameByFlavorCode(task.FlavorCode) return cloudbrainTwoFlavorName, nil } else if task.Type == models.TypeCloudBrainTwo && task.JobType == string(models.JobTypeDebug) && task.FlavorName == "" && task.FlavorCode == "" { diff --git a/routers/repo/dataset.go b/routers/repo/dataset.go index d65a9f2aa..f0e41024b 100755 --- a/routers/repo/dataset.go +++ b/routers/repo/dataset.go @@ -45,15 +45,10 @@ func newFilterPrivateAttachments(ctx *context.Context, list []*models.Attachment repo.GetOwner() } permission := false - if repo.Owner.IsOrganization() && ctx.User != nil { - if repo.Owner.IsUserPartOfOrg(ctx.User.ID) { - log.Info("user is member of org.") - permission = true - } - } if !permission && ctx.User != nil { isCollaborator, _ := repo.IsCollaborator(ctx.User.ID) - if isCollaborator { + isInRepoTeam,_:=repo.IsInRepoTeam(ctx.User.ID) + if isCollaborator ||isInRepoTeam { log.Info("Collaborator user may visit the attach.") permission = true } diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 6fc77a454..d7e799427 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1,6 +1,7 @@ package repo import ( + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -106,15 +107,11 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err } } - //get valid resource specs - specs, err := grampus.GetResourceSpecs(processType) - - grampusSpecs := getFilterSpecBySpecialPool(specs, includeCenters, excludeCenters) - - if err != nil { - log.Error("GetResourceSpecs failed:", err.Error()) - } else { - ctx.Data["flavor_infos"] = grampusSpecs + //prepare available specs + if processType == grampus.ProcessorTypeNPU { + prepareGrampusTrainSpecs(ctx, models.NPU) + } else if processType == grampus.ProcessorTypeGPU { + prepareGrampusTrainSpecs(ctx, models.GPU) } //get branches @@ -140,6 +137,15 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err return nil } +func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: computeResource, + Cluster: models.C2NetCluster, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec { if len(includeCenters) == 0 && len(excludeCenters) == 0 { return specs.Infos @@ -206,7 +212,6 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid branchName := form.BranchName - flavorName := form.FlavorName image := strings.TrimSpace(form.Image) if !jobNamePattern.MatchString(displayJobName) { @@ -272,6 +277,18 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check specification + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.GPU, + Cluster: models.C2NetCluster, + }) + if err != nil || spec == nil { + grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) + ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobGPUNew, &form) + return + } + //check dataset attachment, err := models.GetAttachmentByUUID(uuid) if err != nil { @@ -336,7 +353,6 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ComputeResource: models.GPUResource, ProcessType: grampus.ProcessorTypeGPU, Command: command, - ResourceSpecId: form.FlavorID, ImageUrl: image, Description: description, BootFile: bootFile, @@ -344,12 +360,12 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain CommitID: commitID, BranchName: branchName, Params: form.Params, - FlavorName: flavorName, EngineName: image, DatasetName: attachment.Name, IsLatestVersion: modelarts.IsLatestVersion, VersionCount: modelarts.VersionCountOne, WorkServerNumber: 1, + Spec: spec, } err = grampus.GenerateTrainJob(ctx, req) @@ -397,7 +413,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion - flavorName := form.FlavorName versionCount := modelarts.VersionCountOne engineName := form.EngineName @@ -464,6 +479,18 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check specification + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.C2NetCluster, + }) + if err != nil || spec == nil { + grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) + ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobNPUNew, &form) + return + } + //check dataset attachment, err := models.GetAttachmentByUUID(uuid) if err != nil { @@ -518,7 +545,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ComputeResource: models.NPUResource, ProcessType: grampus.ProcessorTypeNPU, Command: command, - ResourceSpecId: form.FlavorID, ImageId: form.ImageID, DataUrl: dataObsPath, Description: description, @@ -531,11 +557,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain IsLatestVersion: isLatestVersion, BranchName: branchName, Params: form.Params, - FlavorName: flavorName, EngineName: engineName, VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, DatasetName: attachment.Name, + Spec: spec, } err = grampus.GenerateTrainJob(ctx, req) @@ -712,6 +738,7 @@ func GrampusTrainJobShow(ctx *context.Context) { taskList := make([]*models.Cloudbrain, 0) taskList = append(taskList, task) + prepareSpec4Show(ctx, task) ctx.Data["version_list_task"] = taskList ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false) ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 847e831f6..b4f6f000e 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -2,6 +2,8 @@ package repo import ( "archive/zip" + "code.gitea.io/gitea/modules/modelarts_cd" + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -60,18 +62,11 @@ func DebugJobIndex(ctx *context.Context) { if page <= 0 { page = 1 } - typeCloudBrain := models.TypeCloudBrainAll + jobTypeNot := false - if listType == models.GPUResource { - typeCloudBrain = models.TypeCloudBrainOne - } else if listType == models.NPUResource { - typeCloudBrain = models.TypeCloudBrainTwo - } else if listType == models.AllResource { - typeCloudBrain = models.TypeCloudBrainAll - } else { - log.Error("listType(%s) error", listType) - ctx.ServerError("listType error", errors.New("listType error")) - return + var computeResource string + if listType != models.AllResource { + computeResource = listType } var jobTypes []string @@ -81,10 +76,11 @@ func DebugJobIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: typeCloudBrain, - JobTypeNot: jobTypeNot, - JobTypes: jobTypes, + RepoID: repo.ID, + ComputeResource: computeResource, + Type: models.TypeCloudBrainAll, + JobTypeNot: jobTypeNot, + JobTypes: jobTypes, }) if err != nil { ctx.ServerError("Get debugjob faild:", err) @@ -134,17 +130,9 @@ func notebookNewDataPrepare(ctx *context.Context) error { return err } ctx.Data["attachments"] = attachs + ctx.Data["images"] = setting.StImageInfos.ImageInfo - if modelarts.ImageInfos == nil { - json.Unmarshal([]byte(setting.ImageInfos), &modelarts.ImageInfos) - } - ctx.Data["images"] = modelarts.ImageInfos.ImageInfo - - if modelarts.FlavorInfos == nil { - json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) - } - ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug)) + prepareCloudbrainTwoDebugSpecs(ctx) ctx.Data["datasetType"] = models.TypeCloudBrainTwo @@ -154,6 +142,20 @@ func notebookNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) { + aiCenterCode := models.AICenterOfCloudBrainTwo + if setting.ModelartsCD.Enabled { + aiCenterCode = models.AICenterOfChengdu + } + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: aiCenterCode, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { ctx.Data["PageIsNotebook"] = true jobName := form.JobName @@ -204,7 +206,6 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment description := form.Description - flavor := form.Flavor imageId := form.ImageId repo := ctx.Repo.Repository @@ -239,15 +240,26 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm return } } - - errStr := checkModelArtsSpecialPool(ctx, flavor, string(models.JobTypeDebug)) - if errStr != "" { + var aiCenterCode = models.AICenterOfCloudBrainTwo + if setting.ModelartsCD.Enabled { + aiCenterCode = models.AICenterOfChengdu + } + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: aiCenterCode}) + if err != nil || spec == nil { notebookNewDataPrepare(ctx) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsNotebookNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form) return } + if setting.ModelartsCD.Enabled { + err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, uuid, description, imageId, spec) + } else { + err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec) + } - err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId) if err != nil { log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"]) notebookNewDataPrepare(ctx) @@ -292,24 +304,7 @@ func NotebookShow(ctx *context.Context) { if err == nil { task.User = user } - if modelarts.FlavorInfos == nil { - json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) - } - - findSpec := false - if modelarts.FlavorInfos != nil { - ctx.Data["resource_spec"] = modelarts.FlavorInfos.FlavorInfo[0].Desc - for _, f := range modelarts.FlavorInfos.FlavorInfo { - if fmt.Sprint(f.Value) == task.FlavorCode { - ctx.Data["resource_spec"] = f.Desc - findSpec = true - break - } - } - } - - setShowSpecBySpecialPoolConfig(ctx, findSpec, task) - + prepareSpec4Show(ctx, task) if task.TrainJobDuration == "" { if task.Duration == 0 { var duration int64 @@ -394,36 +389,16 @@ func setShowSpecBySpecialPoolConfig(ctx *context.Context, findSpec bool, task *m } } -func NotebookDebug(ctx *context.Context) { - var jobID = ctx.Params(":jobid") - - result, err := modelarts.GetJob(jobID) - if err != nil { - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) - return - } - - res, err := modelarts.GetJobToken(jobID) - if err != nil { - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) - return - } - - urls := strings.Split(result.Spec.Annotations.Url, "/") - urlPrefix := result.Spec.Annotations.TargetDomain - for i, url := range urls { - if i > 2 { - urlPrefix += "/" + url - } - } - - debugUrl := urlPrefix + "?token=" + res.Token - ctx.Redirect(debugUrl) -} - func NotebookDebug2(ctx *context.Context) { + var err error + var result *models.GetNotebook2Result task := ctx.Cloudbrain - result, err := modelarts.GetNotebook2(task.JobID) + if task.Type == models.TypeCloudBrainTwo { + result, err = modelarts.GetNotebook2(task.JobID) + } else if task.Type == models.TypeCDCenter { + result, err = modelarts_cd.GetNotebook(task.JobID) + } + if err != nil { ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) return @@ -437,6 +412,7 @@ func NotebookRestart(ctx *context.Context) { var resultCode = "-1" var errorMsg = "" var status = "" + var spec *models.Specification task := ctx.Cloudbrain @@ -464,12 +440,40 @@ func NotebookRestart(ctx *context.Context) { } } + oldSpec, err := resource.GetCloudbrainSpec(task.ID) + if err != nil || oldSpec == nil { + log.Error("NotebookManage GetCloudbrainSpec error.%v", err) + errorMsg = "Resource specification not available" + break + } + + aiCenterCode := models.AICenterOfCloudBrainTwo + if task.Type == models.TypeCDCenter { + aiCenterCode = models.AICenterOfChengdu + } + spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: aiCenterCode}) + if err != nil || spec == nil { + log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID) + errorMsg = "Resource specification not support any more" + break + } + createTime := timeutil.TimeStampNow() param := models.NotebookAction{ Action: models.ActionStart, } - res, err := modelarts.ManageNotebook2(task.JobID, param) + var res *models.NotebookActionResult + if task.Type == models.TypeCloudBrainTwo { + res, err = modelarts.ManageNotebook2(task.JobID, param) + } else if task.Type == models.TypeCDCenter { + res, err = modelarts_cd.ManageNotebook(task.JobID, param) + } + if err != nil { log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"]) /* 暂不处理再次调试502的场景,详情见方案 @@ -507,8 +511,7 @@ func NotebookRestart(ctx *context.Context) { Description: task.Description, CreatedUnix: createTime, UpdatedUnix: createTime, - FlavorCode: task.FlavorCode, - FlavorName: task.FlavorName, + Spec: spec, } err = models.RestartCloudbrain(task, newTask) @@ -555,7 +558,14 @@ func NotebookStop(ctx *context.Context) { Action: models.ActionStop, } - res, err := modelarts.ManageNotebook2(task.JobID, param) + var err error + var res *models.NotebookActionResult + if task.Type == models.TypeCloudBrainTwo { + res, err = modelarts.ManageNotebook2(task.JobID, param) + } else if task.Type == models.TypeCDCenter { + res, err = modelarts_cd.ManageNotebook(task.JobID, param) + } + if err != nil { log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) resultCode = "-1" @@ -605,7 +615,13 @@ func NotebookDel(ctx *context.Context) { return } - _, err := modelarts.DelNotebook2(task.JobID) + var err error + if task.Type == models.TypeCloudBrainTwo { + _, err = modelarts.DelNotebook2(task.JobID) + } else if task.Type == models.TypeCDCenter { + _, err = modelarts_cd.DelNotebook(task.JobID) + } + if err != nil { log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error()) if strings.Contains(err.Error(), modelarts.NotebookNotFound) || strings.Contains(err.Error(), modelarts.NotebookNoPermission) || strings.Contains(err.Error(), modelarts.NotebookInvalid) { @@ -741,14 +757,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -763,9 +772,33 @@ func trainJobNewDataPrepare(ctx *context.Context) error { waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) + return nil } +func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + +func setMultiNodeIfConfigureMatch(ctx *context.Context) { + modelarts.InitMultiNode() + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg { + ctx.Data["WorkNode"] = info.Node + break + } + } + } +} + func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { modelarts.InitSpecialPool() @@ -848,13 +881,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { @@ -880,6 +907,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts ctx.Data["datasetType"] = models.TypeCloudBrainTwo waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) return nil } @@ -942,14 +970,12 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + prepareCloudbrainTwoTrainSpecs(ctx) + spec, _ := resource.GetCloudbrainSpec(task.ID) + if spec != nil { + log.Info("spec_id = %d", spec.ID) + ctx.Data["spec_id"] = spec.ID } - ctx.Data["flavor_infos"] = flavorInfos.Info - - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) var Parameters modelarts.Parameters if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil { @@ -1040,13 +1066,7 @@ func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrai } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) var Parameters modelarts.Parameters if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { @@ -1099,7 +1119,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID //isSaveParam := form.IsSaveParam @@ -1115,6 +1134,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount := modelarts.VersionCountOne EngineName := form.EngineName + errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber) + if errStr != "" { + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1145,10 +1171,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form) return } //Determine whether the task name of the task in the project is duplicated @@ -1311,7 +1341,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: outputObsPath, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, EngineID: int64(engineID), LogUrl: logObsPath, @@ -1327,6 +1356,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount: VersionCount, TotalVersionCount: modelarts.TotalVersionCount, DatasetName: datasetNames, + Spec: spec, } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand @@ -1349,6 +1379,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func checkMultiNode(userId int64, serverNum int) string { + if serverNum == 1 { + return "" + } + modelarts.InitMultiNode() + var isServerNumValid = false + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { + if isInNodes(info.Node, serverNum) { + isServerNumValid = true + break + } + + } + } + } + if isServerNumValid { + return "" + } else { + return "repo.modelarts.no_node_right" + } +} +func checkInferenceJobMultiNode(userId int64, serverNum int) string { + if serverNum == 1 { + return "" + } + + return "repo.modelarts.no_node_right" + +} + +func isInNodes(nodes []int, num int) bool { + for _, node := range nodes { + if node == num { + return true + } + } + return false + +} + func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { userImageUrl := "" userCommand := "" @@ -1383,6 +1455,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber) + if errStr != "" { + versionErrorDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1412,7 +1491,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID //isSaveParam := form.IsSaveParam @@ -1450,10 +1528,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { versionErrorDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form) return } @@ -1607,7 +1689,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: outputObsPath, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, IsLatestVersion: isLatestVersion, EngineID: int64(engineID), @@ -1624,6 +1705,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ PreVersionName: PreVersionName, TotalVersionCount: latestTask.TotalVersionCount + 1, DatasetName: datasetNames, + Spec: spec, } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand @@ -1719,10 +1801,6 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { return errors.New("启动文件必须是python文件") } - if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 { - log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber) - return errors.New("计算节点数必须在1-2之间") - } if form.BranchName == "" { log.Error("the branch must not be null!", form.BranchName) return errors.New("代码分支不能为空!") @@ -1811,7 +1889,6 @@ func TrainJobShow(ctx *context.Context) { for i, task := range VersionListTasks { var parameters models.Parameters - err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), ¶meters) if err != nil { log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err) @@ -1832,6 +1909,14 @@ func TrainJobShow(ctx *context.Context) { datasetList = append(datasetList, GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)) VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) + + //add spec + s, err := resource.GetCloudbrainSpec(task.Cloudbrain.ID) + if err != nil { + log.Error("TrainJobShow GetCloudbrainSpec error:" + err.Error()) + continue + } + VersionListTasks[i].Cloudbrain.Spec = s } pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5) @@ -1999,7 +2084,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID repo := ctx.Repo.Repository @@ -2021,6 +2105,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ckptUrl := "/" + form.TrainUrl + form.CkptName log.Info("ckpt url:" + ckptUrl) + errStr := checkInferenceJobMultiNode(ctx.User.ID, form.WorkServerNumber) + if errStr != "" { + inferenceJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + return + } + count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -2069,13 +2160,16 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { inferenceJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form) return } - //todo: del the codeLocalPath _, err = ioutil.ReadDir(codeLocalPath) if err == nil { @@ -2127,7 +2221,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid) if err != nil { inferenceJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form) return } dataPath := dataUrl @@ -2183,7 +2277,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: trainUrl, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, EngineID: int64(engineID), LogUrl: logObsPath, @@ -2203,6 +2296,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ModelVersion: modelVersion, CkptName: ckptName, ResultUrl: resultObsPath, + Spec: spec, DatasetName: datasetNames, } @@ -2245,7 +2339,7 @@ func checkModelArtsSpecialPool(ctx *context.Context, flavorCode string, jobType if !isMatchPool { isMatchSpec := false if jobType == string(models.JobTypeDebug) { - for _, flavor := range modelarts.FlavorInfos.FlavorInfo { + for _, flavor := range setting.StFlavorInfo.FlavorInfo { if flavor.Value == flavorCode { isMatchSpec = true break @@ -2383,14 +2477,7 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference)) + prepareCloudbrainTwoInferenceSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -2421,6 +2508,16 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error { ctx.Data["PageIsCloudBrain"] = true @@ -2455,14 +2552,7 @@ func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModel return err } ctx.Data["engine_versions"] = versionInfos.Version - - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference)) + prepareCloudbrainTwoInferenceSpecs(ctx) configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { @@ -2537,7 +2627,7 @@ func InferenceJobShow(ctx *context.Context) { } else { task.Parameters = "" } - + prepareSpec4Show(ctx, task) LabelName := strings.Fields(task.LabelName) ctx.Data["labelName"] = LabelName ctx.Data["jobID"] = jobID diff --git a/routers/response/response_list.go b/routers/response/response_list.go index 5e057bfd0..6514f3edd 100644 --- a/routers/response/response_list.go +++ b/routers/response/response_list.go @@ -2,3 +2,4 @@ package response var RESOURCE_QUEUE_NOT_AVAILABLE = &BizError{Code: 1001, Err: "resource queue not available"} var SPECIFICATION_NOT_EXIST = &BizError{Code: 1002, Err: "specification not exist"} +var SPECIFICATION_NOT_AVAILABLE = &BizError{Code: 1003, Err: "specification not available"} diff --git a/routers/user/home.go b/routers/user/home.go index d8c2565c6..78e6c00e9 100755 --- a/routers/user/home.go +++ b/routers/user/home.go @@ -836,14 +836,12 @@ func Cloudbrains(ctx *context.Context) { ctx.ServerError("Get job failed:", err) return } - + models.LoadSpecs4CloudbrainInfo(ciTasks) for i, task := range ciTasks { ciTasks[i].CanDebug = true ciTasks[i].CanDel = true ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource ciTasks[i].Cloudbrain.AiCenter = repo.GetCloudbrainAiCenter(task.Cloudbrain, ctx) - _, cardType, _ := repo.GetCloudbrainCardNumAndType(task.Cloudbrain) - ciTasks[i].Cloudbrain.CardType = cardType ciTasks[i].Cloudbrain.Cluster = repo.GetCloudbrainCluster(task.Cloudbrain, ctx) } diff --git a/services/cloudbrain/resource/resource_specification.go b/services/cloudbrain/resource/resource_specification.go index 680b98933..b68abbb88 100644 --- a/services/cloudbrain/resource/resource_specification.go +++ b/services/cloudbrain/resource/resource_specification.go @@ -2,12 +2,19 @@ package resource import ( "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/services/admin/operate_log" + "encoding/json" + "errors" "fmt" + "strconv" "strings" + "time" ) func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error { @@ -92,6 +99,7 @@ func SyncGrampusSpecs(doerId int64) error { GPUMemGiB: gpuMemGiB, Status: models.SpecNotVerified, IsAutomaticSync: true, + IsAvailable: true, CreatedBy: doerId, UpdatedBy: doerId, }) @@ -103,6 +111,7 @@ func SyncGrampusSpecs(doerId int64) error { CpuCores: spec.SpecInfo.CpuCoreNum, MemGiB: memGiB, GPUMemGiB: gpuMemGiB, + IsAvailable: true, UpdatedBy: doerId, }) } @@ -142,7 +151,9 @@ func ResourceSpecOnShelf(doerId int64, id int64, unitPrice int) *response.BizErr if q, err := models.GetResourceQueue(&models.ResourceQueue{ID: spec.QueueId}); err != nil || q == nil { return response.RESOURCE_QUEUE_NOT_AVAILABLE } - + if !spec.IsAvailable { + return response.SPECIFICATION_NOT_AVAILABLE + } err = models.ResourceSpecOnShelf(id, unitPrice) if err != nil { return response.NewBizError(err) @@ -184,3 +195,461 @@ func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *mod Comment: comment, }) } + +func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) { + r, err := models.FindSpecs(opts) + if err != nil { + log.Error("FindAvailableSpecs error.%v", err) + return nil, err + } + //filter exclusive specs + specs := filterExclusiveSpecs(r, userId) + + //distinct by sourceSpecId + specs = distinctSpecs(specs) + return specs, err +} + +func filterExclusiveSpecs(r []*models.Specification, userId int64) []*models.Specification { + specs := make([]*models.Specification, 0, len(r)) + specMap := make(map[int64]string, 0) + for i := 0; i < len(r); i++ { + spec := r[i] + if _, has := specMap[spec.ID]; has { + continue + } + if !spec.IsExclusive { + specs = append(specs, spec) + specMap[spec.ID] = "" + continue + } + orgs := strings.Split(spec.ExclusiveOrg, ";") + for _, org := range orgs { + isMember, _ := models.IsOrganizationMemberByOrgName(org, userId) + if isMember { + specs = append(specs, spec) + specMap[spec.ID] = "" + break + } + } + } + return specs +} + +func distinctSpecs(r []*models.Specification) []*models.Specification { + specs := make([]*models.Specification, 0, len(r)) + sourceSpecIdMap := make(map[string]string, 0) + for i := 0; i < len(r); i++ { + spec := r[i] + if spec.SourceSpecId == "" { + specs = append(specs, spec) + continue + } + if _, has := sourceSpecIdMap[spec.SourceSpecId]; has { + continue + } + specs = append(specs, spec) + sourceSpecIdMap[spec.SourceSpecId] = "" + } + return specs +} + +func GetAndCheckSpec(userId int64, specId int64, opts models.FindSpecsOptions) (*models.Specification, error) { + if specId == 0 { + return nil, nil + } + opts.SpecId = specId + r, err := FindAvailableSpecs(userId, opts) + if err != nil { + return nil, err + } + if r == nil || len(r) == 0 { + return nil, nil + } + return r[0], nil +} + +func InsertCloudbrainSpec(cloudbrainId int64, s *models.Specification) error { + c := models.CloudbrainSpec{ + CloudbrainID: cloudbrainId, + SpecId: s.ID, + SourceSpecId: s.SourceSpecId, + AccCardsNum: s.AccCardsNum, + AccCardType: s.AccCardType, + CpuCores: s.CpuCores, + MemGiB: s.MemGiB, + GPUMemGiB: s.GPUMemGiB, + ShareMemGiB: s.ShareMemGiB, + ComputeResource: s.ComputeResource, + UnitPrice: s.UnitPrice, + QueueId: s.QueueId, + QueueCode: s.QueueCode, + Cluster: s.Cluster, + AiCenterCode: s.AiCenterCode, + AiCenterName: s.AiCenterName, + IsExclusive: s.IsExclusive, + ExclusiveOrg: s.ExclusiveOrg, + } + _, err := models.InsertCloudbrainSpec(c) + if err != nil { + log.Error("InsertCloudbrainSpec error.CloudbrainSpec=%v. err=%v", c, err) + return err + } + return nil +} + +func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) { + c, err := models.GetCloudbrainSpecByID(cloudbrainId) + if err != nil { + return nil, err + } + if c == nil { + return nil, nil + } + return c.ConvertToSpecification(), nil +} + +func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) { + var success int64 + var total int64 + + if !scopeAll { + if ids == nil || len(ids) == 0 { + return 0, 0, nil + } + total = int64(len(ids)) + tasks, err := models.GetCloudbrainWithDeletedByIDs(ids) + if err != nil { + return total, 0, err + } + for _, task := range tasks { + err = RefreshOneHistorySpec(task) + if err != nil { + log.Error("RefreshOneHistorySpec error.%v", err) + continue + } + success++ + } + + } else { + page := 1 + pageSize := 100 + n, err := models.CountNoSpecHistoricTask() + if err != nil { + log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err) + return 0, 0, err + } + total = n + for i := 0; i < 500; i++ { + list, err := models.FindCloudbrainTask(page, pageSize) + page++ + if err != nil { + log.Error("FindCloudbrainTask error.page=%d pageSize=%d e=%v", page, pageSize, err) + return total, success, err + } + if len(list) == 0 { + log.Info("RefreshHistorySpec. list is empty") + break + } + for _, task := range list { + s, err := GetCloudbrainSpec(task.ID) + if err != nil { + log.Error("RefreshHistorySpec GetCloudbrainSpec error.%v", err) + continue + } + if s != nil { + continue + } + err = RefreshOneHistorySpec(task) + if err != nil { + log.Error("RefreshOneHistorySpec error.%v", err) + continue + } + success++ + } + if len(list) < pageSize { + log.Info("RefreshHistorySpec. list < pageSize") + break + } + } + } + return total, success, nil + +} + +func RefreshOneHistorySpec(task *models.Cloudbrain) error { + var spec *models.Specification + var err error + switch task.Type { + case models.TypeCloudBrainOne: + spec, err = getCloudbrainOneSpec(task) + case models.TypeCloudBrainTwo: + spec, err = getCloudbrainTwoSpec(task) + case models.TypeC2Net: + spec, err = getGrampusSpec(task) + } + if err != nil { + log.Error("find spec error,task.ID=%d err=%v", task.ID, err) + return err + } + if spec == nil { + log.Error("find spec failed,task.ID=%d", task.ID) + return errors.New("find spec failed") + } + return InsertCloudbrainSpec(task.ID, spec) +} + +func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) { + if task.GpuQueue == "" { + log.Info("gpu queue is empty.task.ID = %d", task.ID) + return nil, nil + } + //find from config + spec, err := findCloudbrainOneSpecFromConfig(task) + if err != nil { + log.Error("getCloudbrainOneSpec findCloudbrainOneSpecFromConfig error.%v", err) + return nil, err + } + if spec != nil { + return spec, nil + } + //find from remote + return findCloudbrainOneSpecFromRemote(task) + +} + +func findCloudbrainOneSpecFromRemote(task *models.Cloudbrain) (*models.Specification, error) { + time.Sleep(200 * time.Millisecond) + log.Info("start findCloudbrainOneSpecFromRemote") + result, err := cloudbrain.GetJob(task.JobID) + if err != nil { + log.Error("getCloudbrainOneSpec error. %v", err) + return nil, err + } + + if result == nil { + log.Info("findCloudbrainOneSpecFromRemote failed,result is empty.task.ID=%d", task.ID) + return nil, nil + } + jobRes, _ := models.ConvertToJobResultPayload(result.Payload) + memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory) + if task.ComputeResource == "CPU/GPU" { + task.ComputeResource = models.GPU + } + var shmMB float32 + if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 { + shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024 + if jobRes.Config.TaskRoles[0].ShmMB == 103600 { + shmMB = 100 + } else if jobRes.Config.TaskRoles[0].ShmMB == 51800 { + shmMB = 50 + } + } + opt := models.FindSpecsOptions{ + ComputeResource: task.ComputeResource, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + QueueCode: task.GpuQueue, + AccCardsNum: jobRes.Resource.NvidiaComGpu, + UseAccCardsNum: true, + CpuCores: jobRes.Resource.CPU, + UseCpuCores: true, + MemGiB: memSize, + UseMemGiB: memSize > 0, + ShareMemGiB: shmMB, + UseShareMemGiB: shmMB > 0, + RequestAll: true, + } + specs, err := models.FindSpecs(opt) + if err != nil { + log.Error("getCloudbrainOneSpec from remote error,%v", err) + return nil, err + } + if len(specs) == 1 { + return specs[0], nil + } + if len(specs) == 0 { + s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") + if err != nil { + log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) + return nil, nil + } + return s, nil + } + log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) + return nil, nil +} + +func findCloudbrainOneSpecFromConfig(task *models.Cloudbrain) (*models.Specification, error) { + //find from config + var specConfig *models.ResourceSpec + hasSpec := false + if task.JobType == string(models.JobTypeTrain) { + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + } + } + } else if task.JobType == string(models.JobTypeInference) { + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + } + } + } else { + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + + } + } + } + if !hasSpec && cloudbrain.SpecialPools != nil { + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if specialPool.ResourceSpec != nil { + + for _, spec := range specialPool.ResourceSpec { + if task.ResourceSpecId == spec.Id { + hasSpec = true + specConfig = spec + break + } + } + } + } + } + if specConfig == nil { + log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId) + return nil, nil + } + if task.ComputeResource == "CPU/GPU" { + task.ComputeResource = models.GPU + } + + shareMemMiB := float32(specConfig.ShareMemMiB) / 1024 + if specConfig.ShareMemMiB == 103600 { + shareMemMiB = 100 + } else if specConfig.ShareMemMiB == 51800 { + shareMemMiB = 50 + } + opt := models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: task.ComputeResource, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + QueueCode: task.GpuQueue, + AccCardsNum: specConfig.GpuNum, + UseAccCardsNum: true, + CpuCores: specConfig.CpuNum, + UseCpuCores: true, + MemGiB: float32(specConfig.MemMiB) / 1024, + UseMemGiB: true, + ShareMemGiB: shareMemMiB, + UseShareMemGiB: true, + RequestAll: true, + } + specs, err := models.FindSpecs(opt) + if err != nil { + log.Error("getCloudbrainOneSpec from config error,%v", err) + return nil, err + } + if len(specs) > 1 { + log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) + return nil, nil + } + if len(specs) == 0 { + s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") + if err != nil { + log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) + return nil, nil + } + return s, nil + } + return specs[0], nil +} + +func getCloudbrainTwoSpec(task *models.Cloudbrain) (*models.Specification, error) { + specMap, err := models.GetCloudbrainTwoSpecs() + if err != nil { + log.Error("InitCloudbrainTwoSpecs err.%v", err) + return nil, err + } + if task.FlavorCode != "" { + return specMap[task.FlavorCode], nil + } + time.Sleep(200 * time.Millisecond) + log.Info("start getCloudbrainTwoSpec FromRemote") + if task.JobType == string(models.JobTypeDebug) { + result, err := modelarts.GetNotebook2(task.JobID) + if err != nil { + log.Error("getCloudbrainTwoSpec GetNotebook2 error.%v", err) + return nil, err + } + if result != nil { + return specMap[result.Flavor], nil + } + } else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) { + result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("getCloudbrainTwoSpec GetTrainJob error:%v", task.JobName, err) + return nil, err + } + if result != nil { + return specMap[result.Flavor.Code], nil + } + } + return nil, nil +} + +func getGrampusSpec(task *models.Cloudbrain) (*models.Specification, error) { + specMap, err := models.GetGrampusSpecs() + if err != nil { + log.Error("GetGrampusSpecs err.%v", err) + return nil, err + } + if task.AiCenter != "" { + c := strings.Split(task.AiCenter, "+") + spec := specMap[task.FlavorCode+"_"+c[0]] + if spec != nil { + return spec, nil + } + } + return specMap[task.FlavorCode], nil +} + +func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) { + return models.InitQueueAndSpec(models.ResourceQueue{ + QueueCode: opt.QueueCode, + Cluster: opt.Cluster, + AiCenterCode: opt.AiCenterCode, + AiCenterName: aiCenterName, + ComputeResource: opt.ComputeResource, + AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode), + Remark: remark, + }, models.ResourceSpecification{ + AccCardsNum: opt.AccCardsNum, + CpuCores: opt.CpuCores, + MemGiB: opt.MemGiB, + GPUMemGiB: opt.GPUMemGiB, + ShareMemGiB: opt.ShareMemGiB, + Status: models.SpecOffShelf, + IsAvailable: true, + }) +} diff --git a/templates/admin/cloudbrain/list.tmpl b/templates/admin/cloudbrain/list.tmpl index 83510f268..4bac45f2b 100755 --- a/templates/admin/cloudbrain/list.tmpl +++ b/templates/admin/cloudbrain/list.tmpl @@ -1,4 +1,5 @@ {{template "base/head" .}} +
@@ -175,10 +176,17 @@
- - {{if .CardType}}{{.CardType}}{{else}}--{{end}} - -
+ +
+
{{if .User.Name}} diff --git a/templates/admin/resources/queue.tmpl b/templates/admin/resources/queue.tmpl index 3f2d83b99..13c30690a 100644 --- a/templates/admin/resources/queue.tmpl +++ b/templates/admin/resources/queue.tmpl @@ -4,7 +4,7 @@ {{template "admin/navbar" .}}
- +
{{template "base/footer" .}} diff --git a/templates/admin/resources/scene.tmpl b/templates/admin/resources/scene.tmpl index 53af0352c..f0a94a703 100644 --- a/templates/admin/resources/scene.tmpl +++ b/templates/admin/resources/scene.tmpl @@ -4,7 +4,7 @@ {{template "admin/navbar" .}}
- +
{{template "base/footer" .}} diff --git a/templates/admin/resources/specification.tmpl b/templates/admin/resources/specification.tmpl index 34992c5c9..265aafa81 100644 --- a/templates/admin/resources/specification.tmpl +++ b/templates/admin/resources/specification.tmpl @@ -4,7 +4,7 @@ {{template "admin/navbar" .}}
- +
{{template "base/footer" .}} diff --git a/templates/custom/task_wait_count.tmpl b/templates/custom/task_wait_count.tmpl new file mode 100644 index 000000000..fb8ee71fb --- /dev/null +++ b/templates/custom/task_wait_count.tmpl @@ -0,0 +1,25 @@ +
+
+ + {{.i18n.Tr "repo.wait_count_start"}} {{.WaitCount}} {{.i18n.Tr "repo.wait_count_end"}} +
+
+ diff --git a/templates/repo/cloudbrain/benchmark/new.tmpl b/templates/repo/cloudbrain/benchmark/new.tmpl index 32e715ab7..58f471ede 100755 --- a/templates/repo/cloudbrain/benchmark/new.tmpl +++ b/templates/repo/cloudbrain/benchmark/new.tmpl @@ -51,9 +51,12 @@ {{.i18n.Tr "repo.cloudbrain.benchmark.model"}} - {{template "custom/wait_count_train" Dict "ctx" $ "type" .benchmark_gpu_types}}
+
+ + {{template "custom/task_wait_count" .}} +
{{.description}}
-
+
  @@ -112,7 +115,7 @@
{{template "custom/select_dataset_train" .}} -
+ +
+ +
- {{template "custom/wait_count_train" Dict "ctx" $ "type" .benchmark_gpu_types}}
+
+ + {{template "custom/task_wait_count" .}} +
{{.description}}
-
+
 
@@ -201,7 +214,7 @@
-
+ + +
+ +
@@ -245,7 +266,7 @@
{{template "base/footer" .}} - + \ No newline at end of file diff --git a/templates/repo/cloudbrain/benchmark/show.tmpl b/templates/repo/cloudbrain/benchmark/show.tmpl index add7d34d4..6f213b8c5 100755 --- a/templates/repo/cloudbrain/benchmark/show.tmpl +++ b/templates/repo/cloudbrain/benchmark/show.tmpl @@ -256,8 +256,9 @@
@@ -453,7 +454,7 @@ {{$.i18n.Tr "cloudbrain.gpu_type"}} - +
{{$.resource_type}}
@@ -464,9 +465,9 @@ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - +
- {{$.resource_spec}} + {{$.resource_spec}}
@@ -528,19 +529,42 @@
- -
- - -

-                            
- -
- -
+ + + {{$.i18n.Tr "repo.modelarts.download_log"}} + + +
+
+ + + + + + + +
+
+
+
+ + +

+                             
+ +
+ + +
@@ -571,7 +595,22 @@
{{template "base/footer" .}} - + + \ No newline at end of file diff --git a/templates/repo/cloudbrain/inference/new.tmpl b/templates/repo/cloudbrain/inference/new.tmpl index 9932ecf49..08cc23557 100644 --- a/templates/repo/cloudbrain/inference/new.tmpl +++ b/templates/repo/cloudbrain/inference/new.tmpl @@ -82,8 +82,11 @@ Ascend NPU
- {{template "custom/wait_count_train" Dict "ctx" $ "type" .inference_gpu_types}} -
+
+
+ + {{template "custom/task_wait_count" .}} +
{{.i18n.Tr "cloudbrain.new_infer_gpu_tooltips" "/dataset" "/model" "/result" | Safe}}
@@ -140,7 +143,7 @@ - +
@@ -168,7 +171,7 @@
-
+
@@ -227,7 +230,7 @@
-
+ +
+ +
-
@@ -264,7 +271,7 @@
{{template "base/footer" .}} - + diff --git a/templates/repo/cloudbrain/inference/show.tmpl b/templates/repo/cloudbrain/inference/show.tmpl index 3ec01417e..c076b00b3 100644 --- a/templates/repo/cloudbrain/inference/show.tmpl +++ b/templates/repo/cloudbrain/inference/show.tmpl @@ -228,7 +228,7 @@ {{with .task}}
+ data-repopath="{{$.RepoRelPath}}/cloudbrain" data-jobid="{{.ID}}" data-version="{{.VersionName}}">
@@ -264,7 +264,8 @@ data-tab="first">{{$.i18n.Tr "repo.modelarts.train_job.config"}} {{$.i18n.Tr "repo.cloudbrain.runinfo"}} - + {{$.i18n.Tr "repo.modelarts.log"}} {{$.i18n.Tr "repo.model_download"}}
@@ -340,7 +341,7 @@ {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} - +
{{$.resource_type}}
@@ -482,7 +483,7 @@ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - +
{{$.i18n.Tr "cloudbrain.gpu_num"}}:{{$.GpuNum}},{{$.i18n.Tr "cloudbrain.cpu_num"}}:{{$.CpuNum}},{{$.i18n.Tr "cloudbrain.memory"}}(MB):{{$.MemMiB}},{{$.i18n.Tr "cloudbrain.shared_memory"}}(MB):{{$.ShareMemMiB}}
@@ -524,7 +525,7 @@ -
@@ -537,7 +538,44 @@
- +
+ +
+ + + + + + + +
+
+
+
+ + +

+                             
+ +
+ + +
@@ -561,6 +599,7 @@
{{template "base/footer" .}} + \ No newline at end of file diff --git a/templates/repo/cloudbrain/new.tmpl b/templates/repo/cloudbrain/new.tmpl index 84785d6e3..b9efe1d4b 100755 --- a/templates/repo/cloudbrain/new.tmpl +++ b/templates/repo/cloudbrain/new.tmpl @@ -25,7 +25,7 @@
{{template "base/alert" .}} -
+
@@ -55,8 +55,11 @@ d="M3 2.992C3 2.444 3.445 2 3.993 2h16.014a1 1 0 0 1 .993.992v18.016a.993.993 0 0 1-.993.992H3.993A1 1 0 0 1 3 21.008V2.992zM19 11V4H5v7h14zm0 2H5v7h14v-7zM9 6h6v2H9V6zm0 9h6v2H9v-2z" /> Ascend NPU -
- {{template "custom/wait_count" .}} +
+
+
+ + {{template "custom/task_wait_count" .}}
@@ -108,8 +111,8 @@ {{end}} {{end}} -
-
+
+
@@ -125,8 +128,8 @@
- -
+ + + +
+ +
@@ -188,6 +199,7 @@
{{template "base/footer" .}} + \ No newline at end of file diff --git a/templates/repo/cloudbrain/show.tmpl b/templates/repo/cloudbrain/show.tmpl index 76363314d..46d4f5140 100755 --- a/templates/repo/cloudbrain/show.tmpl +++ b/templates/repo/cloudbrain/show.tmpl @@ -345,7 +345,7 @@ {{$.i18n.Tr "cloudbrain.gpu_type"}} - +
{{$.resource_type}}
@@ -400,10 +400,8 @@ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - -
- {{$.i18n.Tr "cloudbrain.gpu_num"}}:{{$.GpuNum}},{{$.i18n.Tr "cloudbrain.cpu_num"}}:{{$.CpuNum}},{{$.i18n.Tr "cloudbrain.memory"}}(MB):{{$.MemMiB}},{{$.i18n.Tr "cloudbrain.shared_memory"}}(MB):{{$.ShareMemMiB}} -
+ +
@@ -560,7 +558,7 @@
{{template "base/footer" .}} - + \ No newline at end of file diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index cf25ae91c..709490ac1 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -14,7 +14,9 @@ .width { width: 100% !important; } - + .width48 { + width: 48.5% !important; + } .width80 { width: 80.7% !important; margin-left: 10px; @@ -30,7 +32,7 @@ margin-left: 10.5rem !important; align-items: center; } - + .width81 { margin-left: 1.5rem !important; width: 81% !important; @@ -114,8 +116,11 @@ Ascend NPU
- {{template "custom/wait_count_train" Dict "ctx" $ "type" .train_gpu_types}} -
+
+
+ + {{template "custom/task_wait_count" .}} +
{{.i18n.Tr "cloudbrain.new_train_gpu_tooltips" "/code" "/dataset" "/model" | Safe}}
@@ -171,7 +176,7 @@
-
+
@@ -224,10 +229,10 @@ class="plus square outline icon">{{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
- +
-
+ + +
+ +
@@ -267,7 +279,7 @@
{{template "base/footer" .}} - + \ No newline at end of file + ;(function() { + var SPECS = {{ .train_specs }}; + var showPoint = true; + renderSpecsSelect($('#__specs__'), SPECS, showPoint, { + gpu_memory: {{$.i18n.Tr "cloudbrain.gpu_memory"}}, + free: {{$.i18n.Tr "cloudbrain.free"}}, + point_hr: {{$.i18n.Tr "cloudbrain.point_hr"}}, + memory: {{$.i18n.Tr "cloudbrain.memory"}}, + shared_memory: {{$.i18n.Tr "cloudbrain.shared_memory"}}, + }); + })(); + diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index ba886cd2f..86d8e54cb 100644 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -359,7 +359,7 @@ {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} - +
{{$.resource_type}}
@@ -370,7 +370,7 @@ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - +
{{$.i18n.Tr "cloudbrain.gpu_num"}}:{{$.GpuNum}},{{$.i18n.Tr "cloudbrain.cpu_num"}}:{{$.CpuNum}},{{$.i18n.Tr "cloudbrain.memory"}}(MB):{{$.MemMiB}},{{$.i18n.Tr "cloudbrain.shared_memory"}}(MB):{{$.ShareMemMiB}}
@@ -677,6 +677,7 @@ {{template "base/footer" .}} + \ No newline at end of file diff --git a/templates/repo/grampus/trainjob/gpu/new.tmpl b/templates/repo/grampus/trainjob/gpu/new.tmpl index 445ecbd01..90b38c422 100755 --- a/templates/repo/grampus/trainjob/gpu/new.tmpl +++ b/templates/repo/grampus/trainjob/gpu/new.tmpl @@ -30,7 +30,9 @@ .width81{ width: 81% !important; } - +.width48{ + width: 48.5% !important; +} .add{font-size: 18px; padding: 0.5rem; border: 1px solid rgba(187, 187, 187, 100); @@ -104,9 +106,12 @@ Ascend NPU -
- {{template "custom/wait_count_train" Dict "ctx" $}} -
+
+
+
+ + {{template "custom/task_wait_count" .}} +
{{.i18n.Tr "cloudbrain.new_train_gpu_tooltips" "/tmp/code" "/tmp/dataset" "/tmp/output" | Safe}}
@@ -116,7 +121,7 @@ {{.i18n.Tr "repo.cloudbrain_jobname_err"}}
- +
{{if .description}} @@ -128,7 +133,7 @@

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

- +
@@ -167,7 +172,7 @@ {{.i18n.Tr "cloudbrain.view_sample"}}
- + {{template "custom/select_dataset_train" .}}
@@ -175,11 +180,10 @@ {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
- +
- -
+ +
+ +
- + - + - +
{{template "base/footer" .}} + \ No newline at end of file + + ;(function() { + var SPECS = {{ .Specs }}; + var showPoint = true; + renderSpecsSelect($('#__specs__'), SPECS, showPoint, { + gpu_memory: {{$.i18n.Tr "cloudbrain.gpu_memory"}}, + free: {{$.i18n.Tr "cloudbrain.free"}}, + point_hr: {{$.i18n.Tr "cloudbrain.point_hr"}}, + memory: {{$.i18n.Tr "cloudbrain.memory"}}, + shared_memory: {{$.i18n.Tr "cloudbrain.shared_memory"}}, + }); + })(); + diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl index b84c5cc34..88a41779e 100755 --- a/templates/repo/grampus/trainjob/npu/new.tmpl +++ b/templates/repo/grampus/trainjob/npu/new.tmpl @@ -25,7 +25,9 @@ .width81{ width: 81% !important; } - +.width48 { + width: 48.5% !important; +} .add{font-size: 18px; padding: 0.5rem; border: 1px solid rgba(187, 187, 187, 100); @@ -99,13 +101,15 @@ Ascend NPU
- {{template "custom/wait_count_train" Dict "ctx" $}} -
+
+
+ + {{template "custom/task_wait_count" .}} +
{{.i18n.Tr "cloudbrain.new_train_gpu_tooltips" "/cache/code" "/cache/dataset" "/cache/output" | Safe}}
-
@@ -179,7 +183,7 @@ {{.i18n.Tr "cloudbrain.view_sample"}}
- + {{template "custom/select_dataset_train" .}}
@@ -187,11 +191,11 @@ {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
- +
-
+ +
+ +
@@ -242,6 +250,7 @@
{{template "base/footer" .}} + diff --git a/templates/repo/grampus/trainjob/show.tmpl b/templates/repo/grampus/trainjob/show.tmpl index 1b6dfc901..fd790dabe 100755 --- a/templates/repo/grampus/trainjob/show.tmpl +++ b/templates/repo/grampus/trainjob/show.tmpl @@ -357,7 +357,7 @@ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - +
{{.FlavorName}}
@@ -634,8 +634,20 @@ {{template "base/footer" .}} - + diff --git a/templates/repo/modelarts/inferencejob/show.tmpl b/templates/repo/modelarts/inferencejob/show.tmpl index c3855cafd..957c11234 100644 --- a/templates/repo/modelarts/inferencejob/show.tmpl +++ b/templates/repo/modelarts/inferencejob/show.tmpl @@ -423,7 +423,7 @@ td, th { {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - +
{{.FlavorName}}
@@ -522,6 +522,7 @@ td, th { {{template "base/footer" .}} + diff --git a/templates/repo/modelarts/notebook/new.tmpl b/templates/repo/modelarts/notebook/new.tmpl index a29794aca..4f10f43fb 100755 --- a/templates/repo/modelarts/notebook/new.tmpl +++ b/templates/repo/modelarts/notebook/new.tmpl @@ -38,7 +38,10 @@ Ascend NPU - {{template "custom/wait_count" .}} + +
+ + {{template "custom/task_wait_count" .}}
@@ -65,7 +68,7 @@
--> -
+ +
+ +
+
+ +
@@ -287,8 +294,24 @@ id="trainjob_work_server_num" tabindex="3" autofocus required maxlength="255" value="1" readonly>
- + {{if .WorkNode}} + {{range .WorkNode}} + + {{if $.work_server_number}} + {{if eq . $.work_server_number }} + + {{else}} + + {{end}} + {{else}} + + {{end}} + {{end}} + + {{else}} + {{end}}
@@ -310,7 +333,7 @@
{{template "base/footer" .}} - + \ No newline at end of file + + ;(function() { + var SPECS = {{ .Specs }}; + var showPoint = true; + renderSpecsSelect($('#__specs__'), SPECS, showPoint, { + gpu_memory: {{$.i18n.Tr "cloudbrain.gpu_memory"}}, + free: {{$.i18n.Tr "cloudbrain.free"}}, + point_hr: {{$.i18n.Tr "cloudbrain.point_hr"}}, + memory: {{$.i18n.Tr "cloudbrain.memory"}}, + shared_memory: {{$.i18n.Tr "cloudbrain.shared_memory"}}, + }); + })(); + diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl index 73b8ac61a..739364d04 100755 --- a/templates/repo/modelarts/trainjob/show.tmpl +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -1,5 +1,7 @@ {{template "base/head" .}} + +