package models import ( "encoding/json" "errors" "fmt" "time" "xorm.io/xorm" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" "xorm.io/builder" ) type CloudbrainStatus string type JobType string type ModelArtsJobStatus string const ( JobWaiting CloudbrainStatus = "WAITING" JobStopped CloudbrainStatus = "STOPPED" JobSucceeded CloudbrainStatus = "SUCCEEDED" JobFailed CloudbrainStatus = "FAILED" JobRunning CloudbrainStatus = "RUNNING" JobTypeDebug JobType = "DEBUG" JobTypeBenchmark JobType = "BENCHMARK" JobTypeSnn4imagenet JobType = "SNN4IMAGENET" ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败 ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中 ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动 ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中 ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中 ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败 ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中 ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中 ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止 ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障 ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除 ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中 ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败 ) type Cloudbrain struct { ID int64 `xorm:"pk autoincr"` JobID string `xorm:"INDEX NOT NULL"` JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"` JobName string `xorm:"INDEX"` Status string `xorm:"INDEX"` UserID int64 `xorm:"INDEX"` RepoID int64 `xorm:"INDEX"` SubTaskName string `xorm:"INDEX"` ContainerID string ContainerIp string CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` DeletedAt time.Time `xorm:"deleted"` CanDebug bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` User *User `xorm:"-"` Repo *Repository `xorm:"-"` } type CloudBrainLoginResult struct { Code string Msg string Payload map[string]interface{} } type TaskRole struct { Name string `json:"name"` TaskNumber int8 `json:"taskNumber"` MinSucceededTaskCount int8 `json:"minSucceededTaskCount"` MinFailedTaskCount int8 `json:"minFailedTaskCount"` CPUNumber int8 `json:"cpuNumber"` GPUNumber int8 `json:"gpuNumber"` MemoryMB int `json:"memoryMB"` ShmMB int `json:"shmMB"` Command string `json:"command"` NeedIBDevice bool `json:"needIBDevice"` IsMainRole bool `json:"isMainRole"` UseNNI bool `json:"useNNI"` } type StHostPath struct { Path string `json:"path"` MountPath string `json:"mountPath"` ReadOnly bool `json:"readOnly"` } type Volume struct { HostPath StHostPath `json:"hostPath"` } type CreateJobParams struct { JobName string `json:"jobName"` RetryCount int8 `json:"retryCount"` GpuType string `json:"gpuType"` Image string `json:"image"` TaskRoles []TaskRole `json:"taskRoles"` Volumes []Volume `json:"volumes"` } type CreateJobResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload map[string]interface{} `json:"payload"` } type GetJobResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload map[string]interface{} `json:"payload"` } type GetImagesResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload GetImagesPayload `json:"payload"` } type GetImagesPayload struct { Count int `json:"count"` ImageInfo []*ImageInfo `json:"rows"` } type CloudbrainsOptions struct { ListOptions RepoID int64 // include all repos if empty UserID int64 JobID int64 SortType string CloudbrainIDs []int64 // JobStatus CloudbrainStatus Type int } type TaskPod struct { TaskRoleStatus struct { Name string `json:"name"` } `json:"taskRoleStatus"` TaskStatuses []struct { TaskIndex int `json:"taskIndex"` PodUID string `json:"podUid"` PodIP string `json:"podIp"` PodName string `json:"podName"` ContainerID string `json:"containerId"` ContainerIP string `json:"containerIp"` ContainerGpus string `json:"containerGpus"` State string `json:"state"` StartAt time.Time `json:"startAt"` FinishedAt time.Time `json:"finishedAt"` ExitCode int `json:"exitCode"` ExitDiagnostics string `json:"exitDiagnostics"` RetriedCount int `json:"retriedCount"` StartTime string FinishedTime string } `json:"taskStatuses"` } type TaskInfo struct { Username string `json:"username"` TaskName string `json:"task_name"` CodeName string `json:"code_name"` BenchmarkCategory []string `json:"selected_category"` CodeLink string `json:"code_link"` } func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) { data, _ := json.Marshal(input) var taskPod TaskPod err := json.Unmarshal(data, &taskPod) taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05") taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05") return taskPod, err } type JobResultPayload struct { ID string `json:"id"` Name string `json:"name"` Platform string `json:"platform"` JobStatus struct { Username string `json:"username"` State string `json:"state"` SubState string `json:"subState"` ExecutionType string `json:"executionType"` Retries int `json:"retries"` CreatedTime int64 `json:"createdTime"` CompletedTime int64 `json:"completedTime"` AppID string `json:"appId"` AppProgress string `json:"appProgress"` AppTrackingURL string `json:"appTrackingUrl"` AppLaunchedTime int64 `json:"appLaunchedTime"` AppCompletedTime interface{} `json:"appCompletedTime"` AppExitCode int `json:"appExitCode"` AppExitDiagnostics string `json:"appExitDiagnostics"` AppExitType interface{} `json:"appExitType"` VirtualCluster string `json:"virtualCluster"` StartTime string EndTime string } `json:"jobStatus"` TaskRoles map[string]interface{} `json:"taskRoles"` Resource struct { CPU int `json:"cpu"` Memory string `json:"memory"` NvidiaComGpu int `json:"nvidia.com/gpu"` } `json:"resource"` Config struct { Image string `json:"image"` JobID string `json:"jobId"` GpuType string `json:"gpuType"` JobName string `json:"jobName"` JobType string `json:"jobType"` TaskRoles []struct { Name string `json:"name"` ShmMB int `json:"shmMB"` Command string `json:"command"` MemoryMB int `json:"memoryMB"` CPUNumber int `json:"cpuNumber"` GpuNumber int `json:"gpuNumber"` IsMainRole bool `json:"isMainRole"` TaskNumber int `json:"taskNumber"` NeedIBDevice bool `json:"needIBDevice"` MinFailedTaskCount int `json:"minFailedTaskCount"` MinSucceededTaskCount int `json:"minSucceededTaskCount"` } `json:"taskRoles"` RetryCount int `json:"retryCount"` } `json:"config"` Userinfo struct { User string `json:"user"` OrgID string `json:"org_id"` } `json:"userinfo"` } func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) { data, _ := json.Marshal(input) var jobResultPayload JobResultPayload err := json.Unmarshal(data, &jobResultPayload) jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05") jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05") return jobResultPayload, err } type ImagesResultPayload struct { Images []struct { ID int `json:"id"` Name string `json:"name"` Place string `json:"place"` Description string `json:"description"` Provider string `json:"provider"` Createtime string `json:"createtime"` Remark string `json:"remark"` } `json:"taskStatuses"` } type ImageInfo struct { ID int `json:"id"` Name string `json:"name"` Place string `json:"place"` Description string `json:"description"` Provider string `json:"provider"` Createtime string `json:"createtime"` Remark string `json:"remark"` IsPublic int `json:"isPublic"` PlaceView string } type Categories struct { Category []*Category `json:"category"` } type Category struct { Id int `json:"id"` Value string `json:"value"` } type CommitImageParams struct { Ip string `json:"ip"` TaskContainerId string `json:"taskContainerId"` ImageTag string `json:"imageTag"` ImageDescription string `json:"imageDescription"` } type CommitImageResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload map[string]interface{} `json:"payload"` } type StopJobResult struct { Code string `json:"code"` Msg string `json:"msg"` } type CreateNotebookParams struct { JobName string `json:"name"` Description string `json:"description"` ProfileID string `json:"profile_id"` Flavor string `json:"flavor"` Spec Spec `json:"spec"` Workspace Workspace `json:"workspace"` Pool Pool `json:"pool"` } type Pool struct { ID string `json:"id"` Name string `json:"name"` Type string `json:"type"` } type Workspace struct { ID string `json:"id"` } type Spec struct { Storage Storage `json:"storage"` AutoStop AutoStop `json:"auto_stop"` } type AutoStop struct { Enable bool `json:"enable"` Duration int `json:"duration"` } type Storage struct { Type string `json:"type"` Location Location `json:"location"` } type Location struct { Path string `json:"path"` } type NotebookResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` } type CreateNotebookResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` Status string `json:"status"` CreationTimestamp string `json:"creation_timestamp"` LatestUpdateTimestamp string `json:"latest_update_timestamp"` Profile struct { ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` DeType string `json:"de_type"` FlavorType string `json:"flavor_type"` } `json:"profile"` Flavor string `json:"flavor"` FlavorDetails struct{ Name string `json:"name"` Status string `json:"status"` QueuingNum int `json:"queuing_num"` QueueLeftTime int `json:"queue_left_time"` //s Duration int `json:"duration"` //auto_stop_time s } `json:"flavor_details"` } type GetNotebookResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` Status string `json:"status"` CreationTimestamp string `json:"creation_timestamp"` CreateTime string LatestUpdateTimestamp string `json:"latest_update_timestamp"` LatestUpdateTime string Profile struct { ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` DeType string `json:"de_type"` FlavorType string `json:"flavor_type"` } `json:"profile"` Flavor string `json:"flavor"` FlavorDetails struct{ Name string `json:"name"` Status string `json:"status"` QueuingNum int `json:"queuing_num"` QueueLeftTime int `json:"queue_left_time"` //s Duration int `json:"duration"` //auto_stop_time s } `json:"flavor_details"` QueuingInfo struct{ ID string `json:"id"` Name string `json:"name"` Flavor string `json:"flavor"` DeType string `json:"de_type"` Status string `json:"status"` BeginTimestamp int `json:"begin_timestamp"`//time of instance begin in queue BeginTime string RemainTime int `json:"remain_time"` //remain time of instance EndTimestamp int `json:"end_timestamp"` // EndTime string Rank int `json:"rank"` //rank of instance in queue } `json:"queuing_info"` Spec struct{ Annotations struct{ TargetDomain string `json:"target_domain"` Url string `json:"url"` } `json:"annotations"` } `json:"spec"` } type GetTokenParams struct { Auth Auth `json:"auth"` } type Auth struct { Identity Identity `json:"identity"` Scope Scope `json:"scope"` } type Scope struct { Project Project `json:"project"` } type Project struct { Name string `json:"name"` } type Identity struct { Methods []string `json:"methods"` Password Password `json:"password"` } type Password struct { User NotebookUser `json:"user"` } type NotebookUser struct { Name string `json:"name"` Password string `json:"password"` Domain Domain `json:"domain"` } type Domain struct { Name string `json:"name"` } const ( ActionStart = "start" ActionStop = "stop" ActionRestart = "restart" ActionQueue = "queue" ActionDequeue = "dequeue" ) type NotebookAction struct { Action string `json:"action"` } type NotebookActionResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` CurrentStatus string `json:"current_status"` PreviousState string `json:"previous_state"` } type NotebookGetJobTokenResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` Token string `json:"token"` } type NotebookDelResult struct { InstanceID string `json:"instance_id"` } type CreateTrainJobParams struct { JobName string `json:"job_name"` Description string `json:"job_desc"` Config Config `json:"config"` WorkspaceID string `json:"workspace_id"` } type Config struct { WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL //DatasetID string `json:"dataset_id"` //DataVersionID string `json:"dataset_version_id"` //DataSource []DataSource `json:"data_source"` //SpecID int64 `json:"spec_id"` EngineID int64 `json:"engine_id"` //ModelID int64 `json:"model_id"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` //UserImageUrl string `json:"user_image_url"` //UserCommand string `json:"user_command"` //CreateVersion bool `json:"create_version"` //Volumes []Volumes `json:"volumes"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } type CreateConfigParams struct { ConfigName string `json:"config_name"` Description string `json:"config_desc"` WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL //DatasetID string `json:"dataset_id"` //DataVersionID string `json:"dataset_version_id"` //DataSource []DataSource `json:"data_source"` //SpecID int64 `json:"spec_id"` EngineID int64 `json:"engine_id"` //ModelID int64 `json:"model_id"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` //UserImageUrl string `json:"user_image_url"` //UserCommand string `json:"user_command"` //CreateVersion bool `json:"create_version"` //Volumes []Volumes `json:"volumes"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } type Parameter struct { Label string `json:"label"` Value string `json:"value"` } type DataSource struct { DatasetID string `json:"dataset_id"` DatasetVersion string `json:"dataset_version"` Type string `json:"type"` DataUrl string `json:"data_url"` } type Volumes struct { Nfs Nfs `json:"nfs"` HostPath HostPath `json:"host_path"` } type Nfs struct { ID string `json:"id"` SourcePath string `json:"src_path"` DestPath string `json:"dest_path"` ReadOnly bool `json:"read_only"` } type HostPath struct { SourcePath string `json:"src_path"` DestPath string `json:"dest_path"` ReadOnly bool `json:"read_only"` } type Flavor struct { Code string `json:"code"` } type CreateTrainJobResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` JobName string `json:"job_name"` JobID int64 `json:"job_id"` Status int `json:"status"` CreationTime int64 `json:"create_time"` VersionID int64 `json:"version_id"` ResourceID string `json:"resource_id"` VersionName string `json:"version_name"` } type CreateTrainJobConfigResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` } type GetResourceSpecsResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` SpecTotalCount int `json:"spec_total_count"` Specs []Specs `json:"specs"` } type Specs struct { Core string `json:"core"` Cpu string `json:"cpu"` IsNoResource bool `json:"no_resource"` GpuType string `json:"gpu_type"` SpecID int64 `json:"spec_id"` GpuNum int `json:"gpu_num"` SpecCode string `json:"spec_code"` Storage string `json:"storage"` MaxNum int `json:"max_num"` UnitNum int `json:"unit_num"` InterfaceType int `json:"interface_type"` } type ErrorResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_message"` IsSuccess bool `json:"is_success"` } func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { sess := x.NewSession() defer sess.Close() var cond = builder.NewCond() if opts.RepoID > 0 { cond = cond.And( builder.Eq{"cloudbrain.repo_id": opts.RepoID}, ) } if opts.UserID > 0 { cond = cond.And( builder.Eq{"cloudbrain.user_id": opts.UserID}, ) } if (opts.JobID) > 0 { cond = cond.And( builder.Eq{"cloudbrain.job_id": opts.JobID}, ) } if (opts.Type) >= 0 { cond = cond.And( builder.Eq{"cloudbrain.type": opts.Type}, ) } // switch opts.JobStatus { // case JobWaiting: // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)}) // case JobFailed: // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)}) // case JobStopped: // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)}) // case JobSucceeded: // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)}) // } if len(opts.CloudbrainIDs) > 0 { cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) } count, err := sess.Where(cond).Count(new(Cloudbrain)) if err != nil { return nil, 0, fmt.Errorf("Count: %v", err) } if opts.Page >= 0 && opts.PageSize > 0 { var start int if opts.Page == 0 { start = 0 } else { start = (opts.Page - 1) * opts.PageSize } sess.Limit(opts.PageSize, start) } sess.OrderBy("cloudbrain.created_unix DESC") cloudbrains := make([]*Cloudbrain, 0, setting.UI.IssuePagingNum) if err := sess.Where(cond).Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } sess.Close() return cloudbrains, count, nil } func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { if _, err = x.Insert(cloudbrain); err != nil { return err } return nil } func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) { has, err := x.Get(cb) if err != nil { return nil, err } else if !has { return nil, errors.New("cloudbrain task is not found") } return cb, nil } func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) { cb := &Cloudbrain{JobID: jobID, RepoID: repoID} return getRepoCloudBrain(cb) } func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { cb := &Cloudbrain{JobID: jobID} return getRepoCloudBrain(cb) } func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) { cb := &Cloudbrain{JobID: jobID, Status: string(status)} _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb) return } func UpdateJob(job *Cloudbrain) error { return updateJob(x, job) } func updateJob(e Engine, job *Cloudbrain) error { var sess *xorm.Session sess = e.Where("job_id = ?", job.JobID) _, err := sess.Cols("status", "container_id", "container_ip").Update(job) return err } func DeleteJob(job *Cloudbrain) error { return deleteJob(x, job) } func deleteJob(e Engine, job *Cloudbrain) error { _, err := e.ID(job.ID).Delete(job) return err }