package models import ( "encoding/json" "fmt" "strings" "time" "xorm.io/builder" "xorm.io/xorm" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" ) type CloudbrainStatus string type JobType string type ModelArtsJobStatus string const ( NPUResource = "NPU" GPUResource = "CPU/GPU" //notebook storage category EVSCategory = "EVS" EFSCategory = "EFS" ManagedOwnership = "MANAGED" DetectedOwnership = "DEDICATED" NotebookFeature = "NOTEBOOK" DefaultFeature = "DEFAULT" JobWaiting CloudbrainStatus = "WAITING" JobStopped CloudbrainStatus = "STOPPED" JobSucceeded CloudbrainStatus = "SUCCEEDED" JobFailed CloudbrainStatus = "FAILED" JobRunning CloudbrainStatus = "RUNNING" JobTypeDebug JobType = "DEBUG" JobTypeBenchmark JobType = "BENCHMARK" JobTypeSnn4imagenet JobType = "SNN4IMAGENET" JobTypeBrainScore JobType = "BRAINSCORE" JobTypeTrain JobType = "TRAIN" JobTypeInference JobType = "INFERENCE" //notebook ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败 ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中 ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动 ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中 ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中 ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败 ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中 ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中 ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止 ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障 ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除 ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中 ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败 //trainjob ModelArtsTrainJobUnknown ModelArtsJobStatus = "UNKNOWN" //作业状态未知 ModelArtsTrainJobInit ModelArtsJobStatus = "INIT" //作业初始化状态 ModelArtsTrainJobImageCreating ModelArtsJobStatus = "IMAGE_CREATING" //作业镜像正在创建 ModelArtsTrainJobImageFailed ModelArtsJobStatus = "IMAGE_FAILED" //作业镜像创建失败 ModelArtsTrainJobSubmitTrying ModelArtsJobStatus = "SUBMIT_TRYING" //作业正在提交 ModelArtsTrainJobSubmitFailed ModelArtsJobStatus = "SUBMIT_FAILED" //作业提交失败 ModelArtsTrainJobDeleteFailed ModelArtsJobStatus = "DELETE_FAILED" //作业删除失败 ModelArtsTrainJobWaiting ModelArtsJobStatus = "WAITING" //作业正在排队中 ModelArtsTrainJobRunning ModelArtsJobStatus = "RUNNING" //作业正在运行中 ModelArtsTrainJobKilling ModelArtsJobStatus = "KILLING" //作业正在取消 ModelArtsTrainJobCompleted ModelArtsJobStatus = "COMPLETED" //作业已经完成 ModelArtsTrainJobFailed ModelArtsJobStatus = "FAILED" //作业运行失败 ModelArtsTrainJobKilled ModelArtsJobStatus = "KILLED" //作业取消成功 ModelArtsTrainJobCanceled ModelArtsJobStatus = "CANCELED" //作业取消 ModelArtsTrainJobLost ModelArtsJobStatus = "LOST" //作业丢失 ModelArtsTrainJobScaling ModelArtsJobStatus = "SCALING" //作业正在扩容 ModelArtsTrainJobSubmitModelFailed ModelArtsJobStatus = "SUBMIT_MODEL_FAILED" //提交模型失败 ModelArtsTrainJobDeployServiceFailed ModelArtsJobStatus = "DEPLOY_SERVICE_FAILED" //部署服务失败 ModelArtsTrainJobCheckInit ModelArtsJobStatus = "CHECK_INIT" //审核作业初始化 ModelArtsTrainJobCheckRunning ModelArtsJobStatus = "CHECK_RUNNING" //审核作业正在运行中 ModelArtsTrainJobCheckRunningCompleted ModelArtsJobStatus = "CHECK_RUNNING_COMPLETED" //审核作业已经完成 ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 ) type Cloudbrain struct { ID int64 `xorm:"pk autoincr"` JobID string `xorm:"INDEX NOT NULL"` JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"` JobName string Status string UserID int64 RepoID int64 SubTaskName string ContainerID string ContainerIp string CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` Duration int64 TrainJobDuration string Image string //GPU镜像名称 GpuQueue string //GPU类型即GPU队列 ResourceSpecId int //GPU规格id DeletedAt time.Time `xorm:"deleted"` CanDebug bool `xorm:"-"` CanDel bool `xorm:"-"` CanModify bool `xorm:"-"` Type int BenchmarkTypeID int BenchmarkChildTypeID int VersionID int64 //版本id VersionName string `xorm:"INDEX"` //当前版本 Uuid string //数据集id DatasetName string VersionCount int //任务的当前版本数量,不包括删除的 IsLatestVersion string //是否是最新版本,1是,0否 CommitID string //提交的仓库代码id PreVersionName string //父版本名称 ComputeResource string //计算资源,例如npu EngineID int64 //引擎id TrainUrl string //输出模型的obs路径 BranchName string //分支名称 Parameters string //传给modelarts的param参数 BootFile string //启动文件 DataUrl string //数据集的obs路径 LogUrl string //日志输出的obs路径 PreVersionId int64 //父版本的版本id FlavorCode string //modelarts上的规格id Description string `xorm:"varchar(256)"` //描述 WorkServerNumber int //节点数 FlavorName string //规格名称 EngineName string //引擎名称 TotalVersionCount int //任务的所有版本数量,包括删除的 LabelName string //标签名称 ModelName string //模型名称 ModelVersion string //模型版本 CkptName string //权重文件名称 ResultUrl string //推理结果的obs路径 User *User `xorm:"-"` Repo *Repository `xorm:"-"` } type CloudbrainInfo struct { Cloudbrain `xorm:"extends"` User `xorm:"extends"` } type CloudBrainLoginResult struct { Code string Msg string Payload map[string]interface{} } type TaskRole struct { Name string `json:"name"` TaskNumber int `json:"taskNumber"` MinSucceededTaskCount int `json:"minSucceededTaskCount"` MinFailedTaskCount int `json:"minFailedTaskCount"` CPUNumber int `json:"cpuNumber"` GPUNumber int `json:"gpuNumber"` MemoryMB int `json:"memoryMB"` ShmMB int `json:"shmMB"` Command string `json:"command"` NeedIBDevice bool `json:"needIBDevice"` IsMainRole bool `json:"isMainRole"` UseNNI bool `json:"useNNI"` } type StHostPath struct { Path string `json:"path"` MountPath string `json:"mountPath"` ReadOnly bool `json:"readOnly"` } type Volume struct { HostPath StHostPath `json:"hostPath"` } type CreateJobParams struct { JobName string `json:"jobName"` RetryCount int8 `json:"retryCount"` GpuType string `json:"gpuType"` Image string `json:"image"` TaskRoles []TaskRole `json:"taskRoles"` Volumes []Volume `json:"volumes"` } type CreateJobResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload map[string]interface{} `json:"payload"` } type GetJobResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload map[string]interface{} `json:"payload"` } type GetImagesResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload GetImagesPayload `json:"payload"` } type GetImagesPayload struct { Count int `json:"count"` TotalPages int `json:"totalPages,omitempty"` ImageInfo []*ImageInfo `json:"rows"` } type CloudbrainsOptions struct { ListOptions RepoID int64 // include all repos if empty UserID int64 JobID string SortType string CloudbrainIDs []int64 JobStatus []string JobStatusNot bool Keyword string Type int JobTypes []string VersionName string IsLatestVersion string JobTypeNot bool NeedRepoInfo bool } type TaskPod struct { TaskRoleStatus struct { Name string `json:"name"` } `json:"taskRoleStatus"` //TaskStatuses []struct { // TaskIndex int `json:"taskIndex"` // PodUID string `json:"podUid"` // PodIP string `json:"podIp"` // PodName string `json:"podName"` // ContainerID string `json:"containerId"` // ContainerIP string `json:"containerIp"` // ContainerGpus string `json:"containerGpus"` // State string `json:"state"` // StartAt time.Time `json:"startAt"` // FinishedAt time.Time `json:"finishedAt"` // ExitCode int `json:"exitCode"` // ExitDiagnostics string `json:"exitDiagnostics"` // RetriedCount int `json:"retriedCount"` // StartTime string // FinishedTime string //} `json:"taskStatuses"` TaskStatuses []TaskStatuses `json:"taskStatuses"` } type TaskStatuses struct { TaskIndex int `json:"taskIndex"` PodUID string `json:"podUid"` PodIP string `json:"podIp"` PodName string `json:"podName"` ContainerID string `json:"containerId"` ContainerIP string `json:"containerIp"` ContainerGpus string `json:"containerGpus"` State string `json:"state"` StartAt time.Time `json:"startAt"` FinishedAt time.Time `json:"finishedAt"` ExitCode int `json:"exitCode"` ExitDiagnostics string `json:"exitDiagnostics"` RetriedCount int `json:"retriedCount"` StartTime string FinishedTime string } type TaskInfo struct { Username string `json:"username"` TaskName string `json:"task_name"` CodeName string `json:"code_name"` BenchmarkCategory []string `json:"selected_category"` CodeLink string `json:"code_link"` GpuType string `json:"gpu_type"` } func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) { data, _ := json.Marshal(input) var taskPod TaskPod err := json.Unmarshal(data, &taskPod) taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05") taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05") //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with - if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") { taskPod.TaskStatuses[0].FinishedTime = "-" } return taskPod, err } type JobResultPayload struct { ID string `json:"id"` Name string `json:"name"` Platform string `json:"platform"` JobStatus struct { Username string `json:"username"` State string `json:"state"` SubState string `json:"subState"` ExecutionType string `json:"executionType"` Retries int `json:"retries"` CreatedTime int64 `json:"createdTime"` CompletedTime int64 `json:"completedTime"` AppID string `json:"appId"` AppProgress string `json:"appProgress"` AppTrackingURL string `json:"appTrackingUrl"` AppLaunchedTime int64 `json:"appLaunchedTime"` AppCompletedTime interface{} `json:"appCompletedTime"` AppExitCode int `json:"appExitCode"` AppExitDiagnostics string `json:"appExitDiagnostics"` AppExitType interface{} `json:"appExitType"` VirtualCluster string `json:"virtualCluster"` StartTime string EndTime string } `json:"jobStatus"` TaskRoles map[string]interface{} `json:"taskRoles"` Resource struct { CPU int `json:"cpu"` Memory string `json:"memory"` NvidiaComGpu int `json:"nvidia.com/gpu"` } `json:"resource"` Config struct { Image string `json:"image"` JobID string `json:"jobId"` GpuType string `json:"gpuType"` JobName string `json:"jobName"` JobType string `json:"jobType"` TaskRoles []struct { Name string `json:"name"` ShmMB int `json:"shmMB"` Command string `json:"command"` MemoryMB int `json:"memoryMB"` CPUNumber int `json:"cpuNumber"` GpuNumber int `json:"gpuNumber"` IsMainRole bool `json:"isMainRole"` TaskNumber int `json:"taskNumber"` NeedIBDevice bool `json:"needIBDevice"` MinFailedTaskCount int `json:"minFailedTaskCount"` MinSucceededTaskCount int `json:"minSucceededTaskCount"` } `json:"taskRoles"` RetryCount int `json:"retryCount"` } `json:"config"` Userinfo struct { User string `json:"user"` OrgID string `json:"org_id"` } `json:"userinfo"` } func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) { data, _ := json.Marshal(input) var jobResultPayload JobResultPayload err := json.Unmarshal(data, &jobResultPayload) jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05") jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05") if jobResultPayload.JobStatus.State == string(JobWaiting) { jobResultPayload.JobStatus.StartTime = "-" jobResultPayload.JobStatus.EndTime = "-" } return jobResultPayload, err } type ImagesResultPayload struct { Images []struct { ID int `json:"id"` Name string `json:"name"` Place string `json:"place"` Description string `json:"description"` Provider string `json:"provider"` Createtime string `json:"createtime"` Remark string `json:"remark"` } `json:"taskStatuses"` } type ImageInfo struct { ID int `json:"id"` Name string `json:"name"` Place string `json:"place"` Description string `json:"description"` Provider string `json:"provider"` Createtime string `json:"createtime"` Remark string `json:"remark"` IsPublic int `json:"isPublic"` PlaceView string } type Categories struct { Category []*Category `json:"category"` } type Category struct { Id int `json:"id"` Value string `json:"value"` } type BenchmarkTypes struct { BenchmarkType []*BenchmarkType `json:"type"` } type BenchmarkType struct { Id int `json:"id"` First string `json:"first"` //一级算法类型名称 Second []*BenchmarkDataset `json:"second"` } type BenchmarkDataset struct { Id int `json:"id"` Value string `json:"value"` //二级算法类型名称 Attachment string `json:"attachment"` //数据集的uuid Owner string `json:"owner"` //评估脚本所在仓库的拥有者 RepoName string `json:"repo_name"` //评估脚本所在仓库的名称 } type GpuInfos struct { GpuInfo []*GpuInfo `json:"gpu_type"` } type GpuInfo struct { Id int `json:"id"` Value string `json:"value"` Queue string `json:"queue"` } type ResourceSpecs struct { ResourceSpec []*ResourceSpec `json:"resorce_specs"` } type ResourceSpec struct { Id int `json:"id"` CpuNum int `json:"cpu"` GpuNum int `json:"gpu"` MemMiB int `json:"memMiB"` ShareMemMiB int `json:"shareMemMiB"` } type FlavorInfos struct { FlavorInfo []*FlavorInfo `json:"flavor_info"` } type FlavorInfo struct { Id int `json:"id"` Value string `json:"value"` Desc string `json:"desc"` } type PoolInfos struct { PoolInfo []*PoolInfo `json:"pool_info"` } type PoolInfo struct { PoolId string `json:"pool_id"` PoolName string `json:"pool_name"` PoolType string `json:"pool_type"` } type CommitImageParams struct { Ip string `json:"ip"` TaskContainerId string `json:"taskContainerId"` ImageTag string `json:"imageTag"` ImageDescription string `json:"imageDescription"` } type CommitImageResult struct { Code string `json:"code"` Msg string `json:"msg"` Payload map[string]interface{} `json:"payload"` } type GetJobLogParams struct { Size string `json:"size"` Sort string `json:"sort"` QueryInfo QueryInfo `json:"query"` } type QueryInfo struct { MatchInfo MatchInfo `json:"match"` } type MatchInfo struct { PodName string `json:"kubernetes.pod.name"` } type GetJobLogResult struct { ScrollID string `json:"_scroll_id"` Took int `json:"took"` TimedOut bool `json:"timed_out"` Shards struct { Total int `json:"total"` Successful int `json:"successful"` Skipped int `json:"skipped"` Failed int `json:"failed"` } `json:"_shards"` Hits struct { Hits []Hits `json:"hits"` } `json:"hits"` } type Hits struct { Index string `json:"_index"` Type string `json:"_type"` ID string `json:"_id"` Source struct { Message string `json:"message"` } `json:"_source"` Sort []int `json:"sort"` } type GetAllJobLogParams struct { Scroll string `json:"scroll"` ScrollID string `json:"scroll_id"` } type DeleteJobLogTokenParams struct { ScrollID string `json:"scroll_id"` } type DeleteJobLogTokenResult struct { Succeeded bool `json:"succeeded"` NumFreed int `json:"num_freed"` } type CloudBrainResult struct { Code string `json:"code"` Msg string `json:"msg"` } type CreateNotebook2Params struct { JobName string `json:"name"` Description string `json:"description"` Duration int64 `json:"duration"` //ms Feature string `json:"feature"` PoolID string `json:"pool_id"` Flavor string `json:"flavor"` ImageID string `json:"image_id"` WorkspaceID string `json:"workspace_id"` Volume VolumeReq `json:"volume"` } type VolumeReq struct { Capacity int `json:"capacity"` Category string `json:"category"` Ownership string `json:"ownership"` Uri string `json:"uri"` } type CreateNotebookParams struct { JobName string `json:"name"` Description string `json:"description"` ProfileID string `json:"profile_id"` Flavor string `json:"flavor"` Spec Spec `json:"spec"` Workspace Workspace `json:"workspace"` Pool Pool `json:"pool"` } type Pool struct { ID string `json:"id"` Name string `json:"name"` Type string `json:"type"` } type Workspace struct { ID string `json:"id"` } type Spec struct { Storage Storage `json:"storage"` AutoStop AutoStop `json:"auto_stop"` } type AutoStop struct { Enable bool `json:"enable"` Duration int `json:"duration"` } type Storage struct { Type string `json:"type"` Location Location `json:"location"` } type Location struct { Path string `json:"path"` } type NotebookResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` } type CreateNotebookResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` Status string `json:"status"` CreationTimestamp string `json:"creation_timestamp"` LatestUpdateTimestamp string `json:"latest_update_timestamp"` Profile struct { ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` DeType string `json:"de_type"` FlavorType string `json:"flavor_type"` } `json:"profile"` Flavor string `json:"flavor"` FlavorDetails struct { Name string `json:"name"` Status string `json:"status"` QueuingNum int `json:"queuing_num"` QueueLeftTime int `json:"queue_left_time"` //s Duration int `json:"duration"` //auto_stop_time s } `json:"flavor_details"` } type GetNotebookResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` Status string `json:"status"` CreationTimestamp string `json:"creation_timestamp"` CreateTime string LatestUpdateTimestamp string `json:"latest_update_timestamp"` LatestUpdateTime string Profile struct { ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` DeType string `json:"de_type"` FlavorType string `json:"flavor_type"` } `json:"profile"` Flavor string `json:"flavor"` FlavorDetails struct { Name string `json:"name"` Status string `json:"status"` QueuingNum int `json:"queuing_num"` QueueLeftTime int `json:"queue_left_time"` //s Duration int `json:"duration"` //auto_stop_time s } `json:"flavor_details"` QueuingInfo struct { ID string `json:"id"` Name string `json:"name"` Flavor string `json:"flavor"` DeType string `json:"de_type"` Status string `json:"status"` BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue BeginTime string RemainTime int `json:"remain_time"` //remain time of instance EndTimestamp int `json:"end_timestamp"` // EndTime string Rank int `json:"rank"` //rank of instance in queue } `json:"queuing_info"` Spec struct { Annotations struct { TargetDomain string `json:"target_domain"` Url string `json:"url"` } `json:"annotations"` } `json:"spec"` } type GetNotebook2Result struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` FailReason string `json:"fail_reason"` ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` Status string `json:"status"` Url string `json:"url"` //实例访问的URL Token string `json:"token"` //notebook鉴权使用的token信息 Flavor string `json:"flavor"` CreateTime string LatestUpdateTime string CreateAt int64 `json:"create_at"` //实例创建的时间,UTC毫秒 UpdateAt int64 `json:"update_at"` //实例最后更新(不包括保活心跳)的时间,UTC毫秒 Image struct { Name string `json:"name"` Status string `json:"status"` QueuingNum int `json:"queuing_num"` QueueLeftTime int `json:"queue_left_time"` //s Duration int `json:"duration"` //auto_stop_time s } `json:"image"` Lease struct { CreateTime int64 `json:"create_at"` //实例创建的时间,UTC毫秒 Duration int64 `json:"duration"` //实例运行时长,以创建时间为起点计算,即“创建时间+duration > 当前时刻”时,系统会自动停止实例 UpdateTime int64 `json:"update_at"` //实例最后更新(不包括保活心跳)的时间,UTC毫秒 } `json:"lease"` //实例自动停止的倒计时信息 VolumeRes struct { Capacity int `json:"capacity"` Category string `json:"category"` MountPath string `json:"mount_path"` Ownership string `json:"ownership"` Status string `json:"status"` } `json:"volume"` } type GetTokenParams struct { Auth Auth `json:"auth"` } type Auth struct { Identity Identity `json:"identity"` Scope Scope `json:"scope"` } type Scope struct { Project Project `json:"project"` } type Project struct { Name string `json:"name"` } type Identity struct { Methods []string `json:"methods"` Password Password `json:"password"` } type Password struct { User NotebookUser `json:"user"` } type NotebookUser struct { Name string `json:"name"` Password string `json:"password"` Domain Domain `json:"domain"` } type Domain struct { Name string `json:"name"` } const ( ActionStart = "start" ActionStop = "stop" ActionRestart = "restart" ActionQueue = "queue" ActionDequeue = "dequeue" ) type NotebookAction struct { Action string `json:"action"` } type NotebookActionResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` CurrentStatus string `json:"current_status"` PreviousState string `json:"previous_state"` Status string `json:"status"` } type NotebookGetJobTokenResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` Token string `json:"token"` } type NotebookDelResult struct { InstanceID string `json:"instance_id"` } type CreateTrainJobParams struct { JobName string `json:"job_name"` Description string `json:"job_desc"` Config Config `json:"config"` WorkspaceID string `json:"workspace_id"` } type Config struct { WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL EngineID int64 `json:"engine_id"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` //UserImageUrl string `json:"user_image_url"` //UserCommand string `json:"user_command"` CreateVersion bool `json:"create_version"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } type CreateInferenceJobParams struct { JobName string `json:"job_name"` Description string `json:"job_desc"` InfConfig InfConfig `json:"config"` WorkspaceID string `json:"workspace_id"` } type InfConfig struct { WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL EngineID int64 `json:"engine_id"` LogUrl string `json:"log_url"` CreateVersion bool `json:"create_version"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } type CreateTrainJobVersionParams struct { Description string `json:"job_desc"` Config TrainJobVersionConfig `json:"config"` } type TrainJobVersionConfig struct { WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL EngineID int64 `json:"engine_id"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` PreVersionId int64 `json:"pre_version_id"` } type CreateConfigParams struct { ConfigName string `json:"config_name"` Description string `json:"config_desc"` WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL EngineID int64 `json:"engine_id"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } type Parameter struct { Label string `json:"label"` Value string `json:"value"` } type Parameters struct { Parameter []Parameter `json:"parameter"` } type DataSource struct { DatasetID string `json:"dataset_id"` DatasetVersion string `json:"dataset_version"` Type string `json:"type"` DataUrl string `json:"data_url"` } type Volumes struct { Nfs Nfs `json:"nfs"` HostPath HostPath `json:"host_path"` } type Nfs struct { ID string `json:"id"` SourcePath string `json:"src_path"` DestPath string `json:"dest_path"` ReadOnly bool `json:"read_only"` } type HostPath struct { SourcePath string `json:"src_path"` DestPath string `json:"dest_path"` ReadOnly bool `json:"read_only"` } type Flavor struct { Code string `json:"code"` } type CreateTrainJobResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` JobName string `json:"job_name"` JobID int64 `json:"job_id"` Status int `json:"status"` CreateTime int64 `json:"create_time"` VersionID int64 `json:"version_id"` ResourceID string `json:"resource_id"` VersionName string `json:"version_name"` } type CreateTrainJobConfigResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` } type GetResourceSpecsResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` SpecTotalCount int `json:"spec_total_count"` Specs []Specs `json:"specs"` } type Specs struct { Core string `json:"core"` Cpu string `json:"cpu"` IsNoResource bool `json:"no_resource"` GpuType string `json:"gpu_type"` SpecID int64 `json:"spec_id"` GpuNum int `json:"gpu_num"` SpecCode string `json:"spec_code"` Storage string `json:"storage"` MaxNum int `json:"max_num"` UnitNum int `json:"unit_num"` InterfaceType int `json:"interface_type"` } type GetConfigListResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` ConfigTotalCount int `json:"config_total_count"` ParaConfigs []ParaConfig `json:"configs"` } type ParaConfig struct { ConfigName string `json:"config_name"` ConfigDesc string `json:"config_desc"` CreateTime int64 `json:"create_time"` EngineType int `json:"engine_type"` EngineName string `json:"engine_name"` EngineId int64 `json:"engine_id"` EngineVersion string `json:"engine_version"` UserImageUrl string `json:"user_image_url"` UserCommand string `json:"user_command"` Result GetConfigResult } type GetConfigResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` ConfigName string `json:"config_name"` Description string `json:"config_desc"` WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL EngineID int64 `json:"engine_id"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } type ErrorResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_message"` IsSuccess bool `json:"is_success"` } type GetTrainJobResult struct { IsSuccess bool `json:"is_success"` JobName string `json:"job_name"` JobID int64 `json:"job_id"` Description string `json:"job_desc"` IntStatus int `json:"status"` Status string LongCreateTime int64 `json:"create_time"` CreateTime string Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒 TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss VersionID int64 `json:"version_id"` ResourceID string `json:"resource_id"` VersionName string `json:"version_name"` PreVersionID int64 `json:"pre_version_id"` WorkServerNum int `json:"worker_server_num"` AppUrl string `json:"app_url"` //训练作业的代码目录 BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL EngineID int64 `json:"engine_id"` EngineName string `json:"engine_name"` EngineVersion string `json:"engine_version"` TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL LogUrl string `json:"log_url"` Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` PoolName string `json:"pool_name"` NasMountPath string `json:"nas_mount_path"` NasShareAddr string `json:"nas_share_addr"` DatasetName string ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话 } type GetTrainJobLogResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` Content string `json:"content"` Lines int `json:"lines"` StartLine string `json:"start_line"` EndLine string `json:"end_line"` } type GetTrainJobLogFileNamesResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` LogFileList []string `json:"log_file_list"` } type TrainJobResult struct { ErrorCode string `json:"error_code"` ErrorMsg string `json:"error_msg"` IsSuccess bool `json:"is_success"` } type LogFile struct { Name string } func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { sess := x.NewSession() defer sess.Close() var cond = builder.NewCond() if opts.RepoID > 0 { cond = cond.And( builder.Eq{"cloudbrain.repo_id": opts.RepoID}, ) } if opts.UserID > 0 { cond = cond.And( builder.Eq{"cloudbrain.user_id": opts.UserID}, ) } if (opts.JobID) != "" { cond = cond.And( builder.Eq{"cloudbrain.job_id": opts.JobID}, ) } if (opts.Type) >= 0 { cond = cond.And( builder.Eq{"cloudbrain.type": opts.Type}, ) } if len(opts.JobTypes) > 0 { if opts.JobTypeNot { cond = cond.And( builder.NotIn("cloudbrain.job_type", opts.JobTypes), ) } else { cond = cond.And( builder.In("cloudbrain.job_type", opts.JobTypes), ) } } if (opts.IsLatestVersion) != "" { cond = cond.And( builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion}, ) } if len(opts.CloudbrainIDs) > 0 { cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) } if len(opts.JobStatus) > 0 { if opts.JobStatusNot { cond = cond.And( builder.NotIn("cloudbrain.status", opts.JobStatus), ) } else { cond = cond.And( builder.In("cloudbrain.status", opts.JobStatus), ) } } var count int64 var err error condition := "cloudbrain.user_id = `user`.id" if len(opts.Keyword) == 0 { count, err = sess.Where(cond).Count(new(Cloudbrain)) } else { lowerKeyWord := strings.ToLower(opts.Keyword) cond = cond.And(builder.Or(builder.Like{"LOWER(cloudbrain.job_name)", lowerKeyWord}, builder.Like{"`user`.lower_name", lowerKeyWord})) count, err = sess.Table(&Cloudbrain{}).Where(cond). Join("left", "`user`", condition).Count(new(CloudbrainInfo)) } if err != nil { return nil, 0, fmt.Errorf("Count: %v", err) } if opts.Page >= 0 && opts.PageSize > 0 { var start int if opts.Page == 0 { start = 0 } else { start = (opts.Page - 1) * opts.PageSize } sess.Limit(opts.PageSize, start) } sess.OrderBy("cloudbrain.created_unix DESC") cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum) if err := sess.Table(&Cloudbrain{}).Where(cond). Join("left", "`user`", condition). Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } if opts.NeedRepoInfo { var ids []int64 for _, task := range cloudbrains { ids = append(ids, task.RepoID) } repositoryMap, err := GetRepositoriesMapByIDs(ids) if err == nil { for _, task := range cloudbrains { task.Repo = repositoryMap[task.RepoID] } } } return cloudbrains, count, nil } func QueryModelTrainJobVersionList(jobId string) ([]*CloudbrainInfo, int, error) { sess := x.NewSession() defer sess.Close() var cond = builder.NewCond() cond = cond.And( builder.Eq{"cloudbrain.job_id": jobId}, ) cond = cond.And( builder.Eq{"cloudbrain.Status": "COMPLETED"}, ) sess.OrderBy("cloudbrain.created_unix DESC") cloudbrains := make([]*CloudbrainInfo, 0) if err := sess.Table(&Cloudbrain{}).Where(cond). Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } return cloudbrains, int(len(cloudbrains)), nil } func QueryModelTrainJobList(repoId int64) ([]*CloudbrainInfo, int, error) { sess := x.NewSession() defer sess.Close() var cond = builder.NewCond() cond = cond.And( builder.Eq{"repo_id": repoId}, ) cond = cond.And( builder.Eq{"Status": "COMPLETED"}, ) cond = cond.And( builder.Eq{"job_type": "TRAIN"}, ) cloudbrains := make([]*CloudbrainInfo, 0) if err := sess.Select("job_id,job_name").Table(&Cloudbrain{}).Where(cond).OrderBy("created_unix DESC"). Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } keys := make(map[string]string) uniqueElements := make([]*CloudbrainInfo, 0) for _, entry := range cloudbrains { if _, value := keys[entry.JobID]; !value { keys[entry.JobID] = entry.JobName uniqueElements = append(uniqueElements, entry) } } return uniqueElements, int(len(uniqueElements)), nil } func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, error) { sess := x.NewSession() defer sess.Close() var cond = builder.NewCond() if opts.RepoID > 0 { cond = cond.And( builder.Eq{"cloudbrain.repo_id": opts.RepoID}, ) } if opts.UserID > 0 { cond = cond.And( builder.Eq{"cloudbrain.user_id": opts.UserID}, ) } if (opts.Type) >= 0 { cond = cond.And( builder.Eq{"cloudbrain.type": opts.Type}, ) } if (opts.JobID) != "" { cond = cond.And( builder.Eq{"cloudbrain.job_id": opts.JobID}, ) } if len(opts.JobTypes) > 0 { cond = cond.And( builder.In("cloudbrain.job_type", opts.JobTypes), ) } if len(opts.CloudbrainIDs) > 0 { cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) } count, err := sess.Where(cond).Count(new(Cloudbrain)) if err != nil { return nil, 0, fmt.Errorf("Count: %v", err) } if opts.Page >= 0 && opts.PageSize > 0 { var start int if opts.Page == 0 { start = 0 } else { start = (opts.Page - 1) * opts.PageSize } sess.Limit(opts.PageSize, start) } sess.OrderBy("cloudbrain.created_unix DESC") cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum) if err := sess.Table(&Cloudbrain{}).Where(cond). Join("left", "`user`", "cloudbrain.user_id = `user`.id"). Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } return cloudbrains, int(count), nil } func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { if _, err = x.Insert(cloudbrain); err != nil { return err } return nil } func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) { has, err := x.Get(cb) if err != nil { return nil, err } else if !has { return nil, ErrJobNotExist{} } return cb, nil } func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) { cb := &Cloudbrain{JobID: jobID, RepoID: repoID} return getRepoCloudBrain(cb) } func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { cb := &Cloudbrain{JobID: jobID} return getRepoCloudBrain(cb) } func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) { cb := &Cloudbrain{JobID: jobID, VersionName: versionName} return getRepoCloudBrain(cb) } func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) { cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion} return getRepoCloudBrain(cb) } func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) return cloudBrains, err } func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains) return cloudBrains, err } func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) { cb := &Cloudbrain{JobID: jobID, Status: string(status)} _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb) return } func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) { cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration} _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb) return } func SetVersionCountAndLatestVersion(jobID string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) { cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount} _, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) return } func UpdateJob(job *Cloudbrain) error { return updateJob(x, job) } func updateJob(e Engine, job *Cloudbrain) error { _, err := e.ID(job.ID).AllCols().Update(job) return err } func UpdateTrainJobVersion(job *Cloudbrain) error { return updateJobTrainVersion(x, job) } func updateJobTrainVersion(e Engine, job *Cloudbrain) error { var sess *xorm.Session sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName) _, err := sess.Cols("status", "train_job_duration").Update(job) return err } func DeleteJob(job *Cloudbrain) error { return deleteJob(x, job) } func deleteJob(e Engine, job *Cloudbrain) error { _, err := e.ID(job.ID).Delete(job) return err } func GetCloudbrainByName(jobName string) (*Cloudbrain, error) { cb := &Cloudbrain{JobName: jobName} return getRepoCloudBrain(cb) } func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool { if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) { return false } repo, err := GetRepositoryByID(job.RepoID) if err != nil { log.Error("GetRepositoryByID failed:%v", err.Error()) return false } permission, _ := GetUserRepoPermission(repo, user) if err != nil { log.Error("GetUserRepoPermission failed:%v", err.Error()) return false } if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin { return true } return false } func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { cloudbrains := make([]*Cloudbrain, 0, 10) return cloudbrains, x. NotIn("status", JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). Limit(100). Find(&cloudbrains) } func GetCloudbrainCountByUserID(userID int64, jobType string) (int, error) { count, err := x.In("status", JobWaiting, JobRunning).And("job_type = ? and user_id = ? and type = ?", jobType, userID, TypeCloudBrainOne).Count(new(Cloudbrain)) return int(count), err } func GetCloudbrainNotebookCountByUserID(userID int64) (int, error) { count, err := x.In("status", ModelArtsCreateQueue, ModelArtsCreating, ModelArtsStarting, ModelArtsReadyToStart, ModelArtsResizing, ModelArtsStartQueuing, ModelArtsRunning, ModelArtsRestarting). And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) return int(count), err } func GetCloudbrainTrainJobCountByUserID(userID int64) (int, error) { count, err := x.In("status", ModelArtsTrainJobInit, ModelArtsTrainJobImageCreating, ModelArtsTrainJobSubmitTrying, ModelArtsTrainJobWaiting, ModelArtsTrainJobRunning, ModelArtsTrainJobScaling, ModelArtsTrainJobCheckInit, ModelArtsTrainJobCheckRunning, ModelArtsTrainJobCheckRunningCompleted). And("job_type = ? and user_id = ? and type = ?", JobTypeTrain, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) return int(count), err } func GetCloudbrainInferenceJobCountByUserID(userID int64) (int, error) { count, err := x.In("status", ModelArtsTrainJobInit, ModelArtsTrainJobImageCreating, ModelArtsTrainJobSubmitTrying, ModelArtsTrainJobWaiting, ModelArtsTrainJobRunning, ModelArtsTrainJobScaling, ModelArtsTrainJobCheckInit, ModelArtsTrainJobCheckRunning, ModelArtsTrainJobCheckRunningCompleted). And("job_type = ? and user_id = ? and type = ?", JobTypeInference, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) return int(count), err } func UpdateInferenceJob(job *Cloudbrain) error { return updateInferenceJob(x, job) } func updateInferenceJob(e Engine, job *Cloudbrain) error { var sess *xorm.Session sess = e.Where("job_id = ?", job.JobID) _, err := sess.Cols("status", "train_job_duration").Update(job) return err } func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { sess := x.NewSession() defer sess.Close() if err = sess.Begin(); err != nil { return err } if _, err = sess.Delete(old); err != nil { sess.Rollback() return err } if _, err = sess.Insert(new); err != nil { sess.Rollback() return err } if err = sess.Commit(); err != nil { return err } return nil }