You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 17 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "time"
  7. "xorm.io/xorm"
  8. "code.gitea.io/gitea/modules/setting"
  9. "code.gitea.io/gitea/modules/timeutil"
  10. "xorm.io/builder"
  11. )
  12. type CloudbrainStatus string
  13. type JobType string
  14. type ModelArtsJobStatus string
  15. const (
  16. JobWaiting CloudbrainStatus = "WAITING"
  17. JobStopped CloudbrainStatus = "STOPPED"
  18. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  19. JobFailed CloudbrainStatus = "FAILED"
  20. JobRunning CloudbrainStatus = "RUNNING"
  21. JobTypeDebug JobType = "DEBUG"
  22. JobTypeBenchmark JobType = "BENCHMARK"
  23. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  24. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  25. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  26. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  27. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  28. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  29. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  30. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  31. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  32. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  33. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  34. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  35. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  36. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  37. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  38. )
  39. type Cloudbrain struct {
  40. ID int64 `xorm:"pk autoincr"`
  41. JobID string `xorm:"INDEX NOT NULL"`
  42. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  43. JobName string `xorm:"INDEX"`
  44. Status string `xorm:"INDEX"`
  45. UserID int64 `xorm:"INDEX"`
  46. RepoID int64 `xorm:"INDEX"`
  47. SubTaskName string `xorm:"INDEX"`
  48. ContainerID string
  49. ContainerIp string
  50. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  51. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  52. DeletedAt time.Time `xorm:"deleted"`
  53. CanDebug bool `xorm:"-"`
  54. Type int `xorm:"INDEX DEFAULT 0"`
  55. User *User `xorm:"-"`
  56. Repo *Repository `xorm:"-"`
  57. }
  58. type CloudBrainLoginResult struct {
  59. Code string
  60. Msg string
  61. Payload map[string]interface{}
  62. }
  63. type TaskRole struct {
  64. Name string `json:"name"`
  65. TaskNumber int8 `json:"taskNumber"`
  66. MinSucceededTaskCount int8 `json:"minSucceededTaskCount"`
  67. MinFailedTaskCount int8 `json:"minFailedTaskCount"`
  68. CPUNumber int8 `json:"cpuNumber"`
  69. GPUNumber int8 `json:"gpuNumber"`
  70. MemoryMB int `json:"memoryMB"`
  71. ShmMB int `json:"shmMB"`
  72. Command string `json:"command"`
  73. NeedIBDevice bool `json:"needIBDevice"`
  74. IsMainRole bool `json:"isMainRole"`
  75. UseNNI bool `json:"useNNI"`
  76. }
  77. type StHostPath struct {
  78. Path string `json:"path"`
  79. MountPath string `json:"mountPath"`
  80. ReadOnly bool `json:"readOnly"`
  81. }
  82. type Volume struct {
  83. HostPath StHostPath `json:"hostPath"`
  84. }
  85. type CreateJobParams struct {
  86. JobName string `json:"jobName"`
  87. RetryCount int8 `json:"retryCount"`
  88. GpuType string `json:"gpuType"`
  89. Image string `json:"image"`
  90. TaskRoles []TaskRole `json:"taskRoles"`
  91. Volumes []Volume `json:"volumes"`
  92. }
  93. type CreateJobResult struct {
  94. Code string `json:"code"`
  95. Msg string `json:"msg"`
  96. Payload map[string]interface{} `json:"payload"`
  97. }
  98. type GetJobResult struct {
  99. Code string `json:"code"`
  100. Msg string `json:"msg"`
  101. Payload map[string]interface{} `json:"payload"`
  102. }
  103. type GetImagesResult struct {
  104. Code string `json:"code"`
  105. Msg string `json:"msg"`
  106. Payload map[string]*ImageInfo `json:"payload"`
  107. }
  108. type CloudbrainsOptions struct {
  109. ListOptions
  110. RepoID int64 // include all repos if empty
  111. UserID int64
  112. JobID int64
  113. SortType string
  114. CloudbrainIDs []int64
  115. // JobStatus CloudbrainStatus
  116. Type int
  117. }
  118. type TaskPod struct {
  119. TaskRoleStatus struct {
  120. Name string `json:"name"`
  121. } `json:"taskRoleStatus"`
  122. TaskStatuses []struct {
  123. TaskIndex int `json:"taskIndex"`
  124. PodUID string `json:"podUid"`
  125. PodIP string `json:"podIp"`
  126. PodName string `json:"podName"`
  127. ContainerID string `json:"containerId"`
  128. ContainerIP string `json:"containerIp"`
  129. ContainerGpus string `json:"containerGpus"`
  130. State string `json:"state"`
  131. StartAt time.Time `json:"startAt"`
  132. FinishedAt time.Time `json:"finishedAt"`
  133. ExitCode int `json:"exitCode"`
  134. ExitDiagnostics string `json:"exitDiagnostics"`
  135. RetriedCount int `json:"retriedCount"`
  136. StartTime string
  137. FinishedTime string
  138. } `json:"taskStatuses"`
  139. }
  140. type TaskInfo struct {
  141. Username string `json:"username"`
  142. TaskName string `json:"task_name"`
  143. CodeName string `json:"code_name"`
  144. }
  145. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  146. data, _ := json.Marshal(input)
  147. var taskPod TaskPod
  148. err := json.Unmarshal(data, &taskPod)
  149. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  150. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  151. return taskPod, err
  152. }
  153. type JobResultPayload struct {
  154. ID string `json:"id"`
  155. Name string `json:"name"`
  156. Platform string `json:"platform"`
  157. JobStatus struct {
  158. Username string `json:"username"`
  159. State string `json:"state"`
  160. SubState string `json:"subState"`
  161. ExecutionType string `json:"executionType"`
  162. Retries int `json:"retries"`
  163. CreatedTime int64 `json:"createdTime"`
  164. CompletedTime int64 `json:"completedTime"`
  165. AppID string `json:"appId"`
  166. AppProgress string `json:"appProgress"`
  167. AppTrackingURL string `json:"appTrackingUrl"`
  168. AppLaunchedTime int64 `json:"appLaunchedTime"`
  169. AppCompletedTime interface{} `json:"appCompletedTime"`
  170. AppExitCode int `json:"appExitCode"`
  171. AppExitDiagnostics string `json:"appExitDiagnostics"`
  172. AppExitType interface{} `json:"appExitType"`
  173. VirtualCluster string `json:"virtualCluster"`
  174. StartTime string
  175. EndTime string
  176. } `json:"jobStatus"`
  177. TaskRoles map[string]interface{} `json:"taskRoles"`
  178. Resource struct {
  179. CPU int `json:"cpu"`
  180. Memory string `json:"memory"`
  181. NvidiaComGpu int `json:"nvidia.com/gpu"`
  182. } `json:"resource"`
  183. Config struct {
  184. Image string `json:"image"`
  185. JobID string `json:"jobId"`
  186. GpuType string `json:"gpuType"`
  187. JobName string `json:"jobName"`
  188. JobType string `json:"jobType"`
  189. TaskRoles []struct {
  190. Name string `json:"name"`
  191. ShmMB int `json:"shmMB"`
  192. Command string `json:"command"`
  193. MemoryMB int `json:"memoryMB"`
  194. CPUNumber int `json:"cpuNumber"`
  195. GpuNumber int `json:"gpuNumber"`
  196. IsMainRole bool `json:"isMainRole"`
  197. TaskNumber int `json:"taskNumber"`
  198. NeedIBDevice bool `json:"needIBDevice"`
  199. MinFailedTaskCount int `json:"minFailedTaskCount"`
  200. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  201. } `json:"taskRoles"`
  202. RetryCount int `json:"retryCount"`
  203. } `json:"config"`
  204. Userinfo struct {
  205. User string `json:"user"`
  206. OrgID string `json:"org_id"`
  207. } `json:"userinfo"`
  208. }
  209. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  210. data, _ := json.Marshal(input)
  211. var jobResultPayload JobResultPayload
  212. err := json.Unmarshal(data, &jobResultPayload)
  213. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  214. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  215. return jobResultPayload, err
  216. }
  217. type ImagesResultPayload struct {
  218. Images []struct {
  219. ID int `json:"id"`
  220. Name string `json:"name"`
  221. Place string `json:"place"`
  222. Description string `json:"description"`
  223. Provider string `json:"provider"`
  224. Createtime string `json:"createtime"`
  225. Remark string `json:"remark"`
  226. } `json:"taskStatuses"`
  227. }
  228. type ImageInfo struct {
  229. ID int `json:"id"`
  230. Name string `json:"name"`
  231. Place string `json:"place"`
  232. Description string `json:"description"`
  233. Provider string `json:"provider"`
  234. Createtime string `json:"createtime"`
  235. Remark string `json:"remark"`
  236. PlaceView string
  237. }
  238. type CommitImageParams struct {
  239. Ip string `json:"ip"`
  240. TaskContainerId string `json:"taskContainerId"`
  241. ImageTag string `json:"imageTag"`
  242. ImageDescription string `json:"imageDescription"`
  243. }
  244. type CommitImageResult struct {
  245. Code string `json:"code"`
  246. Msg string `json:"msg"`
  247. Payload map[string]interface{} `json:"payload"`
  248. }
  249. type StopJobResult struct {
  250. Code string `json:"code"`
  251. Msg string `json:"msg"`
  252. }
  253. type CreateNotebookParams struct {
  254. JobName string `json:"name"`
  255. Description string `json:"description"`
  256. ProfileID string `json:"profile_id"`
  257. Flavor string `json:"flavor"`
  258. Spec Spec `json:"spec"`
  259. Workspace Workspace `json:"workspace"`
  260. }
  261. type Workspace struct {
  262. ID string `json:"id"`
  263. }
  264. type Spec struct {
  265. Storage Storage `json:"storage"`
  266. AutoStop AutoStop `json:"auto_stop"`
  267. }
  268. type AutoStop struct {
  269. Enable bool `json:"enable"`
  270. Duration int `json:"duration"`
  271. }
  272. type Storage struct {
  273. Type string `json:"type"`
  274. Location Location `json:"location"`
  275. }
  276. type Location struct {
  277. Path string `json:"path"`
  278. }
  279. type CreateNotebookResult struct {
  280. ErrorCode string `json:"error_code"`
  281. ErrorMsg string `json:"error_msg"`
  282. ID string `json:"id"`
  283. Name string `json:"name"`
  284. Description string `json:"description"`
  285. Status string `json:"status"`
  286. CreationTimestamp string `json:"creation_timestamp"`
  287. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  288. Profile struct {
  289. ID string `json:"id"`
  290. Name string `json:"name"`
  291. Description string `json:"description"`
  292. DeType string `json:"de_type"`
  293. FlavorType string `json:"flavor_type"`
  294. } `json:"profile"`
  295. Flavor string `json:"flavor"`
  296. FlavorDetails struct{
  297. Name string `json:"name"`
  298. Status string `json:"status"`
  299. QueuingNum int `json:"queuing_num"`
  300. QueueLeftTime int `json:"queue_left_time"` //s
  301. Duration int `json:"duration"` //auto_stop_time s
  302. } `json:"flavor_details"`
  303. }
  304. type GetNotebookResult struct {
  305. ErrorCode string `json:"error_code"`
  306. ErrorMsg string `json:"error_msg"`
  307. ID string `json:"id"`
  308. Name string `json:"name"`
  309. Description string `json:"description"`
  310. Status string `json:"status"`
  311. CreationTimestamp string `json:"creation_timestamp"`
  312. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  313. Profile struct {
  314. ID string `json:"id"`
  315. Name string `json:"name"`
  316. Description string `json:"description"`
  317. DeType string `json:"de_type"`
  318. FlavorType string `json:"flavor_type"`
  319. } `json:"profile"`
  320. Flavor string `json:"flavor"`
  321. FlavorDetails struct{
  322. Name string `json:"name"`
  323. Status string `json:"status"`
  324. QueuingNum int `json:"queuing_num"`
  325. QueueLeftTime int `json:"queue_left_time"` //s
  326. Duration int `json:"duration"` //auto_stop_time s
  327. } `json:"flavor_details"`
  328. QueuingInfo struct{
  329. ID string `json:"id"`
  330. Name string `json:"name"`
  331. Flavor string `json:"flavor"`
  332. DeType string `json:"de_type"`
  333. Status string `json:"status"`
  334. BeginTimestamp int `json:"begin_timestamp"`//time of instance begin in queue
  335. RemainTime int `json:"remain_time"` //remain time of instance
  336. EndTimestamp int `json:"end_timestamp"` //
  337. Rank int `json:"rank"` //rank of instance in queue
  338. } `json:"queuing_info"`
  339. }
  340. type GetTokenParams struct {
  341. Auth Auth `json:auth`
  342. }
  343. type Auth struct {
  344. Identity Identity `json:identity`
  345. Scope Scope `json:scope`
  346. }
  347. type Scope struct {
  348. Project Project `json:project`
  349. }
  350. type Project struct {
  351. Name string `json:name`
  352. }
  353. type Identity struct {
  354. Methods []string `json:"methods"`
  355. Password Password `json:password`
  356. }
  357. type Password struct {
  358. User NotebookUser `json:user`
  359. }
  360. type NotebookUser struct {
  361. Name string `json:name`
  362. Password string `json:"password"`
  363. Domain Domain `json:domain`
  364. }
  365. type Domain struct {
  366. Name string `json:name`
  367. }
  368. func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
  369. sess := x.NewSession()
  370. defer sess.Close()
  371. var cond = builder.NewCond()
  372. if opts.RepoID > 0 {
  373. cond = cond.And(
  374. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  375. )
  376. }
  377. if opts.UserID > 0 {
  378. cond = cond.And(
  379. builder.Eq{"cloudbrain.user_id": opts.UserID},
  380. )
  381. }
  382. if (opts.JobID) > 0 {
  383. cond = cond.And(
  384. builder.Eq{"cloudbrain.job_id": opts.JobID},
  385. )
  386. }
  387. if (opts.Type) >= 0 {
  388. cond = cond.And(
  389. builder.Eq{"cloudbrain.type": opts.Type},
  390. )
  391. }
  392. // switch opts.JobStatus {
  393. // case JobWaiting:
  394. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  395. // case JobFailed:
  396. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  397. // case JobStopped:
  398. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  399. // case JobSucceeded:
  400. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  401. // }
  402. if len(opts.CloudbrainIDs) > 0 {
  403. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  404. }
  405. count, err := sess.Where(cond).Count(new(Cloudbrain))
  406. if err != nil {
  407. return nil, 0, fmt.Errorf("Count: %v", err)
  408. }
  409. if opts.Page >= 0 && opts.PageSize > 0 {
  410. var start int
  411. if opts.Page == 0 {
  412. start = 0
  413. } else {
  414. start = (opts.Page - 1) * opts.PageSize
  415. }
  416. sess.Limit(opts.PageSize, start)
  417. }
  418. sess.OrderBy("cloudbrain.created_unix DESC")
  419. cloudbrains := make([]*Cloudbrain, 0, setting.UI.IssuePagingNum)
  420. if err := sess.Where(cond).Find(&cloudbrains); err != nil {
  421. return nil, 0, fmt.Errorf("Find: %v", err)
  422. }
  423. sess.Close()
  424. return cloudbrains, count, nil
  425. }
  426. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  427. if _, err = x.Insert(cloudbrain); err != nil {
  428. return err
  429. }
  430. return nil
  431. }
  432. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  433. has, err := x.Get(cb)
  434. if err != nil {
  435. return nil, err
  436. } else if !has {
  437. return nil, errors.New("cloudbrain task is not found")
  438. }
  439. return cb, nil
  440. }
  441. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  442. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  443. return getRepoCloudBrain(cb)
  444. }
  445. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  446. cb := &Cloudbrain{JobID: jobID}
  447. return getRepoCloudBrain(cb)
  448. }
  449. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  450. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  451. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  452. return
  453. }
  454. func UpdateJob(job *Cloudbrain) error {
  455. return updateJob(x, job)
  456. }
  457. func updateJob(e Engine, job *Cloudbrain) error {
  458. var sess *xorm.Session
  459. sess = e.Where("job_id = ?", job.JobID)
  460. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  461. return err
  462. }
  463. func DeleteJob(job *Cloudbrain) error {
  464. return deleteJob(x, job)
  465. }
  466. func deleteJob(e Engine, job *Cloudbrain) error {
  467. _, err := e.ID(job.ID).Delete(job)
  468. return err
  469. }