You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 18 kB

5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. package models
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "time"
  7. "xorm.io/xorm"
  8. "code.gitea.io/gitea/modules/setting"
  9. "code.gitea.io/gitea/modules/timeutil"
  10. "xorm.io/builder"
  11. )
  12. type CloudbrainStatus string
  13. type JobType string
  14. type ModelArtsJobStatus string
  15. const (
  16. JobWaiting CloudbrainStatus = "WAITING"
  17. JobStopped CloudbrainStatus = "STOPPED"
  18. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  19. JobFailed CloudbrainStatus = "FAILED"
  20. JobRunning CloudbrainStatus = "RUNNING"
  21. JobTypeDebug JobType = "DEBUG"
  22. JobTypeBenchmark JobType = "BENCHMARK"
  23. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  24. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  25. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  26. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  27. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  28. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  29. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  30. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  31. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  32. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  33. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  34. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  35. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  36. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  37. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  38. )
  39. type Cloudbrain struct {
  40. ID int64 `xorm:"pk autoincr"`
  41. JobID string `xorm:"INDEX NOT NULL"`
  42. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  43. JobName string `xorm:"INDEX"`
  44. Status string `xorm:"INDEX"`
  45. UserID int64 `xorm:"INDEX"`
  46. RepoID int64 `xorm:"INDEX"`
  47. SubTaskName string `xorm:"INDEX"`
  48. ContainerID string
  49. ContainerIp string
  50. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  51. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  52. DeletedAt time.Time `xorm:"deleted"`
  53. CanDebug bool `xorm:"-"`
  54. Type int `xorm:"INDEX DEFAULT 0"`
  55. User *User `xorm:"-"`
  56. Repo *Repository `xorm:"-"`
  57. }
  58. type CloudBrainLoginResult struct {
  59. Code string
  60. Msg string
  61. Payload map[string]interface{}
  62. }
  63. type TaskRole struct {
  64. Name string `json:"name"`
  65. TaskNumber int8 `json:"taskNumber"`
  66. MinSucceededTaskCount int8 `json:"minSucceededTaskCount"`
  67. MinFailedTaskCount int8 `json:"minFailedTaskCount"`
  68. CPUNumber int8 `json:"cpuNumber"`
  69. GPUNumber int8 `json:"gpuNumber"`
  70. MemoryMB int `json:"memoryMB"`
  71. ShmMB int `json:"shmMB"`
  72. Command string `json:"command"`
  73. NeedIBDevice bool `json:"needIBDevice"`
  74. IsMainRole bool `json:"isMainRole"`
  75. UseNNI bool `json:"useNNI"`
  76. }
  77. type StHostPath struct {
  78. Path string `json:"path"`
  79. MountPath string `json:"mountPath"`
  80. ReadOnly bool `json:"readOnly"`
  81. }
  82. type Volume struct {
  83. HostPath StHostPath `json:"hostPath"`
  84. }
  85. type CreateJobParams struct {
  86. JobName string `json:"jobName"`
  87. RetryCount int8 `json:"retryCount"`
  88. GpuType string `json:"gpuType"`
  89. Image string `json:"image"`
  90. TaskRoles []TaskRole `json:"taskRoles"`
  91. Volumes []Volume `json:"volumes"`
  92. }
  93. type CreateJobResult struct {
  94. Code string `json:"code"`
  95. Msg string `json:"msg"`
  96. Payload map[string]interface{} `json:"payload"`
  97. }
  98. type GetJobResult struct {
  99. Code string `json:"code"`
  100. Msg string `json:"msg"`
  101. Payload map[string]interface{} `json:"payload"`
  102. }
  103. type GetImagesResult struct {
  104. Code string `json:"code"`
  105. Msg string `json:"msg"`
  106. Payload GetImagesPayload `json:"payload"`
  107. }
  108. type GetImagesPayload struct {
  109. Count int `json:"count"`
  110. ImageInfo []*ImageInfo `json:"rows"`
  111. }
  112. type CloudbrainsOptions struct {
  113. ListOptions
  114. RepoID int64 // include all repos if empty
  115. UserID int64
  116. JobID int64
  117. SortType string
  118. CloudbrainIDs []int64
  119. // JobStatus CloudbrainStatus
  120. Type int
  121. }
  122. type TaskPod struct {
  123. TaskRoleStatus struct {
  124. Name string `json:"name"`
  125. } `json:"taskRoleStatus"`
  126. TaskStatuses []struct {
  127. TaskIndex int `json:"taskIndex"`
  128. PodUID string `json:"podUid"`
  129. PodIP string `json:"podIp"`
  130. PodName string `json:"podName"`
  131. ContainerID string `json:"containerId"`
  132. ContainerIP string `json:"containerIp"`
  133. ContainerGpus string `json:"containerGpus"`
  134. State string `json:"state"`
  135. StartAt time.Time `json:"startAt"`
  136. FinishedAt time.Time `json:"finishedAt"`
  137. ExitCode int `json:"exitCode"`
  138. ExitDiagnostics string `json:"exitDiagnostics"`
  139. RetriedCount int `json:"retriedCount"`
  140. StartTime string
  141. FinishedTime string
  142. } `json:"taskStatuses"`
  143. }
  144. type TaskInfo struct {
  145. Username string `json:"username"`
  146. TaskName string `json:"task_name"`
  147. CodeName string `json:"code_name"`
  148. }
  149. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  150. data, _ := json.Marshal(input)
  151. var taskPod TaskPod
  152. err := json.Unmarshal(data, &taskPod)
  153. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  154. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  155. return taskPod, err
  156. }
  157. type JobResultPayload struct {
  158. ID string `json:"id"`
  159. Name string `json:"name"`
  160. Platform string `json:"platform"`
  161. JobStatus struct {
  162. Username string `json:"username"`
  163. State string `json:"state"`
  164. SubState string `json:"subState"`
  165. ExecutionType string `json:"executionType"`
  166. Retries int `json:"retries"`
  167. CreatedTime int64 `json:"createdTime"`
  168. CompletedTime int64 `json:"completedTime"`
  169. AppID string `json:"appId"`
  170. AppProgress string `json:"appProgress"`
  171. AppTrackingURL string `json:"appTrackingUrl"`
  172. AppLaunchedTime int64 `json:"appLaunchedTime"`
  173. AppCompletedTime interface{} `json:"appCompletedTime"`
  174. AppExitCode int `json:"appExitCode"`
  175. AppExitDiagnostics string `json:"appExitDiagnostics"`
  176. AppExitType interface{} `json:"appExitType"`
  177. VirtualCluster string `json:"virtualCluster"`
  178. StartTime string
  179. EndTime string
  180. } `json:"jobStatus"`
  181. TaskRoles map[string]interface{} `json:"taskRoles"`
  182. Resource struct {
  183. CPU int `json:"cpu"`
  184. Memory string `json:"memory"`
  185. NvidiaComGpu int `json:"nvidia.com/gpu"`
  186. } `json:"resource"`
  187. Config struct {
  188. Image string `json:"image"`
  189. JobID string `json:"jobId"`
  190. GpuType string `json:"gpuType"`
  191. JobName string `json:"jobName"`
  192. JobType string `json:"jobType"`
  193. TaskRoles []struct {
  194. Name string `json:"name"`
  195. ShmMB int `json:"shmMB"`
  196. Command string `json:"command"`
  197. MemoryMB int `json:"memoryMB"`
  198. CPUNumber int `json:"cpuNumber"`
  199. GpuNumber int `json:"gpuNumber"`
  200. IsMainRole bool `json:"isMainRole"`
  201. TaskNumber int `json:"taskNumber"`
  202. NeedIBDevice bool `json:"needIBDevice"`
  203. MinFailedTaskCount int `json:"minFailedTaskCount"`
  204. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  205. } `json:"taskRoles"`
  206. RetryCount int `json:"retryCount"`
  207. } `json:"config"`
  208. Userinfo struct {
  209. User string `json:"user"`
  210. OrgID string `json:"org_id"`
  211. } `json:"userinfo"`
  212. }
  213. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  214. data, _ := json.Marshal(input)
  215. var jobResultPayload JobResultPayload
  216. err := json.Unmarshal(data, &jobResultPayload)
  217. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  218. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  219. return jobResultPayload, err
  220. }
  221. type ImagesResultPayload struct {
  222. Images []struct {
  223. ID int `json:"id"`
  224. Name string `json:"name"`
  225. Place string `json:"place"`
  226. Description string `json:"description"`
  227. Provider string `json:"provider"`
  228. Createtime string `json:"createtime"`
  229. Remark string `json:"remark"`
  230. } `json:"taskStatuses"`
  231. }
  232. type ImageInfo struct {
  233. ID int `json:"id"`
  234. Name string `json:"name"`
  235. Place string `json:"place"`
  236. Description string `json:"description"`
  237. Provider string `json:"provider"`
  238. Createtime string `json:"createtime"`
  239. Remark string `json:"remark"`
  240. IsPublic int `json:"isPublic"`
  241. PlaceView string
  242. }
  243. type CommitImageParams struct {
  244. Ip string `json:"ip"`
  245. TaskContainerId string `json:"taskContainerId"`
  246. ImageTag string `json:"imageTag"`
  247. ImageDescription string `json:"imageDescription"`
  248. }
  249. type CommitImageResult struct {
  250. Code string `json:"code"`
  251. Msg string `json:"msg"`
  252. Payload map[string]interface{} `json:"payload"`
  253. }
  254. type StopJobResult struct {
  255. Code string `json:"code"`
  256. Msg string `json:"msg"`
  257. }
  258. type CreateNotebookParams struct {
  259. JobName string `json:"name"`
  260. Description string `json:"description"`
  261. ProfileID string `json:"profile_id"`
  262. Flavor string `json:"flavor"`
  263. Spec Spec `json:"spec"`
  264. Workspace Workspace `json:"workspace"`
  265. }
  266. type Workspace struct {
  267. ID string `json:"id"`
  268. }
  269. type Spec struct {
  270. Storage Storage `json:"storage"`
  271. AutoStop AutoStop `json:"auto_stop"`
  272. }
  273. type AutoStop struct {
  274. Enable bool `json:"enable"`
  275. Duration int `json:"duration"`
  276. }
  277. type Storage struct {
  278. Type string `json:"type"`
  279. Location Location `json:"location"`
  280. }
  281. type Location struct {
  282. Path string `json:"path"`
  283. }
  284. type NotebookResult struct {
  285. ErrorCode string `json:"error_code"`
  286. ErrorMsg string `json:"error_msg"`
  287. }
  288. type CreateNotebookResult struct {
  289. ErrorCode string `json:"error_code"`
  290. ErrorMsg string `json:"error_msg"`
  291. ID string `json:"id"`
  292. Name string `json:"name"`
  293. Description string `json:"description"`
  294. Status string `json:"status"`
  295. CreationTimestamp string `json:"creation_timestamp"`
  296. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  297. Profile struct {
  298. ID string `json:"id"`
  299. Name string `json:"name"`
  300. Description string `json:"description"`
  301. DeType string `json:"de_type"`
  302. FlavorType string `json:"flavor_type"`
  303. } `json:"profile"`
  304. Flavor string `json:"flavor"`
  305. FlavorDetails struct{
  306. Name string `json:"name"`
  307. Status string `json:"status"`
  308. QueuingNum int `json:"queuing_num"`
  309. QueueLeftTime int `json:"queue_left_time"` //s
  310. Duration int `json:"duration"` //auto_stop_time s
  311. } `json:"flavor_details"`
  312. }
  313. type GetNotebookResult struct {
  314. ErrorCode string `json:"error_code"`
  315. ErrorMsg string `json:"error_msg"`
  316. ID string `json:"id"`
  317. Name string `json:"name"`
  318. Description string `json:"description"`
  319. Status string `json:"status"`
  320. CreationTimestamp string `json:"creation_timestamp"`
  321. CreateTime string
  322. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  323. LatestUpdateTime string
  324. Profile struct {
  325. ID string `json:"id"`
  326. Name string `json:"name"`
  327. Description string `json:"description"`
  328. DeType string `json:"de_type"`
  329. FlavorType string `json:"flavor_type"`
  330. } `json:"profile"`
  331. Flavor string `json:"flavor"`
  332. FlavorDetails struct{
  333. Name string `json:"name"`
  334. Status string `json:"status"`
  335. QueuingNum int `json:"queuing_num"`
  336. QueueLeftTime int `json:"queue_left_time"` //s
  337. Duration int `json:"duration"` //auto_stop_time s
  338. } `json:"flavor_details"`
  339. QueuingInfo struct{
  340. ID string `json:"id"`
  341. Name string `json:"name"`
  342. Flavor string `json:"flavor"`
  343. DeType string `json:"de_type"`
  344. Status string `json:"status"`
  345. BeginTimestamp int `json:"begin_timestamp"`//time of instance begin in queue
  346. BeginTime string
  347. RemainTime int `json:"remain_time"` //remain time of instance
  348. EndTimestamp int `json:"end_timestamp"` //
  349. EndTime string
  350. Rank int `json:"rank"` //rank of instance in queue
  351. } `json:"queuing_info"`
  352. Spec struct{
  353. Annotations struct{
  354. TargetDomain string `json:"target_domain"`
  355. Url string `json:"url"`
  356. } `json:"annotations"`
  357. } `json:"spec"`
  358. }
  359. type GetTokenParams struct {
  360. Auth Auth `json:"auth"`
  361. }
  362. type Auth struct {
  363. Identity Identity `json:"identity"`
  364. Scope Scope `json:"scope"`
  365. }
  366. type Scope struct {
  367. Project Project `json:"project"`
  368. }
  369. type Project struct {
  370. Name string `json:"name"`
  371. }
  372. type Identity struct {
  373. Methods []string `json:"methods"`
  374. Password Password `json:"password"`
  375. }
  376. type Password struct {
  377. User NotebookUser `json:"user"`
  378. }
  379. type NotebookUser struct {
  380. Name string `json:"name"`
  381. Password string `json:"password"`
  382. Domain Domain `json:"domain"`
  383. }
  384. type Domain struct {
  385. Name string `json:"name"`
  386. }
  387. const (
  388. ActionStart = "start"
  389. ActionStop = "stop"
  390. ActionRestart = "restart"
  391. ActionQueue = "queue"
  392. ActionDequeue = "dequeue"
  393. )
  394. type NotebookAction struct {
  395. Action string `json:"action"`
  396. }
  397. type NotebookActionResult struct {
  398. ErrorCode string `json:"error_code"`
  399. ErrorMsg string `json:"error_msg"`
  400. CurrentStatus string `json:"current_status"`
  401. PreviousState string `json:"previous_state"`
  402. }
  403. type NotebookGetJobTokenResult struct {
  404. ErrorCode string `json:"error_code"`
  405. ErrorMsg string `json:"error_msg"`
  406. Token string `json:"token"`
  407. }
  408. type NotebookDelResult struct {
  409. InstanceID string `json:"instance_id"`
  410. }
  411. func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
  412. sess := x.NewSession()
  413. defer sess.Close()
  414. var cond = builder.NewCond()
  415. if opts.RepoID > 0 {
  416. cond = cond.And(
  417. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  418. )
  419. }
  420. if opts.UserID > 0 {
  421. cond = cond.And(
  422. builder.Eq{"cloudbrain.user_id": opts.UserID},
  423. )
  424. }
  425. if (opts.JobID) > 0 {
  426. cond = cond.And(
  427. builder.Eq{"cloudbrain.job_id": opts.JobID},
  428. )
  429. }
  430. if (opts.Type) >= 0 {
  431. cond = cond.And(
  432. builder.Eq{"cloudbrain.type": opts.Type},
  433. )
  434. }
  435. // switch opts.JobStatus {
  436. // case JobWaiting:
  437. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  438. // case JobFailed:
  439. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  440. // case JobStopped:
  441. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  442. // case JobSucceeded:
  443. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  444. // }
  445. if len(opts.CloudbrainIDs) > 0 {
  446. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  447. }
  448. count, err := sess.Where(cond).Count(new(Cloudbrain))
  449. if err != nil {
  450. return nil, 0, fmt.Errorf("Count: %v", err)
  451. }
  452. if opts.Page >= 0 && opts.PageSize > 0 {
  453. var start int
  454. if opts.Page == 0 {
  455. start = 0
  456. } else {
  457. start = (opts.Page - 1) * opts.PageSize
  458. }
  459. sess.Limit(opts.PageSize, start)
  460. }
  461. sess.OrderBy("cloudbrain.created_unix DESC")
  462. cloudbrains := make([]*Cloudbrain, 0, setting.UI.IssuePagingNum)
  463. if err := sess.Where(cond).Find(&cloudbrains); err != nil {
  464. return nil, 0, fmt.Errorf("Find: %v", err)
  465. }
  466. sess.Close()
  467. return cloudbrains, count, nil
  468. }
  469. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  470. if _, err = x.Insert(cloudbrain); err != nil {
  471. return err
  472. }
  473. return nil
  474. }
  475. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  476. has, err := x.Get(cb)
  477. if err != nil {
  478. return nil, err
  479. } else if !has {
  480. return nil, errors.New("cloudbrain task is not found")
  481. }
  482. return cb, nil
  483. }
  484. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  485. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  486. return getRepoCloudBrain(cb)
  487. }
  488. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  489. cb := &Cloudbrain{JobID: jobID}
  490. return getRepoCloudBrain(cb)
  491. }
  492. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  493. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  494. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  495. return
  496. }
  497. func UpdateJob(job *Cloudbrain) error {
  498. return updateJob(x, job)
  499. }
  500. func updateJob(e Engine, job *Cloudbrain) error {
  501. var sess *xorm.Session
  502. sess = e.Where("job_id = ?", job.JobID)
  503. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  504. return err
  505. }
  506. func DeleteJob(job *Cloudbrain) error {
  507. return deleteJob(x, job)
  508. }
  509. func deleteJob(e Engine, job *Cloudbrain) error {
  510. _, err := e.ID(job.ID).Delete(job)
  511. return err
  512. }