You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 19 kB

5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "time"
  7. "xorm.io/xorm"
  8. "code.gitea.io/gitea/modules/setting"
  9. "code.gitea.io/gitea/modules/timeutil"
  10. "xorm.io/builder"
  11. )
  12. type CloudbrainStatus string
  13. type JobType string
  14. type ModelArtsJobStatus string
  15. const (
  16. JobWaiting CloudbrainStatus = "WAITING"
  17. JobStopped CloudbrainStatus = "STOPPED"
  18. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  19. JobFailed CloudbrainStatus = "FAILED"
  20. JobRunning CloudbrainStatus = "RUNNING"
  21. JobTypeDebug JobType = "DEBUG"
  22. JobTypeBenchmark JobType = "BENCHMARK"
  23. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  24. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  25. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  26. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  27. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  28. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  29. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  30. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  31. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  32. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  33. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  34. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  35. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  36. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  37. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  38. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  39. )
  40. type Cloudbrain struct {
  41. ID int64 `xorm:"pk autoincr"`
  42. JobID string `xorm:"INDEX NOT NULL"`
  43. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  44. JobName string `xorm:"INDEX"`
  45. Status string `xorm:"INDEX"`
  46. UserID int64 `xorm:"INDEX"`
  47. RepoID int64 `xorm:"INDEX"`
  48. SubTaskName string `xorm:"INDEX"`
  49. ContainerID string
  50. ContainerIp string
  51. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  52. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  53. DeletedAt time.Time `xorm:"deleted"`
  54. CanDebug bool `xorm:"-"`
  55. Type int `xorm:"INDEX DEFAULT 0"`
  56. User *User `xorm:"-"`
  57. Repo *Repository `xorm:"-"`
  58. }
  59. type CloudBrainLoginResult struct {
  60. Code string
  61. Msg string
  62. Payload map[string]interface{}
  63. }
  64. type TaskRole struct {
  65. Name string `json:"name"`
  66. TaskNumber int8 `json:"taskNumber"`
  67. MinSucceededTaskCount int8 `json:"minSucceededTaskCount"`
  68. MinFailedTaskCount int8 `json:"minFailedTaskCount"`
  69. CPUNumber int8 `json:"cpuNumber"`
  70. GPUNumber int8 `json:"gpuNumber"`
  71. MemoryMB int `json:"memoryMB"`
  72. ShmMB int `json:"shmMB"`
  73. Command string `json:"command"`
  74. NeedIBDevice bool `json:"needIBDevice"`
  75. IsMainRole bool `json:"isMainRole"`
  76. UseNNI bool `json:"useNNI"`
  77. }
  78. type StHostPath struct {
  79. Path string `json:"path"`
  80. MountPath string `json:"mountPath"`
  81. ReadOnly bool `json:"readOnly"`
  82. }
  83. type Volume struct {
  84. HostPath StHostPath `json:"hostPath"`
  85. }
  86. type CreateJobParams struct {
  87. JobName string `json:"jobName"`
  88. RetryCount int8 `json:"retryCount"`
  89. GpuType string `json:"gpuType"`
  90. Image string `json:"image"`
  91. TaskRoles []TaskRole `json:"taskRoles"`
  92. Volumes []Volume `json:"volumes"`
  93. }
  94. type CreateJobResult struct {
  95. Code string `json:"code"`
  96. Msg string `json:"msg"`
  97. Payload map[string]interface{} `json:"payload"`
  98. }
  99. type GetJobResult struct {
  100. Code string `json:"code"`
  101. Msg string `json:"msg"`
  102. Payload map[string]interface{} `json:"payload"`
  103. }
  104. type GetImagesResult struct {
  105. Code string `json:"code"`
  106. Msg string `json:"msg"`
  107. Payload GetImagesPayload `json:"payload"`
  108. }
  109. type GetImagesPayload struct {
  110. Count int `json:"count"`
  111. ImageInfo []*ImageInfo `json:"rows"`
  112. }
  113. type CloudbrainsOptions struct {
  114. ListOptions
  115. RepoID int64 // include all repos if empty
  116. UserID int64
  117. JobID int64
  118. SortType string
  119. CloudbrainIDs []int64
  120. // JobStatus CloudbrainStatus
  121. Type int
  122. }
  123. type TaskPod struct {
  124. TaskRoleStatus struct {
  125. Name string `json:"name"`
  126. } `json:"taskRoleStatus"`
  127. TaskStatuses []struct {
  128. TaskIndex int `json:"taskIndex"`
  129. PodUID string `json:"podUid"`
  130. PodIP string `json:"podIp"`
  131. PodName string `json:"podName"`
  132. ContainerID string `json:"containerId"`
  133. ContainerIP string `json:"containerIp"`
  134. ContainerGpus string `json:"containerGpus"`
  135. State string `json:"state"`
  136. StartAt time.Time `json:"startAt"`
  137. FinishedAt time.Time `json:"finishedAt"`
  138. ExitCode int `json:"exitCode"`
  139. ExitDiagnostics string `json:"exitDiagnostics"`
  140. RetriedCount int `json:"retriedCount"`
  141. StartTime string
  142. FinishedTime string
  143. } `json:"taskStatuses"`
  144. }
  145. type TaskInfo struct {
  146. Username string `json:"username"`
  147. TaskName string `json:"task_name"`
  148. CodeName string `json:"code_name"`
  149. BenchmarkCategory string `json:"selected_category"`
  150. }
  151. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  152. data, _ := json.Marshal(input)
  153. var taskPod TaskPod
  154. err := json.Unmarshal(data, &taskPod)
  155. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  156. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  157. return taskPod, err
  158. }
  159. type JobResultPayload struct {
  160. ID string `json:"id"`
  161. Name string `json:"name"`
  162. Platform string `json:"platform"`
  163. JobStatus struct {
  164. Username string `json:"username"`
  165. State string `json:"state"`
  166. SubState string `json:"subState"`
  167. ExecutionType string `json:"executionType"`
  168. Retries int `json:"retries"`
  169. CreatedTime int64 `json:"createdTime"`
  170. CompletedTime int64 `json:"completedTime"`
  171. AppID string `json:"appId"`
  172. AppProgress string `json:"appProgress"`
  173. AppTrackingURL string `json:"appTrackingUrl"`
  174. AppLaunchedTime int64 `json:"appLaunchedTime"`
  175. AppCompletedTime interface{} `json:"appCompletedTime"`
  176. AppExitCode int `json:"appExitCode"`
  177. AppExitDiagnostics string `json:"appExitDiagnostics"`
  178. AppExitType interface{} `json:"appExitType"`
  179. VirtualCluster string `json:"virtualCluster"`
  180. StartTime string
  181. EndTime string
  182. } `json:"jobStatus"`
  183. TaskRoles map[string]interface{} `json:"taskRoles"`
  184. Resource struct {
  185. CPU int `json:"cpu"`
  186. Memory string `json:"memory"`
  187. NvidiaComGpu int `json:"nvidia.com/gpu"`
  188. } `json:"resource"`
  189. Config struct {
  190. Image string `json:"image"`
  191. JobID string `json:"jobId"`
  192. GpuType string `json:"gpuType"`
  193. JobName string `json:"jobName"`
  194. JobType string `json:"jobType"`
  195. TaskRoles []struct {
  196. Name string `json:"name"`
  197. ShmMB int `json:"shmMB"`
  198. Command string `json:"command"`
  199. MemoryMB int `json:"memoryMB"`
  200. CPUNumber int `json:"cpuNumber"`
  201. GpuNumber int `json:"gpuNumber"`
  202. IsMainRole bool `json:"isMainRole"`
  203. TaskNumber int `json:"taskNumber"`
  204. NeedIBDevice bool `json:"needIBDevice"`
  205. MinFailedTaskCount int `json:"minFailedTaskCount"`
  206. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  207. } `json:"taskRoles"`
  208. RetryCount int `json:"retryCount"`
  209. } `json:"config"`
  210. Userinfo struct {
  211. User string `json:"user"`
  212. OrgID string `json:"org_id"`
  213. } `json:"userinfo"`
  214. }
  215. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  216. data, _ := json.Marshal(input)
  217. var jobResultPayload JobResultPayload
  218. err := json.Unmarshal(data, &jobResultPayload)
  219. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  220. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  221. return jobResultPayload, err
  222. }
  223. type ImagesResultPayload struct {
  224. Images []struct {
  225. ID int `json:"id"`
  226. Name string `json:"name"`
  227. Place string `json:"place"`
  228. Description string `json:"description"`
  229. Provider string `json:"provider"`
  230. Createtime string `json:"createtime"`
  231. Remark string `json:"remark"`
  232. } `json:"taskStatuses"`
  233. }
  234. type ImageInfo struct {
  235. ID int `json:"id"`
  236. Name string `json:"name"`
  237. Place string `json:"place"`
  238. Description string `json:"description"`
  239. Provider string `json:"provider"`
  240. Createtime string `json:"createtime"`
  241. Remark string `json:"remark"`
  242. IsPublic int `json:"isPublic"`
  243. PlaceView string
  244. }
  245. type Categories struct {
  246. Category []*Category `json:"category"`
  247. }
  248. type Category struct {
  249. Id int `json:"id"`
  250. Value string `json:"value"`
  251. }
  252. type CommitImageParams struct {
  253. Ip string `json:"ip"`
  254. TaskContainerId string `json:"taskContainerId"`
  255. ImageTag string `json:"imageTag"`
  256. ImageDescription string `json:"imageDescription"`
  257. }
  258. type CommitImageResult struct {
  259. Code string `json:"code"`
  260. Msg string `json:"msg"`
  261. Payload map[string]interface{} `json:"payload"`
  262. }
  263. type StopJobResult struct {
  264. Code string `json:"code"`
  265. Msg string `json:"msg"`
  266. }
  267. type CreateNotebookParams struct {
  268. JobName string `json:"name"`
  269. Description string `json:"description"`
  270. ProfileID string `json:"profile_id"`
  271. Flavor string `json:"flavor"`
  272. Spec Spec `json:"spec"`
  273. Workspace Workspace `json:"workspace"`
  274. }
  275. type Workspace struct {
  276. ID string `json:"id"`
  277. }
  278. type Spec struct {
  279. Storage Storage `json:"storage"`
  280. AutoStop AutoStop `json:"auto_stop"`
  281. }
  282. type AutoStop struct {
  283. Enable bool `json:"enable"`
  284. Duration int `json:"duration"`
  285. }
  286. type Storage struct {
  287. Type string `json:"type"`
  288. Location Location `json:"location"`
  289. }
  290. type Location struct {
  291. Path string `json:"path"`
  292. }
  293. type NotebookResult struct {
  294. ErrorCode string `json:"error_code"`
  295. ErrorMsg string `json:"error_msg"`
  296. }
  297. type CreateNotebookResult struct {
  298. ErrorCode string `json:"error_code"`
  299. ErrorMsg string `json:"error_msg"`
  300. ID string `json:"id"`
  301. Name string `json:"name"`
  302. Description string `json:"description"`
  303. Status string `json:"status"`
  304. CreationTimestamp string `json:"creation_timestamp"`
  305. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  306. Profile struct {
  307. ID string `json:"id"`
  308. Name string `json:"name"`
  309. Description string `json:"description"`
  310. DeType string `json:"de_type"`
  311. FlavorType string `json:"flavor_type"`
  312. } `json:"profile"`
  313. Flavor string `json:"flavor"`
  314. FlavorDetails struct{
  315. Name string `json:"name"`
  316. Status string `json:"status"`
  317. QueuingNum int `json:"queuing_num"`
  318. QueueLeftTime int `json:"queue_left_time"` //s
  319. Duration int `json:"duration"` //auto_stop_time s
  320. } `json:"flavor_details"`
  321. }
  322. type GetNotebookResult struct {
  323. ErrorCode string `json:"error_code"`
  324. ErrorMsg string `json:"error_msg"`
  325. ID string `json:"id"`
  326. Name string `json:"name"`
  327. Description string `json:"description"`
  328. Status string `json:"status"`
  329. CreationTimestamp string `json:"creation_timestamp"`
  330. CreateTime string
  331. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  332. LatestUpdateTime string
  333. Profile struct {
  334. ID string `json:"id"`
  335. Name string `json:"name"`
  336. Description string `json:"description"`
  337. DeType string `json:"de_type"`
  338. FlavorType string `json:"flavor_type"`
  339. } `json:"profile"`
  340. Flavor string `json:"flavor"`
  341. FlavorDetails struct{
  342. Name string `json:"name"`
  343. Status string `json:"status"`
  344. QueuingNum int `json:"queuing_num"`
  345. QueueLeftTime int `json:"queue_left_time"` //s
  346. Duration int `json:"duration"` //auto_stop_time s
  347. } `json:"flavor_details"`
  348. QueuingInfo struct{
  349. ID string `json:"id"`
  350. Name string `json:"name"`
  351. Flavor string `json:"flavor"`
  352. DeType string `json:"de_type"`
  353. Status string `json:"status"`
  354. BeginTimestamp int `json:"begin_timestamp"`//time of instance begin in queue
  355. BeginTime string
  356. RemainTime int `json:"remain_time"` //remain time of instance
  357. EndTimestamp int `json:"end_timestamp"` //
  358. EndTime string
  359. Rank int `json:"rank"` //rank of instance in queue
  360. } `json:"queuing_info"`
  361. Spec struct{
  362. Annotations struct{
  363. TargetDomain string `json:"target_domain"`
  364. Url string `json:"url"`
  365. } `json:"annotations"`
  366. } `json:"spec"`
  367. }
  368. type GetTokenParams struct {
  369. Auth Auth `json:"auth"`
  370. }
  371. type Auth struct {
  372. Identity Identity `json:"identity"`
  373. Scope Scope `json:"scope"`
  374. }
  375. type Scope struct {
  376. Project Project `json:"project"`
  377. }
  378. type Project struct {
  379. Name string `json:"name"`
  380. }
  381. type Identity struct {
  382. Methods []string `json:"methods"`
  383. Password Password `json:"password"`
  384. }
  385. type Password struct {
  386. User NotebookUser `json:"user"`
  387. }
  388. type NotebookUser struct {
  389. Name string `json:"name"`
  390. Password string `json:"password"`
  391. Domain Domain `json:"domain"`
  392. }
  393. type Domain struct {
  394. Name string `json:"name"`
  395. }
  396. const (
  397. ActionStart = "start"
  398. ActionStop = "stop"
  399. ActionRestart = "restart"
  400. ActionQueue = "queue"
  401. ActionDequeue = "dequeue"
  402. )
  403. type NotebookAction struct {
  404. Action string `json:"action"`
  405. }
  406. type NotebookActionResult struct {
  407. ErrorCode string `json:"error_code"`
  408. ErrorMsg string `json:"error_msg"`
  409. CurrentStatus string `json:"current_status"`
  410. PreviousState string `json:"previous_state"`
  411. }
  412. type NotebookGetJobTokenResult struct {
  413. ErrorCode string `json:"error_code"`
  414. ErrorMsg string `json:"error_msg"`
  415. Token string `json:"token"`
  416. }
  417. type NotebookDelResult struct {
  418. InstanceID string `json:"instance_id"`
  419. }
  420. func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
  421. sess := x.NewSession()
  422. defer sess.Close()
  423. var cond = builder.NewCond()
  424. if opts.RepoID > 0 {
  425. cond = cond.And(
  426. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  427. )
  428. }
  429. if opts.UserID > 0 {
  430. cond = cond.And(
  431. builder.Eq{"cloudbrain.user_id": opts.UserID},
  432. )
  433. }
  434. if (opts.JobID) > 0 {
  435. cond = cond.And(
  436. builder.Eq{"cloudbrain.job_id": opts.JobID},
  437. )
  438. }
  439. if (opts.Type) >= 0 {
  440. cond = cond.And(
  441. builder.Eq{"cloudbrain.type": opts.Type},
  442. )
  443. }
  444. // switch opts.JobStatus {
  445. // case JobWaiting:
  446. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  447. // case JobFailed:
  448. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  449. // case JobStopped:
  450. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  451. // case JobSucceeded:
  452. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  453. // }
  454. if len(opts.CloudbrainIDs) > 0 {
  455. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  456. }
  457. count, err := sess.Where(cond).Count(new(Cloudbrain))
  458. if err != nil {
  459. return nil, 0, fmt.Errorf("Count: %v", err)
  460. }
  461. if opts.Page >= 0 && opts.PageSize > 0 {
  462. var start int
  463. if opts.Page == 0 {
  464. start = 0
  465. } else {
  466. start = (opts.Page - 1) * opts.PageSize
  467. }
  468. sess.Limit(opts.PageSize, start)
  469. }
  470. sess.OrderBy("cloudbrain.created_unix DESC")
  471. cloudbrains := make([]*Cloudbrain, 0, setting.UI.IssuePagingNum)
  472. if err := sess.Where(cond).Find(&cloudbrains); err != nil {
  473. return nil, 0, fmt.Errorf("Find: %v", err)
  474. }
  475. sess.Close()
  476. return cloudbrains, count, nil
  477. }
  478. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  479. if _, err = x.Insert(cloudbrain); err != nil {
  480. return err
  481. }
  482. return nil
  483. }
  484. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  485. has, err := x.Get(cb)
  486. if err != nil {
  487. return nil, err
  488. } else if !has {
  489. return nil, errors.New("cloudbrain task is not found")
  490. }
  491. return cb, nil
  492. }
  493. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  494. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  495. return getRepoCloudBrain(cb)
  496. }
  497. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  498. cb := &Cloudbrain{JobID: jobID}
  499. return getRepoCloudBrain(cb)
  500. }
  501. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  502. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  503. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  504. return
  505. }
  506. func UpdateJob(job *Cloudbrain) error {
  507. return updateJob(x, job)
  508. }
  509. func updateJob(e Engine, job *Cloudbrain) error {
  510. var sess *xorm.Session
  511. sess = e.Where("job_id = ?", job.JobID)
  512. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  513. return err
  514. }
  515. func DeleteJob(job *Cloudbrain) error {
  516. return deleteJob(x, job)
  517. }
  518. func deleteJob(e Engine, job *Cloudbrain) error {
  519. _, err := e.ID(job.ID).Delete(job)
  520. return err
  521. }