You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 27 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "time"
  7. "xorm.io/xorm"
  8. "code.gitea.io/gitea/modules/setting"
  9. "code.gitea.io/gitea/modules/timeutil"
  10. "xorm.io/builder"
  11. )
  12. type CloudbrainStatus string
  13. type JobType string
  14. type ModelArtsJobStatus string
  15. const (
  16. JobWaiting CloudbrainStatus = "WAITING"
  17. JobStopped CloudbrainStatus = "STOPPED"
  18. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  19. JobFailed CloudbrainStatus = "FAILED"
  20. JobRunning CloudbrainStatus = "RUNNING"
  21. JobTypeDebug JobType = "DEBUG"
  22. JobTypeBenchmark JobType = "BENCHMARK"
  23. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  24. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  25. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  26. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  27. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  28. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  29. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  30. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  31. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  32. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  33. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  34. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  35. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  36. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  37. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  38. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  39. )
  40. type Cloudbrain struct {
  41. ID int64 `xorm:"pk autoincr"`
  42. JobID string `xorm:"INDEX NOT NULL"`
  43. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  44. JobName string `xorm:"INDEX"`
  45. Status string `xorm:"INDEX"`
  46. UserID int64 `xorm:"INDEX"`
  47. RepoID int64 `xorm:"INDEX"`
  48. SubTaskName string `xorm:"INDEX"`
  49. ContainerID string
  50. ContainerIp string
  51. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  52. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  53. DeletedAt time.Time `xorm:"deleted"`
  54. CanDebug bool `xorm:"-"`
  55. Type int `xorm:"INDEX DEFAULT 0"`
  56. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  57. VersionName string
  58. Uuid string
  59. User *User `xorm:"-"`
  60. Repo *Repository `xorm:"-"`
  61. }
  62. type CloudBrainLoginResult struct {
  63. Code string
  64. Msg string
  65. Payload map[string]interface{}
  66. }
  67. type TaskRole struct {
  68. Name string `json:"name"`
  69. TaskNumber int8 `json:"taskNumber"`
  70. MinSucceededTaskCount int8 `json:"minSucceededTaskCount"`
  71. MinFailedTaskCount int8 `json:"minFailedTaskCount"`
  72. CPUNumber int8 `json:"cpuNumber"`
  73. GPUNumber int8 `json:"gpuNumber"`
  74. MemoryMB int `json:"memoryMB"`
  75. ShmMB int `json:"shmMB"`
  76. Command string `json:"command"`
  77. NeedIBDevice bool `json:"needIBDevice"`
  78. IsMainRole bool `json:"isMainRole"`
  79. UseNNI bool `json:"useNNI"`
  80. }
  81. type StHostPath struct {
  82. Path string `json:"path"`
  83. MountPath string `json:"mountPath"`
  84. ReadOnly bool `json:"readOnly"`
  85. }
  86. type Volume struct {
  87. HostPath StHostPath `json:"hostPath"`
  88. }
  89. type CreateJobParams struct {
  90. JobName string `json:"jobName"`
  91. RetryCount int8 `json:"retryCount"`
  92. GpuType string `json:"gpuType"`
  93. Image string `json:"image"`
  94. TaskRoles []TaskRole `json:"taskRoles"`
  95. Volumes []Volume `json:"volumes"`
  96. }
  97. type CreateJobResult struct {
  98. Code string `json:"code"`
  99. Msg string `json:"msg"`
  100. Payload map[string]interface{} `json:"payload"`
  101. }
  102. type GetJobResult struct {
  103. Code string `json:"code"`
  104. Msg string `json:"msg"`
  105. Payload map[string]interface{} `json:"payload"`
  106. }
  107. type GetImagesResult struct {
  108. Code string `json:"code"`
  109. Msg string `json:"msg"`
  110. Payload GetImagesPayload `json:"payload"`
  111. }
  112. type GetImagesPayload struct {
  113. Count int `json:"count"`
  114. ImageInfo []*ImageInfo `json:"rows"`
  115. }
  116. type CloudbrainsOptions struct {
  117. ListOptions
  118. RepoID int64 // include all repos if empty
  119. UserID int64
  120. JobID int64
  121. SortType string
  122. CloudbrainIDs []int64
  123. // JobStatus CloudbrainStatus
  124. Type int
  125. }
  126. type TaskPod struct {
  127. TaskRoleStatus struct {
  128. Name string `json:"name"`
  129. } `json:"taskRoleStatus"`
  130. TaskStatuses []struct {
  131. TaskIndex int `json:"taskIndex"`
  132. PodUID string `json:"podUid"`
  133. PodIP string `json:"podIp"`
  134. PodName string `json:"podName"`
  135. ContainerID string `json:"containerId"`
  136. ContainerIP string `json:"containerIp"`
  137. ContainerGpus string `json:"containerGpus"`
  138. State string `json:"state"`
  139. StartAt time.Time `json:"startAt"`
  140. FinishedAt time.Time `json:"finishedAt"`
  141. ExitCode int `json:"exitCode"`
  142. ExitDiagnostics string `json:"exitDiagnostics"`
  143. RetriedCount int `json:"retriedCount"`
  144. StartTime string
  145. FinishedTime string
  146. } `json:"taskStatuses"`
  147. }
  148. type TaskInfo struct {
  149. Username string `json:"username"`
  150. TaskName string `json:"task_name"`
  151. CodeName string `json:"code_name"`
  152. BenchmarkCategory []string `json:"selected_category"`
  153. CodeLink string `json:"code_link"`
  154. }
  155. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  156. data, _ := json.Marshal(input)
  157. var taskPod TaskPod
  158. err := json.Unmarshal(data, &taskPod)
  159. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  160. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  161. return taskPod, err
  162. }
  163. type JobResultPayload struct {
  164. ID string `json:"id"`
  165. Name string `json:"name"`
  166. Platform string `json:"platform"`
  167. JobStatus struct {
  168. Username string `json:"username"`
  169. State string `json:"state"`
  170. SubState string `json:"subState"`
  171. ExecutionType string `json:"executionType"`
  172. Retries int `json:"retries"`
  173. CreatedTime int64 `json:"createdTime"`
  174. CompletedTime int64 `json:"completedTime"`
  175. AppID string `json:"appId"`
  176. AppProgress string `json:"appProgress"`
  177. AppTrackingURL string `json:"appTrackingUrl"`
  178. AppLaunchedTime int64 `json:"appLaunchedTime"`
  179. AppCompletedTime interface{} `json:"appCompletedTime"`
  180. AppExitCode int `json:"appExitCode"`
  181. AppExitDiagnostics string `json:"appExitDiagnostics"`
  182. AppExitType interface{} `json:"appExitType"`
  183. VirtualCluster string `json:"virtualCluster"`
  184. StartTime string
  185. EndTime string
  186. } `json:"jobStatus"`
  187. TaskRoles map[string]interface{} `json:"taskRoles"`
  188. Resource struct {
  189. CPU int `json:"cpu"`
  190. Memory string `json:"memory"`
  191. NvidiaComGpu int `json:"nvidia.com/gpu"`
  192. } `json:"resource"`
  193. Config struct {
  194. Image string `json:"image"`
  195. JobID string `json:"jobId"`
  196. GpuType string `json:"gpuType"`
  197. JobName string `json:"jobName"`
  198. JobType string `json:"jobType"`
  199. TaskRoles []struct {
  200. Name string `json:"name"`
  201. ShmMB int `json:"shmMB"`
  202. Command string `json:"command"`
  203. MemoryMB int `json:"memoryMB"`
  204. CPUNumber int `json:"cpuNumber"`
  205. GpuNumber int `json:"gpuNumber"`
  206. IsMainRole bool `json:"isMainRole"`
  207. TaskNumber int `json:"taskNumber"`
  208. NeedIBDevice bool `json:"needIBDevice"`
  209. MinFailedTaskCount int `json:"minFailedTaskCount"`
  210. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  211. } `json:"taskRoles"`
  212. RetryCount int `json:"retryCount"`
  213. } `json:"config"`
  214. Userinfo struct {
  215. User string `json:"user"`
  216. OrgID string `json:"org_id"`
  217. } `json:"userinfo"`
  218. }
  219. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  220. data, _ := json.Marshal(input)
  221. var jobResultPayload JobResultPayload
  222. err := json.Unmarshal(data, &jobResultPayload)
  223. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  224. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  225. return jobResultPayload, err
  226. }
  227. type ImagesResultPayload struct {
  228. Images []struct {
  229. ID int `json:"id"`
  230. Name string `json:"name"`
  231. Place string `json:"place"`
  232. Description string `json:"description"`
  233. Provider string `json:"provider"`
  234. Createtime string `json:"createtime"`
  235. Remark string `json:"remark"`
  236. } `json:"taskStatuses"`
  237. }
  238. type ImageInfo struct {
  239. ID int `json:"id"`
  240. Name string `json:"name"`
  241. Place string `json:"place"`
  242. Description string `json:"description"`
  243. Provider string `json:"provider"`
  244. Createtime string `json:"createtime"`
  245. Remark string `json:"remark"`
  246. IsPublic int `json:"isPublic"`
  247. PlaceView string
  248. }
  249. type Categories struct {
  250. Category []*Category `json:"category"`
  251. }
  252. type Category struct {
  253. Id int `json:"id"`
  254. Value string `json:"value"`
  255. }
  256. type CommitImageParams struct {
  257. Ip string `json:"ip"`
  258. TaskContainerId string `json:"taskContainerId"`
  259. ImageTag string `json:"imageTag"`
  260. ImageDescription string `json:"imageDescription"`
  261. }
  262. type CommitImageResult struct {
  263. Code string `json:"code"`
  264. Msg string `json:"msg"`
  265. Payload map[string]interface{} `json:"payload"`
  266. }
  267. type StopJobResult struct {
  268. Code string `json:"code"`
  269. Msg string `json:"msg"`
  270. }
  271. type CreateNotebookParams struct {
  272. JobName string `json:"name"`
  273. Description string `json:"description"`
  274. ProfileID string `json:"profile_id"`
  275. Flavor string `json:"flavor"`
  276. Spec Spec `json:"spec"`
  277. Workspace Workspace `json:"workspace"`
  278. Pool Pool `json:"pool"`
  279. }
  280. type Pool struct {
  281. ID string `json:"id"`
  282. Name string `json:"name"`
  283. Type string `json:"type"`
  284. }
  285. type Workspace struct {
  286. ID string `json:"id"`
  287. }
  288. type Spec struct {
  289. Storage Storage `json:"storage"`
  290. AutoStop AutoStop `json:"auto_stop"`
  291. }
  292. type AutoStop struct {
  293. Enable bool `json:"enable"`
  294. Duration int `json:"duration"`
  295. }
  296. type Storage struct {
  297. Type string `json:"type"`
  298. Location Location `json:"location"`
  299. }
  300. type Location struct {
  301. Path string `json:"path"`
  302. }
  303. type NotebookResult struct {
  304. ErrorCode string `json:"error_code"`
  305. ErrorMsg string `json:"error_msg"`
  306. }
  307. type CreateNotebookResult struct {
  308. ErrorCode string `json:"error_code"`
  309. ErrorMsg string `json:"error_msg"`
  310. ID string `json:"id"`
  311. Name string `json:"name"`
  312. Description string `json:"description"`
  313. Status string `json:"status"`
  314. CreationTimestamp string `json:"creation_timestamp"`
  315. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  316. Profile struct {
  317. ID string `json:"id"`
  318. Name string `json:"name"`
  319. Description string `json:"description"`
  320. DeType string `json:"de_type"`
  321. FlavorType string `json:"flavor_type"`
  322. } `json:"profile"`
  323. Flavor string `json:"flavor"`
  324. FlavorDetails struct{
  325. Name string `json:"name"`
  326. Status string `json:"status"`
  327. QueuingNum int `json:"queuing_num"`
  328. QueueLeftTime int `json:"queue_left_time"` //s
  329. Duration int `json:"duration"` //auto_stop_time s
  330. } `json:"flavor_details"`
  331. }
  332. type GetNotebookResult struct {
  333. ErrorCode string `json:"error_code"`
  334. ErrorMsg string `json:"error_msg"`
  335. ID string `json:"id"`
  336. Name string `json:"name"`
  337. Description string `json:"description"`
  338. Status string `json:"status"`
  339. CreationTimestamp string `json:"creation_timestamp"`
  340. CreateTime string
  341. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  342. LatestUpdateTime string
  343. Profile struct {
  344. ID string `json:"id"`
  345. Name string `json:"name"`
  346. Description string `json:"description"`
  347. DeType string `json:"de_type"`
  348. FlavorType string `json:"flavor_type"`
  349. } `json:"profile"`
  350. Flavor string `json:"flavor"`
  351. FlavorDetails struct{
  352. Name string `json:"name"`
  353. Status string `json:"status"`
  354. QueuingNum int `json:"queuing_num"`
  355. QueueLeftTime int `json:"queue_left_time"` //s
  356. Duration int `json:"duration"` //auto_stop_time s
  357. } `json:"flavor_details"`
  358. QueuingInfo struct{
  359. ID string `json:"id"`
  360. Name string `json:"name"`
  361. Flavor string `json:"flavor"`
  362. DeType string `json:"de_type"`
  363. Status string `json:"status"`
  364. BeginTimestamp int `json:"begin_timestamp"`//time of instance begin in queue
  365. BeginTime string
  366. RemainTime int `json:"remain_time"` //remain time of instance
  367. EndTimestamp int `json:"end_timestamp"` //
  368. EndTime string
  369. Rank int `json:"rank"` //rank of instance in queue
  370. } `json:"queuing_info"`
  371. Spec struct{
  372. Annotations struct{
  373. TargetDomain string `json:"target_domain"`
  374. Url string `json:"url"`
  375. } `json:"annotations"`
  376. } `json:"spec"`
  377. }
  378. type GetTokenParams struct {
  379. Auth Auth `json:"auth"`
  380. }
  381. type Auth struct {
  382. Identity Identity `json:"identity"`
  383. Scope Scope `json:"scope"`
  384. }
  385. type Scope struct {
  386. Project Project `json:"project"`
  387. }
  388. type Project struct {
  389. Name string `json:"name"`
  390. }
  391. type Identity struct {
  392. Methods []string `json:"methods"`
  393. Password Password `json:"password"`
  394. }
  395. type Password struct {
  396. User NotebookUser `json:"user"`
  397. }
  398. type NotebookUser struct {
  399. Name string `json:"name"`
  400. Password string `json:"password"`
  401. Domain Domain `json:"domain"`
  402. }
  403. type Domain struct {
  404. Name string `json:"name"`
  405. }
  406. const (
  407. ActionStart = "start"
  408. ActionStop = "stop"
  409. ActionRestart = "restart"
  410. ActionQueue = "queue"
  411. ActionDequeue = "dequeue"
  412. )
  413. type NotebookAction struct {
  414. Action string `json:"action"`
  415. }
  416. type NotebookActionResult struct {
  417. ErrorCode string `json:"error_code"`
  418. ErrorMsg string `json:"error_msg"`
  419. CurrentStatus string `json:"current_status"`
  420. PreviousState string `json:"previous_state"`
  421. }
  422. type NotebookGetJobTokenResult struct {
  423. ErrorCode string `json:"error_code"`
  424. ErrorMsg string `json:"error_msg"`
  425. Token string `json:"token"`
  426. }
  427. type NotebookDelResult struct {
  428. InstanceID string `json:"instance_id"`
  429. }
  430. type CreateTrainJobParams struct {
  431. JobName string `json:"job_name"`
  432. Description string `json:"job_desc"`
  433. Config Config `json:"config"`
  434. WorkspaceID string `json:"workspace_id"`
  435. }
  436. type Config struct {
  437. WorkServerNum int `json:"worker_server_num"`
  438. AppUrl string `json:"app_url"` //训练作业的代码目录
  439. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  440. Parameter []Parameter `json:"parameter"`
  441. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  442. //DatasetID string `json:"dataset_id"`
  443. //DataVersionID string `json:"dataset_version_id"`
  444. //DataSource []DataSource `json:"data_source"`
  445. //SpecID int64 `json:"spec_id"`
  446. EngineID int64 `json:"engine_id"`
  447. //ModelID int64 `json:"model_id"`
  448. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  449. LogUrl string `json:"log_url"`
  450. //UserImageUrl string `json:"user_image_url"`
  451. //UserCommand string `json:"user_command"`
  452. CreateVersion bool `json:"create_version"`
  453. //Volumes []Volumes `json:"volumes"`
  454. Flavor Flavor `json:"flavor"`
  455. PoolID string `json:"pool_id"`
  456. }
  457. type CreateConfigParams struct {
  458. ConfigName string `json:"config_name"`
  459. Description string `json:"config_desc"`
  460. WorkServerNum int `json:"worker_server_num"`
  461. AppUrl string `json:"app_url"` //训练作业的代码目录
  462. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  463. Parameter []Parameter `json:"parameter"`
  464. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  465. //DatasetID string `json:"dataset_id"`
  466. //DataVersionID string `json:"dataset_version_id"`
  467. //DataSource []DataSource `json:"data_source"`
  468. //SpecID int64 `json:"spec_id"`
  469. EngineID int64 `json:"engine_id"`
  470. //ModelID int64 `json:"model_id"`
  471. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  472. LogUrl string `json:"log_url"`
  473. //UserImageUrl string `json:"user_image_url"`
  474. //UserCommand string `json:"user_command"`
  475. //CreateVersion bool `json:"create_version"`
  476. //Volumes []Volumes `json:"volumes"`
  477. Flavor Flavor `json:"flavor"`
  478. PoolID string `json:"pool_id"`
  479. }
  480. type Parameter struct {
  481. Label string `json:"label"`
  482. Value string `json:"value"`
  483. }
  484. type Parameters struct {
  485. Parameter []Parameter `json:"parameter"`
  486. }
  487. type DataSource struct {
  488. DatasetID string `json:"dataset_id"`
  489. DatasetVersion string `json:"dataset_version"`
  490. Type string `json:"type"`
  491. DataUrl string `json:"data_url"`
  492. }
  493. type Volumes struct {
  494. Nfs Nfs `json:"nfs"`
  495. HostPath HostPath `json:"host_path"`
  496. }
  497. type Nfs struct {
  498. ID string `json:"id"`
  499. SourcePath string `json:"src_path"`
  500. DestPath string `json:"dest_path"`
  501. ReadOnly bool `json:"read_only"`
  502. }
  503. type HostPath struct {
  504. SourcePath string `json:"src_path"`
  505. DestPath string `json:"dest_path"`
  506. ReadOnly bool `json:"read_only"`
  507. }
  508. type Flavor struct {
  509. Code string `json:"code"`
  510. }
  511. type CreateTrainJobResult struct {
  512. ErrorCode string `json:"error_code"`
  513. ErrorMsg string `json:"error_msg"`
  514. IsSuccess bool `json:"is_success"`
  515. JobName string `json:"job_name"`
  516. JobID int64 `json:"job_id"`
  517. Status int `json:"status"`
  518. CreateTime int64 `json:"create_time"`
  519. VersionID int64 `json:"version_id"`
  520. ResourceID string `json:"resource_id"`
  521. VersionName string `json:"version_name"`
  522. }
  523. type CreateTrainJobConfigResult struct {
  524. ErrorCode string `json:"error_code"`
  525. ErrorMsg string `json:"error_msg"`
  526. IsSuccess bool `json:"is_success"`
  527. }
  528. type GetResourceSpecsResult struct {
  529. ErrorCode string `json:"error_code"`
  530. ErrorMsg string `json:"error_msg"`
  531. IsSuccess bool `json:"is_success"`
  532. SpecTotalCount int `json:"spec_total_count"`
  533. Specs []Specs `json:"specs"`
  534. }
  535. type Specs struct {
  536. Core string `json:"core"`
  537. Cpu string `json:"cpu"`
  538. IsNoResource bool `json:"no_resource"`
  539. GpuType string `json:"gpu_type"`
  540. SpecID int64 `json:"spec_id"`
  541. GpuNum int `json:"gpu_num"`
  542. SpecCode string `json:"spec_code"`
  543. Storage string `json:"storage"`
  544. MaxNum int `json:"max_num"`
  545. UnitNum int `json:"unit_num"`
  546. InterfaceType int `json:"interface_type"`
  547. }
  548. type ErrorResult struct {
  549. ErrorCode string `json:"error_code"`
  550. ErrorMsg string `json:"error_message"`
  551. IsSuccess bool `json:"is_success"`
  552. }
  553. type GetTrainJobResult struct {
  554. IsSuccess bool `json:"is_success"`
  555. JobName string `json:"job_name"`
  556. JobID int64 `json:"job_id"`
  557. Description string `json:"job_desc"`
  558. IntStatus int `json:"status"`
  559. Status string
  560. LongCreateTime int64 `json:"create_time"`
  561. CreateTime string
  562. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  563. VersionID int64 `json:"version_id"`
  564. ResourceID string `json:"resource_id"`
  565. VersionName string `json:"version_name"`
  566. PreVersionID int64 `json:"pre_version_id"`
  567. WorkServerNum int `json:"worker_server_num"`
  568. AppUrl string `json:"app_url"` //训练作业的代码目录
  569. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  570. Parameter []Parameter `json:"parameter"`
  571. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  572. //DatasetID string `json:"dataset_id"`
  573. //DataVersionID string `json:"dataset_version_id"`
  574. //DataSource []DataSource `json:"data_source"`
  575. //SpecID int64 `json:"spec_id"`
  576. EngineID int64 `json:"engine_id"`
  577. EngineName string `json:"engine_name"`
  578. EngineVersion string `json:"engine_version"`
  579. //ModelID int64 `json:"model_id"`
  580. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  581. LogUrl string `json:"log_url"`
  582. //UserImageUrl string `json:"user_image_url"`
  583. //UserCommand string `json:"user_command"`
  584. //Volumes []Volumes `json:"volumes"`
  585. Flavor Flavor `json:"flavor"`
  586. PoolID string `json:"pool_id"`
  587. PoolName string `json:"pool_name"`
  588. NasMountPath string `json:"nas_mount_path"`
  589. NasShareAddr string `json:"nas_share_addr"`
  590. DatasetName string
  591. }
  592. type GetTrainJobLogResult struct {
  593. ErrorCode string `json:"error_code"`
  594. ErrorMsg string `json:"error_msg"`
  595. IsSuccess bool `json:"is_success"`
  596. Content string `json:"content"`
  597. Lines int `json:"lines"`
  598. StartLine string `json:"start_line"`
  599. EndLine string `json:"end_line"`
  600. }
  601. type GetTrainJobLogFileNamesResult struct {
  602. ErrorCode string `json:"error_code"`
  603. ErrorMsg string `json:"error_msg"`
  604. IsSuccess bool `json:"is_success"`
  605. LogFileList []string `json:"log_file_list"`
  606. }
  607. type TrainJobResult struct {
  608. ErrorCode string `json:"error_code"`
  609. ErrorMsg string `json:"error_msg"`
  610. IsSuccess bool `json:"is_success"`
  611. }
  612. func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
  613. sess := x.NewSession()
  614. defer sess.Close()
  615. var cond = builder.NewCond()
  616. if opts.RepoID > 0 {
  617. cond = cond.And(
  618. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  619. )
  620. }
  621. if opts.UserID > 0 {
  622. cond = cond.And(
  623. builder.Eq{"cloudbrain.user_id": opts.UserID},
  624. )
  625. }
  626. if (opts.JobID) > 0 {
  627. cond = cond.And(
  628. builder.Eq{"cloudbrain.job_id": opts.JobID},
  629. )
  630. }
  631. if (opts.Type) >= 0 {
  632. cond = cond.And(
  633. builder.Eq{"cloudbrain.type": opts.Type},
  634. )
  635. }
  636. // switch opts.JobStatus {
  637. // case JobWaiting:
  638. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  639. // case JobFailed:
  640. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  641. // case JobStopped:
  642. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  643. // case JobSucceeded:
  644. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  645. // }
  646. if len(opts.CloudbrainIDs) > 0 {
  647. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  648. }
  649. count, err := sess.Where(cond).Count(new(Cloudbrain))
  650. if err != nil {
  651. return nil, 0, fmt.Errorf("Count: %v", err)
  652. }
  653. if opts.Page >= 0 && opts.PageSize > 0 {
  654. var start int
  655. if opts.Page == 0 {
  656. start = 0
  657. } else {
  658. start = (opts.Page - 1) * opts.PageSize
  659. }
  660. sess.Limit(opts.PageSize, start)
  661. }
  662. sess.OrderBy("cloudbrain.created_unix DESC")
  663. cloudbrains := make([]*Cloudbrain, 0, setting.UI.IssuePagingNum)
  664. if err := sess.Where(cond).Find(&cloudbrains); err != nil {
  665. return nil, 0, fmt.Errorf("Find: %v", err)
  666. }
  667. sess.Close()
  668. return cloudbrains, count, nil
  669. }
  670. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  671. if _, err = x.Insert(cloudbrain); err != nil {
  672. return err
  673. }
  674. return nil
  675. }
  676. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  677. has, err := x.Get(cb)
  678. if err != nil {
  679. return nil, err
  680. } else if !has {
  681. return nil, errors.New("cloudbrain task is not found")
  682. }
  683. return cb, nil
  684. }
  685. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  686. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  687. return getRepoCloudBrain(cb)
  688. }
  689. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  690. cb := &Cloudbrain{JobID: jobID}
  691. return getRepoCloudBrain(cb)
  692. }
  693. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  694. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  695. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  696. return
  697. }
  698. func UpdateJob(job *Cloudbrain) error {
  699. return updateJob(x, job)
  700. }
  701. func updateJob(e Engine, job *Cloudbrain) error {
  702. var sess *xorm.Session
  703. sess = e.Where("job_id = ?", job.JobID)
  704. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  705. return err
  706. }
  707. func DeleteJob(job *Cloudbrain) error {
  708. return deleteJob(x, job)
  709. }
  710. func deleteJob(e Engine, job *Cloudbrain) error {
  711. _, err := e.ID(job.ID).Delete(job)
  712. return err
  713. }