You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 32 kB

3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  27. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  28. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  29. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  30. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  31. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  32. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  33. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  34. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  35. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  36. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  37. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  38. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  39. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  40. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  41. )
  42. type Cloudbrain struct {
  43. ID int64 `xorm:"pk autoincr"`
  44. JobID string `xorm:"INDEX NOT NULL"`
  45. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  46. JobName string `xorm:"INDEX"`
  47. Status string `xorm:"INDEX"`
  48. UserID int64 `xorm:"INDEX"`
  49. RepoID int64 `xorm:"INDEX"`
  50. SubTaskName string `xorm:"INDEX"`
  51. ContainerID string
  52. ContainerIp string
  53. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  54. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  55. Duration int64 `xorm:"INDEX duration"`
  56. TrainJobDuration string
  57. DeletedAt time.Time `xorm:"deleted"`
  58. CanDebug bool `xorm:"-"`
  59. CanDel bool `xorm:"-"`
  60. Type int `xorm:"INDEX DEFAULT 0"`
  61. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  62. VersionName string
  63. Uuid string
  64. User *User `xorm:"-"`
  65. Repo *Repository `xorm:"-"`
  66. }
  67. type CloudbrainInfo struct {
  68. Cloudbrain `xorm:"extends"`
  69. User `xorm:"extends"`
  70. }
  71. type CloudBrainLoginResult struct {
  72. Code string
  73. Msg string
  74. Payload map[string]interface{}
  75. }
  76. type TaskRole struct {
  77. Name string `json:"name"`
  78. TaskNumber int `json:"taskNumber"`
  79. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  80. MinFailedTaskCount int `json:"minFailedTaskCount"`
  81. CPUNumber int `json:"cpuNumber"`
  82. GPUNumber int `json:"gpuNumber"`
  83. MemoryMB int `json:"memoryMB"`
  84. ShmMB int `json:"shmMB"`
  85. Command string `json:"command"`
  86. NeedIBDevice bool `json:"needIBDevice"`
  87. IsMainRole bool `json:"isMainRole"`
  88. UseNNI bool `json:"useNNI"`
  89. }
  90. type StHostPath struct {
  91. Path string `json:"path"`
  92. MountPath string `json:"mountPath"`
  93. ReadOnly bool `json:"readOnly"`
  94. }
  95. type Volume struct {
  96. HostPath StHostPath `json:"hostPath"`
  97. }
  98. type CreateJobParams struct {
  99. JobName string `json:"jobName"`
  100. RetryCount int8 `json:"retryCount"`
  101. GpuType string `json:"gpuType"`
  102. Image string `json:"image"`
  103. TaskRoles []TaskRole `json:"taskRoles"`
  104. Volumes []Volume `json:"volumes"`
  105. }
  106. type CreateJobResult struct {
  107. Code string `json:"code"`
  108. Msg string `json:"msg"`
  109. Payload map[string]interface{} `json:"payload"`
  110. }
  111. type GetJobResult struct {
  112. Code string `json:"code"`
  113. Msg string `json:"msg"`
  114. Payload map[string]interface{} `json:"payload"`
  115. }
  116. type GetImagesResult struct {
  117. Code string `json:"code"`
  118. Msg string `json:"msg"`
  119. Payload GetImagesPayload `json:"payload"`
  120. }
  121. type GetImagesPayload struct {
  122. Count int `json:"count"`
  123. TotalPages int `json:"totalPages,omitempty"`
  124. ImageInfo []*ImageInfo `json:"rows"`
  125. }
  126. type CloudbrainsOptions struct {
  127. ListOptions
  128. RepoID int64 // include all repos if empty
  129. UserID int64
  130. JobID int64
  131. SortType string
  132. CloudbrainIDs []int64
  133. // JobStatus CloudbrainStatus
  134. Type int
  135. }
  136. type TaskPod struct {
  137. TaskRoleStatus struct {
  138. Name string `json:"name"`
  139. } `json:"taskRoleStatus"`
  140. //TaskStatuses []struct {
  141. // TaskIndex int `json:"taskIndex"`
  142. // PodUID string `json:"podUid"`
  143. // PodIP string `json:"podIp"`
  144. // PodName string `json:"podName"`
  145. // ContainerID string `json:"containerId"`
  146. // ContainerIP string `json:"containerIp"`
  147. // ContainerGpus string `json:"containerGpus"`
  148. // State string `json:"state"`
  149. // StartAt time.Time `json:"startAt"`
  150. // FinishedAt time.Time `json:"finishedAt"`
  151. // ExitCode int `json:"exitCode"`
  152. // ExitDiagnostics string `json:"exitDiagnostics"`
  153. // RetriedCount int `json:"retriedCount"`
  154. // StartTime string
  155. // FinishedTime string
  156. //} `json:"taskStatuses"`
  157. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  158. }
  159. type TaskStatuses struct {
  160. TaskIndex int `json:"taskIndex"`
  161. PodUID string `json:"podUid"`
  162. PodIP string `json:"podIp"`
  163. PodName string `json:"podName"`
  164. ContainerID string `json:"containerId"`
  165. ContainerIP string `json:"containerIp"`
  166. ContainerGpus string `json:"containerGpus"`
  167. State string `json:"state"`
  168. StartAt time.Time `json:"startAt"`
  169. FinishedAt time.Time `json:"finishedAt"`
  170. ExitCode int `json:"exitCode"`
  171. ExitDiagnostics string `json:"exitDiagnostics"`
  172. RetriedCount int `json:"retriedCount"`
  173. StartTime string
  174. FinishedTime string
  175. }
  176. type TaskInfo struct {
  177. Username string `json:"username"`
  178. TaskName string `json:"task_name"`
  179. CodeName string `json:"code_name"`
  180. BenchmarkCategory []string `json:"selected_category"`
  181. CodeLink string `json:"code_link"`
  182. GpuType string `json:"gpu_type"`
  183. }
  184. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  185. data, _ := json.Marshal(input)
  186. var taskPod TaskPod
  187. err := json.Unmarshal(data, &taskPod)
  188. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  189. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  190. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  191. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  192. taskPod.TaskStatuses[0].FinishedTime = "-"
  193. }
  194. return taskPod, err
  195. }
  196. type JobResultPayload struct {
  197. ID string `json:"id"`
  198. Name string `json:"name"`
  199. Platform string `json:"platform"`
  200. JobStatus struct {
  201. Username string `json:"username"`
  202. State string `json:"state"`
  203. SubState string `json:"subState"`
  204. ExecutionType string `json:"executionType"`
  205. Retries int `json:"retries"`
  206. CreatedTime int64 `json:"createdTime"`
  207. CompletedTime int64 `json:"completedTime"`
  208. AppID string `json:"appId"`
  209. AppProgress string `json:"appProgress"`
  210. AppTrackingURL string `json:"appTrackingUrl"`
  211. AppLaunchedTime int64 `json:"appLaunchedTime"`
  212. AppCompletedTime interface{} `json:"appCompletedTime"`
  213. AppExitCode int `json:"appExitCode"`
  214. AppExitDiagnostics string `json:"appExitDiagnostics"`
  215. AppExitType interface{} `json:"appExitType"`
  216. VirtualCluster string `json:"virtualCluster"`
  217. StartTime string
  218. EndTime string
  219. } `json:"jobStatus"`
  220. TaskRoles map[string]interface{} `json:"taskRoles"`
  221. Resource struct {
  222. CPU int `json:"cpu"`
  223. Memory string `json:"memory"`
  224. NvidiaComGpu int `json:"nvidia.com/gpu"`
  225. } `json:"resource"`
  226. Config struct {
  227. Image string `json:"image"`
  228. JobID string `json:"jobId"`
  229. GpuType string `json:"gpuType"`
  230. JobName string `json:"jobName"`
  231. JobType string `json:"jobType"`
  232. TaskRoles []struct {
  233. Name string `json:"name"`
  234. ShmMB int `json:"shmMB"`
  235. Command string `json:"command"`
  236. MemoryMB int `json:"memoryMB"`
  237. CPUNumber int `json:"cpuNumber"`
  238. GpuNumber int `json:"gpuNumber"`
  239. IsMainRole bool `json:"isMainRole"`
  240. TaskNumber int `json:"taskNumber"`
  241. NeedIBDevice bool `json:"needIBDevice"`
  242. MinFailedTaskCount int `json:"minFailedTaskCount"`
  243. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  244. } `json:"taskRoles"`
  245. RetryCount int `json:"retryCount"`
  246. } `json:"config"`
  247. Userinfo struct {
  248. User string `json:"user"`
  249. OrgID string `json:"org_id"`
  250. } `json:"userinfo"`
  251. }
  252. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  253. data, _ := json.Marshal(input)
  254. var jobResultPayload JobResultPayload
  255. err := json.Unmarshal(data, &jobResultPayload)
  256. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  257. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  258. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  259. jobResultPayload.JobStatus.StartTime = "-"
  260. jobResultPayload.JobStatus.EndTime = "-"
  261. }
  262. return jobResultPayload, err
  263. }
  264. type ImagesResultPayload struct {
  265. Images []struct {
  266. ID int `json:"id"`
  267. Name string `json:"name"`
  268. Place string `json:"place"`
  269. Description string `json:"description"`
  270. Provider string `json:"provider"`
  271. Createtime string `json:"createtime"`
  272. Remark string `json:"remark"`
  273. } `json:"taskStatuses"`
  274. }
  275. type ImageInfo struct {
  276. ID int `json:"id"`
  277. Name string `json:"name"`
  278. Place string `json:"place"`
  279. Description string `json:"description"`
  280. Provider string `json:"provider"`
  281. Createtime string `json:"createtime"`
  282. Remark string `json:"remark"`
  283. IsPublic int `json:"isPublic"`
  284. PlaceView string
  285. }
  286. type Categories struct {
  287. Category []*Category `json:"category"`
  288. }
  289. type Category struct {
  290. Id int `json:"id"`
  291. Value string `json:"value"`
  292. }
  293. type GpuInfos struct {
  294. GpuInfo []*GpuInfo `json:"gpu_type"`
  295. }
  296. type GpuInfo struct {
  297. Id int `json:"id"`
  298. Value string `json:"value"`
  299. Queue string `json:"queue"`
  300. }
  301. type ResourceSpecs struct {
  302. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  303. }
  304. type ResourceSpec struct {
  305. Id int `json:"id"`
  306. CpuNum int `json:"cpu"`
  307. GpuNum int `json:"gpu"`
  308. MemMiB int `json:"memMiB"`
  309. ShareMemMiB int `json:"shareMemMiB"`
  310. }
  311. type FlavorInfos struct {
  312. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  313. }
  314. type FlavorInfo struct {
  315. Id int `json:"id"`
  316. Value string `json:"value"`
  317. }
  318. type PoolInfos struct {
  319. PoolInfo []*PoolInfo `json:"pool_info"`
  320. }
  321. type PoolInfo struct {
  322. PoolId string `json:"pool_id"`
  323. PoolName string `json:"pool_name"`
  324. PoolType string `json:"pool_type"`
  325. }
  326. type CommitImageParams struct {
  327. Ip string `json:"ip"`
  328. TaskContainerId string `json:"taskContainerId"`
  329. ImageTag string `json:"imageTag"`
  330. ImageDescription string `json:"imageDescription"`
  331. }
  332. type CommitImageResult struct {
  333. Code string `json:"code"`
  334. Msg string `json:"msg"`
  335. Payload map[string]interface{} `json:"payload"`
  336. }
  337. type CloudBrainResult struct {
  338. Code string `json:"code"`
  339. Msg string `json:"msg"`
  340. }
  341. type CreateNotebookParams struct {
  342. JobName string `json:"name"`
  343. Description string `json:"description"`
  344. ProfileID string `json:"profile_id"`
  345. Flavor string `json:"flavor"`
  346. Spec Spec `json:"spec"`
  347. Workspace Workspace `json:"workspace"`
  348. Pool Pool `json:"pool"`
  349. }
  350. type Pool struct {
  351. ID string `json:"id"`
  352. Name string `json:"name"`
  353. Type string `json:"type"`
  354. }
  355. type Workspace struct {
  356. ID string `json:"id"`
  357. }
  358. type Spec struct {
  359. Storage Storage `json:"storage"`
  360. AutoStop AutoStop `json:"auto_stop"`
  361. }
  362. type AutoStop struct {
  363. Enable bool `json:"enable"`
  364. Duration int `json:"duration"`
  365. }
  366. type Storage struct {
  367. Type string `json:"type"`
  368. Location Location `json:"location"`
  369. }
  370. type Location struct {
  371. Path string `json:"path"`
  372. }
  373. type NotebookResult struct {
  374. ErrorCode string `json:"error_code"`
  375. ErrorMsg string `json:"error_msg"`
  376. }
  377. type CreateNotebookResult struct {
  378. ErrorCode string `json:"error_code"`
  379. ErrorMsg string `json:"error_msg"`
  380. ID string `json:"id"`
  381. Name string `json:"name"`
  382. Description string `json:"description"`
  383. Status string `json:"status"`
  384. CreationTimestamp string `json:"creation_timestamp"`
  385. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  386. Profile struct {
  387. ID string `json:"id"`
  388. Name string `json:"name"`
  389. Description string `json:"description"`
  390. DeType string `json:"de_type"`
  391. FlavorType string `json:"flavor_type"`
  392. } `json:"profile"`
  393. Flavor string `json:"flavor"`
  394. FlavorDetails struct {
  395. Name string `json:"name"`
  396. Status string `json:"status"`
  397. QueuingNum int `json:"queuing_num"`
  398. QueueLeftTime int `json:"queue_left_time"` //s
  399. Duration int `json:"duration"` //auto_stop_time s
  400. } `json:"flavor_details"`
  401. }
  402. type GetNotebookResult struct {
  403. ErrorCode string `json:"error_code"`
  404. ErrorMsg string `json:"error_msg"`
  405. ID string `json:"id"`
  406. Name string `json:"name"`
  407. Description string `json:"description"`
  408. Status string `json:"status"`
  409. CreationTimestamp string `json:"creation_timestamp"`
  410. CreateTime string
  411. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  412. LatestUpdateTime string
  413. Profile struct {
  414. ID string `json:"id"`
  415. Name string `json:"name"`
  416. Description string `json:"description"`
  417. DeType string `json:"de_type"`
  418. FlavorType string `json:"flavor_type"`
  419. } `json:"profile"`
  420. Flavor string `json:"flavor"`
  421. FlavorDetails struct {
  422. Name string `json:"name"`
  423. Status string `json:"status"`
  424. QueuingNum int `json:"queuing_num"`
  425. QueueLeftTime int `json:"queue_left_time"` //s
  426. Duration int `json:"duration"` //auto_stop_time s
  427. } `json:"flavor_details"`
  428. QueuingInfo struct {
  429. ID string `json:"id"`
  430. Name string `json:"name"`
  431. Flavor string `json:"flavor"`
  432. DeType string `json:"de_type"`
  433. Status string `json:"status"`
  434. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  435. BeginTime string
  436. RemainTime int `json:"remain_time"` //remain time of instance
  437. EndTimestamp int `json:"end_timestamp"` //
  438. EndTime string
  439. Rank int `json:"rank"` //rank of instance in queue
  440. } `json:"queuing_info"`
  441. Spec struct {
  442. Annotations struct {
  443. TargetDomain string `json:"target_domain"`
  444. Url string `json:"url"`
  445. } `json:"annotations"`
  446. } `json:"spec"`
  447. }
  448. type GetTokenParams struct {
  449. Auth Auth `json:"auth"`
  450. }
  451. type Auth struct {
  452. Identity Identity `json:"identity"`
  453. Scope Scope `json:"scope"`
  454. }
  455. type Scope struct {
  456. Project Project `json:"project"`
  457. }
  458. type Project struct {
  459. Name string `json:"name"`
  460. }
  461. type Identity struct {
  462. Methods []string `json:"methods"`
  463. Password Password `json:"password"`
  464. }
  465. type Password struct {
  466. User NotebookUser `json:"user"`
  467. }
  468. type NotebookUser struct {
  469. Name string `json:"name"`
  470. Password string `json:"password"`
  471. Domain Domain `json:"domain"`
  472. }
  473. type Domain struct {
  474. Name string `json:"name"`
  475. }
  476. const (
  477. ActionStart = "start"
  478. ActionStop = "stop"
  479. ActionRestart = "restart"
  480. ActionQueue = "queue"
  481. ActionDequeue = "dequeue"
  482. )
  483. type NotebookAction struct {
  484. Action string `json:"action"`
  485. }
  486. type NotebookActionResult struct {
  487. ErrorCode string `json:"error_code"`
  488. ErrorMsg string `json:"error_msg"`
  489. CurrentStatus string `json:"current_status"`
  490. PreviousState string `json:"previous_state"`
  491. }
  492. type NotebookGetJobTokenResult struct {
  493. ErrorCode string `json:"error_code"`
  494. ErrorMsg string `json:"error_msg"`
  495. Token string `json:"token"`
  496. }
  497. type NotebookDelResult struct {
  498. InstanceID string `json:"instance_id"`
  499. }
  500. type CreateTrainJobParams struct {
  501. JobName string `json:"job_name"`
  502. Description string `json:"job_desc"`
  503. Config Config `json:"config"`
  504. WorkspaceID string `json:"workspace_id"`
  505. }
  506. type Config struct {
  507. WorkServerNum int `json:"worker_server_num"`
  508. AppUrl string `json:"app_url"` //训练作业的代码目录
  509. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  510. Parameter []Parameter `json:"parameter"`
  511. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  512. //DatasetID string `json:"dataset_id"`
  513. //DataVersionID string `json:"dataset_version_id"`
  514. //DataSource []DataSource `json:"data_source"`
  515. //SpecID int64 `json:"spec_id"`
  516. EngineID int64 `json:"engine_id"`
  517. //ModelID int64 `json:"model_id"`
  518. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  519. LogUrl string `json:"log_url"`
  520. //UserImageUrl string `json:"user_image_url"`
  521. //UserCommand string `json:"user_command"`
  522. CreateVersion bool `json:"create_version"`
  523. //Volumes []Volumes `json:"volumes"`
  524. Flavor Flavor `json:"flavor"`
  525. PoolID string `json:"pool_id"`
  526. }
  527. type CreateConfigParams struct {
  528. ConfigName string `json:"config_name"`
  529. Description string `json:"config_desc"`
  530. WorkServerNum int `json:"worker_server_num"`
  531. AppUrl string `json:"app_url"` //训练作业的代码目录
  532. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  533. Parameter []Parameter `json:"parameter"`
  534. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  535. //DatasetID string `json:"dataset_id"`
  536. //DataVersionID string `json:"dataset_version_id"`
  537. //DataSource []DataSource `json:"data_source"`
  538. //SpecID int64 `json:"spec_id"`
  539. EngineID int64 `json:"engine_id"`
  540. //ModelID int64 `json:"model_id"`
  541. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  542. LogUrl string `json:"log_url"`
  543. //UserImageUrl string `json:"user_image_url"`
  544. //UserCommand string `json:"user_command"`
  545. //CreateVersion bool `json:"create_version"`
  546. //Volumes []Volumes `json:"volumes"`
  547. Flavor Flavor `json:"flavor"`
  548. PoolID string `json:"pool_id"`
  549. }
  550. type Parameter struct {
  551. Label string `json:"label"`
  552. Value string `json:"value"`
  553. }
  554. type Parameters struct {
  555. Parameter []Parameter `json:"parameter"`
  556. }
  557. type DataSource struct {
  558. DatasetID string `json:"dataset_id"`
  559. DatasetVersion string `json:"dataset_version"`
  560. Type string `json:"type"`
  561. DataUrl string `json:"data_url"`
  562. }
  563. type Volumes struct {
  564. Nfs Nfs `json:"nfs"`
  565. HostPath HostPath `json:"host_path"`
  566. }
  567. type Nfs struct {
  568. ID string `json:"id"`
  569. SourcePath string `json:"src_path"`
  570. DestPath string `json:"dest_path"`
  571. ReadOnly bool `json:"read_only"`
  572. }
  573. type HostPath struct {
  574. SourcePath string `json:"src_path"`
  575. DestPath string `json:"dest_path"`
  576. ReadOnly bool `json:"read_only"`
  577. }
  578. type Flavor struct {
  579. Code string `json:"code"`
  580. }
  581. type CreateTrainJobResult struct {
  582. ErrorCode string `json:"error_code"`
  583. ErrorMsg string `json:"error_msg"`
  584. IsSuccess bool `json:"is_success"`
  585. JobName string `json:"job_name"`
  586. JobID int64 `json:"job_id"`
  587. Status int `json:"status"`
  588. CreateTime int64 `json:"create_time"`
  589. VersionID int64 `json:"version_id"`
  590. ResourceID string `json:"resource_id"`
  591. VersionName string `json:"version_name"`
  592. }
  593. type CreateTrainJobConfigResult struct {
  594. ErrorCode string `json:"error_code"`
  595. ErrorMsg string `json:"error_msg"`
  596. IsSuccess bool `json:"is_success"`
  597. }
  598. type GetResourceSpecsResult struct {
  599. ErrorCode string `json:"error_code"`
  600. ErrorMsg string `json:"error_msg"`
  601. IsSuccess bool `json:"is_success"`
  602. SpecTotalCount int `json:"spec_total_count"`
  603. Specs []Specs `json:"specs"`
  604. }
  605. type Specs struct {
  606. Core string `json:"core"`
  607. Cpu string `json:"cpu"`
  608. IsNoResource bool `json:"no_resource"`
  609. GpuType string `json:"gpu_type"`
  610. SpecID int64 `json:"spec_id"`
  611. GpuNum int `json:"gpu_num"`
  612. SpecCode string `json:"spec_code"`
  613. Storage string `json:"storage"`
  614. MaxNum int `json:"max_num"`
  615. UnitNum int `json:"unit_num"`
  616. InterfaceType int `json:"interface_type"`
  617. }
  618. type GetConfigListResult struct {
  619. ErrorCode string `json:"error_code"`
  620. ErrorMsg string `json:"error_msg"`
  621. IsSuccess bool `json:"is_success"`
  622. ConfigTotalCount int `json:"config_total_count"`
  623. ParaConfigs []ParaConfig `json:"configs"`
  624. }
  625. type ParaConfig struct {
  626. ConfigName string `json:"config_name"`
  627. ConfigDesc string `json:"config_desc"`
  628. CreateTime int64 `json:"create_time"`
  629. EngineType int `json:"engine_type"`
  630. EngineName string `json:"engine_name"`
  631. EngineId int64 `json:"engine_id"`
  632. EngineVersion string `json:"engine_version"`
  633. UserImageUrl string `json:"user_image_url"`
  634. UserCommand string `json:"user_command"`
  635. Result GetConfigResult
  636. }
  637. type GetConfigResult struct {
  638. ErrorCode string `json:"error_code"`
  639. ErrorMsg string `json:"error_msg"`
  640. IsSuccess bool `json:"is_success"`
  641. ConfigName string `json:"config_name"`
  642. Description string `json:"config_desc"`
  643. WorkServerNum int `json:"worker_server_num"`
  644. AppUrl string `json:"app_url"` //训练作业的代码目录
  645. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  646. Parameter []Parameter `json:"parameter"`
  647. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  648. //DatasetID string `json:"dataset_id"`
  649. //DataVersionID string `json:"dataset_version_id"`
  650. //DataSource []DataSource `json:"data_source"`
  651. //SpecID int64 `json:"spec_id"`
  652. EngineID int64 `json:"engine_id"`
  653. //ModelID int64 `json:"model_id"`
  654. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  655. LogUrl string `json:"log_url"`
  656. //UserImageUrl string `json:"user_image_url"`
  657. //UserCommand string `json:"user_command"`
  658. //CreateVersion bool `json:"create_version"`
  659. //Volumes []Volumes `json:"volumes"`
  660. Flavor Flavor `json:"flavor"`
  661. PoolID string `json:"pool_id"`
  662. }
  663. type ErrorResult struct {
  664. ErrorCode string `json:"error_code"`
  665. ErrorMsg string `json:"error_message"`
  666. IsSuccess bool `json:"is_success"`
  667. }
  668. type GetTrainJobResult struct {
  669. IsSuccess bool `json:"is_success"`
  670. JobName string `json:"job_name"`
  671. JobID int64 `json:"job_id"`
  672. Description string `json:"job_desc"`
  673. IntStatus int `json:"status"`
  674. Status string
  675. LongCreateTime int64 `json:"create_time"`
  676. CreateTime string
  677. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  678. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  679. VersionID int64 `json:"version_id"`
  680. ResourceID string `json:"resource_id"`
  681. VersionName string `json:"version_name"`
  682. PreVersionID int64 `json:"pre_version_id"`
  683. WorkServerNum int `json:"worker_server_num"`
  684. AppUrl string `json:"app_url"` //训练作业的代码目录
  685. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  686. Parameter []Parameter `json:"parameter"`
  687. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  688. //DatasetID string `json:"dataset_id"`
  689. //DataVersionID string `json:"dataset_version_id"`
  690. //DataSource []DataSource `json:"data_source"`
  691. //SpecID int64 `json:"spec_id"`
  692. EngineID int64 `json:"engine_id"`
  693. EngineName string `json:"engine_name"`
  694. EngineVersion string `json:"engine_version"`
  695. //ModelID int64 `json:"model_id"`
  696. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  697. LogUrl string `json:"log_url"`
  698. //UserImageUrl string `json:"user_image_url"`
  699. //UserCommand string `json:"user_command"`
  700. //Volumes []Volumes `json:"volumes"`
  701. Flavor Flavor `json:"flavor"`
  702. PoolID string `json:"pool_id"`
  703. PoolName string `json:"pool_name"`
  704. NasMountPath string `json:"nas_mount_path"`
  705. NasShareAddr string `json:"nas_share_addr"`
  706. DatasetName string
  707. }
  708. type GetTrainJobLogResult struct {
  709. ErrorCode string `json:"error_code"`
  710. ErrorMsg string `json:"error_msg"`
  711. IsSuccess bool `json:"is_success"`
  712. Content string `json:"content"`
  713. Lines int `json:"lines"`
  714. StartLine string `json:"start_line"`
  715. EndLine string `json:"end_line"`
  716. }
  717. type GetTrainJobLogFileNamesResult struct {
  718. ErrorCode string `json:"error_code"`
  719. ErrorMsg string `json:"error_msg"`
  720. IsSuccess bool `json:"is_success"`
  721. LogFileList []string `json:"log_file_list"`
  722. }
  723. type TrainJobResult struct {
  724. ErrorCode string `json:"error_code"`
  725. ErrorMsg string `json:"error_msg"`
  726. IsSuccess bool `json:"is_success"`
  727. }
  728. type LogFile struct {
  729. Name string
  730. }
  731. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  732. sess := x.NewSession()
  733. defer sess.Close()
  734. var cond = builder.NewCond()
  735. if opts.RepoID > 0 {
  736. cond = cond.And(
  737. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  738. )
  739. }
  740. if opts.UserID > 0 {
  741. cond = cond.And(
  742. builder.Eq{"cloudbrain.user_id": opts.UserID},
  743. )
  744. }
  745. if (opts.JobID) > 0 {
  746. cond = cond.And(
  747. builder.Eq{"cloudbrain.job_id": opts.JobID},
  748. )
  749. }
  750. if (opts.Type) >= 0 {
  751. cond = cond.And(
  752. builder.Eq{"cloudbrain.type": opts.Type},
  753. )
  754. }
  755. // switch opts.JobStatus {
  756. // case JobWaiting:
  757. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  758. // case JobFailed:
  759. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  760. // case JobStopped:
  761. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  762. // case JobSucceeded:
  763. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  764. // }
  765. if len(opts.CloudbrainIDs) > 0 {
  766. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  767. }
  768. count, err := sess.Where(cond).Count(new(Cloudbrain))
  769. if err != nil {
  770. return nil, 0, fmt.Errorf("Count: %v", err)
  771. }
  772. if opts.Page >= 0 && opts.PageSize > 0 {
  773. var start int
  774. if opts.Page == 0 {
  775. start = 0
  776. } else {
  777. start = (opts.Page - 1) * opts.PageSize
  778. }
  779. sess.Limit(opts.PageSize, start)
  780. }
  781. sess.OrderBy("cloudbrain.created_unix DESC")
  782. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  783. if err := sess.Table(&Cloudbrain{}).Where(cond).
  784. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  785. Find(&cloudbrains); err != nil {
  786. return nil, 0, fmt.Errorf("Find: %v", err)
  787. }
  788. sess.Close()
  789. return cloudbrains, count, nil
  790. }
  791. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  792. if _, err = x.Insert(cloudbrain); err != nil {
  793. return err
  794. }
  795. return nil
  796. }
  797. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  798. has, err := x.Get(cb)
  799. if err != nil {
  800. return nil, err
  801. } else if !has {
  802. return nil, ErrJobNotExist{}
  803. }
  804. return cb, nil
  805. }
  806. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  807. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  808. return getRepoCloudBrain(cb)
  809. }
  810. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  811. cb := &Cloudbrain{JobID: jobID}
  812. return getRepoCloudBrain(cb)
  813. }
  814. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  815. cloudBrains := make([]*Cloudbrain, 0)
  816. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  817. return cloudBrains, err
  818. }
  819. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  820. cloudBrains := make([]*Cloudbrain, 0)
  821. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  822. return cloudBrains, err
  823. }
  824. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  825. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  826. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  827. return
  828. }
  829. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  830. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  831. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  832. return
  833. }
  834. func UpdateJob(job *Cloudbrain) error {
  835. return updateJob(x, job)
  836. }
  837. func updateJob(e Engine, job *Cloudbrain) error {
  838. var sess *xorm.Session
  839. sess = e.Where("job_id = ?", job.JobID)
  840. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  841. return err
  842. }
  843. // func UpdateTrainJob(job *CloudbrainInfo) error {
  844. // return updateTrainJob(x, job)
  845. // }
  846. // func updateTrainJob(e Engine, job *CloudbrainInfo) error {
  847. // var sess *xorm.Session
  848. // sess = e.Where("job_id = ?", job.Cloudbrain.JobID)
  849. // _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  850. // return err
  851. // }
  852. func DeleteJob(job *Cloudbrain) error {
  853. return deleteJob(x, job)
  854. }
  855. func deleteJob(e Engine, job *Cloudbrain) error {
  856. _, err := e.ID(job.ID).Delete(job)
  857. return err
  858. }
  859. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  860. cb := &Cloudbrain{JobName: jobName}
  861. return getRepoCloudBrain(cb)
  862. }
  863. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  864. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  865. return false
  866. }
  867. repo, err := GetRepositoryByID(job.RepoID)
  868. if err != nil {
  869. log.Error("GetRepositoryByID failed:%v", err.Error())
  870. return false
  871. }
  872. permission, _ := GetUserRepoPermission(repo, user)
  873. if err != nil {
  874. log.Error("GetUserRepoPermission failed:%v", err.Error())
  875. return false
  876. }
  877. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  878. return true
  879. }
  880. return false
  881. }