You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 36 kB

3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
5 years ago
4 years ago
3 years ago
5 years ago
3 years ago
5 years ago
3 years ago
3 years ago
3 years ago
5 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. JobTypeTrain JobType = "TRAIN"
  27. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  28. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  29. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  30. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  31. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  32. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  33. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  34. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  35. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  36. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  37. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  38. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  39. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  40. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  41. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  42. )
  43. type Cloudbrain struct {
  44. ID int64 `xorm:"pk autoincr"`
  45. JobID string `xorm:"INDEX NOT NULL"`
  46. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  47. JobName string
  48. Status string
  49. UserID int64
  50. RepoID int64
  51. SubTaskName string
  52. ContainerID string
  53. ContainerIp string
  54. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  55. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  56. Duration int64
  57. TrainJobDuration string
  58. DeletedAt time.Time `xorm:"deleted"`
  59. CanDebug bool `xorm:"-"`
  60. CanDel bool `xorm:"-"`
  61. Type int
  62. VersionID int64 //版本id
  63. VersionName string `xorm:"INDEX"` //当前版本
  64. Uuid string //数据集id
  65. DatasetName string
  66. VersionCount int //任务的当前版本数量,不包括删除的
  67. IsLatestVersion string //是否是最新版本,1是,0否
  68. CommitID string //提交的仓库代码id
  69. PreVersionName string //父版本名称
  70. ComputeResource string //计算资源,例如npu
  71. EngineID int64 //引擎id
  72. TrainUrl string //输出的obs路径
  73. BranchName string //分支名称
  74. Parameters string //传给modelarts的param参数
  75. BootFile string //启动文件
  76. DataUrl string //数据集的obs路径
  77. LogUrl string //日志输出的obs路径
  78. PreVersionId int64 //父版本的版本id
  79. FlavorCode string //modelarts上的规格id
  80. Description string //描述
  81. WorkServerNumber int //节点数
  82. FlavorName string //规格名称
  83. EngineName string //引擎名称
  84. TotalVersionCount int //任务的所有版本数量,包括删除的
  85. User *User `xorm:"-"`
  86. Repo *Repository `xorm:"-"`
  87. }
  88. type CloudbrainInfo struct {
  89. Cloudbrain `xorm:"extends"`
  90. User `xorm:"extends"`
  91. }
  92. type CloudBrainLoginResult struct {
  93. Code string
  94. Msg string
  95. Payload map[string]interface{}
  96. }
  97. type TaskRole struct {
  98. Name string `json:"name"`
  99. TaskNumber int `json:"taskNumber"`
  100. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  101. MinFailedTaskCount int `json:"minFailedTaskCount"`
  102. CPUNumber int `json:"cpuNumber"`
  103. GPUNumber int `json:"gpuNumber"`
  104. MemoryMB int `json:"memoryMB"`
  105. ShmMB int `json:"shmMB"`
  106. Command string `json:"command"`
  107. NeedIBDevice bool `json:"needIBDevice"`
  108. IsMainRole bool `json:"isMainRole"`
  109. UseNNI bool `json:"useNNI"`
  110. }
  111. type StHostPath struct {
  112. Path string `json:"path"`
  113. MountPath string `json:"mountPath"`
  114. ReadOnly bool `json:"readOnly"`
  115. }
  116. type Volume struct {
  117. HostPath StHostPath `json:"hostPath"`
  118. }
  119. type CreateJobParams struct {
  120. JobName string `json:"jobName"`
  121. RetryCount int8 `json:"retryCount"`
  122. GpuType string `json:"gpuType"`
  123. Image string `json:"image"`
  124. TaskRoles []TaskRole `json:"taskRoles"`
  125. Volumes []Volume `json:"volumes"`
  126. }
  127. type CreateJobResult struct {
  128. Code string `json:"code"`
  129. Msg string `json:"msg"`
  130. Payload map[string]interface{} `json:"payload"`
  131. }
  132. type GetJobResult struct {
  133. Code string `json:"code"`
  134. Msg string `json:"msg"`
  135. Payload map[string]interface{} `json:"payload"`
  136. }
  137. type GetImagesResult struct {
  138. Code string `json:"code"`
  139. Msg string `json:"msg"`
  140. Payload GetImagesPayload `json:"payload"`
  141. }
  142. type GetImagesPayload struct {
  143. Count int `json:"count"`
  144. TotalPages int `json:"totalPages,omitempty"`
  145. ImageInfo []*ImageInfo `json:"rows"`
  146. }
  147. type CloudbrainsOptions struct {
  148. ListOptions
  149. RepoID int64 // include all repos if empty
  150. UserID int64
  151. JobID string
  152. SortType string
  153. CloudbrainIDs []int64
  154. // JobStatus CloudbrainStatus
  155. Type int
  156. JobType string
  157. VersionName string
  158. IsLatestVersion string
  159. }
  160. type TaskPod struct {
  161. TaskRoleStatus struct {
  162. Name string `json:"name"`
  163. } `json:"taskRoleStatus"`
  164. //TaskStatuses []struct {
  165. // TaskIndex int `json:"taskIndex"`
  166. // PodUID string `json:"podUid"`
  167. // PodIP string `json:"podIp"`
  168. // PodName string `json:"podName"`
  169. // ContainerID string `json:"containerId"`
  170. // ContainerIP string `json:"containerIp"`
  171. // ContainerGpus string `json:"containerGpus"`
  172. // State string `json:"state"`
  173. // StartAt time.Time `json:"startAt"`
  174. // FinishedAt time.Time `json:"finishedAt"`
  175. // ExitCode int `json:"exitCode"`
  176. // ExitDiagnostics string `json:"exitDiagnostics"`
  177. // RetriedCount int `json:"retriedCount"`
  178. // StartTime string
  179. // FinishedTime string
  180. //} `json:"taskStatuses"`
  181. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  182. }
  183. type TaskStatuses struct {
  184. TaskIndex int `json:"taskIndex"`
  185. PodUID string `json:"podUid"`
  186. PodIP string `json:"podIp"`
  187. PodName string `json:"podName"`
  188. ContainerID string `json:"containerId"`
  189. ContainerIP string `json:"containerIp"`
  190. ContainerGpus string `json:"containerGpus"`
  191. State string `json:"state"`
  192. StartAt time.Time `json:"startAt"`
  193. FinishedAt time.Time `json:"finishedAt"`
  194. ExitCode int `json:"exitCode"`
  195. ExitDiagnostics string `json:"exitDiagnostics"`
  196. RetriedCount int `json:"retriedCount"`
  197. StartTime string
  198. FinishedTime string
  199. }
  200. type TaskInfo struct {
  201. Username string `json:"username"`
  202. TaskName string `json:"task_name"`
  203. CodeName string `json:"code_name"`
  204. BenchmarkCategory []string `json:"selected_category"`
  205. CodeLink string `json:"code_link"`
  206. GpuType string `json:"gpu_type"`
  207. }
  208. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  209. data, _ := json.Marshal(input)
  210. var taskPod TaskPod
  211. err := json.Unmarshal(data, &taskPod)
  212. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  213. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  214. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  215. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  216. taskPod.TaskStatuses[0].FinishedTime = "-"
  217. }
  218. return taskPod, err
  219. }
  220. type JobResultPayload struct {
  221. ID string `json:"id"`
  222. Name string `json:"name"`
  223. Platform string `json:"platform"`
  224. JobStatus struct {
  225. Username string `json:"username"`
  226. State string `json:"state"`
  227. SubState string `json:"subState"`
  228. ExecutionType string `json:"executionType"`
  229. Retries int `json:"retries"`
  230. CreatedTime int64 `json:"createdTime"`
  231. CompletedTime int64 `json:"completedTime"`
  232. AppID string `json:"appId"`
  233. AppProgress string `json:"appProgress"`
  234. AppTrackingURL string `json:"appTrackingUrl"`
  235. AppLaunchedTime int64 `json:"appLaunchedTime"`
  236. AppCompletedTime interface{} `json:"appCompletedTime"`
  237. AppExitCode int `json:"appExitCode"`
  238. AppExitDiagnostics string `json:"appExitDiagnostics"`
  239. AppExitType interface{} `json:"appExitType"`
  240. VirtualCluster string `json:"virtualCluster"`
  241. StartTime string
  242. EndTime string
  243. } `json:"jobStatus"`
  244. TaskRoles map[string]interface{} `json:"taskRoles"`
  245. Resource struct {
  246. CPU int `json:"cpu"`
  247. Memory string `json:"memory"`
  248. NvidiaComGpu int `json:"nvidia.com/gpu"`
  249. } `json:"resource"`
  250. Config struct {
  251. Image string `json:"image"`
  252. JobID string `json:"jobId"`
  253. GpuType string `json:"gpuType"`
  254. JobName string `json:"jobName"`
  255. JobType string `json:"jobType"`
  256. TaskRoles []struct {
  257. Name string `json:"name"`
  258. ShmMB int `json:"shmMB"`
  259. Command string `json:"command"`
  260. MemoryMB int `json:"memoryMB"`
  261. CPUNumber int `json:"cpuNumber"`
  262. GpuNumber int `json:"gpuNumber"`
  263. IsMainRole bool `json:"isMainRole"`
  264. TaskNumber int `json:"taskNumber"`
  265. NeedIBDevice bool `json:"needIBDevice"`
  266. MinFailedTaskCount int `json:"minFailedTaskCount"`
  267. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  268. } `json:"taskRoles"`
  269. RetryCount int `json:"retryCount"`
  270. } `json:"config"`
  271. Userinfo struct {
  272. User string `json:"user"`
  273. OrgID string `json:"org_id"`
  274. } `json:"userinfo"`
  275. }
  276. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  277. data, _ := json.Marshal(input)
  278. var jobResultPayload JobResultPayload
  279. err := json.Unmarshal(data, &jobResultPayload)
  280. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  281. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  282. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  283. jobResultPayload.JobStatus.StartTime = "-"
  284. jobResultPayload.JobStatus.EndTime = "-"
  285. }
  286. return jobResultPayload, err
  287. }
  288. type ImagesResultPayload struct {
  289. Images []struct {
  290. ID int `json:"id"`
  291. Name string `json:"name"`
  292. Place string `json:"place"`
  293. Description string `json:"description"`
  294. Provider string `json:"provider"`
  295. Createtime string `json:"createtime"`
  296. Remark string `json:"remark"`
  297. } `json:"taskStatuses"`
  298. }
  299. type ImageInfo struct {
  300. ID int `json:"id"`
  301. Name string `json:"name"`
  302. Place string `json:"place"`
  303. Description string `json:"description"`
  304. Provider string `json:"provider"`
  305. Createtime string `json:"createtime"`
  306. Remark string `json:"remark"`
  307. IsPublic int `json:"isPublic"`
  308. PlaceView string
  309. }
  310. type Categories struct {
  311. Category []*Category `json:"category"`
  312. }
  313. type Category struct {
  314. Id int `json:"id"`
  315. Value string `json:"value"`
  316. }
  317. type GpuInfos struct {
  318. GpuInfo []*GpuInfo `json:"gpu_type"`
  319. }
  320. type GpuInfo struct {
  321. Id int `json:"id"`
  322. Value string `json:"value"`
  323. Queue string `json:"queue"`
  324. }
  325. type ResourceSpecs struct {
  326. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  327. }
  328. type ResourceSpec struct {
  329. Id int `json:"id"`
  330. CpuNum int `json:"cpu"`
  331. GpuNum int `json:"gpu"`
  332. MemMiB int `json:"memMiB"`
  333. ShareMemMiB int `json:"shareMemMiB"`
  334. }
  335. type FlavorInfos struct {
  336. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  337. }
  338. type FlavorInfo struct {
  339. Id int `json:"id"`
  340. Value string `json:"value"`
  341. Desc string `json:"desc"`
  342. }
  343. type PoolInfos struct {
  344. PoolInfo []*PoolInfo `json:"pool_info"`
  345. }
  346. type PoolInfo struct {
  347. PoolId string `json:"pool_id"`
  348. PoolName string `json:"pool_name"`
  349. PoolType string `json:"pool_type"`
  350. }
  351. type CommitImageParams struct {
  352. Ip string `json:"ip"`
  353. TaskContainerId string `json:"taskContainerId"`
  354. ImageTag string `json:"imageTag"`
  355. ImageDescription string `json:"imageDescription"`
  356. }
  357. type CommitImageResult struct {
  358. Code string `json:"code"`
  359. Msg string `json:"msg"`
  360. Payload map[string]interface{} `json:"payload"`
  361. }
  362. type CloudBrainResult struct {
  363. Code string `json:"code"`
  364. Msg string `json:"msg"`
  365. }
  366. type CreateNotebookParams struct {
  367. JobName string `json:"name"`
  368. Description string `json:"description"`
  369. ProfileID string `json:"profile_id"`
  370. Flavor string `json:"flavor"`
  371. Spec Spec `json:"spec"`
  372. Workspace Workspace `json:"workspace"`
  373. Pool Pool `json:"pool"`
  374. }
  375. type Pool struct {
  376. ID string `json:"id"`
  377. Name string `json:"name"`
  378. Type string `json:"type"`
  379. }
  380. type Workspace struct {
  381. ID string `json:"id"`
  382. }
  383. type Spec struct {
  384. Storage Storage `json:"storage"`
  385. AutoStop AutoStop `json:"auto_stop"`
  386. }
  387. type AutoStop struct {
  388. Enable bool `json:"enable"`
  389. Duration int `json:"duration"`
  390. }
  391. type Storage struct {
  392. Type string `json:"type"`
  393. Location Location `json:"location"`
  394. }
  395. type Location struct {
  396. Path string `json:"path"`
  397. }
  398. type NotebookResult struct {
  399. ErrorCode string `json:"error_code"`
  400. ErrorMsg string `json:"error_msg"`
  401. }
  402. type CreateNotebookResult struct {
  403. ErrorCode string `json:"error_code"`
  404. ErrorMsg string `json:"error_msg"`
  405. ID string `json:"id"`
  406. Name string `json:"name"`
  407. Description string `json:"description"`
  408. Status string `json:"status"`
  409. CreationTimestamp string `json:"creation_timestamp"`
  410. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  411. Profile struct {
  412. ID string `json:"id"`
  413. Name string `json:"name"`
  414. Description string `json:"description"`
  415. DeType string `json:"de_type"`
  416. FlavorType string `json:"flavor_type"`
  417. } `json:"profile"`
  418. Flavor string `json:"flavor"`
  419. FlavorDetails struct {
  420. Name string `json:"name"`
  421. Status string `json:"status"`
  422. QueuingNum int `json:"queuing_num"`
  423. QueueLeftTime int `json:"queue_left_time"` //s
  424. Duration int `json:"duration"` //auto_stop_time s
  425. } `json:"flavor_details"`
  426. }
  427. type GetNotebookResult struct {
  428. ErrorCode string `json:"error_code"`
  429. ErrorMsg string `json:"error_msg"`
  430. ID string `json:"id"`
  431. Name string `json:"name"`
  432. Description string `json:"description"`
  433. Status string `json:"status"`
  434. CreationTimestamp string `json:"creation_timestamp"`
  435. CreateTime string
  436. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  437. LatestUpdateTime string
  438. Profile struct {
  439. ID string `json:"id"`
  440. Name string `json:"name"`
  441. Description string `json:"description"`
  442. DeType string `json:"de_type"`
  443. FlavorType string `json:"flavor_type"`
  444. } `json:"profile"`
  445. Flavor string `json:"flavor"`
  446. FlavorDetails struct {
  447. Name string `json:"name"`
  448. Status string `json:"status"`
  449. QueuingNum int `json:"queuing_num"`
  450. QueueLeftTime int `json:"queue_left_time"` //s
  451. Duration int `json:"duration"` //auto_stop_time s
  452. } `json:"flavor_details"`
  453. QueuingInfo struct {
  454. ID string `json:"id"`
  455. Name string `json:"name"`
  456. Flavor string `json:"flavor"`
  457. DeType string `json:"de_type"`
  458. Status string `json:"status"`
  459. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  460. BeginTime string
  461. RemainTime int `json:"remain_time"` //remain time of instance
  462. EndTimestamp int `json:"end_timestamp"` //
  463. EndTime string
  464. Rank int `json:"rank"` //rank of instance in queue
  465. } `json:"queuing_info"`
  466. Spec struct {
  467. Annotations struct {
  468. TargetDomain string `json:"target_domain"`
  469. Url string `json:"url"`
  470. } `json:"annotations"`
  471. } `json:"spec"`
  472. }
  473. type GetTokenParams struct {
  474. Auth Auth `json:"auth"`
  475. }
  476. type Auth struct {
  477. Identity Identity `json:"identity"`
  478. Scope Scope `json:"scope"`
  479. }
  480. type Scope struct {
  481. Project Project `json:"project"`
  482. }
  483. type Project struct {
  484. Name string `json:"name"`
  485. }
  486. type Identity struct {
  487. Methods []string `json:"methods"`
  488. Password Password `json:"password"`
  489. }
  490. type Password struct {
  491. User NotebookUser `json:"user"`
  492. }
  493. type NotebookUser struct {
  494. Name string `json:"name"`
  495. Password string `json:"password"`
  496. Domain Domain `json:"domain"`
  497. }
  498. type Domain struct {
  499. Name string `json:"name"`
  500. }
  501. const (
  502. ActionStart = "start"
  503. ActionStop = "stop"
  504. ActionRestart = "restart"
  505. ActionQueue = "queue"
  506. ActionDequeue = "dequeue"
  507. )
  508. type NotebookAction struct {
  509. Action string `json:"action"`
  510. }
  511. type NotebookActionResult struct {
  512. ErrorCode string `json:"error_code"`
  513. ErrorMsg string `json:"error_msg"`
  514. CurrentStatus string `json:"current_status"`
  515. PreviousState string `json:"previous_state"`
  516. }
  517. type NotebookGetJobTokenResult struct {
  518. ErrorCode string `json:"error_code"`
  519. ErrorMsg string `json:"error_msg"`
  520. Token string `json:"token"`
  521. }
  522. type NotebookDelResult struct {
  523. InstanceID string `json:"instance_id"`
  524. }
  525. type CreateTrainJobParams struct {
  526. JobName string `json:"job_name"`
  527. Description string `json:"job_desc"`
  528. Config Config `json:"config"`
  529. WorkspaceID string `json:"workspace_id"`
  530. }
  531. type Config struct {
  532. WorkServerNum int `json:"worker_server_num"`
  533. AppUrl string `json:"app_url"` //训练作业的代码目录
  534. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  535. Parameter []Parameter `json:"parameter"`
  536. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  537. EngineID int64 `json:"engine_id"`
  538. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  539. LogUrl string `json:"log_url"`
  540. //UserImageUrl string `json:"user_image_url"`
  541. //UserCommand string `json:"user_command"`
  542. CreateVersion bool `json:"create_version"`
  543. Flavor Flavor `json:"flavor"`
  544. PoolID string `json:"pool_id"`
  545. }
  546. type CreateTrainJobVersionParams struct {
  547. Description string `json:"job_desc"`
  548. Config TrainJobVersionConfig `json:"config"`
  549. }
  550. type TrainJobVersionConfig struct {
  551. WorkServerNum int `json:"worker_server_num"`
  552. AppUrl string `json:"app_url"` //训练作业的代码目录
  553. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  554. Parameter []Parameter `json:"parameter"`
  555. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  556. EngineID int64 `json:"engine_id"`
  557. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  558. LogUrl string `json:"log_url"`
  559. Flavor Flavor `json:"flavor"`
  560. PoolID string `json:"pool_id"`
  561. PreVersionId int64 `json:"pre_version_id"`
  562. }
  563. type CreateConfigParams struct {
  564. ConfigName string `json:"config_name"`
  565. Description string `json:"config_desc"`
  566. WorkServerNum int `json:"worker_server_num"`
  567. AppUrl string `json:"app_url"` //训练作业的代码目录
  568. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  569. Parameter []Parameter `json:"parameter"`
  570. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  571. EngineID int64 `json:"engine_id"`
  572. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  573. LogUrl string `json:"log_url"`
  574. Flavor Flavor `json:"flavor"`
  575. PoolID string `json:"pool_id"`
  576. }
  577. type Parameter struct {
  578. Label string `json:"label"`
  579. Value string `json:"value"`
  580. }
  581. type Parameters struct {
  582. Parameter []Parameter `json:"parameter"`
  583. }
  584. type DataSource struct {
  585. DatasetID string `json:"dataset_id"`
  586. DatasetVersion string `json:"dataset_version"`
  587. Type string `json:"type"`
  588. DataUrl string `json:"data_url"`
  589. }
  590. type Volumes struct {
  591. Nfs Nfs `json:"nfs"`
  592. HostPath HostPath `json:"host_path"`
  593. }
  594. type Nfs struct {
  595. ID string `json:"id"`
  596. SourcePath string `json:"src_path"`
  597. DestPath string `json:"dest_path"`
  598. ReadOnly bool `json:"read_only"`
  599. }
  600. type HostPath struct {
  601. SourcePath string `json:"src_path"`
  602. DestPath string `json:"dest_path"`
  603. ReadOnly bool `json:"read_only"`
  604. }
  605. type Flavor struct {
  606. Code string `json:"code"`
  607. }
  608. type CreateTrainJobResult struct {
  609. ErrorCode string `json:"error_code"`
  610. ErrorMsg string `json:"error_msg"`
  611. IsSuccess bool `json:"is_success"`
  612. JobName string `json:"job_name"`
  613. JobID int64 `json:"job_id"`
  614. Status int `json:"status"`
  615. CreateTime int64 `json:"create_time"`
  616. VersionID int64 `json:"version_id"`
  617. ResourceID string `json:"resource_id"`
  618. VersionName string `json:"version_name"`
  619. }
  620. type CreateTrainJobConfigResult struct {
  621. ErrorCode string `json:"error_code"`
  622. ErrorMsg string `json:"error_msg"`
  623. IsSuccess bool `json:"is_success"`
  624. }
  625. type GetResourceSpecsResult struct {
  626. ErrorCode string `json:"error_code"`
  627. ErrorMsg string `json:"error_msg"`
  628. IsSuccess bool `json:"is_success"`
  629. SpecTotalCount int `json:"spec_total_count"`
  630. Specs []Specs `json:"specs"`
  631. }
  632. type Specs struct {
  633. Core string `json:"core"`
  634. Cpu string `json:"cpu"`
  635. IsNoResource bool `json:"no_resource"`
  636. GpuType string `json:"gpu_type"`
  637. SpecID int64 `json:"spec_id"`
  638. GpuNum int `json:"gpu_num"`
  639. SpecCode string `json:"spec_code"`
  640. Storage string `json:"storage"`
  641. MaxNum int `json:"max_num"`
  642. UnitNum int `json:"unit_num"`
  643. InterfaceType int `json:"interface_type"`
  644. }
  645. type GetConfigListResult struct {
  646. ErrorCode string `json:"error_code"`
  647. ErrorMsg string `json:"error_msg"`
  648. IsSuccess bool `json:"is_success"`
  649. ConfigTotalCount int `json:"config_total_count"`
  650. ParaConfigs []ParaConfig `json:"configs"`
  651. }
  652. type ParaConfig struct {
  653. ConfigName string `json:"config_name"`
  654. ConfigDesc string `json:"config_desc"`
  655. CreateTime int64 `json:"create_time"`
  656. EngineType int `json:"engine_type"`
  657. EngineName string `json:"engine_name"`
  658. EngineId int64 `json:"engine_id"`
  659. EngineVersion string `json:"engine_version"`
  660. UserImageUrl string `json:"user_image_url"`
  661. UserCommand string `json:"user_command"`
  662. Result GetConfigResult
  663. }
  664. type GetConfigResult struct {
  665. ErrorCode string `json:"error_code"`
  666. ErrorMsg string `json:"error_msg"`
  667. IsSuccess bool `json:"is_success"`
  668. ConfigName string `json:"config_name"`
  669. Description string `json:"config_desc"`
  670. WorkServerNum int `json:"worker_server_num"`
  671. AppUrl string `json:"app_url"` //训练作业的代码目录
  672. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  673. Parameter []Parameter `json:"parameter"`
  674. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  675. EngineID int64 `json:"engine_id"`
  676. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  677. LogUrl string `json:"log_url"`
  678. Flavor Flavor `json:"flavor"`
  679. PoolID string `json:"pool_id"`
  680. }
  681. type ErrorResult struct {
  682. ErrorCode string `json:"error_code"`
  683. ErrorMsg string `json:"error_message"`
  684. IsSuccess bool `json:"is_success"`
  685. }
  686. type GetTrainJobResult struct {
  687. IsSuccess bool `json:"is_success"`
  688. JobName string `json:"job_name"`
  689. JobID int64 `json:"job_id"`
  690. Description string `json:"job_desc"`
  691. IntStatus int `json:"status"`
  692. Status string
  693. LongCreateTime int64 `json:"create_time"`
  694. CreateTime string
  695. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  696. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  697. VersionID int64 `json:"version_id"`
  698. ResourceID string `json:"resource_id"`
  699. VersionName string `json:"version_name"`
  700. PreVersionID int64 `json:"pre_version_id"`
  701. WorkServerNum int `json:"worker_server_num"`
  702. AppUrl string `json:"app_url"` //训练作业的代码目录
  703. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  704. Parameter []Parameter `json:"parameter"`
  705. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  706. EngineID int64 `json:"engine_id"`
  707. EngineName string `json:"engine_name"`
  708. EngineVersion string `json:"engine_version"`
  709. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  710. LogUrl string `json:"log_url"`
  711. Flavor Flavor `json:"flavor"`
  712. PoolID string `json:"pool_id"`
  713. PoolName string `json:"pool_name"`
  714. NasMountPath string `json:"nas_mount_path"`
  715. NasShareAddr string `json:"nas_share_addr"`
  716. DatasetName string
  717. ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
  718. }
  719. type GetTrainJobLogResult struct {
  720. ErrorCode string `json:"error_code"`
  721. ErrorMsg string `json:"error_msg"`
  722. IsSuccess bool `json:"is_success"`
  723. Content string `json:"content"`
  724. Lines int `json:"lines"`
  725. StartLine string `json:"start_line"`
  726. EndLine string `json:"end_line"`
  727. }
  728. type GetTrainJobLogFileNamesResult struct {
  729. ErrorCode string `json:"error_code"`
  730. ErrorMsg string `json:"error_msg"`
  731. IsSuccess bool `json:"is_success"`
  732. LogFileList []string `json:"log_file_list"`
  733. }
  734. type TrainJobResult struct {
  735. ErrorCode string `json:"error_code"`
  736. ErrorMsg string `json:"error_msg"`
  737. IsSuccess bool `json:"is_success"`
  738. }
  739. type LogFile struct {
  740. Name string
  741. }
  742. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  743. sess := x.NewSession()
  744. defer sess.Close()
  745. var cond = builder.NewCond()
  746. if opts.RepoID > 0 {
  747. cond = cond.And(
  748. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  749. )
  750. }
  751. if opts.UserID > 0 {
  752. cond = cond.And(
  753. builder.Eq{"cloudbrain.user_id": opts.UserID},
  754. )
  755. }
  756. if (opts.JobID) != "" {
  757. cond = cond.And(
  758. builder.Eq{"cloudbrain.job_id": opts.JobID},
  759. )
  760. }
  761. if (opts.Type) >= 0 {
  762. cond = cond.And(
  763. builder.Eq{"cloudbrain.type": opts.Type},
  764. )
  765. }
  766. if (opts.JobType) != "" {
  767. cond = cond.And(
  768. builder.Eq{"cloudbrain.job_type": opts.JobType},
  769. )
  770. }
  771. if (opts.IsLatestVersion) != "" {
  772. cond = cond.And(
  773. builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion},
  774. )
  775. }
  776. if len(opts.CloudbrainIDs) > 0 {
  777. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  778. }
  779. count, err := sess.Where(cond).Count(new(Cloudbrain))
  780. if err != nil {
  781. return nil, 0, fmt.Errorf("Count: %v", err)
  782. }
  783. if opts.Page >= 0 && opts.PageSize > 0 {
  784. var start int
  785. if opts.Page == 0 {
  786. start = 0
  787. } else {
  788. start = (opts.Page - 1) * opts.PageSize
  789. }
  790. sess.Limit(opts.PageSize, start)
  791. }
  792. sess.OrderBy("cloudbrain.created_unix DESC")
  793. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  794. if err := sess.Table(&Cloudbrain{}).Where(cond).
  795. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  796. Find(&cloudbrains); err != nil {
  797. return nil, 0, fmt.Errorf("Find: %v", err)
  798. }
  799. return cloudbrains, count, nil
  800. }
  801. func QueryModelTrainJobVersionList(jobId string) ([]*CloudbrainInfo, int, error) {
  802. sess := x.NewSession()
  803. defer sess.Close()
  804. var cond = builder.NewCond()
  805. cond = cond.And(
  806. builder.Eq{"cloudbrain.job_id": jobId},
  807. )
  808. cond = cond.And(
  809. builder.Eq{"cloudbrain.Status": "COMPLETED"},
  810. )
  811. sess.OrderBy("cloudbrain.created_unix DESC")
  812. cloudbrains := make([]*CloudbrainInfo, 0)
  813. if err := sess.Table(&Cloudbrain{}).Where(cond).
  814. Find(&cloudbrains); err != nil {
  815. return nil, 0, fmt.Errorf("Find: %v", err)
  816. }
  817. return cloudbrains, int(len(cloudbrains)), nil
  818. }
  819. func QueryModelTrainJobList(repoId int64) ([]*CloudbrainInfo, int, error) {
  820. sess := x.NewSession()
  821. defer sess.Close()
  822. var cond = builder.NewCond()
  823. cond = cond.And(
  824. builder.Eq{"cloudbrain.repo_id": repoId},
  825. )
  826. cond = cond.And(
  827. builder.Eq{"cloudbrain.Status": "COMPLETED"},
  828. )
  829. sess.OrderBy("cloudbrain.created_unix DESC")
  830. cloudbrains := make([]*CloudbrainInfo, 0)
  831. if err := sess.Distinct("job_id").Table(&Cloudbrain{}).Where(cond).
  832. Find(&cloudbrains); err != nil {
  833. return nil, 0, fmt.Errorf("Find: %v", err)
  834. }
  835. return cloudbrains, int(len(cloudbrains)), nil
  836. }
  837. func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, error) {
  838. sess := x.NewSession()
  839. defer sess.Close()
  840. var cond = builder.NewCond()
  841. if opts.RepoID > 0 {
  842. cond = cond.And(
  843. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  844. )
  845. }
  846. if opts.UserID > 0 {
  847. cond = cond.And(
  848. builder.Eq{"cloudbrain.user_id": opts.UserID},
  849. )
  850. }
  851. if (opts.Type) >= 0 {
  852. cond = cond.And(
  853. builder.Eq{"cloudbrain.type": opts.Type},
  854. )
  855. }
  856. if (opts.JobID) != "" {
  857. cond = cond.And(
  858. builder.Eq{"cloudbrain.job_id": opts.JobID},
  859. )
  860. }
  861. if (opts.JobType) != "" {
  862. cond = cond.And(
  863. builder.Eq{"cloudbrain.job_type": opts.JobType},
  864. )
  865. }
  866. if len(opts.CloudbrainIDs) > 0 {
  867. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  868. }
  869. count, err := sess.Where(cond).Count(new(Cloudbrain))
  870. if err != nil {
  871. return nil, 0, fmt.Errorf("Count: %v", err)
  872. }
  873. if opts.Page >= 0 && opts.PageSize > 0 {
  874. var start int
  875. if opts.Page == 0 {
  876. start = 0
  877. } else {
  878. start = (opts.Page - 1) * opts.PageSize
  879. }
  880. sess.Limit(opts.PageSize, start)
  881. }
  882. sess.OrderBy("cloudbrain.created_unix DESC")
  883. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  884. if err := sess.Table(&Cloudbrain{}).Where(cond).
  885. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  886. Find(&cloudbrains); err != nil {
  887. return nil, 0, fmt.Errorf("Find: %v", err)
  888. }
  889. return cloudbrains, int(count), nil
  890. }
  891. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  892. if _, err = x.Insert(cloudbrain); err != nil {
  893. return err
  894. }
  895. return nil
  896. }
  897. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  898. has, err := x.Get(cb)
  899. if err != nil {
  900. return nil, err
  901. } else if !has {
  902. return nil, ErrJobNotExist{}
  903. }
  904. return cb, nil
  905. }
  906. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  907. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  908. return getRepoCloudBrain(cb)
  909. }
  910. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  911. cb := &Cloudbrain{JobID: jobID}
  912. return getRepoCloudBrain(cb)
  913. }
  914. func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
  915. cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
  916. return getRepoCloudBrain(cb)
  917. }
  918. func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) {
  919. cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion}
  920. return getRepoCloudBrain(cb)
  921. }
  922. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  923. cloudBrains := make([]*Cloudbrain, 0)
  924. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  925. return cloudBrains, err
  926. }
  927. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  928. cloudBrains := make([]*Cloudbrain, 0)
  929. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  930. return cloudBrains, err
  931. }
  932. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  933. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  934. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  935. return
  936. }
  937. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  938. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  939. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  940. return
  941. }
  942. func SetVersionCountAndLatestVersion(jobID string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) {
  943. cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount}
  944. _, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
  945. return
  946. }
  947. func UpdateJob(job *Cloudbrain) error {
  948. return updateJob(x, job)
  949. }
  950. func updateJob(e Engine, job *Cloudbrain) error {
  951. var sess *xorm.Session
  952. sess = e.Where("job_id = ?", job.JobID)
  953. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  954. return err
  955. }
  956. func UpdateTrainJobVersion(job *Cloudbrain) error {
  957. return updateJobTrainVersion(x, job)
  958. }
  959. func updateJobTrainVersion(e Engine, job *Cloudbrain) error {
  960. var sess *xorm.Session
  961. sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName)
  962. _, err := sess.Cols("status", "train_job_duration").Update(job)
  963. return err
  964. }
  965. func DeleteJob(job *Cloudbrain) error {
  966. return deleteJob(x, job)
  967. }
  968. func deleteJob(e Engine, job *Cloudbrain) error {
  969. _, err := e.ID(job.ID).Delete(job)
  970. return err
  971. }
  972. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  973. cb := &Cloudbrain{JobName: jobName}
  974. return getRepoCloudBrain(cb)
  975. }
  976. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  977. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  978. return false
  979. }
  980. repo, err := GetRepositoryByID(job.RepoID)
  981. if err != nil {
  982. log.Error("GetRepositoryByID failed:%v", err.Error())
  983. return false
  984. }
  985. permission, _ := GetUserRepoPermission(repo, user)
  986. if err != nil {
  987. log.Error("GetUserRepoPermission failed:%v", err.Error())
  988. return false
  989. }
  990. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  991. return true
  992. }
  993. return false
  994. }