You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 27 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790
  1. package models
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "time"
  7. "xorm.io/xorm"
  8. "code.gitea.io/gitea/modules/setting"
  9. "code.gitea.io/gitea/modules/timeutil"
  10. "xorm.io/builder"
  11. )
  12. type CloudbrainStatus string
  13. type JobType string
  14. type ModelArtsJobStatus string
  15. const (
  16. JobWaiting CloudbrainStatus = "WAITING"
  17. JobStopped CloudbrainStatus = "STOPPED"
  18. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  19. JobFailed CloudbrainStatus = "FAILED"
  20. JobRunning CloudbrainStatus = "RUNNING"
  21. JobTypeDebug JobType = "DEBUG"
  22. JobTypeBenchmark JobType = "BENCHMARK"
  23. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  24. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  25. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  26. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  27. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  28. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  29. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  30. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  31. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  32. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  33. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  34. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  35. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  36. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  37. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  38. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  39. )
  40. type Cloudbrain struct {
  41. ID int64 `xorm:"pk autoincr"`
  42. JobID string `xorm:"INDEX NOT NULL"`
  43. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  44. JobName string `xorm:"INDEX"`
  45. Status string `xorm:"INDEX"`
  46. UserID int64 `xorm:"INDEX"`
  47. RepoID int64 `xorm:"INDEX"`
  48. SubTaskName string `xorm:"INDEX"`
  49. ContainerID string
  50. ContainerIp string
  51. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  52. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  53. DeletedAt time.Time `xorm:"deleted"`
  54. CanDebug bool `xorm:"-"`
  55. Type int `xorm:"INDEX DEFAULT 0"`
  56. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  57. VersionName string
  58. User *User `xorm:"-"`
  59. Repo *Repository `xorm:"-"`
  60. }
  61. type CloudBrainLoginResult struct {
  62. Code string
  63. Msg string
  64. Payload map[string]interface{}
  65. }
  66. type TaskRole struct {
  67. Name string `json:"name"`
  68. TaskNumber int8 `json:"taskNumber"`
  69. MinSucceededTaskCount int8 `json:"minSucceededTaskCount"`
  70. MinFailedTaskCount int8 `json:"minFailedTaskCount"`
  71. CPUNumber int8 `json:"cpuNumber"`
  72. GPUNumber int8 `json:"gpuNumber"`
  73. MemoryMB int `json:"memoryMB"`
  74. ShmMB int `json:"shmMB"`
  75. Command string `json:"command"`
  76. NeedIBDevice bool `json:"needIBDevice"`
  77. IsMainRole bool `json:"isMainRole"`
  78. UseNNI bool `json:"useNNI"`
  79. }
  80. type StHostPath struct {
  81. Path string `json:"path"`
  82. MountPath string `json:"mountPath"`
  83. ReadOnly bool `json:"readOnly"`
  84. }
  85. type Volume struct {
  86. HostPath StHostPath `json:"hostPath"`
  87. }
  88. type CreateJobParams struct {
  89. JobName string `json:"jobName"`
  90. RetryCount int8 `json:"retryCount"`
  91. GpuType string `json:"gpuType"`
  92. Image string `json:"image"`
  93. TaskRoles []TaskRole `json:"taskRoles"`
  94. Volumes []Volume `json:"volumes"`
  95. }
  96. type CreateJobResult struct {
  97. Code string `json:"code"`
  98. Msg string `json:"msg"`
  99. Payload map[string]interface{} `json:"payload"`
  100. }
  101. type GetJobResult struct {
  102. Code string `json:"code"`
  103. Msg string `json:"msg"`
  104. Payload map[string]interface{} `json:"payload"`
  105. }
  106. type GetImagesResult struct {
  107. Code string `json:"code"`
  108. Msg string `json:"msg"`
  109. Payload GetImagesPayload `json:"payload"`
  110. }
  111. type GetImagesPayload struct {
  112. Count int `json:"count"`
  113. ImageInfo []*ImageInfo `json:"rows"`
  114. }
  115. type CloudbrainsOptions struct {
  116. ListOptions
  117. RepoID int64 // include all repos if empty
  118. UserID int64
  119. JobID int64
  120. SortType string
  121. CloudbrainIDs []int64
  122. // JobStatus CloudbrainStatus
  123. Type int
  124. }
  125. type TaskPod struct {
  126. TaskRoleStatus struct {
  127. Name string `json:"name"`
  128. } `json:"taskRoleStatus"`
  129. TaskStatuses []struct {
  130. TaskIndex int `json:"taskIndex"`
  131. PodUID string `json:"podUid"`
  132. PodIP string `json:"podIp"`
  133. PodName string `json:"podName"`
  134. ContainerID string `json:"containerId"`
  135. ContainerIP string `json:"containerIp"`
  136. ContainerGpus string `json:"containerGpus"`
  137. State string `json:"state"`
  138. StartAt time.Time `json:"startAt"`
  139. FinishedAt time.Time `json:"finishedAt"`
  140. ExitCode int `json:"exitCode"`
  141. ExitDiagnostics string `json:"exitDiagnostics"`
  142. RetriedCount int `json:"retriedCount"`
  143. StartTime string
  144. FinishedTime string
  145. } `json:"taskStatuses"`
  146. }
  147. type TaskInfo struct {
  148. Username string `json:"username"`
  149. TaskName string `json:"task_name"`
  150. CodeName string `json:"code_name"`
  151. BenchmarkCategory []string `json:"selected_category"`
  152. CodeLink string `json:"code_link"`
  153. }
  154. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  155. data, _ := json.Marshal(input)
  156. var taskPod TaskPod
  157. err := json.Unmarshal(data, &taskPod)
  158. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  159. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  160. return taskPod, err
  161. }
  162. type JobResultPayload struct {
  163. ID string `json:"id"`
  164. Name string `json:"name"`
  165. Platform string `json:"platform"`
  166. JobStatus struct {
  167. Username string `json:"username"`
  168. State string `json:"state"`
  169. SubState string `json:"subState"`
  170. ExecutionType string `json:"executionType"`
  171. Retries int `json:"retries"`
  172. CreatedTime int64 `json:"createdTime"`
  173. CompletedTime int64 `json:"completedTime"`
  174. AppID string `json:"appId"`
  175. AppProgress string `json:"appProgress"`
  176. AppTrackingURL string `json:"appTrackingUrl"`
  177. AppLaunchedTime int64 `json:"appLaunchedTime"`
  178. AppCompletedTime interface{} `json:"appCompletedTime"`
  179. AppExitCode int `json:"appExitCode"`
  180. AppExitDiagnostics string `json:"appExitDiagnostics"`
  181. AppExitType interface{} `json:"appExitType"`
  182. VirtualCluster string `json:"virtualCluster"`
  183. StartTime string
  184. EndTime string
  185. } `json:"jobStatus"`
  186. TaskRoles map[string]interface{} `json:"taskRoles"`
  187. Resource struct {
  188. CPU int `json:"cpu"`
  189. Memory string `json:"memory"`
  190. NvidiaComGpu int `json:"nvidia.com/gpu"`
  191. } `json:"resource"`
  192. Config struct {
  193. Image string `json:"image"`
  194. JobID string `json:"jobId"`
  195. GpuType string `json:"gpuType"`
  196. JobName string `json:"jobName"`
  197. JobType string `json:"jobType"`
  198. TaskRoles []struct {
  199. Name string `json:"name"`
  200. ShmMB int `json:"shmMB"`
  201. Command string `json:"command"`
  202. MemoryMB int `json:"memoryMB"`
  203. CPUNumber int `json:"cpuNumber"`
  204. GpuNumber int `json:"gpuNumber"`
  205. IsMainRole bool `json:"isMainRole"`
  206. TaskNumber int `json:"taskNumber"`
  207. NeedIBDevice bool `json:"needIBDevice"`
  208. MinFailedTaskCount int `json:"minFailedTaskCount"`
  209. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  210. } `json:"taskRoles"`
  211. RetryCount int `json:"retryCount"`
  212. } `json:"config"`
  213. Userinfo struct {
  214. User string `json:"user"`
  215. OrgID string `json:"org_id"`
  216. } `json:"userinfo"`
  217. }
  218. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  219. data, _ := json.Marshal(input)
  220. var jobResultPayload JobResultPayload
  221. err := json.Unmarshal(data, &jobResultPayload)
  222. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  223. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  224. return jobResultPayload, err
  225. }
  226. type ImagesResultPayload struct {
  227. Images []struct {
  228. ID int `json:"id"`
  229. Name string `json:"name"`
  230. Place string `json:"place"`
  231. Description string `json:"description"`
  232. Provider string `json:"provider"`
  233. Createtime string `json:"createtime"`
  234. Remark string `json:"remark"`
  235. } `json:"taskStatuses"`
  236. }
  237. type ImageInfo struct {
  238. ID int `json:"id"`
  239. Name string `json:"name"`
  240. Place string `json:"place"`
  241. Description string `json:"description"`
  242. Provider string `json:"provider"`
  243. Createtime string `json:"createtime"`
  244. Remark string `json:"remark"`
  245. IsPublic int `json:"isPublic"`
  246. PlaceView string
  247. }
  248. type Categories struct {
  249. Category []*Category `json:"category"`
  250. }
  251. type Category struct {
  252. Id int `json:"id"`
  253. Value string `json:"value"`
  254. }
  255. type CommitImageParams struct {
  256. Ip string `json:"ip"`
  257. TaskContainerId string `json:"taskContainerId"`
  258. ImageTag string `json:"imageTag"`
  259. ImageDescription string `json:"imageDescription"`
  260. }
  261. type CommitImageResult struct {
  262. Code string `json:"code"`
  263. Msg string `json:"msg"`
  264. Payload map[string]interface{} `json:"payload"`
  265. }
  266. type StopJobResult struct {
  267. Code string `json:"code"`
  268. Msg string `json:"msg"`
  269. }
  270. type CreateNotebookParams struct {
  271. JobName string `json:"name"`
  272. Description string `json:"description"`
  273. ProfileID string `json:"profile_id"`
  274. Flavor string `json:"flavor"`
  275. Spec Spec `json:"spec"`
  276. Workspace Workspace `json:"workspace"`
  277. Pool Pool `json:"pool"`
  278. }
  279. type Pool struct {
  280. ID string `json:"id"`
  281. Name string `json:"name"`
  282. Type string `json:"type"`
  283. }
  284. type Workspace struct {
  285. ID string `json:"id"`
  286. }
  287. type Spec struct {
  288. Storage Storage `json:"storage"`
  289. AutoStop AutoStop `json:"auto_stop"`
  290. }
  291. type AutoStop struct {
  292. Enable bool `json:"enable"`
  293. Duration int `json:"duration"`
  294. }
  295. type Storage struct {
  296. Type string `json:"type"`
  297. Location Location `json:"location"`
  298. }
  299. type Location struct {
  300. Path string `json:"path"`
  301. }
  302. type NotebookResult struct {
  303. ErrorCode string `json:"error_code"`
  304. ErrorMsg string `json:"error_msg"`
  305. }
  306. type CreateNotebookResult struct {
  307. ErrorCode string `json:"error_code"`
  308. ErrorMsg string `json:"error_msg"`
  309. ID string `json:"id"`
  310. Name string `json:"name"`
  311. Description string `json:"description"`
  312. Status string `json:"status"`
  313. CreationTimestamp string `json:"creation_timestamp"`
  314. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  315. Profile struct {
  316. ID string `json:"id"`
  317. Name string `json:"name"`
  318. Description string `json:"description"`
  319. DeType string `json:"de_type"`
  320. FlavorType string `json:"flavor_type"`
  321. } `json:"profile"`
  322. Flavor string `json:"flavor"`
  323. FlavorDetails struct{
  324. Name string `json:"name"`
  325. Status string `json:"status"`
  326. QueuingNum int `json:"queuing_num"`
  327. QueueLeftTime int `json:"queue_left_time"` //s
  328. Duration int `json:"duration"` //auto_stop_time s
  329. } `json:"flavor_details"`
  330. }
  331. type GetNotebookResult struct {
  332. ErrorCode string `json:"error_code"`
  333. ErrorMsg string `json:"error_msg"`
  334. ID string `json:"id"`
  335. Name string `json:"name"`
  336. Description string `json:"description"`
  337. Status string `json:"status"`
  338. CreationTimestamp string `json:"creation_timestamp"`
  339. CreateTime string
  340. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  341. LatestUpdateTime string
  342. Profile struct {
  343. ID string `json:"id"`
  344. Name string `json:"name"`
  345. Description string `json:"description"`
  346. DeType string `json:"de_type"`
  347. FlavorType string `json:"flavor_type"`
  348. } `json:"profile"`
  349. Flavor string `json:"flavor"`
  350. FlavorDetails struct{
  351. Name string `json:"name"`
  352. Status string `json:"status"`
  353. QueuingNum int `json:"queuing_num"`
  354. QueueLeftTime int `json:"queue_left_time"` //s
  355. Duration int `json:"duration"` //auto_stop_time s
  356. } `json:"flavor_details"`
  357. QueuingInfo struct{
  358. ID string `json:"id"`
  359. Name string `json:"name"`
  360. Flavor string `json:"flavor"`
  361. DeType string `json:"de_type"`
  362. Status string `json:"status"`
  363. BeginTimestamp int `json:"begin_timestamp"`//time of instance begin in queue
  364. BeginTime string
  365. RemainTime int `json:"remain_time"` //remain time of instance
  366. EndTimestamp int `json:"end_timestamp"` //
  367. EndTime string
  368. Rank int `json:"rank"` //rank of instance in queue
  369. } `json:"queuing_info"`
  370. Spec struct{
  371. Annotations struct{
  372. TargetDomain string `json:"target_domain"`
  373. Url string `json:"url"`
  374. } `json:"annotations"`
  375. } `json:"spec"`
  376. }
  377. type GetTokenParams struct {
  378. Auth Auth `json:"auth"`
  379. }
  380. type Auth struct {
  381. Identity Identity `json:"identity"`
  382. Scope Scope `json:"scope"`
  383. }
  384. type Scope struct {
  385. Project Project `json:"project"`
  386. }
  387. type Project struct {
  388. Name string `json:"name"`
  389. }
  390. type Identity struct {
  391. Methods []string `json:"methods"`
  392. Password Password `json:"password"`
  393. }
  394. type Password struct {
  395. User NotebookUser `json:"user"`
  396. }
  397. type NotebookUser struct {
  398. Name string `json:"name"`
  399. Password string `json:"password"`
  400. Domain Domain `json:"domain"`
  401. }
  402. type Domain struct {
  403. Name string `json:"name"`
  404. }
  405. const (
  406. ActionStart = "start"
  407. ActionStop = "stop"
  408. ActionRestart = "restart"
  409. ActionQueue = "queue"
  410. ActionDequeue = "dequeue"
  411. )
  412. type NotebookAction struct {
  413. Action string `json:"action"`
  414. }
  415. type NotebookActionResult struct {
  416. ErrorCode string `json:"error_code"`
  417. ErrorMsg string `json:"error_msg"`
  418. CurrentStatus string `json:"current_status"`
  419. PreviousState string `json:"previous_state"`
  420. }
  421. type NotebookGetJobTokenResult struct {
  422. ErrorCode string `json:"error_code"`
  423. ErrorMsg string `json:"error_msg"`
  424. Token string `json:"token"`
  425. }
  426. type NotebookDelResult struct {
  427. InstanceID string `json:"instance_id"`
  428. }
  429. type CreateTrainJobParams struct {
  430. JobName string `json:"job_name"`
  431. Description string `json:"job_desc"`
  432. Config Config `json:"config"`
  433. WorkspaceID string `json:"workspace_id"`
  434. }
  435. type Config struct {
  436. WorkServerNum int `json:"worker_server_num"`
  437. AppUrl string `json:"app_url"` //训练作业的代码目录
  438. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  439. Parameter []Parameter `json:"parameter"`
  440. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  441. //DatasetID string `json:"dataset_id"`
  442. //DataVersionID string `json:"dataset_version_id"`
  443. //DataSource []DataSource `json:"data_source"`
  444. //SpecID int64 `json:"spec_id"`
  445. EngineID int64 `json:"engine_id"`
  446. //ModelID int64 `json:"model_id"`
  447. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  448. LogUrl string `json:"log_url"`
  449. //UserImageUrl string `json:"user_image_url"`
  450. //UserCommand string `json:"user_command"`
  451. CreateVersion bool `json:"create_version"`
  452. //Volumes []Volumes `json:"volumes"`
  453. Flavor Flavor `json:"flavor"`
  454. PoolID string `json:"pool_id"`
  455. }
  456. type CreateConfigParams struct {
  457. ConfigName string `json:"config_name"`
  458. Description string `json:"config_desc"`
  459. WorkServerNum int `json:"worker_server_num"`
  460. AppUrl string `json:"app_url"` //训练作业的代码目录
  461. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  462. Parameter []Parameter `json:"parameter"`
  463. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  464. //DatasetID string `json:"dataset_id"`
  465. //DataVersionID string `json:"dataset_version_id"`
  466. //DataSource []DataSource `json:"data_source"`
  467. //SpecID int64 `json:"spec_id"`
  468. EngineID int64 `json:"engine_id"`
  469. //ModelID int64 `json:"model_id"`
  470. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  471. LogUrl string `json:"log_url"`
  472. //UserImageUrl string `json:"user_image_url"`
  473. //UserCommand string `json:"user_command"`
  474. //CreateVersion bool `json:"create_version"`
  475. //Volumes []Volumes `json:"volumes"`
  476. Flavor Flavor `json:"flavor"`
  477. PoolID string `json:"pool_id"`
  478. }
  479. type Parameter struct {
  480. Label string `json:"label"`
  481. Value string `json:"value"`
  482. }
  483. type DataSource struct {
  484. DatasetID string `json:"dataset_id"`
  485. DatasetVersion string `json:"dataset_version"`
  486. Type string `json:"type"`
  487. DataUrl string `json:"data_url"`
  488. }
  489. type Volumes struct {
  490. Nfs Nfs `json:"nfs"`
  491. HostPath HostPath `json:"host_path"`
  492. }
  493. type Nfs struct {
  494. ID string `json:"id"`
  495. SourcePath string `json:"src_path"`
  496. DestPath string `json:"dest_path"`
  497. ReadOnly bool `json:"read_only"`
  498. }
  499. type HostPath struct {
  500. SourcePath string `json:"src_path"`
  501. DestPath string `json:"dest_path"`
  502. ReadOnly bool `json:"read_only"`
  503. }
  504. type Flavor struct {
  505. Code string `json:"code"`
  506. }
  507. type CreateTrainJobResult struct {
  508. ErrorCode string `json:"error_code"`
  509. ErrorMsg string `json:"error_msg"`
  510. IsSuccess bool `json:"is_success"`
  511. JobName string `json:"job_name"`
  512. JobID int64 `json:"job_id"`
  513. Status int `json:"status"`
  514. CreateTime int64 `json:"create_time"`
  515. VersionID int64 `json:"version_id"`
  516. ResourceID string `json:"resource_id"`
  517. VersionName string `json:"version_name"`
  518. }
  519. type CreateTrainJobConfigResult struct {
  520. ErrorCode string `json:"error_code"`
  521. ErrorMsg string `json:"error_msg"`
  522. IsSuccess bool `json:"is_success"`
  523. }
  524. type GetResourceSpecsResult struct {
  525. ErrorCode string `json:"error_code"`
  526. ErrorMsg string `json:"error_msg"`
  527. IsSuccess bool `json:"is_success"`
  528. SpecTotalCount int `json:"spec_total_count"`
  529. Specs []Specs `json:"specs"`
  530. }
  531. type Specs struct {
  532. Core string `json:"core"`
  533. Cpu string `json:"cpu"`
  534. IsNoResource bool `json:"no_resource"`
  535. GpuType string `json:"gpu_type"`
  536. SpecID int64 `json:"spec_id"`
  537. GpuNum int `json:"gpu_num"`
  538. SpecCode string `json:"spec_code"`
  539. Storage string `json:"storage"`
  540. MaxNum int `json:"max_num"`
  541. UnitNum int `json:"unit_num"`
  542. InterfaceType int `json:"interface_type"`
  543. }
  544. type ErrorResult struct {
  545. ErrorCode string `json:"error_code"`
  546. ErrorMsg string `json:"error_message"`
  547. IsSuccess bool `json:"is_success"`
  548. }
  549. type GetTrainJobResult struct {
  550. IsSuccess bool `json:"is_success"`
  551. JobName string `json:"job_name"`
  552. JobID int64 `json:"job_id"`
  553. Description string `json:"job_desc"`
  554. Status int `json:"status"`
  555. LongCreateTime int64 `json:"create_time"`
  556. CreateTime string
  557. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  558. VersionID int64 `json:"version_id"`
  559. ResourceID string `json:"resource_id"`
  560. VersionName string `json:"version_name"`
  561. PreVersionID int64 `json:"pre_version_id"`
  562. WorkServerNum int `json:"worker_server_num"`
  563. AppUrl string `json:"app_url"` //训练作业的代码目录
  564. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  565. Parameter []Parameter `json:"parameter"`
  566. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  567. //DatasetID string `json:"dataset_id"`
  568. //DataVersionID string `json:"dataset_version_id"`
  569. //DataSource []DataSource `json:"data_source"`
  570. //SpecID int64 `json:"spec_id"`
  571. EngineID int64 `json:"engine_id"`
  572. //ModelID int64 `json:"model_id"`
  573. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  574. LogUrl string `json:"log_url"`
  575. //UserImageUrl string `json:"user_image_url"`
  576. //UserCommand string `json:"user_command"`
  577. CreateVersion bool `json:"create_version"`
  578. //Volumes []Volumes `json:"volumes"`
  579. Flavor Flavor `json:"flavor"`
  580. PoolID string `json:"pool_id"`
  581. PoolName string `json:"pool_name"`
  582. NasMountPath string `json:"nas_mount_path"`
  583. NasShareAddr string `json:"nas_share_addr"`
  584. }
  585. type GetTrainJobLogResult struct {
  586. ErrorCode string `json:"error_code"`
  587. ErrorMsg string `json:"error_msg"`
  588. IsSuccess bool `json:"is_success"`
  589. Content string `json:"content"`
  590. Lines int `json:"lines"`
  591. StartLine string `json:"start_line"`
  592. EndLine string `json:"end_line"`
  593. }
  594. type GetTrainJobLogFileNamesResult struct {
  595. ErrorCode string `json:"error_code"`
  596. ErrorMsg string `json:"error_msg"`
  597. IsSuccess bool `json:"is_success"`
  598. LogFileList []string `json:"log_file_list"`
  599. }
  600. func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
  601. sess := x.NewSession()
  602. defer sess.Close()
  603. var cond = builder.NewCond()
  604. if opts.RepoID > 0 {
  605. cond = cond.And(
  606. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  607. )
  608. }
  609. if opts.UserID > 0 {
  610. cond = cond.And(
  611. builder.Eq{"cloudbrain.user_id": opts.UserID},
  612. )
  613. }
  614. if (opts.JobID) > 0 {
  615. cond = cond.And(
  616. builder.Eq{"cloudbrain.job_id": opts.JobID},
  617. )
  618. }
  619. if (opts.Type) >= 0 {
  620. cond = cond.And(
  621. builder.Eq{"cloudbrain.type": opts.Type},
  622. )
  623. }
  624. // switch opts.JobStatus {
  625. // case JobWaiting:
  626. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  627. // case JobFailed:
  628. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  629. // case JobStopped:
  630. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  631. // case JobSucceeded:
  632. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  633. // }
  634. if len(opts.CloudbrainIDs) > 0 {
  635. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  636. }
  637. count, err := sess.Where(cond).Count(new(Cloudbrain))
  638. if err != nil {
  639. return nil, 0, fmt.Errorf("Count: %v", err)
  640. }
  641. if opts.Page >= 0 && opts.PageSize > 0 {
  642. var start int
  643. if opts.Page == 0 {
  644. start = 0
  645. } else {
  646. start = (opts.Page - 1) * opts.PageSize
  647. }
  648. sess.Limit(opts.PageSize, start)
  649. }
  650. sess.OrderBy("cloudbrain.created_unix DESC")
  651. cloudbrains := make([]*Cloudbrain, 0, setting.UI.IssuePagingNum)
  652. if err := sess.Where(cond).Find(&cloudbrains); err != nil {
  653. return nil, 0, fmt.Errorf("Find: %v", err)
  654. }
  655. sess.Close()
  656. return cloudbrains, count, nil
  657. }
  658. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  659. if _, err = x.Insert(cloudbrain); err != nil {
  660. return err
  661. }
  662. return nil
  663. }
  664. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  665. has, err := x.Get(cb)
  666. if err != nil {
  667. return nil, err
  668. } else if !has {
  669. return nil, errors.New("cloudbrain task is not found")
  670. }
  671. return cb, nil
  672. }
  673. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  674. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  675. return getRepoCloudBrain(cb)
  676. }
  677. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  678. cb := &Cloudbrain{JobID: jobID}
  679. return getRepoCloudBrain(cb)
  680. }
  681. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  682. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  683. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  684. return
  685. }
  686. func UpdateJob(job *Cloudbrain) error {
  687. return updateJob(x, job)
  688. }
  689. func updateJob(e Engine, job *Cloudbrain) error {
  690. var sess *xorm.Session
  691. sess = e.Where("job_id = ?", job.JobID)
  692. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  693. return err
  694. }
  695. func DeleteJob(job *Cloudbrain) error {
  696. return deleteJob(x, job)
  697. }
  698. func deleteJob(e Engine, job *Cloudbrain) error {
  699. _, err := e.ID(job.ID).Delete(job)
  700. return err
  701. }