You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 35 kB

3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. JobTypeTrain JobType = "TRAIN"
  27. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  28. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  29. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  30. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  31. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  32. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  33. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  34. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  35. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  36. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  37. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  38. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  39. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  40. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  41. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  42. )
  43. type Cloudbrain struct {
  44. ID int64 `xorm:"pk autoincr"`
  45. JobID string `xorm:"INDEX NOT NULL"`
  46. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  47. JobName string
  48. Status string
  49. UserID int64
  50. RepoID int64
  51. SubTaskName string
  52. ContainerID string
  53. ContainerIp string
  54. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  55. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  56. Duration int64
  57. TrainJobDuration string
  58. DeletedAt time.Time `xorm:"deleted"`
  59. CanDebug bool `xorm:"-"`
  60. CanDel bool `xorm:"-"`
  61. Type int
  62. VersionID int64 //版本id
  63. VersionName string `xorm:"INDEX"` //当前版本
  64. Uuid string //数据集id
  65. DatasetName string
  66. VersionCount int //任务的当前版本数量,不包括删除的
  67. IsLatestVersion string //是否是最新版本,1是,0否
  68. CommitID string //提交的仓库代码id
  69. PreVersionName string //父版本名称
  70. ComputeResource string //计算资源,例如npu
  71. EngineID int64 //引擎id
  72. TrainUrl string //输出的obs路径
  73. BranchName string //分支名称
  74. Parameters string //传给modelarts的param参数
  75. BootFile string //启动文件
  76. DataUrl string //数据集的obs路径
  77. LogUrl string //日志输出的obs路径
  78. PreVersionId int64 //父版本的版本id
  79. FlavorCode string //modelarts上的规格id
  80. Description string `xorm:"varchar(256)"` //描述
  81. WorkServerNumber int //节点数
  82. FlavorName string //规格名称
  83. EngineName string //引擎名称
  84. TotalVersionCount int //任务的所有版本数量,包括删除的
  85. User *User `xorm:"-"`
  86. Repo *Repository `xorm:"-"`
  87. }
  88. type CloudbrainInfo struct {
  89. Cloudbrain `xorm:"extends"`
  90. User `xorm:"extends"`
  91. }
  92. type CloudBrainLoginResult struct {
  93. Code string
  94. Msg string
  95. Payload map[string]interface{}
  96. }
  97. type TaskRole struct {
  98. Name string `json:"name"`
  99. TaskNumber int `json:"taskNumber"`
  100. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  101. MinFailedTaskCount int `json:"minFailedTaskCount"`
  102. CPUNumber int `json:"cpuNumber"`
  103. GPUNumber int `json:"gpuNumber"`
  104. MemoryMB int `json:"memoryMB"`
  105. ShmMB int `json:"shmMB"`
  106. Command string `json:"command"`
  107. NeedIBDevice bool `json:"needIBDevice"`
  108. IsMainRole bool `json:"isMainRole"`
  109. UseNNI bool `json:"useNNI"`
  110. }
  111. type StHostPath struct {
  112. Path string `json:"path"`
  113. MountPath string `json:"mountPath"`
  114. ReadOnly bool `json:"readOnly"`
  115. }
  116. type Volume struct {
  117. HostPath StHostPath `json:"hostPath"`
  118. }
  119. type CreateJobParams struct {
  120. JobName string `json:"jobName"`
  121. RetryCount int8 `json:"retryCount"`
  122. GpuType string `json:"gpuType"`
  123. Image string `json:"image"`
  124. TaskRoles []TaskRole `json:"taskRoles"`
  125. Volumes []Volume `json:"volumes"`
  126. }
  127. type CreateJobResult struct {
  128. Code string `json:"code"`
  129. Msg string `json:"msg"`
  130. Payload map[string]interface{} `json:"payload"`
  131. }
  132. type GetJobResult struct {
  133. Code string `json:"code"`
  134. Msg string `json:"msg"`
  135. Payload map[string]interface{} `json:"payload"`
  136. }
  137. type GetImagesResult struct {
  138. Code string `json:"code"`
  139. Msg string `json:"msg"`
  140. Payload GetImagesPayload `json:"payload"`
  141. }
  142. type GetImagesPayload struct {
  143. Count int `json:"count"`
  144. TotalPages int `json:"totalPages,omitempty"`
  145. ImageInfo []*ImageInfo `json:"rows"`
  146. }
  147. type CloudbrainsOptions struct {
  148. ListOptions
  149. RepoID int64 // include all repos if empty
  150. UserID int64
  151. JobID string
  152. SortType string
  153. CloudbrainIDs []int64
  154. // JobStatus CloudbrainStatus
  155. Type int
  156. JobType string
  157. VersionName string
  158. IsLatestVersion string
  159. }
  160. type TaskPod struct {
  161. TaskRoleStatus struct {
  162. Name string `json:"name"`
  163. } `json:"taskRoleStatus"`
  164. //TaskStatuses []struct {
  165. // TaskIndex int `json:"taskIndex"`
  166. // PodUID string `json:"podUid"`
  167. // PodIP string `json:"podIp"`
  168. // PodName string `json:"podName"`
  169. // ContainerID string `json:"containerId"`
  170. // ContainerIP string `json:"containerIp"`
  171. // ContainerGpus string `json:"containerGpus"`
  172. // State string `json:"state"`
  173. // StartAt time.Time `json:"startAt"`
  174. // FinishedAt time.Time `json:"finishedAt"`
  175. // ExitCode int `json:"exitCode"`
  176. // ExitDiagnostics string `json:"exitDiagnostics"`
  177. // RetriedCount int `json:"retriedCount"`
  178. // StartTime string
  179. // FinishedTime string
  180. //} `json:"taskStatuses"`
  181. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  182. }
  183. type TaskStatuses struct {
  184. TaskIndex int `json:"taskIndex"`
  185. PodUID string `json:"podUid"`
  186. PodIP string `json:"podIp"`
  187. PodName string `json:"podName"`
  188. ContainerID string `json:"containerId"`
  189. ContainerIP string `json:"containerIp"`
  190. ContainerGpus string `json:"containerGpus"`
  191. State string `json:"state"`
  192. StartAt time.Time `json:"startAt"`
  193. FinishedAt time.Time `json:"finishedAt"`
  194. ExitCode int `json:"exitCode"`
  195. ExitDiagnostics string `json:"exitDiagnostics"`
  196. RetriedCount int `json:"retriedCount"`
  197. StartTime string
  198. FinishedTime string
  199. }
  200. type TaskInfo struct {
  201. Username string `json:"username"`
  202. TaskName string `json:"task_name"`
  203. CodeName string `json:"code_name"`
  204. BenchmarkCategory []string `json:"selected_category"`
  205. CodeLink string `json:"code_link"`
  206. GpuType string `json:"gpu_type"`
  207. }
  208. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  209. data, _ := json.Marshal(input)
  210. var taskPod TaskPod
  211. err := json.Unmarshal(data, &taskPod)
  212. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  213. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  214. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  215. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  216. taskPod.TaskStatuses[0].FinishedTime = "-"
  217. }
  218. return taskPod, err
  219. }
  220. type JobResultPayload struct {
  221. ID string `json:"id"`
  222. Name string `json:"name"`
  223. Platform string `json:"platform"`
  224. JobStatus struct {
  225. Username string `json:"username"`
  226. State string `json:"state"`
  227. SubState string `json:"subState"`
  228. ExecutionType string `json:"executionType"`
  229. Retries int `json:"retries"`
  230. CreatedTime int64 `json:"createdTime"`
  231. CompletedTime int64 `json:"completedTime"`
  232. AppID string `json:"appId"`
  233. AppProgress string `json:"appProgress"`
  234. AppTrackingURL string `json:"appTrackingUrl"`
  235. AppLaunchedTime int64 `json:"appLaunchedTime"`
  236. AppCompletedTime interface{} `json:"appCompletedTime"`
  237. AppExitCode int `json:"appExitCode"`
  238. AppExitDiagnostics string `json:"appExitDiagnostics"`
  239. AppExitType interface{} `json:"appExitType"`
  240. VirtualCluster string `json:"virtualCluster"`
  241. StartTime string
  242. EndTime string
  243. } `json:"jobStatus"`
  244. TaskRoles map[string]interface{} `json:"taskRoles"`
  245. Resource struct {
  246. CPU int `json:"cpu"`
  247. Memory string `json:"memory"`
  248. NvidiaComGpu int `json:"nvidia.com/gpu"`
  249. } `json:"resource"`
  250. Config struct {
  251. Image string `json:"image"`
  252. JobID string `json:"jobId"`
  253. GpuType string `json:"gpuType"`
  254. JobName string `json:"jobName"`
  255. JobType string `json:"jobType"`
  256. TaskRoles []struct {
  257. Name string `json:"name"`
  258. ShmMB int `json:"shmMB"`
  259. Command string `json:"command"`
  260. MemoryMB int `json:"memoryMB"`
  261. CPUNumber int `json:"cpuNumber"`
  262. GpuNumber int `json:"gpuNumber"`
  263. IsMainRole bool `json:"isMainRole"`
  264. TaskNumber int `json:"taskNumber"`
  265. NeedIBDevice bool `json:"needIBDevice"`
  266. MinFailedTaskCount int `json:"minFailedTaskCount"`
  267. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  268. } `json:"taskRoles"`
  269. RetryCount int `json:"retryCount"`
  270. } `json:"config"`
  271. Userinfo struct {
  272. User string `json:"user"`
  273. OrgID string `json:"org_id"`
  274. } `json:"userinfo"`
  275. }
  276. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  277. data, _ := json.Marshal(input)
  278. var jobResultPayload JobResultPayload
  279. err := json.Unmarshal(data, &jobResultPayload)
  280. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  281. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  282. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  283. jobResultPayload.JobStatus.StartTime = "-"
  284. jobResultPayload.JobStatus.EndTime = "-"
  285. }
  286. return jobResultPayload, err
  287. }
  288. type ImagesResultPayload struct {
  289. Images []struct {
  290. ID int `json:"id"`
  291. Name string `json:"name"`
  292. Place string `json:"place"`
  293. Description string `json:"description"`
  294. Provider string `json:"provider"`
  295. Createtime string `json:"createtime"`
  296. Remark string `json:"remark"`
  297. } `json:"taskStatuses"`
  298. }
  299. type ImageInfo struct {
  300. ID int `json:"id"`
  301. Name string `json:"name"`
  302. Place string `json:"place"`
  303. Description string `json:"description"`
  304. Provider string `json:"provider"`
  305. Createtime string `json:"createtime"`
  306. Remark string `json:"remark"`
  307. IsPublic int `json:"isPublic"`
  308. PlaceView string
  309. }
  310. type Categories struct {
  311. Category []*Category `json:"category"`
  312. }
  313. type Category struct {
  314. Id int `json:"id"`
  315. Value string `json:"value"`
  316. }
  317. type GpuInfos struct {
  318. GpuInfo []*GpuInfo `json:"gpu_type"`
  319. }
  320. type GpuInfo struct {
  321. Id int `json:"id"`
  322. Value string `json:"value"`
  323. Queue string `json:"queue"`
  324. }
  325. type ResourceSpecs struct {
  326. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  327. }
  328. type ResourceSpec struct {
  329. Id int `json:"id"`
  330. CpuNum int `json:"cpu"`
  331. GpuNum int `json:"gpu"`
  332. MemMiB int `json:"memMiB"`
  333. ShareMemMiB int `json:"shareMemMiB"`
  334. }
  335. type FlavorInfos struct {
  336. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  337. }
  338. type FlavorInfo struct {
  339. Id int `json:"id"`
  340. Value string `json:"value"`
  341. Desc string `json:"desc"`
  342. }
  343. type PoolInfos struct {
  344. PoolInfo []*PoolInfo `json:"pool_info"`
  345. }
  346. type PoolInfo struct {
  347. PoolId string `json:"pool_id"`
  348. PoolName string `json:"pool_name"`
  349. PoolType string `json:"pool_type"`
  350. }
  351. type CommitImageParams struct {
  352. Ip string `json:"ip"`
  353. TaskContainerId string `json:"taskContainerId"`
  354. ImageTag string `json:"imageTag"`
  355. ImageDescription string `json:"imageDescription"`
  356. }
  357. type CommitImageResult struct {
  358. Code string `json:"code"`
  359. Msg string `json:"msg"`
  360. Payload map[string]interface{} `json:"payload"`
  361. }
  362. type CloudBrainResult struct {
  363. Code string `json:"code"`
  364. Msg string `json:"msg"`
  365. }
  366. type CreateNotebookParams struct {
  367. JobName string `json:"name"`
  368. Description string `json:"description"`
  369. ProfileID string `json:"profile_id"`
  370. Flavor string `json:"flavor"`
  371. Spec Spec `json:"spec"`
  372. Workspace Workspace `json:"workspace"`
  373. Pool Pool `json:"pool"`
  374. }
  375. type Pool struct {
  376. ID string `json:"id"`
  377. Name string `json:"name"`
  378. Type string `json:"type"`
  379. }
  380. type Workspace struct {
  381. ID string `json:"id"`
  382. }
  383. type Spec struct {
  384. Storage Storage `json:"storage"`
  385. AutoStop AutoStop `json:"auto_stop"`
  386. }
  387. type AutoStop struct {
  388. Enable bool `json:"enable"`
  389. Duration int `json:"duration"`
  390. }
  391. type Storage struct {
  392. Type string `json:"type"`
  393. Location Location `json:"location"`
  394. }
  395. type Location struct {
  396. Path string `json:"path"`
  397. }
  398. type NotebookResult struct {
  399. ErrorCode string `json:"error_code"`
  400. ErrorMsg string `json:"error_msg"`
  401. }
  402. type CreateNotebookResult struct {
  403. ErrorCode string `json:"error_code"`
  404. ErrorMsg string `json:"error_msg"`
  405. ID string `json:"id"`
  406. Name string `json:"name"`
  407. Description string `json:"description"`
  408. Status string `json:"status"`
  409. CreationTimestamp string `json:"creation_timestamp"`
  410. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  411. Profile struct {
  412. ID string `json:"id"`
  413. Name string `json:"name"`
  414. Description string `json:"description"`
  415. DeType string `json:"de_type"`
  416. FlavorType string `json:"flavor_type"`
  417. } `json:"profile"`
  418. Flavor string `json:"flavor"`
  419. FlavorDetails struct {
  420. Name string `json:"name"`
  421. Status string `json:"status"`
  422. QueuingNum int `json:"queuing_num"`
  423. QueueLeftTime int `json:"queue_left_time"` //s
  424. Duration int `json:"duration"` //auto_stop_time s
  425. } `json:"flavor_details"`
  426. }
  427. type GetNotebookResult struct {
  428. ErrorCode string `json:"error_code"`
  429. ErrorMsg string `json:"error_msg"`
  430. ID string `json:"id"`
  431. Name string `json:"name"`
  432. Description string `json:"description"`
  433. Status string `json:"status"`
  434. CreationTimestamp string `json:"creation_timestamp"`
  435. CreateTime string
  436. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  437. LatestUpdateTime string
  438. Profile struct {
  439. ID string `json:"id"`
  440. Name string `json:"name"`
  441. Description string `json:"description"`
  442. DeType string `json:"de_type"`
  443. FlavorType string `json:"flavor_type"`
  444. } `json:"profile"`
  445. Flavor string `json:"flavor"`
  446. FlavorDetails struct {
  447. Name string `json:"name"`
  448. Status string `json:"status"`
  449. QueuingNum int `json:"queuing_num"`
  450. QueueLeftTime int `json:"queue_left_time"` //s
  451. Duration int `json:"duration"` //auto_stop_time s
  452. } `json:"flavor_details"`
  453. QueuingInfo struct {
  454. ID string `json:"id"`
  455. Name string `json:"name"`
  456. Flavor string `json:"flavor"`
  457. DeType string `json:"de_type"`
  458. Status string `json:"status"`
  459. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  460. BeginTime string
  461. RemainTime int `json:"remain_time"` //remain time of instance
  462. EndTimestamp int `json:"end_timestamp"` //
  463. EndTime string
  464. Rank int `json:"rank"` //rank of instance in queue
  465. } `json:"queuing_info"`
  466. Spec struct {
  467. Annotations struct {
  468. TargetDomain string `json:"target_domain"`
  469. Url string `json:"url"`
  470. } `json:"annotations"`
  471. } `json:"spec"`
  472. }
  473. type GetTokenParams struct {
  474. Auth Auth `json:"auth"`
  475. }
  476. type Auth struct {
  477. Identity Identity `json:"identity"`
  478. Scope Scope `json:"scope"`
  479. }
  480. type Scope struct {
  481. Project Project `json:"project"`
  482. }
  483. type Project struct {
  484. Name string `json:"name"`
  485. }
  486. type Identity struct {
  487. Methods []string `json:"methods"`
  488. Password Password `json:"password"`
  489. }
  490. type Password struct {
  491. User NotebookUser `json:"user"`
  492. }
  493. type NotebookUser struct {
  494. Name string `json:"name"`
  495. Password string `json:"password"`
  496. Domain Domain `json:"domain"`
  497. }
  498. type Domain struct {
  499. Name string `json:"name"`
  500. }
  501. const (
  502. ActionStart = "start"
  503. ActionStop = "stop"
  504. ActionRestart = "restart"
  505. ActionQueue = "queue"
  506. ActionDequeue = "dequeue"
  507. )
  508. type NotebookAction struct {
  509. Action string `json:"action"`
  510. }
  511. type NotebookActionResult struct {
  512. ErrorCode string `json:"error_code"`
  513. ErrorMsg string `json:"error_msg"`
  514. CurrentStatus string `json:"current_status"`
  515. PreviousState string `json:"previous_state"`
  516. }
  517. type NotebookGetJobTokenResult struct {
  518. ErrorCode string `json:"error_code"`
  519. ErrorMsg string `json:"error_msg"`
  520. Token string `json:"token"`
  521. }
  522. type NotebookDelResult struct {
  523. InstanceID string `json:"instance_id"`
  524. }
  525. type CreateTrainJobParams struct {
  526. JobName string `json:"job_name"`
  527. Description string `json:"job_desc"`
  528. Config Config `json:"config"`
  529. WorkspaceID string `json:"workspace_id"`
  530. }
  531. type Config struct {
  532. WorkServerNum int `json:"worker_server_num"`
  533. AppUrl string `json:"app_url"` //训练作业的代码目录
  534. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  535. Parameter []Parameter `json:"parameter"`
  536. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  537. EngineID int64 `json:"engine_id"`
  538. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  539. LogUrl string `json:"log_url"`
  540. //UserImageUrl string `json:"user_image_url"`
  541. //UserCommand string `json:"user_command"`
  542. CreateVersion bool `json:"create_version"`
  543. Flavor Flavor `json:"flavor"`
  544. PoolID string `json:"pool_id"`
  545. }
  546. type CreateTrainJobVersionParams struct {
  547. Description string `json:"job_desc"`
  548. Config TrainJobVersionConfig `json:"config"`
  549. }
  550. type TrainJobVersionConfig struct {
  551. WorkServerNum int `json:"worker_server_num"`
  552. AppUrl string `json:"app_url"` //训练作业的代码目录
  553. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  554. Parameter []Parameter `json:"parameter"`
  555. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  556. EngineID int64 `json:"engine_id"`
  557. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  558. LogUrl string `json:"log_url"`
  559. Flavor Flavor `json:"flavor"`
  560. PoolID string `json:"pool_id"`
  561. PreVersionId int64 `json:"pre_version_id"`
  562. }
  563. type CreateConfigParams struct {
  564. ConfigName string `json:"config_name"`
  565. Description string `json:"config_desc"`
  566. WorkServerNum int `json:"worker_server_num"`
  567. AppUrl string `json:"app_url"` //训练作业的代码目录
  568. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  569. Parameter []Parameter `json:"parameter"`
  570. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  571. EngineID int64 `json:"engine_id"`
  572. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  573. LogUrl string `json:"log_url"`
  574. Flavor Flavor `json:"flavor"`
  575. PoolID string `json:"pool_id"`
  576. }
  577. type Parameter struct {
  578. Label string `json:"label"`
  579. Value string `json:"value"`
  580. }
  581. type Parameters struct {
  582. Parameter []Parameter `json:"parameter"`
  583. }
  584. type DataSource struct {
  585. DatasetID string `json:"dataset_id"`
  586. DatasetVersion string `json:"dataset_version"`
  587. Type string `json:"type"`
  588. DataUrl string `json:"data_url"`
  589. }
  590. type Volumes struct {
  591. Nfs Nfs `json:"nfs"`
  592. HostPath HostPath `json:"host_path"`
  593. }
  594. type Nfs struct {
  595. ID string `json:"id"`
  596. SourcePath string `json:"src_path"`
  597. DestPath string `json:"dest_path"`
  598. ReadOnly bool `json:"read_only"`
  599. }
  600. type HostPath struct {
  601. SourcePath string `json:"src_path"`
  602. DestPath string `json:"dest_path"`
  603. ReadOnly bool `json:"read_only"`
  604. }
  605. type Flavor struct {
  606. Code string `json:"code"`
  607. }
  608. type CreateTrainJobResult struct {
  609. ErrorCode string `json:"error_code"`
  610. ErrorMsg string `json:"error_msg"`
  611. IsSuccess bool `json:"is_success"`
  612. JobName string `json:"job_name"`
  613. JobID int64 `json:"job_id"`
  614. Status int `json:"status"`
  615. CreateTime int64 `json:"create_time"`
  616. VersionID int64 `json:"version_id"`
  617. ResourceID string `json:"resource_id"`
  618. VersionName string `json:"version_name"`
  619. }
  620. type CreateTrainJobConfigResult struct {
  621. ErrorCode string `json:"error_code"`
  622. ErrorMsg string `json:"error_msg"`
  623. IsSuccess bool `json:"is_success"`
  624. }
  625. type GetResourceSpecsResult struct {
  626. ErrorCode string `json:"error_code"`
  627. ErrorMsg string `json:"error_msg"`
  628. IsSuccess bool `json:"is_success"`
  629. SpecTotalCount int `json:"spec_total_count"`
  630. Specs []Specs `json:"specs"`
  631. }
  632. type Specs struct {
  633. Core string `json:"core"`
  634. Cpu string `json:"cpu"`
  635. IsNoResource bool `json:"no_resource"`
  636. GpuType string `json:"gpu_type"`
  637. SpecID int64 `json:"spec_id"`
  638. GpuNum int `json:"gpu_num"`
  639. SpecCode string `json:"spec_code"`
  640. Storage string `json:"storage"`
  641. MaxNum int `json:"max_num"`
  642. UnitNum int `json:"unit_num"`
  643. InterfaceType int `json:"interface_type"`
  644. }
  645. type GetConfigListResult struct {
  646. ErrorCode string `json:"error_code"`
  647. ErrorMsg string `json:"error_msg"`
  648. IsSuccess bool `json:"is_success"`
  649. ConfigTotalCount int `json:"config_total_count"`
  650. ParaConfigs []ParaConfig `json:"configs"`
  651. }
  652. type ParaConfig struct {
  653. ConfigName string `json:"config_name"`
  654. ConfigDesc string `json:"config_desc"`
  655. CreateTime int64 `json:"create_time"`
  656. EngineType int `json:"engine_type"`
  657. EngineName string `json:"engine_name"`
  658. EngineId int64 `json:"engine_id"`
  659. EngineVersion string `json:"engine_version"`
  660. UserImageUrl string `json:"user_image_url"`
  661. UserCommand string `json:"user_command"`
  662. Result GetConfigResult
  663. }
  664. type GetConfigResult struct {
  665. ErrorCode string `json:"error_code"`
  666. ErrorMsg string `json:"error_msg"`
  667. IsSuccess bool `json:"is_success"`
  668. ConfigName string `json:"config_name"`
  669. Description string `json:"config_desc"`
  670. WorkServerNum int `json:"worker_server_num"`
  671. AppUrl string `json:"app_url"` //训练作业的代码目录
  672. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  673. Parameter []Parameter `json:"parameter"`
  674. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  675. EngineID int64 `json:"engine_id"`
  676. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  677. LogUrl string `json:"log_url"`
  678. Flavor Flavor `json:"flavor"`
  679. PoolID string `json:"pool_id"`
  680. }
  681. type ErrorResult struct {
  682. ErrorCode string `json:"error_code"`
  683. ErrorMsg string `json:"error_message"`
  684. IsSuccess bool `json:"is_success"`
  685. }
  686. type GetTrainJobResult struct {
  687. IsSuccess bool `json:"is_success"`
  688. JobName string `json:"job_name"`
  689. JobID int64 `json:"job_id"`
  690. Description string `json:"job_desc"`
  691. IntStatus int `json:"status"`
  692. Status string
  693. LongCreateTime int64 `json:"create_time"`
  694. CreateTime string
  695. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  696. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  697. VersionID int64 `json:"version_id"`
  698. ResourceID string `json:"resource_id"`
  699. VersionName string `json:"version_name"`
  700. PreVersionID int64 `json:"pre_version_id"`
  701. WorkServerNum int `json:"worker_server_num"`
  702. AppUrl string `json:"app_url"` //训练作业的代码目录
  703. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  704. Parameter []Parameter `json:"parameter"`
  705. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  706. EngineID int64 `json:"engine_id"`
  707. EngineName string `json:"engine_name"`
  708. EngineVersion string `json:"engine_version"`
  709. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  710. LogUrl string `json:"log_url"`
  711. Flavor Flavor `json:"flavor"`
  712. PoolID string `json:"pool_id"`
  713. PoolName string `json:"pool_name"`
  714. NasMountPath string `json:"nas_mount_path"`
  715. NasShareAddr string `json:"nas_share_addr"`
  716. DatasetName string
  717. ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
  718. }
  719. type GetTrainJobLogResult struct {
  720. ErrorCode string `json:"error_code"`
  721. ErrorMsg string `json:"error_msg"`
  722. IsSuccess bool `json:"is_success"`
  723. Content string `json:"content"`
  724. Lines int `json:"lines"`
  725. StartLine string `json:"start_line"`
  726. EndLine string `json:"end_line"`
  727. }
  728. type GetTrainJobLogFileNamesResult struct {
  729. ErrorCode string `json:"error_code"`
  730. ErrorMsg string `json:"error_msg"`
  731. IsSuccess bool `json:"is_success"`
  732. LogFileList []string `json:"log_file_list"`
  733. }
  734. type TrainJobResult struct {
  735. ErrorCode string `json:"error_code"`
  736. ErrorMsg string `json:"error_msg"`
  737. IsSuccess bool `json:"is_success"`
  738. }
  739. type LogFile struct {
  740. Name string
  741. }
  742. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  743. sess := x.NewSession()
  744. defer sess.Close()
  745. var cond = builder.NewCond()
  746. if opts.RepoID > 0 {
  747. cond = cond.And(
  748. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  749. )
  750. }
  751. if opts.UserID > 0 {
  752. cond = cond.And(
  753. builder.Eq{"cloudbrain.user_id": opts.UserID},
  754. )
  755. }
  756. if (opts.JobID) != "" {
  757. cond = cond.And(
  758. builder.Eq{"cloudbrain.job_id": opts.JobID},
  759. )
  760. }
  761. if (opts.Type) >= 0 {
  762. cond = cond.And(
  763. builder.Eq{"cloudbrain.type": opts.Type},
  764. )
  765. }
  766. if (opts.JobType) != "" {
  767. cond = cond.And(
  768. builder.Eq{"cloudbrain.job_type": opts.JobType},
  769. )
  770. }
  771. if (opts.IsLatestVersion) != "" {
  772. cond = cond.And(
  773. builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion},
  774. )
  775. }
  776. if len(opts.CloudbrainIDs) > 0 {
  777. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  778. }
  779. count, err := sess.Where(cond).Count(new(Cloudbrain))
  780. if err != nil {
  781. return nil, 0, fmt.Errorf("Count: %v", err)
  782. }
  783. if opts.Page >= 0 && opts.PageSize > 0 {
  784. var start int
  785. if opts.Page == 0 {
  786. start = 0
  787. } else {
  788. start = (opts.Page - 1) * opts.PageSize
  789. }
  790. sess.Limit(opts.PageSize, start)
  791. }
  792. sess.OrderBy("cloudbrain.created_unix DESC")
  793. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  794. if err := sess.Table(&Cloudbrain{}).Where(cond).
  795. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  796. Find(&cloudbrains); err != nil {
  797. return nil, 0, fmt.Errorf("Find: %v", err)
  798. }
  799. return cloudbrains, count, nil
  800. }
  801. func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, error) {
  802. sess := x.NewSession()
  803. defer sess.Close()
  804. var cond = builder.NewCond()
  805. if opts.RepoID > 0 {
  806. cond = cond.And(
  807. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  808. )
  809. }
  810. if opts.UserID > 0 {
  811. cond = cond.And(
  812. builder.Eq{"cloudbrain.user_id": opts.UserID},
  813. )
  814. }
  815. if (opts.Type) >= 0 {
  816. cond = cond.And(
  817. builder.Eq{"cloudbrain.type": opts.Type},
  818. )
  819. }
  820. if (opts.JobID) != "" {
  821. cond = cond.And(
  822. builder.Eq{"cloudbrain.job_id": opts.JobID},
  823. )
  824. }
  825. if (opts.JobType) != "" {
  826. cond = cond.And(
  827. builder.Eq{"cloudbrain.job_type": opts.JobType},
  828. )
  829. }
  830. if len(opts.CloudbrainIDs) > 0 {
  831. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  832. }
  833. count, err := sess.Where(cond).Count(new(Cloudbrain))
  834. if err != nil {
  835. return nil, 0, fmt.Errorf("Count: %v", err)
  836. }
  837. if opts.Page >= 0 && opts.PageSize > 0 {
  838. var start int
  839. if opts.Page == 0 {
  840. start = 0
  841. } else {
  842. start = (opts.Page - 1) * opts.PageSize
  843. }
  844. sess.Limit(opts.PageSize, start)
  845. }
  846. sess.OrderBy("cloudbrain.created_unix DESC")
  847. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  848. if err := sess.Table(&Cloudbrain{}).Where(cond).
  849. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  850. Find(&cloudbrains); err != nil {
  851. return nil, 0, fmt.Errorf("Find: %v", err)
  852. }
  853. return cloudbrains, int(count), nil
  854. }
  855. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  856. if _, err = x.Insert(cloudbrain); err != nil {
  857. return err
  858. }
  859. return nil
  860. }
  861. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  862. has, err := x.Get(cb)
  863. if err != nil {
  864. return nil, err
  865. } else if !has {
  866. return nil, ErrJobNotExist{}
  867. }
  868. return cb, nil
  869. }
  870. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  871. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  872. return getRepoCloudBrain(cb)
  873. }
  874. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  875. cb := &Cloudbrain{JobID: jobID}
  876. return getRepoCloudBrain(cb)
  877. }
  878. func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
  879. cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
  880. return getRepoCloudBrain(cb)
  881. }
  882. func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) {
  883. cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion}
  884. return getRepoCloudBrain(cb)
  885. }
  886. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  887. cloudBrains := make([]*Cloudbrain, 0)
  888. err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  889. return cloudBrains, err
  890. }
  891. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  892. cloudBrains := make([]*Cloudbrain, 0)
  893. err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  894. return cloudBrains, err
  895. }
  896. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  897. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  898. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  899. return
  900. }
  901. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  902. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  903. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  904. return
  905. }
  906. func SetVersionCountAndLatestVersion(jobID string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) {
  907. cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount}
  908. _, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
  909. return
  910. }
  911. func UpdateJob(job *Cloudbrain) error {
  912. return updateJob(x, job)
  913. }
  914. func updateJob(e Engine, job *Cloudbrain) error {
  915. var sess *xorm.Session
  916. sess = e.Where("job_id = ?", job.JobID)
  917. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  918. return err
  919. }
  920. func UpdateTrainJobVersion(job *Cloudbrain) error {
  921. return updateJobTrainVersion(x, job)
  922. }
  923. func updateJobTrainVersion(e Engine, job *Cloudbrain) error {
  924. var sess *xorm.Session
  925. sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName)
  926. _, err := sess.Cols("status", "train_job_duration").Update(job)
  927. return err
  928. }
  929. func DeleteJob(job *Cloudbrain) error {
  930. return deleteJob(x, job)
  931. }
  932. func deleteJob(e Engine, job *Cloudbrain) error {
  933. _, err := e.ID(job.ID).Delete(job)
  934. return err
  935. }
  936. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  937. cb := &Cloudbrain{JobName: jobName}
  938. return getRepoCloudBrain(cb)
  939. }
  940. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  941. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  942. return false
  943. }
  944. repo, err := GetRepositoryByID(job.RepoID)
  945. if err != nil {
  946. log.Error("GetRepositoryByID failed:%v", err.Error())
  947. return false
  948. }
  949. permission, _ := GetUserRepoPermission(repo, user)
  950. if err != nil {
  951. log.Error("GetUserRepoPermission failed:%v", err.Error())
  952. return false
  953. }
  954. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  955. return true
  956. }
  957. return false
  958. }