You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 18 kB

3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "code.gitea.io/gitea/modules/timeutil"
  8. "code.gitea.io/gitea/modules/storage"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. )
  15. const (
  16. Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  17. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  18. CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
  19. CodeMountPath = "/code"
  20. DataSetMountPath = "/dataset"
  21. ModelMountPath = "/model"
  22. LogFile = "log.txt"
  23. BenchMarkMountPath = "/benchmark"
  24. BenchMarkResourceID = 1
  25. Snn4imagenetMountPath = "/snn4imagenet"
  26. BrainScoreMountPath = "/brainscore"
  27. TaskInfoName = "/taskInfo"
  28. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'`
  29. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'`
  30. SubTaskName = "task1"
  31. Success = "S000"
  32. DefaultBranchName = "master"
  33. ResultPath = "/result"
  34. )
  35. var (
  36. ResourceSpecs *models.ResourceSpecs
  37. TrainResourceSpecs *models.ResourceSpecs
  38. SpecialPools *models.SpecialPools
  39. )
  40. type GenerateCloudBrainTaskReq struct {
  41. Ctx *context.Context
  42. DisplayJobName string
  43. JobName string
  44. Image string
  45. Command string
  46. CodePath string
  47. ModelPath string
  48. BenchmarkPath string
  49. Snn4ImageNetPath string
  50. BrainScorePath string
  51. JobType string
  52. GpuQueue string
  53. Description string
  54. BranchName string
  55. BootFile string
  56. Params string
  57. CommitID string
  58. Uuids string
  59. DatasetNames string
  60. DatasetInfos map[string]models.DatasetInfo
  61. BenchmarkTypeID int
  62. BenchmarkChildTypeID int
  63. ResourceSpecId int
  64. ResultPath string
  65. TrainUrl string
  66. ModelName string
  67. ModelVersion string
  68. CkptName string
  69. }
  70. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  71. if !ctx.IsSigned {
  72. return false
  73. }
  74. if err != nil {
  75. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  76. } else {
  77. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  78. }
  79. }
  80. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  81. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  82. }
  83. func CanCreateOrDebugJob(ctx *context.Context) bool {
  84. if !ctx.IsSigned {
  85. return false
  86. }
  87. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  88. }
  89. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  90. return isAdminOrJobCreater(ctx, job, nil)
  91. }
  92. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  93. if !ctx.IsSigned {
  94. return false
  95. }
  96. if err != nil {
  97. return ctx.IsUserSiteAdmin()
  98. } else {
  99. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  100. }
  101. }
  102. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  103. if !ctx.IsSigned {
  104. return false
  105. }
  106. if err != nil {
  107. return ctx.IsUserSiteAdmin()
  108. } else {
  109. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  110. }
  111. }
  112. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  113. var ID = ctx.Params(":id")
  114. job, err := models.GetCloudbrainByID(ID)
  115. if err != nil {
  116. log.Error("GetCloudbrainByID failed:%v", err.Error())
  117. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  118. }
  119. ctx.Cloudbrain = job
  120. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  121. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  122. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  123. }
  124. }
  125. func AdminOrJobCreaterRight(ctx *context.Context) {
  126. var ID = ctx.Params(":id")
  127. job, err := models.GetCloudbrainByID(ID)
  128. if err != nil {
  129. log.Error("GetCloudbrainByID failed:%v", err.Error())
  130. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  131. }
  132. ctx.Cloudbrain = job
  133. if !isAdminOrJobCreater(ctx, job, err) {
  134. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  135. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  136. }
  137. }
  138. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  139. var jobID = ctx.Params(":jobid")
  140. job, err := models.GetCloudbrainByJobID(jobID)
  141. if err != nil {
  142. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  143. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  144. }
  145. ctx.Cloudbrain = job
  146. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  147. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  148. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  149. }
  150. }
  151. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  152. var jobID = ctx.Params(":jobid")
  153. job, err := models.GetCloudbrainByJobID(jobID)
  154. if err != nil {
  155. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  156. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  157. }
  158. ctx.Cloudbrain = job
  159. if !isAdminOrJobCreater(ctx, job, err) {
  160. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  161. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  162. }
  163. }
  164. func AdminOrImageCreaterRight(ctx *context.Context) {
  165. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  166. var image *models.Image
  167. if err != nil {
  168. log.Error("Get Image by ID failed:%v", err.Error())
  169. } else {
  170. image, err = models.GetImageByID(id)
  171. if err != nil {
  172. log.Error("Get Image by ID failed:%v", err.Error())
  173. return
  174. }
  175. }
  176. if !isAdminOrImageCreater(ctx, image, err) {
  177. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  178. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  179. }
  180. }
  181. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  182. var resourceSpec *models.ResourceSpec
  183. var versionCount int
  184. if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) {
  185. versionCount = 1
  186. if TrainResourceSpecs == nil {
  187. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  188. }
  189. for _, spec := range TrainResourceSpecs.ResourceSpec {
  190. if req.ResourceSpecId == spec.Id {
  191. resourceSpec = spec
  192. break
  193. }
  194. }
  195. } else {
  196. if ResourceSpecs == nil {
  197. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  198. }
  199. for _, spec := range ResourceSpecs.ResourceSpec {
  200. if req.ResourceSpecId == spec.Id {
  201. resourceSpec = spec
  202. break
  203. }
  204. }
  205. }
  206. //如果没有匹配到spec信息,尝试从专属资源池获取
  207. if resourceSpec == nil && SpecialPools != nil {
  208. for _, specialPool := range SpecialPools.Pools {
  209. if resourceSpec != nil {
  210. break
  211. }
  212. if specialPool.ResourceSpec != nil {
  213. if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) {
  214. for _, spec := range specialPool.ResourceSpec {
  215. if req.ResourceSpecId == spec.Id {
  216. resourceSpec = spec
  217. break
  218. }
  219. }
  220. }
  221. }
  222. }
  223. }
  224. if resourceSpec == nil {
  225. log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
  226. return errors.New("no such resourceSpec")
  227. }
  228. volumes := []models.Volume{
  229. {
  230. HostPath: models.StHostPath{
  231. Path: req.CodePath,
  232. MountPath: CodeMountPath,
  233. ReadOnly: false,
  234. },
  235. },
  236. {
  237. HostPath: models.StHostPath{
  238. Path: req.ModelPath,
  239. MountPath: ModelMountPath,
  240. ReadOnly: false,
  241. },
  242. },
  243. {
  244. HostPath: models.StHostPath{
  245. Path: req.BenchmarkPath,
  246. MountPath: BenchMarkMountPath,
  247. ReadOnly: true,
  248. },
  249. },
  250. {
  251. HostPath: models.StHostPath{
  252. Path: req.Snn4ImageNetPath,
  253. MountPath: Snn4imagenetMountPath,
  254. ReadOnly: true,
  255. },
  256. },
  257. {
  258. HostPath: models.StHostPath{
  259. Path: req.BrainScorePath,
  260. MountPath: BrainScoreMountPath,
  261. ReadOnly: true,
  262. },
  263. },
  264. {
  265. HostPath: models.StHostPath{
  266. Path: req.ResultPath,
  267. MountPath: ResultPath,
  268. ReadOnly: false,
  269. },
  270. },
  271. }
  272. if len(req.DatasetInfos) == 1 {
  273. volumes = append(volumes, models.Volume{
  274. HostPath: models.StHostPath{
  275. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  276. MountPath: DataSetMountPath,
  277. ReadOnly: true,
  278. },
  279. })
  280. } else {
  281. for _, dataset := range req.DatasetInfos {
  282. volumes = append(volumes, models.Volume{
  283. HostPath: models.StHostPath{
  284. Path: dataset.DataLocalPath,
  285. MountPath: DataSetMountPath + "/" + dataset.Name,
  286. ReadOnly: true,
  287. },
  288. })
  289. }
  290. }
  291. createTime := timeutil.TimeStampNow()
  292. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  293. JobName: req.JobName,
  294. RetryCount: 1,
  295. GpuType: req.GpuQueue,
  296. Image: req.Image,
  297. TaskRoles: []models.TaskRole{
  298. {
  299. Name: SubTaskName,
  300. TaskNumber: 1,
  301. MinSucceededTaskCount: 1,
  302. MinFailedTaskCount: 1,
  303. CPUNumber: resourceSpec.CpuNum,
  304. GPUNumber: resourceSpec.GpuNum,
  305. MemoryMB: resourceSpec.MemMiB,
  306. ShmMB: resourceSpec.ShareMemMiB,
  307. Command: req.Command,
  308. NeedIBDevice: false,
  309. IsMainRole: false,
  310. UseNNI: false,
  311. },
  312. },
  313. Volumes: volumes,
  314. })
  315. if err != nil {
  316. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  317. return err
  318. }
  319. if jobResult.Code != Success {
  320. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  321. return errors.New(jobResult.Msg)
  322. }
  323. var jobID = jobResult.Payload["jobId"].(string)
  324. err = models.CreateCloudbrain(&models.Cloudbrain{
  325. Status: string(models.JobWaiting),
  326. UserID: req.Ctx.User.ID,
  327. RepoID: req.Ctx.Repo.Repository.ID,
  328. JobID: jobID,
  329. JobName: req.JobName,
  330. DisplayJobName: req.DisplayJobName,
  331. SubTaskName: SubTaskName,
  332. JobType: req.JobType,
  333. Type: models.TypeCloudBrainOne,
  334. Uuid: req.Uuids,
  335. Image: req.Image,
  336. GpuQueue: req.GpuQueue,
  337. ResourceSpecId: req.ResourceSpecId,
  338. ComputeResource: models.GPUResource,
  339. BenchmarkTypeID: req.BenchmarkTypeID,
  340. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  341. Description: req.Description,
  342. IsLatestVersion: "1",
  343. VersionCount: versionCount,
  344. BranchName: req.BranchName,
  345. BootFile: req.BootFile,
  346. DatasetName: req.DatasetNames,
  347. Parameters: req.Params,
  348. TrainUrl: req.TrainUrl,
  349. ModelName: req.ModelName,
  350. ModelVersion: req.ModelVersion,
  351. CkptName: req.CkptName,
  352. ResultUrl: req.ResultPath,
  353. CreatedUnix: createTime,
  354. UpdatedUnix: createTime,
  355. CommitID: req.CommitID,
  356. })
  357. if err != nil {
  358. return err
  359. }
  360. task, err := models.GetCloudbrainByJobID(jobID)
  361. if err != nil {
  362. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  363. return err
  364. }
  365. stringId := strconv.FormatInt(task.ID, 10)
  366. if IsBenchmarkJob(req.JobType) {
  367. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  368. } else if string(models.JobTypeTrain) == req.JobType {
  369. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  370. } else if string(models.JobTypeInference) == req.JobType {
  371. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  372. } else {
  373. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  374. }
  375. return nil
  376. }
  377. func IsBenchmarkJob(jobType string) bool {
  378. return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  379. }
  380. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  381. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  382. if err != nil {
  383. log.Warn("Get waiting count err", err)
  384. num = 0
  385. }
  386. return num
  387. }
  388. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  389. jobName := task.JobName
  390. var resourceSpec *models.ResourceSpec
  391. if ResourceSpecs == nil {
  392. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  393. }
  394. for _, spec := range ResourceSpecs.ResourceSpec {
  395. if task.ResourceSpecId == spec.Id {
  396. resourceSpec = spec
  397. }
  398. }
  399. if resourceSpec == nil {
  400. log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
  401. return errors.New("no such resourceSpec")
  402. }
  403. datasetInfos, _, err := models.GetDatasetInfo(task.Uuid)
  404. if err != nil {
  405. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  406. return err
  407. }
  408. volumes := []models.Volume{
  409. {
  410. HostPath: models.StHostPath{
  411. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  412. MountPath: CodeMountPath,
  413. ReadOnly: false,
  414. },
  415. },
  416. {
  417. HostPath: models.StHostPath{
  418. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  419. MountPath: ModelMountPath,
  420. ReadOnly: false,
  421. },
  422. },
  423. {
  424. HostPath: models.StHostPath{
  425. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  426. MountPath: BenchMarkMountPath,
  427. ReadOnly: true,
  428. },
  429. },
  430. {
  431. HostPath: models.StHostPath{
  432. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  433. MountPath: Snn4imagenetMountPath,
  434. ReadOnly: true,
  435. },
  436. },
  437. {
  438. HostPath: models.StHostPath{
  439. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  440. MountPath: BrainScoreMountPath,
  441. ReadOnly: true,
  442. },
  443. },
  444. }
  445. if len(datasetInfos) == 1 {
  446. volumes = append(volumes, models.Volume{
  447. HostPath: models.StHostPath{
  448. Path: datasetInfos[task.Uuid].DataLocalPath,
  449. MountPath: DataSetMountPath,
  450. ReadOnly: true,
  451. },
  452. })
  453. } else {
  454. for _, dataset := range datasetInfos {
  455. volumes = append(volumes, models.Volume{
  456. HostPath: models.StHostPath{
  457. Path: dataset.DataLocalPath,
  458. MountPath: DataSetMountPath + "/" + dataset.Name,
  459. ReadOnly: true,
  460. },
  461. })
  462. }
  463. }
  464. createTime := timeutil.TimeStampNow()
  465. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  466. JobName: jobName,
  467. RetryCount: 1,
  468. GpuType: task.GpuQueue,
  469. Image: task.Image,
  470. TaskRoles: []models.TaskRole{
  471. {
  472. Name: SubTaskName,
  473. TaskNumber: 1,
  474. MinSucceededTaskCount: 1,
  475. MinFailedTaskCount: 1,
  476. CPUNumber: resourceSpec.CpuNum,
  477. GPUNumber: resourceSpec.GpuNum,
  478. MemoryMB: resourceSpec.MemMiB,
  479. ShmMB: resourceSpec.ShareMemMiB,
  480. Command: Command,
  481. NeedIBDevice: false,
  482. IsMainRole: false,
  483. UseNNI: false,
  484. },
  485. },
  486. Volumes: volumes,
  487. })
  488. if err != nil {
  489. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  490. return err
  491. }
  492. if jobResult.Code != Success {
  493. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  494. return errors.New(jobResult.Msg)
  495. }
  496. var jobID = jobResult.Payload["jobId"].(string)
  497. newTask := &models.Cloudbrain{
  498. Status: string(models.JobWaiting),
  499. UserID: task.UserID,
  500. RepoID: task.RepoID,
  501. JobID: jobID,
  502. JobName: task.JobName,
  503. DisplayJobName: task.DisplayJobName,
  504. SubTaskName: task.SubTaskName,
  505. JobType: task.JobType,
  506. Type: task.Type,
  507. Uuid: task.Uuid,
  508. DatasetName: task.DatasetName,
  509. Image: task.Image,
  510. GpuQueue: task.GpuQueue,
  511. ResourceSpecId: task.ResourceSpecId,
  512. ComputeResource: task.ComputeResource,
  513. CreatedUnix: createTime,
  514. UpdatedUnix: createTime,
  515. BranchName: task.BranchName,
  516. }
  517. err = models.RestartCloudbrain(task, newTask)
  518. if err != nil {
  519. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  520. return err
  521. }
  522. stringId := strconv.FormatInt(newTask.ID, 10)
  523. *newID = stringId
  524. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  525. return nil
  526. }
  527. func DelCloudBrainJob(jobId string) string {
  528. task, err := models.GetCloudbrainByJobID(jobId)
  529. if err != nil {
  530. log.Error("get cloud brain err:", err)
  531. return "cloudbrain.Delete_failed"
  532. }
  533. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  534. log.Error("the job(%s) has not been stopped", task.JobName)
  535. return "cloudbrain.Not_Stopped"
  536. }
  537. err = models.DeleteJob(task)
  538. if err != nil {
  539. log.Error("DeleteJob failed:", err)
  540. return "cloudbrain.Delete_failed"
  541. }
  542. deleteJobStorage(task.JobName)
  543. return ""
  544. }
  545. func deleteJobStorage(jobName string) error {
  546. //delete local
  547. localJobPath := setting.JobPath + jobName
  548. err := os.RemoveAll(localJobPath)
  549. if err != nil {
  550. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  551. }
  552. dirPath := setting.CBCodePathPrefix + jobName + "/"
  553. err = storage.Attachments.DeleteDir(dirPath)
  554. if err != nil {
  555. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  556. }
  557. return nil
  558. }
  559. func InitSpecialPool() {
  560. if SpecialPools == nil && setting.SpecialPools != "" {
  561. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  562. }
  563. }
  564. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  565. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  566. return true
  567. }
  568. for _, v := range resourceSpecs {
  569. if v.Id == resourceSpecId {
  570. return true
  571. }
  572. }
  573. return false
  574. }
  575. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  576. for _, v := range pool {
  577. if v.Queue == queue {
  578. return true
  579. }
  580. }
  581. return false
  582. }
  583. func IsElementExist(s []string, str string) bool {
  584. for _, v := range s {
  585. if v == str {
  586. return true
  587. }
  588. }
  589. return false
  590. }