You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 19 kB

2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
2 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "code.gitea.io/gitea/modules/timeutil"
  8. "code.gitea.io/gitea/modules/storage"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. )
  15. const (
  16. //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  17. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  18. CommandBenchmark = `cd /benchmark && bash run_bk.sh >/model/benchmark-log.txt`
  19. CodeMountPath = "/code"
  20. DataSetMountPath = "/dataset"
  21. ModelMountPath = "/model"
  22. PretrainModelMountPath = "/pretrainmodel"
  23. LogFile = "log.txt"
  24. BenchMarkMountPath = "/benchmark"
  25. BenchMarkResourceID = 1
  26. Snn4imagenetMountPath = "/snn4imagenet"
  27. BrainScoreMountPath = "/brainscore"
  28. TaskInfoName = "/taskInfo"
  29. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s' >/model/benchmark-log.txt`
  30. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s' >/model/benchmark-log.txt`
  31. SubTaskName = "task1"
  32. Success = "S000"
  33. DefaultBranchName = "master"
  34. ResultPath = "/result"
  35. )
  36. var (
  37. ResourceSpecs *models.ResourceSpecs
  38. TrainResourceSpecs *models.ResourceSpecs
  39. InferenceResourceSpecs *models.ResourceSpecs
  40. SpecialPools *models.SpecialPools
  41. )
  42. type GenerateCloudBrainTaskReq struct {
  43. Ctx *context.Context
  44. DisplayJobName string
  45. JobName string
  46. Image string
  47. Command string
  48. CodePath string
  49. ModelPath string
  50. BenchmarkPath string
  51. Snn4ImageNetPath string
  52. BrainScorePath string
  53. JobType string
  54. Description string
  55. BranchName string
  56. BootFile string
  57. Params string
  58. CommitID string
  59. Uuids string
  60. DatasetNames string
  61. DatasetInfos map[string]models.DatasetInfo
  62. BenchmarkTypeID int
  63. BenchmarkChildTypeID int
  64. ResultPath string
  65. TrainUrl string
  66. ModelName string
  67. ModelVersion string
  68. CkptName string
  69. LabelName string
  70. PreTrainModelPath string
  71. PreTrainModelUrl string
  72. Spec *models.Specification
  73. }
  74. func GetCloudbrainDebugCommand() string {
  75. var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
  76. return command
  77. }
  78. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  79. if !ctx.IsSigned {
  80. return false
  81. }
  82. if err != nil {
  83. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  84. } else {
  85. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  86. }
  87. }
  88. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  89. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  90. }
  91. func CanCreateOrDebugJob(ctx *context.Context) bool {
  92. if !ctx.IsSigned {
  93. return false
  94. }
  95. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  96. }
  97. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  98. return isAdminOrJobCreater(ctx, job, nil)
  99. }
  100. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  101. if !ctx.IsSigned {
  102. return false
  103. }
  104. if err != nil {
  105. return ctx.IsUserSiteAdmin()
  106. } else {
  107. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  108. }
  109. }
  110. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  111. if !ctx.IsSigned {
  112. return false
  113. }
  114. if err != nil {
  115. return ctx.IsUserSiteAdmin()
  116. } else {
  117. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  118. }
  119. }
  120. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  121. var id = ctx.Params(":id")
  122. job, err := models.GetCloudbrainByID(id)
  123. if err != nil {
  124. log.Error("GetCloudbrainByID failed:%v", err.Error())
  125. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  126. }
  127. ctx.Cloudbrain = job
  128. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  129. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  130. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  131. }
  132. }
  133. func AdminOrJobCreaterRight(ctx *context.Context) {
  134. var id = ctx.Params(":id")
  135. job, err := models.GetCloudbrainByID(id)
  136. if err != nil {
  137. log.Error("GetCloudbrainByID failed:%v", err.Error())
  138. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  139. }
  140. ctx.Cloudbrain = job
  141. if !isAdminOrJobCreater(ctx, job, err) {
  142. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  143. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  144. }
  145. }
  146. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  147. var jobID = ctx.Params(":jobid")
  148. job, err := models.GetCloudbrainByJobID(jobID)
  149. if err != nil {
  150. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  151. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  152. }
  153. ctx.Cloudbrain = job
  154. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  155. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  156. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  157. }
  158. }
  159. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  160. var jobID = ctx.Params(":jobid")
  161. job, err := models.GetCloudbrainByJobID(jobID)
  162. if err != nil {
  163. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  164. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  165. }
  166. ctx.Cloudbrain = job
  167. if !isAdminOrJobCreater(ctx, job, err) {
  168. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  169. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  170. }
  171. }
  172. func AdminOrImageCreaterRight(ctx *context.Context) {
  173. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  174. var image *models.Image
  175. if err != nil {
  176. log.Error("Get Image by ID failed:%v", err.Error())
  177. } else {
  178. image, err = models.GetImageByID(id)
  179. if err != nil {
  180. log.Error("Get Image by ID failed:%v", err.Error())
  181. return
  182. }
  183. }
  184. if !isAdminOrImageCreater(ctx, image, err) {
  185. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  186. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  187. }
  188. }
  189. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  190. var versionCount int
  191. if req.JobType == string(models.JobTypeTrain) {
  192. versionCount = 1
  193. }
  194. volumes := []models.Volume{
  195. {
  196. HostPath: models.StHostPath{
  197. Path: req.CodePath,
  198. MountPath: CodeMountPath,
  199. ReadOnly: false,
  200. },
  201. },
  202. {
  203. HostPath: models.StHostPath{
  204. Path: req.ModelPath,
  205. MountPath: ModelMountPath,
  206. ReadOnly: false,
  207. },
  208. },
  209. {
  210. HostPath: models.StHostPath{
  211. Path: req.BenchmarkPath,
  212. MountPath: BenchMarkMountPath,
  213. ReadOnly: true,
  214. },
  215. },
  216. {
  217. HostPath: models.StHostPath{
  218. Path: req.Snn4ImageNetPath,
  219. MountPath: Snn4imagenetMountPath,
  220. ReadOnly: true,
  221. },
  222. },
  223. {
  224. HostPath: models.StHostPath{
  225. Path: req.BrainScorePath,
  226. MountPath: BrainScoreMountPath,
  227. ReadOnly: true,
  228. },
  229. },
  230. {
  231. HostPath: models.StHostPath{
  232. Path: req.ResultPath,
  233. MountPath: ResultPath,
  234. ReadOnly: false,
  235. },
  236. },
  237. }
  238. if req.PreTrainModelUrl != "" { //预训练
  239. volumes = append(volumes, models.Volume{
  240. HostPath: models.StHostPath{
  241. Path: req.PreTrainModelPath,
  242. MountPath: PretrainModelMountPath,
  243. ReadOnly: true,
  244. },
  245. })
  246. }
  247. if len(req.DatasetInfos) == 1 {
  248. volumes = append(volumes, models.Volume{
  249. HostPath: models.StHostPath{
  250. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  251. MountPath: DataSetMountPath,
  252. ReadOnly: true,
  253. },
  254. })
  255. } else if len(req.DatasetInfos) > 1 {
  256. for _, dataset := range req.DatasetInfos {
  257. volumes = append(volumes, models.Volume{
  258. HostPath: models.StHostPath{
  259. Path: dataset.DataLocalPath,
  260. MountPath: DataSetMountPath + "/" + dataset.Name,
  261. ReadOnly: true,
  262. },
  263. })
  264. }
  265. }
  266. createTime := timeutil.TimeStampNow()
  267. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  268. JobName: req.JobName,
  269. RetryCount: 1,
  270. GpuType: req.Spec.QueueCode,
  271. Image: req.Image,
  272. TaskRoles: []models.TaskRole{
  273. {
  274. Name: SubTaskName,
  275. TaskNumber: 1,
  276. MinSucceededTaskCount: 1,
  277. MinFailedTaskCount: 1,
  278. CPUNumber: req.Spec.CpuCores,
  279. GPUNumber: req.Spec.AccCardsNum,
  280. MemoryMB: int(req.Spec.MemGiB * 1024),
  281. ShmMB: int(req.Spec.ShareMemGiB * 1024),
  282. Command: req.Command,
  283. NeedIBDevice: false,
  284. IsMainRole: false,
  285. UseNNI: false,
  286. },
  287. },
  288. Volumes: volumes,
  289. })
  290. if err != nil {
  291. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  292. return err
  293. }
  294. if jobResult.Code != Success {
  295. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  296. return errors.New(jobResult.Msg)
  297. }
  298. var jobID = jobResult.Payload["jobId"].(string)
  299. err = models.CreateCloudbrain(&models.Cloudbrain{
  300. Status: string(models.JobWaiting),
  301. UserID: req.Ctx.User.ID,
  302. RepoID: req.Ctx.Repo.Repository.ID,
  303. JobID: jobID,
  304. JobName: req.JobName,
  305. DisplayJobName: req.DisplayJobName,
  306. SubTaskName: SubTaskName,
  307. JobType: req.JobType,
  308. Type: models.TypeCloudBrainOne,
  309. Uuid: req.Uuids,
  310. Image: req.Image,
  311. GpuQueue: req.Spec.QueueCode,
  312. ComputeResource: models.GPUResource,
  313. BenchmarkTypeID: req.BenchmarkTypeID,
  314. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  315. Description: req.Description,
  316. IsLatestVersion: "1",
  317. VersionCount: versionCount,
  318. BranchName: req.BranchName,
  319. BootFile: req.BootFile,
  320. DatasetName: req.DatasetNames,
  321. Parameters: req.Params,
  322. TrainUrl: req.TrainUrl,
  323. ModelName: req.ModelName,
  324. ModelVersion: req.ModelVersion,
  325. CkptName: req.CkptName,
  326. ResultUrl: req.ResultPath,
  327. LabelName: req.LabelName,
  328. PreTrainModelUrl: req.PreTrainModelUrl,
  329. CreatedUnix: createTime,
  330. UpdatedUnix: createTime,
  331. CommitID: req.CommitID,
  332. Spec: req.Spec,
  333. })
  334. if err != nil {
  335. return err
  336. }
  337. task, err := models.GetCloudbrainByJobID(jobID)
  338. if err != nil {
  339. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  340. return err
  341. }
  342. stringId := strconv.FormatInt(task.ID, 10)
  343. if IsBenchmarkJob(req.JobType) {
  344. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  345. } else if string(models.JobTypeTrain) == req.JobType {
  346. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  347. } else if string(models.JobTypeInference) == req.JobType {
  348. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  349. } else {
  350. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  351. }
  352. return nil
  353. }
  354. func IsBenchmarkJob(jobType string) bool {
  355. return string(models.JobTypeModelSafety) == jobType || string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  356. }
  357. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  358. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  359. if err != nil {
  360. log.Warn("Get waiting count err", err)
  361. num = 0
  362. }
  363. return num
  364. }
  365. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  366. jobName := task.JobName
  367. spec := task.Spec
  368. var datasetInfos map[string]models.DatasetInfo
  369. if task.Uuid != "" {
  370. var err error
  371. datasetInfos, _, err = models.GetDatasetInfo(task.Uuid)
  372. if err != nil {
  373. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  374. return err
  375. }
  376. }
  377. volumes := []models.Volume{
  378. {
  379. HostPath: models.StHostPath{
  380. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  381. MountPath: CodeMountPath,
  382. ReadOnly: false,
  383. },
  384. },
  385. {
  386. HostPath: models.StHostPath{
  387. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  388. MountPath: ModelMountPath,
  389. ReadOnly: false,
  390. },
  391. },
  392. {
  393. HostPath: models.StHostPath{
  394. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  395. MountPath: BenchMarkMountPath,
  396. ReadOnly: true,
  397. },
  398. },
  399. {
  400. HostPath: models.StHostPath{
  401. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  402. MountPath: Snn4imagenetMountPath,
  403. ReadOnly: true,
  404. },
  405. },
  406. {
  407. HostPath: models.StHostPath{
  408. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  409. MountPath: BrainScoreMountPath,
  410. ReadOnly: true,
  411. },
  412. },
  413. }
  414. if datasetInfos != nil {
  415. if len(datasetInfos) == 1 {
  416. volumes = append(volumes, models.Volume{
  417. HostPath: models.StHostPath{
  418. Path: datasetInfos[task.Uuid].DataLocalPath,
  419. MountPath: DataSetMountPath,
  420. ReadOnly: true,
  421. },
  422. })
  423. } else {
  424. for _, dataset := range datasetInfos {
  425. volumes = append(volumes, models.Volume{
  426. HostPath: models.StHostPath{
  427. Path: dataset.DataLocalPath,
  428. MountPath: DataSetMountPath + "/" + dataset.Name,
  429. ReadOnly: true,
  430. },
  431. })
  432. }
  433. }
  434. }
  435. createTime := timeutil.TimeStampNow()
  436. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  437. JobName: jobName,
  438. RetryCount: 1,
  439. GpuType: task.GpuQueue,
  440. Image: task.Image,
  441. TaskRoles: []models.TaskRole{
  442. {
  443. Name: SubTaskName,
  444. TaskNumber: 1,
  445. MinSucceededTaskCount: 1,
  446. MinFailedTaskCount: 1,
  447. CPUNumber: spec.CpuCores,
  448. GPUNumber: spec.AccCardsNum,
  449. MemoryMB: int(spec.MemGiB * 1024),
  450. ShmMB: int(spec.ShareMemGiB * 1024),
  451. Command: GetCloudbrainDebugCommand(), //Command,
  452. NeedIBDevice: false,
  453. IsMainRole: false,
  454. UseNNI: false,
  455. },
  456. },
  457. Volumes: volumes,
  458. })
  459. if err != nil {
  460. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  461. return err
  462. }
  463. if jobResult.Code != Success {
  464. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  465. return errors.New(jobResult.Msg)
  466. }
  467. var jobID = jobResult.Payload["jobId"].(string)
  468. newTask := &models.Cloudbrain{
  469. Status: string(models.JobWaiting),
  470. UserID: task.UserID,
  471. RepoID: task.RepoID,
  472. JobID: jobID,
  473. JobName: task.JobName,
  474. DisplayJobName: task.DisplayJobName,
  475. SubTaskName: task.SubTaskName,
  476. JobType: task.JobType,
  477. Type: task.Type,
  478. Uuid: task.Uuid,
  479. DatasetName: task.DatasetName,
  480. Image: task.Image,
  481. GpuQueue: task.GpuQueue,
  482. ResourceSpecId: task.ResourceSpecId,
  483. ComputeResource: task.ComputeResource,
  484. CreatedUnix: createTime,
  485. UpdatedUnix: createTime,
  486. BranchName: task.BranchName,
  487. Spec: spec,
  488. }
  489. err = models.RestartCloudbrain(task, newTask)
  490. if err != nil {
  491. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  492. return err
  493. }
  494. stringId := strconv.FormatInt(newTask.ID, 10)
  495. *newID = stringId
  496. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  497. return nil
  498. }
  499. func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {
  500. for _, specialPool := range SpecialPools.Pools {
  501. if specialPool.ResourceSpec != nil {
  502. if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
  503. for _, spec := range specialPool.ResourceSpec {
  504. if resourceSpecId == spec.Id {
  505. return spec
  506. }
  507. }
  508. }
  509. }
  510. }
  511. return nil
  512. }
  513. func DelCloudBrainJob(jobId string) string {
  514. task, err := models.GetCloudbrainByJobID(jobId)
  515. if err != nil {
  516. log.Error("get cloud brain err:", err)
  517. return "cloudbrain.Delete_failed"
  518. }
  519. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  520. log.Error("the job(%s) has not been stopped", task.JobName)
  521. return "cloudbrain.Not_Stopped"
  522. }
  523. err = models.DeleteJob(task)
  524. if err != nil {
  525. log.Error("DeleteJob failed:", err)
  526. return "cloudbrain.Delete_failed"
  527. }
  528. deleteJobStorage(task.JobName)
  529. return ""
  530. }
  531. func deleteJobStorage(jobName string) error {
  532. //delete local
  533. localJobPath := setting.JobPath + jobName
  534. err := os.RemoveAll(localJobPath)
  535. if err != nil {
  536. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  537. }
  538. dirPath := setting.CBCodePathPrefix + jobName + "/"
  539. err = storage.Attachments.DeleteDir(dirPath)
  540. if err != nil {
  541. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  542. }
  543. return nil
  544. }
  545. func InitSpecialPool() {
  546. if SpecialPools == nil && setting.SpecialPools != "" {
  547. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  548. }
  549. }
  550. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  551. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  552. return true
  553. }
  554. for _, v := range resourceSpecs {
  555. if v.Id == resourceSpecId {
  556. return true
  557. }
  558. }
  559. return false
  560. }
  561. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  562. for _, v := range pool {
  563. if v.Queue == queue {
  564. return true
  565. }
  566. }
  567. return false
  568. }
  569. func IsElementExist(s []string, str string) bool {
  570. for _, v := range s {
  571. if v == str {
  572. return true
  573. }
  574. }
  575. return false
  576. }