You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 18 kB

2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "code.gitea.io/gitea/modules/timeutil"
  8. "code.gitea.io/gitea/modules/storage"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. )
  15. const (
  16. //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  17. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  18. CommandBenchmark = `cd /benchmark && bash run_bk.sh >/model/benchmark-log.txt`
  19. CodeMountPath = "/code"
  20. DataSetMountPath = "/dataset"
  21. ModelMountPath = "/model"
  22. LogFile = "log.txt"
  23. BenchMarkMountPath = "/benchmark"
  24. BenchMarkResourceID = 1
  25. Snn4imagenetMountPath = "/snn4imagenet"
  26. BrainScoreMountPath = "/brainscore"
  27. TaskInfoName = "/taskInfo"
  28. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s' >/model/benchmark-log.txt`
  29. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s' >/model/benchmark-log.txt`
  30. SubTaskName = "task1"
  31. Success = "S000"
  32. DefaultBranchName = "master"
  33. ResultPath = "/result"
  34. )
  35. var (
  36. ResourceSpecs *models.ResourceSpecs
  37. TrainResourceSpecs *models.ResourceSpecs
  38. InferenceResourceSpecs *models.ResourceSpecs
  39. SpecialPools *models.SpecialPools
  40. )
  41. type GenerateCloudBrainTaskReq struct {
  42. Ctx *context.Context
  43. DisplayJobName string
  44. JobName string
  45. Image string
  46. Command string
  47. CodePath string
  48. ModelPath string
  49. BenchmarkPath string
  50. Snn4ImageNetPath string
  51. BrainScorePath string
  52. JobType string
  53. Description string
  54. BranchName string
  55. BootFile string
  56. Params string
  57. CommitID string
  58. Uuids string
  59. DatasetNames string
  60. DatasetInfos map[string]models.DatasetInfo
  61. BenchmarkTypeID int
  62. BenchmarkChildTypeID int
  63. ResultPath string
  64. TrainUrl string
  65. ModelName string
  66. ModelVersion string
  67. CkptName string
  68. LabelName string
  69. Spec *models.Specification
  70. }
  71. func GetCloudbrainDebugCommand() string {
  72. var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
  73. return command
  74. }
  75. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  76. if !ctx.IsSigned {
  77. return false
  78. }
  79. if err != nil {
  80. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  81. } else {
  82. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  83. }
  84. }
  85. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  86. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  87. }
  88. func CanCreateOrDebugJob(ctx *context.Context) bool {
  89. if !ctx.IsSigned {
  90. return false
  91. }
  92. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  93. }
  94. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  95. return isAdminOrJobCreater(ctx, job, nil)
  96. }
  97. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  98. if !ctx.IsSigned {
  99. return false
  100. }
  101. if err != nil {
  102. return ctx.IsUserSiteAdmin()
  103. } else {
  104. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  105. }
  106. }
  107. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  108. if !ctx.IsSigned {
  109. return false
  110. }
  111. if err != nil {
  112. return ctx.IsUserSiteAdmin()
  113. } else {
  114. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  115. }
  116. }
  117. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  118. var id = ctx.Params(":id")
  119. job, err := models.GetCloudbrainByID(id)
  120. if err != nil {
  121. log.Error("GetCloudbrainByID failed:%v", err.Error())
  122. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  123. }
  124. ctx.Cloudbrain = job
  125. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  126. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  127. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  128. }
  129. }
  130. func AdminOrJobCreaterRight(ctx *context.Context) {
  131. var id = ctx.Params(":id")
  132. job, err := models.GetCloudbrainByID(id)
  133. if err != nil {
  134. log.Error("GetCloudbrainByID failed:%v", err.Error())
  135. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  136. }
  137. ctx.Cloudbrain = job
  138. if !isAdminOrJobCreater(ctx, job, err) {
  139. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  140. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  141. }
  142. }
  143. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  144. var jobID = ctx.Params(":jobid")
  145. job, err := models.GetCloudbrainByJobID(jobID)
  146. if err != nil {
  147. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  148. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  149. }
  150. ctx.Cloudbrain = job
  151. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  152. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  153. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  154. }
  155. }
  156. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  157. var jobID = ctx.Params(":jobid")
  158. job, err := models.GetCloudbrainByJobID(jobID)
  159. if err != nil {
  160. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  161. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  162. }
  163. ctx.Cloudbrain = job
  164. if !isAdminOrJobCreater(ctx, job, err) {
  165. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  166. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  167. }
  168. }
  169. func AdminOrImageCreaterRight(ctx *context.Context) {
  170. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  171. var image *models.Image
  172. if err != nil {
  173. log.Error("Get Image by ID failed:%v", err.Error())
  174. } else {
  175. image, err = models.GetImageByID(id)
  176. if err != nil {
  177. log.Error("Get Image by ID failed:%v", err.Error())
  178. return
  179. }
  180. }
  181. if !isAdminOrImageCreater(ctx, image, err) {
  182. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  183. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  184. }
  185. }
  186. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  187. var versionCount int
  188. if req.JobType == string(models.JobTypeTrain) {
  189. versionCount = 1
  190. }
  191. volumes := []models.Volume{
  192. {
  193. HostPath: models.StHostPath{
  194. Path: req.CodePath,
  195. MountPath: CodeMountPath,
  196. ReadOnly: false,
  197. },
  198. },
  199. {
  200. HostPath: models.StHostPath{
  201. Path: req.ModelPath,
  202. MountPath: ModelMountPath,
  203. ReadOnly: false,
  204. },
  205. },
  206. {
  207. HostPath: models.StHostPath{
  208. Path: req.BenchmarkPath,
  209. MountPath: BenchMarkMountPath,
  210. ReadOnly: true,
  211. },
  212. },
  213. {
  214. HostPath: models.StHostPath{
  215. Path: req.Snn4ImageNetPath,
  216. MountPath: Snn4imagenetMountPath,
  217. ReadOnly: true,
  218. },
  219. },
  220. {
  221. HostPath: models.StHostPath{
  222. Path: req.BrainScorePath,
  223. MountPath: BrainScoreMountPath,
  224. ReadOnly: true,
  225. },
  226. },
  227. {
  228. HostPath: models.StHostPath{
  229. Path: req.ResultPath,
  230. MountPath: ResultPath,
  231. ReadOnly: false,
  232. },
  233. },
  234. }
  235. if len(req.DatasetInfos) == 1 {
  236. volumes = append(volumes, models.Volume{
  237. HostPath: models.StHostPath{
  238. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  239. MountPath: DataSetMountPath,
  240. ReadOnly: true,
  241. },
  242. })
  243. } else if len(req.DatasetInfos) > 1 {
  244. for _, dataset := range req.DatasetInfos {
  245. volumes = append(volumes, models.Volume{
  246. HostPath: models.StHostPath{
  247. Path: dataset.DataLocalPath,
  248. MountPath: DataSetMountPath + "/" + dataset.Name,
  249. ReadOnly: true,
  250. },
  251. })
  252. }
  253. }
  254. createTime := timeutil.TimeStampNow()
  255. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  256. JobName: req.JobName,
  257. RetryCount: 1,
  258. GpuType: req.Spec.QueueCode,
  259. Image: req.Image,
  260. TaskRoles: []models.TaskRole{
  261. {
  262. Name: SubTaskName,
  263. TaskNumber: 1,
  264. MinSucceededTaskCount: 1,
  265. MinFailedTaskCount: 1,
  266. CPUNumber: req.Spec.CpuCores,
  267. GPUNumber: req.Spec.AccCardsNum,
  268. MemoryMB: int(req.Spec.MemGiB * 1024),
  269. ShmMB: int(req.Spec.ShareMemGiB * 1024),
  270. Command: req.Command,
  271. NeedIBDevice: false,
  272. IsMainRole: false,
  273. UseNNI: false,
  274. },
  275. },
  276. Volumes: volumes,
  277. })
  278. if err != nil {
  279. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  280. return err
  281. }
  282. if jobResult.Code != Success {
  283. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  284. return errors.New(jobResult.Msg)
  285. }
  286. var jobID = jobResult.Payload["jobId"].(string)
  287. err = models.CreateCloudbrain(&models.Cloudbrain{
  288. Status: string(models.JobWaiting),
  289. UserID: req.Ctx.User.ID,
  290. RepoID: req.Ctx.Repo.Repository.ID,
  291. JobID: jobID,
  292. JobName: req.JobName,
  293. DisplayJobName: req.DisplayJobName,
  294. SubTaskName: SubTaskName,
  295. JobType: req.JobType,
  296. Type: models.TypeCloudBrainOne,
  297. Uuid: req.Uuids,
  298. Image: req.Image,
  299. GpuQueue: req.Spec.QueueCode,
  300. ComputeResource: models.GPUResource,
  301. BenchmarkTypeID: req.BenchmarkTypeID,
  302. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  303. Description: req.Description,
  304. IsLatestVersion: "1",
  305. VersionCount: versionCount,
  306. BranchName: req.BranchName,
  307. BootFile: req.BootFile,
  308. DatasetName: req.DatasetNames,
  309. Parameters: req.Params,
  310. TrainUrl: req.TrainUrl,
  311. ModelName: req.ModelName,
  312. ModelVersion: req.ModelVersion,
  313. CkptName: req.CkptName,
  314. ResultUrl: req.ResultPath,
  315. LabelName: req.LabelName,
  316. CreatedUnix: createTime,
  317. UpdatedUnix: createTime,
  318. CommitID: req.CommitID,
  319. Spec: req.Spec,
  320. })
  321. if err != nil {
  322. return err
  323. }
  324. task, err := models.GetCloudbrainByJobID(jobID)
  325. if err != nil {
  326. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  327. return err
  328. }
  329. stringId := strconv.FormatInt(task.ID, 10)
  330. if IsBenchmarkJob(req.JobType) {
  331. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  332. } else if string(models.JobTypeTrain) == req.JobType {
  333. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  334. } else if string(models.JobTypeInference) == req.JobType {
  335. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  336. } else {
  337. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  338. }
  339. return nil
  340. }
  341. func IsBenchmarkJob(jobType string) bool {
  342. return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  343. }
  344. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  345. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  346. if err != nil {
  347. log.Warn("Get waiting count err", err)
  348. num = 0
  349. }
  350. return num
  351. }
  352. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  353. jobName := task.JobName
  354. spec := task.Spec
  355. var datasetInfos map[string]models.DatasetInfo
  356. if task.Uuid != "" {
  357. var err error
  358. datasetInfos, _, err = models.GetDatasetInfo(task.Uuid)
  359. if err != nil {
  360. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  361. return err
  362. }
  363. }
  364. volumes := []models.Volume{
  365. {
  366. HostPath: models.StHostPath{
  367. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  368. MountPath: CodeMountPath,
  369. ReadOnly: false,
  370. },
  371. },
  372. {
  373. HostPath: models.StHostPath{
  374. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  375. MountPath: ModelMountPath,
  376. ReadOnly: false,
  377. },
  378. },
  379. {
  380. HostPath: models.StHostPath{
  381. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  382. MountPath: BenchMarkMountPath,
  383. ReadOnly: true,
  384. },
  385. },
  386. {
  387. HostPath: models.StHostPath{
  388. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  389. MountPath: Snn4imagenetMountPath,
  390. ReadOnly: true,
  391. },
  392. },
  393. {
  394. HostPath: models.StHostPath{
  395. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  396. MountPath: BrainScoreMountPath,
  397. ReadOnly: true,
  398. },
  399. },
  400. }
  401. if datasetInfos != nil {
  402. if len(datasetInfos) == 1 {
  403. volumes = append(volumes, models.Volume{
  404. HostPath: models.StHostPath{
  405. Path: datasetInfos[task.Uuid].DataLocalPath,
  406. MountPath: DataSetMountPath,
  407. ReadOnly: true,
  408. },
  409. })
  410. } else {
  411. for _, dataset := range datasetInfos {
  412. volumes = append(volumes, models.Volume{
  413. HostPath: models.StHostPath{
  414. Path: dataset.DataLocalPath,
  415. MountPath: DataSetMountPath + "/" + dataset.Name,
  416. ReadOnly: true,
  417. },
  418. })
  419. }
  420. }
  421. }
  422. createTime := timeutil.TimeStampNow()
  423. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  424. JobName: jobName,
  425. RetryCount: 1,
  426. GpuType: task.GpuQueue,
  427. Image: task.Image,
  428. TaskRoles: []models.TaskRole{
  429. {
  430. Name: SubTaskName,
  431. TaskNumber: 1,
  432. MinSucceededTaskCount: 1,
  433. MinFailedTaskCount: 1,
  434. CPUNumber: spec.CpuCores,
  435. GPUNumber: spec.AccCardsNum,
  436. MemoryMB: int(spec.MemGiB * 1024),
  437. ShmMB: int(spec.ShareMemGiB * 1024),
  438. Command: GetCloudbrainDebugCommand(), //Command,
  439. NeedIBDevice: false,
  440. IsMainRole: false,
  441. UseNNI: false,
  442. },
  443. },
  444. Volumes: volumes,
  445. })
  446. if err != nil {
  447. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  448. return err
  449. }
  450. if jobResult.Code != Success {
  451. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  452. return errors.New(jobResult.Msg)
  453. }
  454. var jobID = jobResult.Payload["jobId"].(string)
  455. newTask := &models.Cloudbrain{
  456. Status: string(models.JobWaiting),
  457. UserID: task.UserID,
  458. RepoID: task.RepoID,
  459. JobID: jobID,
  460. JobName: task.JobName,
  461. DisplayJobName: task.DisplayJobName,
  462. SubTaskName: task.SubTaskName,
  463. JobType: task.JobType,
  464. Type: task.Type,
  465. Uuid: task.Uuid,
  466. DatasetName: task.DatasetName,
  467. Image: task.Image,
  468. GpuQueue: task.GpuQueue,
  469. ResourceSpecId: task.ResourceSpecId,
  470. ComputeResource: task.ComputeResource,
  471. CreatedUnix: createTime,
  472. UpdatedUnix: createTime,
  473. BranchName: task.BranchName,
  474. Spec: spec,
  475. }
  476. err = models.RestartCloudbrain(task, newTask)
  477. if err != nil {
  478. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  479. return err
  480. }
  481. stringId := strconv.FormatInt(newTask.ID, 10)
  482. *newID = stringId
  483. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  484. return nil
  485. }
  486. func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {
  487. for _, specialPool := range SpecialPools.Pools {
  488. if specialPool.ResourceSpec != nil {
  489. if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
  490. for _, spec := range specialPool.ResourceSpec {
  491. if resourceSpecId == spec.Id {
  492. return spec
  493. }
  494. }
  495. }
  496. }
  497. }
  498. return nil
  499. }
  500. func DelCloudBrainJob(jobId string) string {
  501. task, err := models.GetCloudbrainByJobID(jobId)
  502. if err != nil {
  503. log.Error("get cloud brain err:", err)
  504. return "cloudbrain.Delete_failed"
  505. }
  506. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  507. log.Error("the job(%s) has not been stopped", task.JobName)
  508. return "cloudbrain.Not_Stopped"
  509. }
  510. err = models.DeleteJob(task)
  511. if err != nil {
  512. log.Error("DeleteJob failed:", err)
  513. return "cloudbrain.Delete_failed"
  514. }
  515. deleteJobStorage(task.JobName)
  516. return ""
  517. }
  518. func deleteJobStorage(jobName string) error {
  519. //delete local
  520. localJobPath := setting.JobPath + jobName
  521. err := os.RemoveAll(localJobPath)
  522. if err != nil {
  523. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  524. }
  525. dirPath := setting.CBCodePathPrefix + jobName + "/"
  526. err = storage.Attachments.DeleteDir(dirPath)
  527. if err != nil {
  528. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  529. }
  530. return nil
  531. }
  532. func InitSpecialPool() {
  533. if SpecialPools == nil && setting.SpecialPools != "" {
  534. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  535. }
  536. }
  537. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  538. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  539. return true
  540. }
  541. for _, v := range resourceSpecs {
  542. if v.Id == resourceSpecId {
  543. return true
  544. }
  545. }
  546. return false
  547. }
  548. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  549. for _, v := range pool {
  550. if v.Queue == queue {
  551. return true
  552. }
  553. }
  554. return false
  555. }
  556. func IsElementExist(s []string, str string) bool {
  557. for _, v := range s {
  558. if v == str {
  559. return true
  560. }
  561. }
  562. return false
  563. }