You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 20 kB

2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "code.gitea.io/gitea/modules/timeutil"
  8. "code.gitea.io/gitea/modules/storage"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. )
  15. const (
  16. //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  17. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  18. CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
  19. CodeMountPath = "/code"
  20. DataSetMountPath = "/dataset"
  21. ModelMountPath = "/model"
  22. LogFile = "log.txt"
  23. BenchMarkMountPath = "/benchmark"
  24. BenchMarkResourceID = 1
  25. Snn4imagenetMountPath = "/snn4imagenet"
  26. BrainScoreMountPath = "/brainscore"
  27. TaskInfoName = "/taskInfo"
  28. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'`
  29. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'`
  30. SubTaskName = "task1"
  31. Success = "S000"
  32. DefaultBranchName = "master"
  33. ResultPath = "/result"
  34. )
  35. var (
  36. ResourceSpecs *models.ResourceSpecs
  37. TrainResourceSpecs *models.ResourceSpecs
  38. InferenceResourceSpecs *models.ResourceSpecs
  39. SpecialPools *models.SpecialPools
  40. )
  41. type GenerateCloudBrainTaskReq struct {
  42. Ctx *context.Context
  43. DisplayJobName string
  44. JobName string
  45. Image string
  46. Command string
  47. CodePath string
  48. ModelPath string
  49. BenchmarkPath string
  50. Snn4ImageNetPath string
  51. BrainScorePath string
  52. JobType string
  53. GpuQueue string
  54. Description string
  55. BranchName string
  56. BootFile string
  57. Params string
  58. CommitID string
  59. Uuids string
  60. DatasetNames string
  61. DatasetInfos map[string]models.DatasetInfo
  62. BenchmarkTypeID int
  63. BenchmarkChildTypeID int
  64. ResourceSpecId int
  65. ResultPath string
  66. TrainUrl string
  67. ModelName string
  68. ModelVersion string
  69. CkptName string
  70. LabelName string
  71. }
  72. func GetCloudbrainDebugCommand() string {
  73. var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
  74. return command
  75. }
  76. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  77. if !ctx.IsSigned {
  78. return false
  79. }
  80. if err != nil {
  81. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  82. } else {
  83. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  84. }
  85. }
  86. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  87. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  88. }
  89. func CanCreateOrDebugJob(ctx *context.Context) bool {
  90. if !ctx.IsSigned {
  91. return false
  92. }
  93. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  94. }
  95. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  96. return isAdminOrJobCreater(ctx, job, nil)
  97. }
  98. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  99. if !ctx.IsSigned {
  100. return false
  101. }
  102. if err != nil {
  103. return ctx.IsUserSiteAdmin()
  104. } else {
  105. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  106. }
  107. }
  108. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  109. if !ctx.IsSigned {
  110. return false
  111. }
  112. if err != nil {
  113. return ctx.IsUserSiteAdmin()
  114. } else {
  115. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  116. }
  117. }
  118. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  119. var id = ctx.Params(":id")
  120. job, err := models.GetCloudbrainByID(id)
  121. if err != nil {
  122. log.Error("GetCloudbrainByID failed:%v", err.Error())
  123. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  124. }
  125. ctx.Cloudbrain = job
  126. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  127. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  128. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  129. }
  130. }
  131. func AdminOrJobCreaterRight(ctx *context.Context) {
  132. var id = ctx.Params(":id")
  133. job, err := models.GetCloudbrainByID(id)
  134. if err != nil {
  135. log.Error("GetCloudbrainByID failed:%v", err.Error())
  136. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  137. }
  138. ctx.Cloudbrain = job
  139. if !isAdminOrJobCreater(ctx, job, err) {
  140. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  141. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  142. }
  143. }
  144. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  145. var jobID = ctx.Params(":jobid")
  146. job, err := models.GetCloudbrainByJobID(jobID)
  147. if err != nil {
  148. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  149. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  150. }
  151. ctx.Cloudbrain = job
  152. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  153. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  154. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  155. }
  156. }
  157. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  158. var jobID = ctx.Params(":jobid")
  159. job, err := models.GetCloudbrainByJobID(jobID)
  160. if err != nil {
  161. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  162. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  163. }
  164. ctx.Cloudbrain = job
  165. if !isAdminOrJobCreater(ctx, job, err) {
  166. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  167. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  168. }
  169. }
  170. func AdminOrImageCreaterRight(ctx *context.Context) {
  171. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  172. var image *models.Image
  173. if err != nil {
  174. log.Error("Get Image by ID failed:%v", err.Error())
  175. } else {
  176. image, err = models.GetImageByID(id)
  177. if err != nil {
  178. log.Error("Get Image by ID failed:%v", err.Error())
  179. return
  180. }
  181. }
  182. if !isAdminOrImageCreater(ctx, image, err) {
  183. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  184. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  185. }
  186. }
  187. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  188. var resourceSpec *models.ResourceSpec
  189. var versionCount int
  190. if req.JobType == string(models.JobTypeTrain) {
  191. versionCount = 1
  192. if TrainResourceSpecs == nil {
  193. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  194. }
  195. for _, spec := range TrainResourceSpecs.ResourceSpec {
  196. if req.ResourceSpecId == spec.Id {
  197. resourceSpec = spec
  198. break
  199. }
  200. }
  201. } else if req.JobType == string(models.JobTypeInference) {
  202. if InferenceResourceSpecs == nil {
  203. json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs)
  204. }
  205. for _, spec := range InferenceResourceSpecs.ResourceSpec {
  206. if req.ResourceSpecId == spec.Id {
  207. resourceSpec = spec
  208. break
  209. }
  210. }
  211. } else {
  212. if ResourceSpecs == nil {
  213. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  214. }
  215. for _, spec := range ResourceSpecs.ResourceSpec {
  216. if req.ResourceSpecId == spec.Id {
  217. resourceSpec = spec
  218. break
  219. }
  220. }
  221. }
  222. //如果没有匹配到spec信息,尝试从专属资源池获取
  223. if resourceSpec == nil && SpecialPools != nil {
  224. resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId)
  225. }
  226. if resourceSpec == nil {
  227. log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
  228. return errors.New("no such resourceSpec")
  229. }
  230. volumes := []models.Volume{
  231. {
  232. HostPath: models.StHostPath{
  233. Path: req.CodePath,
  234. MountPath: CodeMountPath,
  235. ReadOnly: false,
  236. },
  237. },
  238. {
  239. HostPath: models.StHostPath{
  240. Path: req.ModelPath,
  241. MountPath: ModelMountPath,
  242. ReadOnly: false,
  243. },
  244. },
  245. {
  246. HostPath: models.StHostPath{
  247. Path: req.BenchmarkPath,
  248. MountPath: BenchMarkMountPath,
  249. ReadOnly: true,
  250. },
  251. },
  252. {
  253. HostPath: models.StHostPath{
  254. Path: req.Snn4ImageNetPath,
  255. MountPath: Snn4imagenetMountPath,
  256. ReadOnly: true,
  257. },
  258. },
  259. {
  260. HostPath: models.StHostPath{
  261. Path: req.BrainScorePath,
  262. MountPath: BrainScoreMountPath,
  263. ReadOnly: true,
  264. },
  265. },
  266. {
  267. HostPath: models.StHostPath{
  268. Path: req.ResultPath,
  269. MountPath: ResultPath,
  270. ReadOnly: false,
  271. },
  272. },
  273. }
  274. if len(req.DatasetInfos) == 1 {
  275. volumes = append(volumes, models.Volume{
  276. HostPath: models.StHostPath{
  277. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  278. MountPath: DataSetMountPath,
  279. ReadOnly: true,
  280. },
  281. })
  282. } else if len(req.DatasetInfos) > 1 {
  283. for _, dataset := range req.DatasetInfos {
  284. volumes = append(volumes, models.Volume{
  285. HostPath: models.StHostPath{
  286. Path: dataset.DataLocalPath,
  287. MountPath: DataSetMountPath + "/" + dataset.Name,
  288. ReadOnly: true,
  289. },
  290. })
  291. }
  292. }
  293. createTime := timeutil.TimeStampNow()
  294. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  295. JobName: req.JobName,
  296. RetryCount: 1,
  297. GpuType: req.GpuQueue,
  298. Image: req.Image,
  299. TaskRoles: []models.TaskRole{
  300. {
  301. Name: SubTaskName,
  302. TaskNumber: 1,
  303. MinSucceededTaskCount: 1,
  304. MinFailedTaskCount: 1,
  305. CPUNumber: resourceSpec.CpuNum,
  306. GPUNumber: resourceSpec.GpuNum,
  307. MemoryMB: resourceSpec.MemMiB,
  308. ShmMB: resourceSpec.ShareMemMiB,
  309. Command: req.Command,
  310. NeedIBDevice: false,
  311. IsMainRole: false,
  312. UseNNI: false,
  313. },
  314. },
  315. Volumes: volumes,
  316. })
  317. if err != nil {
  318. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  319. return err
  320. }
  321. if jobResult.Code != Success {
  322. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  323. return errors.New(jobResult.Msg)
  324. }
  325. var jobID = jobResult.Payload["jobId"].(string)
  326. err = models.CreateCloudbrain(&models.Cloudbrain{
  327. Status: string(models.JobWaiting),
  328. UserID: req.Ctx.User.ID,
  329. RepoID: req.Ctx.Repo.Repository.ID,
  330. JobID: jobID,
  331. JobName: req.JobName,
  332. DisplayJobName: req.DisplayJobName,
  333. SubTaskName: SubTaskName,
  334. JobType: req.JobType,
  335. Type: models.TypeCloudBrainOne,
  336. Uuid: req.Uuids,
  337. Image: req.Image,
  338. GpuQueue: req.GpuQueue,
  339. ResourceSpecId: req.ResourceSpecId,
  340. ComputeResource: models.GPUResource,
  341. BenchmarkTypeID: req.BenchmarkTypeID,
  342. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  343. Description: req.Description,
  344. IsLatestVersion: "1",
  345. VersionCount: versionCount,
  346. BranchName: req.BranchName,
  347. BootFile: req.BootFile,
  348. DatasetName: req.DatasetNames,
  349. Parameters: req.Params,
  350. TrainUrl: req.TrainUrl,
  351. ModelName: req.ModelName,
  352. ModelVersion: req.ModelVersion,
  353. CkptName: req.CkptName,
  354. ResultUrl: req.ResultPath,
  355. LabelName: req.LabelName,
  356. CreatedUnix: createTime,
  357. UpdatedUnix: createTime,
  358. CommitID: req.CommitID,
  359. })
  360. if err != nil {
  361. return err
  362. }
  363. task, err := models.GetCloudbrainByJobID(jobID)
  364. if err != nil {
  365. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  366. return err
  367. }
  368. stringId := strconv.FormatInt(task.ID, 10)
  369. if IsBenchmarkJob(req.JobType) {
  370. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  371. } else if string(models.JobTypeTrain) == req.JobType {
  372. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  373. } else if string(models.JobTypeInference) == req.JobType {
  374. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  375. } else {
  376. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  377. }
  378. return nil
  379. }
  380. func IsBenchmarkJob(jobType string) bool {
  381. return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  382. }
  383. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  384. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  385. if err != nil {
  386. log.Warn("Get waiting count err", err)
  387. num = 0
  388. }
  389. return num
  390. }
  391. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  392. jobName := task.JobName
  393. var resourceSpec *models.ResourceSpec
  394. if ResourceSpecs == nil {
  395. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  396. }
  397. for _, spec := range ResourceSpecs.ResourceSpec {
  398. if task.ResourceSpecId == spec.Id {
  399. resourceSpec = spec
  400. }
  401. }
  402. //如果没有匹配到spec信息,尝试从专属资源池获取
  403. if resourceSpec == nil && SpecialPools != nil {
  404. resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId)
  405. }
  406. if resourceSpec == nil {
  407. log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
  408. return errors.New("no such resourceSpec")
  409. }
  410. var datasetInfos map[string]models.DatasetInfo
  411. if task.Uuid != "" {
  412. var err error
  413. datasetInfos, _, err = models.GetDatasetInfo(task.Uuid)
  414. if err != nil {
  415. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  416. return err
  417. }
  418. }
  419. volumes := []models.Volume{
  420. {
  421. HostPath: models.StHostPath{
  422. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  423. MountPath: CodeMountPath,
  424. ReadOnly: false,
  425. },
  426. },
  427. {
  428. HostPath: models.StHostPath{
  429. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  430. MountPath: ModelMountPath,
  431. ReadOnly: false,
  432. },
  433. },
  434. {
  435. HostPath: models.StHostPath{
  436. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  437. MountPath: BenchMarkMountPath,
  438. ReadOnly: true,
  439. },
  440. },
  441. {
  442. HostPath: models.StHostPath{
  443. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  444. MountPath: Snn4imagenetMountPath,
  445. ReadOnly: true,
  446. },
  447. },
  448. {
  449. HostPath: models.StHostPath{
  450. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  451. MountPath: BrainScoreMountPath,
  452. ReadOnly: true,
  453. },
  454. },
  455. }
  456. if datasetInfos != nil {
  457. if len(datasetInfos) == 1 {
  458. volumes = append(volumes, models.Volume{
  459. HostPath: models.StHostPath{
  460. Path: datasetInfos[task.Uuid].DataLocalPath,
  461. MountPath: DataSetMountPath,
  462. ReadOnly: true,
  463. },
  464. })
  465. } else {
  466. for _, dataset := range datasetInfos {
  467. volumes = append(volumes, models.Volume{
  468. HostPath: models.StHostPath{
  469. Path: dataset.DataLocalPath,
  470. MountPath: DataSetMountPath + "/" + dataset.Name,
  471. ReadOnly: true,
  472. },
  473. })
  474. }
  475. }
  476. }
  477. createTime := timeutil.TimeStampNow()
  478. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  479. JobName: jobName,
  480. RetryCount: 1,
  481. GpuType: task.GpuQueue,
  482. Image: task.Image,
  483. TaskRoles: []models.TaskRole{
  484. {
  485. Name: SubTaskName,
  486. TaskNumber: 1,
  487. MinSucceededTaskCount: 1,
  488. MinFailedTaskCount: 1,
  489. CPUNumber: resourceSpec.CpuNum,
  490. GPUNumber: resourceSpec.GpuNum,
  491. MemoryMB: resourceSpec.MemMiB,
  492. ShmMB: resourceSpec.ShareMemMiB,
  493. Command: GetCloudbrainDebugCommand(), //Command,
  494. NeedIBDevice: false,
  495. IsMainRole: false,
  496. UseNNI: false,
  497. },
  498. },
  499. Volumes: volumes,
  500. })
  501. if err != nil {
  502. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  503. return err
  504. }
  505. if jobResult.Code != Success {
  506. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  507. return errors.New(jobResult.Msg)
  508. }
  509. var jobID = jobResult.Payload["jobId"].(string)
  510. newTask := &models.Cloudbrain{
  511. Status: string(models.JobWaiting),
  512. UserID: task.UserID,
  513. RepoID: task.RepoID,
  514. JobID: jobID,
  515. JobName: task.JobName,
  516. DisplayJobName: task.DisplayJobName,
  517. SubTaskName: task.SubTaskName,
  518. JobType: task.JobType,
  519. Type: task.Type,
  520. Uuid: task.Uuid,
  521. DatasetName: task.DatasetName,
  522. Image: task.Image,
  523. GpuQueue: task.GpuQueue,
  524. ResourceSpecId: task.ResourceSpecId,
  525. ComputeResource: task.ComputeResource,
  526. CreatedUnix: createTime,
  527. UpdatedUnix: createTime,
  528. BranchName: task.BranchName,
  529. }
  530. err = models.RestartCloudbrain(task, newTask)
  531. if err != nil {
  532. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  533. return err
  534. }
  535. stringId := strconv.FormatInt(newTask.ID, 10)
  536. *newID = stringId
  537. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  538. return nil
  539. }
  540. func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {
  541. for _, specialPool := range SpecialPools.Pools {
  542. if specialPool.ResourceSpec != nil {
  543. if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
  544. for _, spec := range specialPool.ResourceSpec {
  545. if resourceSpecId == spec.Id {
  546. return spec
  547. }
  548. }
  549. }
  550. }
  551. }
  552. return nil
  553. }
  554. func DelCloudBrainJob(jobId string) string {
  555. task, err := models.GetCloudbrainByJobID(jobId)
  556. if err != nil {
  557. log.Error("get cloud brain err:", err)
  558. return "cloudbrain.Delete_failed"
  559. }
  560. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  561. log.Error("the job(%s) has not been stopped", task.JobName)
  562. return "cloudbrain.Not_Stopped"
  563. }
  564. err = models.DeleteJob(task)
  565. if err != nil {
  566. log.Error("DeleteJob failed:", err)
  567. return "cloudbrain.Delete_failed"
  568. }
  569. deleteJobStorage(task.JobName)
  570. return ""
  571. }
  572. func deleteJobStorage(jobName string) error {
  573. //delete local
  574. localJobPath := setting.JobPath + jobName
  575. err := os.RemoveAll(localJobPath)
  576. if err != nil {
  577. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  578. }
  579. dirPath := setting.CBCodePathPrefix + jobName + "/"
  580. err = storage.Attachments.DeleteDir(dirPath)
  581. if err != nil {
  582. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  583. }
  584. return nil
  585. }
  586. func InitSpecialPool() {
  587. if SpecialPools == nil && setting.SpecialPools != "" {
  588. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  589. }
  590. }
  591. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  592. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  593. return true
  594. }
  595. for _, v := range resourceSpecs {
  596. if v.Id == resourceSpecId {
  597. return true
  598. }
  599. }
  600. return false
  601. }
  602. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  603. for _, v := range pool {
  604. if v.Queue == queue {
  605. return true
  606. }
  607. }
  608. return false
  609. }
  610. func IsElementExist(s []string, str string) bool {
  611. for _, v := range s {
  612. if v == str {
  613. return true
  614. }
  615. }
  616. return false
  617. }