You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 20 kB

2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
2 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
3 years ago
2 years ago
4 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "code.gitea.io/gitea/modules/timeutil"
  8. "code.gitea.io/gitea/modules/storage"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. )
  15. const (
  16. //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  17. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  18. CommandBenchmark = `cd /benchmark && bash run_bk.sh >/model/benchmark-log.txt`
  19. CodeMountPath = "/code"
  20. DataSetMountPath = "/dataset"
  21. ModelMountPath = "/model"
  22. PretrainModelMountPath = "/pretrainmodel"
  23. LogFile = "log.txt"
  24. BenchMarkMountPath = "/benchmark"
  25. BenchMarkResourceID = 1
  26. Snn4imagenetMountPath = "/snn4imagenet"
  27. BrainScoreMountPath = "/brainscore"
  28. Snn4EcosetMountPath = "/snn4ecoset"
  29. TaskInfoName = "/taskInfo"
  30. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/pretrainmodel' --modeldescription '%s' >/model/benchmark-log.txt`
  31. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/pretrainmodel' -d '%s' >/model/benchmark-log.txt`
  32. SubTaskName = "task1"
  33. Success = "S000"
  34. DefaultBranchName = "master"
  35. ResultPath = "/result"
  36. )
  37. var (
  38. ResourceSpecs *models.ResourceSpecs
  39. TrainResourceSpecs *models.ResourceSpecs
  40. InferenceResourceSpecs *models.ResourceSpecs
  41. SpecialPools *models.SpecialPools
  42. )
  43. type GenerateCloudBrainTaskReq struct {
  44. Ctx *context.Context
  45. DisplayJobName string
  46. JobName string
  47. Image string
  48. Command string
  49. CodePath string
  50. ModelPath string
  51. BenchmarkPath string
  52. Snn4ImageNetPath string
  53. Snn4EcosetPath string
  54. BrainScorePath string
  55. JobType string
  56. Description string
  57. BranchName string
  58. BootFile string
  59. Params string
  60. CommitID string
  61. Uuids string
  62. DatasetNames string
  63. DatasetInfos map[string]models.DatasetInfo
  64. BenchmarkTypeID int
  65. BenchmarkChildTypeID int
  66. ResultPath string
  67. TrainUrl string
  68. ModelName string
  69. ModelVersion string
  70. CkptName string
  71. LabelName string
  72. PreTrainModelPath string
  73. PreTrainModelUrl string
  74. Spec *models.Specification
  75. }
  76. func GetCloudbrainDebugCommand() string {
  77. var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
  78. return command
  79. }
  80. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  81. if !ctx.IsSigned {
  82. return false
  83. }
  84. if err != nil {
  85. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  86. } else {
  87. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  88. }
  89. }
  90. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  91. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  92. }
  93. func CanCreateOrDebugJob(ctx *context.Context) bool {
  94. if !ctx.IsSigned {
  95. return false
  96. }
  97. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  98. }
  99. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  100. return isAdminOrJobCreater(ctx, job, nil)
  101. }
  102. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  103. if !ctx.IsSigned {
  104. return false
  105. }
  106. if err != nil {
  107. return ctx.IsUserSiteAdmin()
  108. } else {
  109. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  110. }
  111. }
  112. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  113. if !ctx.IsSigned {
  114. return false
  115. }
  116. if err != nil {
  117. return ctx.IsUserSiteAdmin()
  118. } else {
  119. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  120. }
  121. }
  122. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  123. var id = ctx.Params(":id")
  124. job, err := GetCloudBrainByIdOrJobId(id)
  125. if err != nil {
  126. log.Error("GetCloudbrainByID failed:%v", err.Error())
  127. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  128. }
  129. ctx.Cloudbrain = job
  130. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  131. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  132. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  133. }
  134. }
  135. func AdminOrJobCreaterRight(ctx *context.Context) {
  136. var id = ctx.Params(":id")
  137. job, err := GetCloudBrainByIdOrJobId(id)
  138. if err != nil {
  139. log.Error("GetCloudbrainByID failed:%v", err.Error())
  140. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  141. }
  142. ctx.Cloudbrain = job
  143. if !isAdminOrJobCreater(ctx, job, err) {
  144. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  145. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  146. }
  147. }
  148. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  149. var jobID = ctx.Params(":jobid")
  150. job, err := GetCloudBrainByIdOrJobId(jobID)
  151. if err != nil {
  152. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  153. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  154. }
  155. ctx.Cloudbrain = job
  156. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  157. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  158. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  159. }
  160. }
  161. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  162. var jobID = ctx.Params(":jobid")
  163. job, err := GetCloudBrainByIdOrJobId(jobID)
  164. if err != nil {
  165. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  166. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  167. }
  168. ctx.Cloudbrain = job
  169. if !isAdminOrJobCreater(ctx, job, err) {
  170. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  171. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  172. }
  173. }
  174. func AdminOrImageCreaterRight(ctx *context.Context) {
  175. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  176. var image *models.Image
  177. if err != nil {
  178. log.Error("Get Image by ID failed:%v", err.Error())
  179. } else {
  180. image, err = models.GetImageByID(id)
  181. if err != nil {
  182. log.Error("Get Image by ID failed:%v", err.Error())
  183. return
  184. }
  185. }
  186. if !isAdminOrImageCreater(ctx, image, err) {
  187. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  188. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  189. }
  190. }
  191. func GenerateTask(req GenerateCloudBrainTaskReq) (string, error) {
  192. var versionCount int
  193. if req.JobType == string(models.JobTypeTrain) {
  194. versionCount = 1
  195. }
  196. volumes := []models.Volume{
  197. {
  198. HostPath: models.StHostPath{
  199. Path: req.CodePath,
  200. MountPath: CodeMountPath,
  201. ReadOnly: false,
  202. },
  203. },
  204. {
  205. HostPath: models.StHostPath{
  206. Path: req.ModelPath,
  207. MountPath: ModelMountPath,
  208. ReadOnly: false,
  209. },
  210. },
  211. {
  212. HostPath: models.StHostPath{
  213. Path: req.BenchmarkPath,
  214. MountPath: BenchMarkMountPath,
  215. ReadOnly: true,
  216. },
  217. },
  218. {
  219. HostPath: models.StHostPath{
  220. Path: req.Snn4ImageNetPath,
  221. MountPath: Snn4imagenetMountPath,
  222. ReadOnly: true,
  223. },
  224. },
  225. {
  226. HostPath: models.StHostPath{
  227. Path: req.BrainScorePath,
  228. MountPath: BrainScoreMountPath,
  229. ReadOnly: true,
  230. },
  231. },
  232. {
  233. HostPath: models.StHostPath{
  234. Path: req.ResultPath,
  235. MountPath: ResultPath,
  236. ReadOnly: false,
  237. },
  238. },
  239. }
  240. if req.PreTrainModelUrl != "" { //预训练
  241. volumes = append(volumes, models.Volume{
  242. HostPath: models.StHostPath{
  243. Path: req.PreTrainModelPath,
  244. MountPath: PretrainModelMountPath,
  245. ReadOnly: true,
  246. },
  247. })
  248. }
  249. if req.Snn4EcosetPath != "" { //ecoset benchmark
  250. volumes = append(volumes, models.Volume{
  251. HostPath: models.StHostPath{
  252. Path: req.Snn4EcosetPath,
  253. MountPath: Snn4EcosetMountPath,
  254. ReadOnly: true,
  255. },
  256. })
  257. }
  258. if len(req.DatasetInfos) == 1 {
  259. volumes = append(volumes, models.Volume{
  260. HostPath: models.StHostPath{
  261. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  262. MountPath: DataSetMountPath,
  263. ReadOnly: true,
  264. },
  265. })
  266. } else if len(req.DatasetInfos) > 1 {
  267. for _, dataset := range req.DatasetInfos {
  268. volumes = append(volumes, models.Volume{
  269. HostPath: models.StHostPath{
  270. Path: dataset.DataLocalPath,
  271. MountPath: DataSetMountPath + "/" + dataset.Name,
  272. ReadOnly: true,
  273. },
  274. })
  275. }
  276. }
  277. createTime := timeutil.TimeStampNow()
  278. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  279. JobName: req.JobName,
  280. RetryCount: 1,
  281. GpuType: req.Spec.QueueCode,
  282. Image: req.Image,
  283. TaskRoles: []models.TaskRole{
  284. {
  285. Name: SubTaskName,
  286. TaskNumber: 1,
  287. MinSucceededTaskCount: 1,
  288. MinFailedTaskCount: 1,
  289. CPUNumber: req.Spec.CpuCores,
  290. GPUNumber: req.Spec.AccCardsNum,
  291. MemoryMB: int(req.Spec.MemGiB * 1024),
  292. ShmMB: int(req.Spec.ShareMemGiB * 1024),
  293. Command: req.Command,
  294. NeedIBDevice: false,
  295. IsMainRole: false,
  296. UseNNI: false,
  297. },
  298. },
  299. Volumes: volumes,
  300. })
  301. if err != nil {
  302. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  303. return "", err
  304. }
  305. if jobResult.Code != Success {
  306. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  307. return "", errors.New(jobResult.Msg)
  308. }
  309. var jobID = jobResult.Payload["jobId"].(string)
  310. err = models.CreateCloudbrain(&models.Cloudbrain{
  311. Status: string(models.JobWaiting),
  312. UserID: req.Ctx.User.ID,
  313. RepoID: req.Ctx.Repo.Repository.ID,
  314. JobID: jobID,
  315. JobName: req.JobName,
  316. DisplayJobName: req.DisplayJobName,
  317. SubTaskName: SubTaskName,
  318. JobType: req.JobType,
  319. Type: models.TypeCloudBrainOne,
  320. Uuid: req.Uuids,
  321. Image: req.Image,
  322. GpuQueue: req.Spec.QueueCode,
  323. ComputeResource: models.GPUResource,
  324. BenchmarkTypeID: req.BenchmarkTypeID,
  325. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  326. Description: req.Description,
  327. IsLatestVersion: "1",
  328. VersionCount: versionCount,
  329. BranchName: req.BranchName,
  330. BootFile: req.BootFile,
  331. DatasetName: req.DatasetNames,
  332. Parameters: req.Params,
  333. TrainUrl: req.TrainUrl,
  334. ModelName: req.ModelName,
  335. ModelVersion: req.ModelVersion,
  336. CkptName: req.CkptName,
  337. ResultUrl: req.ResultPath,
  338. LabelName: req.LabelName,
  339. PreTrainModelUrl: req.PreTrainModelUrl,
  340. CreatedUnix: createTime,
  341. UpdatedUnix: createTime,
  342. CommitID: req.CommitID,
  343. Spec: req.Spec,
  344. })
  345. if err != nil {
  346. return "", err
  347. }
  348. task, err := models.GetCloudbrainByJobID(jobID)
  349. if err != nil {
  350. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  351. return "", err
  352. }
  353. stringId := strconv.FormatInt(task.ID, 10)
  354. if IsBenchmarkJob(req.JobType) {
  355. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  356. } else if string(models.JobTypeTrain) == req.JobType {
  357. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  358. } else if string(models.JobTypeInference) == req.JobType {
  359. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  360. } else {
  361. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  362. }
  363. return jobID, nil
  364. }
  365. func IsBenchmarkJob(jobType string) bool {
  366. return string(models.JobTypeModelSafety) == jobType || string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType || string(models.JobTypeSnn4Ecoset) == jobType
  367. }
  368. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  369. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  370. if err != nil {
  371. log.Warn("Get waiting count err", err)
  372. num = 0
  373. }
  374. return num
  375. }
  376. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  377. jobName := task.JobName
  378. spec := task.Spec
  379. var datasetInfos map[string]models.DatasetInfo
  380. if task.Uuid != "" {
  381. var err error
  382. datasetInfos, _, err = models.GetDatasetInfo(task.Uuid)
  383. if err != nil {
  384. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  385. return err
  386. }
  387. }
  388. volumes := []models.Volume{
  389. {
  390. HostPath: models.StHostPath{
  391. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  392. MountPath: CodeMountPath,
  393. ReadOnly: false,
  394. },
  395. },
  396. {
  397. HostPath: models.StHostPath{
  398. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  399. MountPath: ModelMountPath,
  400. ReadOnly: false,
  401. },
  402. },
  403. {
  404. HostPath: models.StHostPath{
  405. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  406. MountPath: BenchMarkMountPath,
  407. ReadOnly: true,
  408. },
  409. },
  410. {
  411. HostPath: models.StHostPath{
  412. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  413. MountPath: Snn4imagenetMountPath,
  414. ReadOnly: true,
  415. },
  416. },
  417. {
  418. HostPath: models.StHostPath{
  419. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  420. MountPath: BrainScoreMountPath,
  421. ReadOnly: true,
  422. },
  423. },
  424. }
  425. if datasetInfos != nil {
  426. if len(datasetInfos) == 1 {
  427. volumes = append(volumes, models.Volume{
  428. HostPath: models.StHostPath{
  429. Path: datasetInfos[task.Uuid].DataLocalPath,
  430. MountPath: DataSetMountPath,
  431. ReadOnly: true,
  432. },
  433. })
  434. } else {
  435. for _, dataset := range datasetInfos {
  436. volumes = append(volumes, models.Volume{
  437. HostPath: models.StHostPath{
  438. Path: dataset.DataLocalPath,
  439. MountPath: DataSetMountPath + "/" + dataset.Name,
  440. ReadOnly: true,
  441. },
  442. })
  443. }
  444. }
  445. }
  446. createTime := timeutil.TimeStampNow()
  447. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  448. JobName: jobName,
  449. RetryCount: 1,
  450. GpuType: task.GpuQueue,
  451. Image: task.Image,
  452. TaskRoles: []models.TaskRole{
  453. {
  454. Name: SubTaskName,
  455. TaskNumber: 1,
  456. MinSucceededTaskCount: 1,
  457. MinFailedTaskCount: 1,
  458. CPUNumber: spec.CpuCores,
  459. GPUNumber: spec.AccCardsNum,
  460. MemoryMB: int(spec.MemGiB * 1024),
  461. ShmMB: int(spec.ShareMemGiB * 1024),
  462. Command: GetCloudbrainDebugCommand(), //Command,
  463. NeedIBDevice: false,
  464. IsMainRole: false,
  465. UseNNI: false,
  466. },
  467. },
  468. Volumes: volumes,
  469. })
  470. if err != nil {
  471. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  472. return err
  473. }
  474. if jobResult.Code != Success {
  475. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  476. return errors.New(jobResult.Msg)
  477. }
  478. var jobID = jobResult.Payload["jobId"].(string)
  479. newTask := &models.Cloudbrain{
  480. Status: string(models.JobWaiting),
  481. UserID: task.UserID,
  482. RepoID: task.RepoID,
  483. JobID: jobID,
  484. JobName: task.JobName,
  485. DisplayJobName: task.DisplayJobName,
  486. SubTaskName: task.SubTaskName,
  487. JobType: task.JobType,
  488. Type: task.Type,
  489. Uuid: task.Uuid,
  490. DatasetName: task.DatasetName,
  491. Image: task.Image,
  492. GpuQueue: task.GpuQueue,
  493. ResourceSpecId: task.ResourceSpecId,
  494. ComputeResource: task.ComputeResource,
  495. CreatedUnix: createTime,
  496. UpdatedUnix: createTime,
  497. BranchName: task.BranchName,
  498. Spec: spec,
  499. }
  500. err = models.RestartCloudbrain(task, newTask)
  501. if err != nil {
  502. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  503. return err
  504. }
  505. stringId := strconv.FormatInt(newTask.ID, 10)
  506. *newID = stringId
  507. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  508. return nil
  509. }
  510. func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {
  511. for _, specialPool := range SpecialPools.Pools {
  512. if specialPool.ResourceSpec != nil {
  513. if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
  514. for _, spec := range specialPool.ResourceSpec {
  515. if resourceSpecId == spec.Id {
  516. return spec
  517. }
  518. }
  519. }
  520. }
  521. }
  522. return nil
  523. }
  524. func DelCloudBrainJob(jobId string) string {
  525. task, err := models.GetCloudbrainByJobID(jobId)
  526. if err != nil {
  527. log.Error("get cloud brain err:", err)
  528. return "cloudbrain.Delete_failed"
  529. }
  530. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  531. log.Error("the job(%s) has not been stopped", task.JobName)
  532. return "cloudbrain.Not_Stopped"
  533. }
  534. err = models.DeleteJob(task)
  535. if err != nil {
  536. log.Error("DeleteJob failed:", err)
  537. return "cloudbrain.Delete_failed"
  538. }
  539. deleteJobStorage(task.JobName)
  540. return ""
  541. }
  542. func deleteJobStorage(jobName string) error {
  543. //delete local
  544. localJobPath := setting.JobPath + jobName
  545. err := os.RemoveAll(localJobPath)
  546. if err != nil {
  547. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  548. }
  549. dirPath := setting.CBCodePathPrefix + jobName + "/"
  550. err = storage.Attachments.DeleteDir(dirPath)
  551. if err != nil {
  552. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  553. }
  554. return nil
  555. }
  556. func InitSpecialPool() {
  557. if SpecialPools == nil && setting.SpecialPools != "" {
  558. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  559. }
  560. }
  561. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  562. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  563. return true
  564. }
  565. for _, v := range resourceSpecs {
  566. if v.Id == resourceSpecId {
  567. return true
  568. }
  569. }
  570. return false
  571. }
  572. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  573. for _, v := range pool {
  574. if v.Queue == queue {
  575. return true
  576. }
  577. }
  578. return false
  579. }
  580. func IsElementExist(s []string, str string) bool {
  581. for _, v := range s {
  582. if v == str {
  583. return true
  584. }
  585. }
  586. return false
  587. }
  588. func GetCloudBrainByIdOrJobId(id string) (*models.Cloudbrain, error) {
  589. _, err := strconv.ParseInt(id, 10, 64)
  590. var job *models.Cloudbrain
  591. if err != nil {
  592. job, err = models.GetCloudbrainByJobID(id)
  593. } else {
  594. job, err = models.GetCloudbrainByID(id)
  595. if err != nil {
  596. job, err = models.GetCloudbrainByJobID(id)
  597. }
  598. }
  599. return job, err
  600. }