You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 20 kB

2 years ago
2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
2 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
3 years ago
2 years ago
4 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "strings"
  8. "code.gitea.io/gitea/modules/timeutil"
  9. "code.gitea.io/gitea/modules/storage"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. )
  16. const (
  17. //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  18. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  19. CommandBenchmark = `cd /benchmark && bash run_bk.sh >/model/benchmark-log.txt`
  20. CodeMountPath = "/code"
  21. DataSetMountPath = "/dataset"
  22. ModelMountPath = "/model"
  23. PretrainModelMountPath = "/pretrainmodel"
  24. LogFile = "log.txt"
  25. BenchMarkMountPath = "/benchmark"
  26. BenchMarkResourceID = 1
  27. Snn4imagenetMountPath = "/snn4imagenet"
  28. BrainScoreMountPath = "/brainscore"
  29. TaskInfoName = "/taskInfo"
  30. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s' >/model/benchmark-log.txt`
  31. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s' >/model/benchmark-log.txt`
  32. SubTaskName = "task1"
  33. Success = "S000"
  34. DefaultBranchName = "master"
  35. ResultPath = "/result"
  36. )
  37. var (
  38. ResourceSpecs *models.ResourceSpecs
  39. TrainResourceSpecs *models.ResourceSpecs
  40. InferenceResourceSpecs *models.ResourceSpecs
  41. SpecialPools *models.SpecialPools
  42. )
  43. type GenerateCloudBrainTaskReq struct {
  44. Ctx *context.Context
  45. DisplayJobName string
  46. JobName string
  47. Image string
  48. Command string
  49. CodePath string
  50. ModelPath string
  51. BenchmarkPath string
  52. Snn4ImageNetPath string
  53. BrainScorePath string
  54. JobType string
  55. Description string
  56. BranchName string
  57. BootFile string
  58. Params string
  59. CommitID string
  60. Uuids string
  61. DatasetNames string
  62. DatasetInfos map[string]models.DatasetInfo
  63. BenchmarkTypeID int
  64. BenchmarkChildTypeID int
  65. ResultPath string
  66. TrainUrl string
  67. ModelName string
  68. ModelVersion string
  69. CkptName string
  70. LabelName string
  71. PreTrainModelPath string
  72. PreTrainModelUrl string
  73. Spec *models.Specification
  74. }
  75. func GetCloudbrainDebugCommand() string {
  76. var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
  77. return command
  78. }
  79. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  80. if !ctx.IsSigned {
  81. return false
  82. }
  83. if err != nil {
  84. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  85. } else {
  86. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  87. }
  88. }
  89. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  90. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  91. }
  92. func CanCreateOrDebugJob(ctx *context.Context) bool {
  93. if !ctx.IsSigned {
  94. return false
  95. }
  96. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  97. }
  98. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  99. return isAdminOrJobCreater(ctx, job, nil)
  100. }
  101. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  102. if !ctx.IsSigned {
  103. return false
  104. }
  105. if err != nil {
  106. return ctx.IsUserSiteAdmin()
  107. } else {
  108. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  109. }
  110. }
  111. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  112. if !ctx.IsSigned {
  113. return false
  114. }
  115. if err != nil {
  116. return ctx.IsUserSiteAdmin()
  117. } else {
  118. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  119. }
  120. }
  121. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  122. var id = ctx.Params(":id")
  123. job, err := GetCloudBrainByIdOrJobId(id, "id")
  124. if err != nil {
  125. log.Error("GetCloudbrainByID failed:%v", err.Error())
  126. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  127. }
  128. ctx.Cloudbrain = job
  129. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  130. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  131. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  132. }
  133. }
  134. func AdminOrJobCreaterRight(ctx *context.Context) {
  135. var id = ctx.Params(":id")
  136. job, err := GetCloudBrainByIdOrJobId(id, "id")
  137. if err != nil {
  138. log.Error("GetCloudbrainByID failed:%v", err.Error())
  139. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  140. }
  141. ctx.Cloudbrain = job
  142. if !isAdminOrJobCreater(ctx, job, err) {
  143. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  144. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  145. }
  146. }
  147. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  148. var jobID = ctx.Params(":jobid")
  149. job, err := GetCloudBrainByIdOrJobId(jobID, "jobid")
  150. if err != nil {
  151. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  152. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  153. }
  154. ctx.Cloudbrain = job
  155. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  156. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  157. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  158. }
  159. }
  160. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  161. var jobID = ctx.Params(":jobid")
  162. job, err := GetCloudBrainByIdOrJobId(jobID, "jobid")
  163. if err != nil {
  164. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  165. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  166. }
  167. ctx.Cloudbrain = job
  168. if !isAdminOrJobCreater(ctx, job, err) {
  169. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  170. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  171. }
  172. }
  173. func AdminOrImageCreaterRight(ctx *context.Context) {
  174. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  175. var image *models.Image
  176. if err != nil {
  177. log.Error("Get Image by ID failed:%v", err.Error())
  178. } else {
  179. image, err = models.GetImageByID(id)
  180. if err != nil {
  181. log.Error("Get Image by ID failed:%v", err.Error())
  182. return
  183. }
  184. }
  185. if !isAdminOrImageCreater(ctx, image, err) {
  186. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  187. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  188. }
  189. }
  190. func GenerateTask(req GenerateCloudBrainTaskReq) (string, error) {
  191. var versionCount int
  192. if req.JobType == string(models.JobTypeTrain) {
  193. versionCount = 1
  194. }
  195. volumes := []models.Volume{
  196. {
  197. HostPath: models.StHostPath{
  198. Path: req.CodePath,
  199. MountPath: CodeMountPath,
  200. ReadOnly: false,
  201. },
  202. },
  203. {
  204. HostPath: models.StHostPath{
  205. Path: req.ModelPath,
  206. MountPath: ModelMountPath,
  207. ReadOnly: false,
  208. },
  209. },
  210. {
  211. HostPath: models.StHostPath{
  212. Path: req.BenchmarkPath,
  213. MountPath: BenchMarkMountPath,
  214. ReadOnly: true,
  215. },
  216. },
  217. {
  218. HostPath: models.StHostPath{
  219. Path: req.Snn4ImageNetPath,
  220. MountPath: Snn4imagenetMountPath,
  221. ReadOnly: true,
  222. },
  223. },
  224. {
  225. HostPath: models.StHostPath{
  226. Path: req.BrainScorePath,
  227. MountPath: BrainScoreMountPath,
  228. ReadOnly: true,
  229. },
  230. },
  231. {
  232. HostPath: models.StHostPath{
  233. Path: req.ResultPath,
  234. MountPath: ResultPath,
  235. ReadOnly: false,
  236. },
  237. },
  238. }
  239. if req.PreTrainModelUrl != "" { //预训练
  240. volumes = append(volumes, models.Volume{
  241. HostPath: models.StHostPath{
  242. Path: req.PreTrainModelPath,
  243. MountPath: PretrainModelMountPath,
  244. ReadOnly: true,
  245. },
  246. })
  247. }
  248. if len(req.DatasetInfos) == 1 {
  249. volumes = append(volumes, models.Volume{
  250. HostPath: models.StHostPath{
  251. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  252. MountPath: DataSetMountPath,
  253. ReadOnly: true,
  254. },
  255. })
  256. } else if len(req.DatasetInfos) > 1 {
  257. for _, dataset := range req.DatasetInfos {
  258. volumes = append(volumes, models.Volume{
  259. HostPath: models.StHostPath{
  260. Path: dataset.DataLocalPath,
  261. MountPath: DataSetMountPath + "/" + dataset.Name,
  262. ReadOnly: true,
  263. },
  264. })
  265. }
  266. }
  267. createTime := timeutil.TimeStampNow()
  268. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  269. JobName: req.JobName,
  270. RetryCount: 1,
  271. GpuType: req.Spec.QueueCode,
  272. Image: req.Image,
  273. TaskRoles: []models.TaskRole{
  274. {
  275. Name: SubTaskName,
  276. TaskNumber: 1,
  277. MinSucceededTaskCount: 1,
  278. MinFailedTaskCount: 1,
  279. CPUNumber: req.Spec.CpuCores,
  280. GPUNumber: req.Spec.AccCardsNum,
  281. MemoryMB: int(req.Spec.MemGiB * 1024),
  282. ShmMB: int(req.Spec.ShareMemGiB * 1024),
  283. Command: req.Command,
  284. NeedIBDevice: false,
  285. IsMainRole: false,
  286. UseNNI: false,
  287. },
  288. },
  289. Volumes: volumes,
  290. })
  291. if err != nil {
  292. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  293. return "", err
  294. }
  295. if jobResult.Code != Success {
  296. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  297. return "", errors.New(jobResult.Msg)
  298. }
  299. var jobID = jobResult.Payload["jobId"].(string)
  300. err = models.CreateCloudbrain(&models.Cloudbrain{
  301. Status: string(models.JobWaiting),
  302. UserID: req.Ctx.User.ID,
  303. RepoID: req.Ctx.Repo.Repository.ID,
  304. JobID: jobID,
  305. JobName: req.JobName,
  306. DisplayJobName: req.DisplayJobName,
  307. SubTaskName: SubTaskName,
  308. JobType: req.JobType,
  309. Type: models.TypeCloudBrainOne,
  310. Uuid: req.Uuids,
  311. Image: req.Image,
  312. GpuQueue: req.Spec.QueueCode,
  313. ComputeResource: models.GPUResource,
  314. BenchmarkTypeID: req.BenchmarkTypeID,
  315. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  316. Description: req.Description,
  317. IsLatestVersion: "1",
  318. VersionCount: versionCount,
  319. BranchName: req.BranchName,
  320. BootFile: req.BootFile,
  321. DatasetName: req.DatasetNames,
  322. Parameters: req.Params,
  323. TrainUrl: req.TrainUrl,
  324. ModelName: req.ModelName,
  325. ModelVersion: req.ModelVersion,
  326. CkptName: req.CkptName,
  327. ResultUrl: req.ResultPath,
  328. LabelName: req.LabelName,
  329. PreTrainModelUrl: req.PreTrainModelUrl,
  330. CreatedUnix: createTime,
  331. UpdatedUnix: createTime,
  332. CommitID: req.CommitID,
  333. Spec: req.Spec,
  334. })
  335. if err != nil {
  336. return "", err
  337. }
  338. task, err := models.GetCloudbrainByJobID(jobID)
  339. if err != nil {
  340. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  341. return "", err
  342. }
  343. stringId := strconv.FormatInt(task.ID, 10)
  344. if IsBenchmarkJob(req.JobType) {
  345. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  346. } else if string(models.JobTypeTrain) == req.JobType {
  347. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  348. } else if string(models.JobTypeInference) == req.JobType {
  349. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  350. } else {
  351. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  352. }
  353. return jobID, nil
  354. }
  355. func IsBenchmarkJob(jobType string) bool {
  356. return string(models.JobTypeModelSafety) == jobType || string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  357. }
  358. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  359. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  360. if err != nil {
  361. log.Warn("Get waiting count err", err)
  362. num = 0
  363. }
  364. return num
  365. }
  366. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  367. jobName := task.JobName
  368. spec := task.Spec
  369. var datasetInfos map[string]models.DatasetInfo
  370. if task.Uuid != "" {
  371. var err error
  372. datasetInfos, _, err = models.GetDatasetInfo(task.Uuid)
  373. if err != nil {
  374. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  375. return err
  376. }
  377. }
  378. volumes := []models.Volume{
  379. {
  380. HostPath: models.StHostPath{
  381. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  382. MountPath: CodeMountPath,
  383. ReadOnly: false,
  384. },
  385. },
  386. {
  387. HostPath: models.StHostPath{
  388. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  389. MountPath: ModelMountPath,
  390. ReadOnly: false,
  391. },
  392. },
  393. {
  394. HostPath: models.StHostPath{
  395. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  396. MountPath: BenchMarkMountPath,
  397. ReadOnly: true,
  398. },
  399. },
  400. {
  401. HostPath: models.StHostPath{
  402. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  403. MountPath: Snn4imagenetMountPath,
  404. ReadOnly: true,
  405. },
  406. },
  407. {
  408. HostPath: models.StHostPath{
  409. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  410. MountPath: BrainScoreMountPath,
  411. ReadOnly: true,
  412. },
  413. },
  414. }
  415. if datasetInfos != nil {
  416. if len(datasetInfos) == 1 {
  417. volumes = append(volumes, models.Volume{
  418. HostPath: models.StHostPath{
  419. Path: datasetInfos[task.Uuid].DataLocalPath,
  420. MountPath: DataSetMountPath,
  421. ReadOnly: true,
  422. },
  423. })
  424. } else {
  425. for _, dataset := range datasetInfos {
  426. volumes = append(volumes, models.Volume{
  427. HostPath: models.StHostPath{
  428. Path: dataset.DataLocalPath,
  429. MountPath: DataSetMountPath + "/" + dataset.Name,
  430. ReadOnly: true,
  431. },
  432. })
  433. }
  434. }
  435. }
  436. if task.PreTrainModelUrl != "" { //预训练
  437. _, err := models.QueryModelByPath(task.PreTrainModelUrl)
  438. if err != nil {
  439. log.Warn("The model may be deleted", err)
  440. } else {
  441. volumes = append(volumes, models.Volume{
  442. HostPath: models.StHostPath{
  443. Path: setting.Attachment.Minio.RealPath + task.PreTrainModelUrl,
  444. MountPath: PretrainModelMountPath,
  445. ReadOnly: true,
  446. },
  447. })
  448. }
  449. }
  450. createTime := timeutil.TimeStampNow()
  451. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  452. JobName: jobName,
  453. RetryCount: 1,
  454. GpuType: task.GpuQueue,
  455. Image: task.Image,
  456. TaskRoles: []models.TaskRole{
  457. {
  458. Name: SubTaskName,
  459. TaskNumber: 1,
  460. MinSucceededTaskCount: 1,
  461. MinFailedTaskCount: 1,
  462. CPUNumber: spec.CpuCores,
  463. GPUNumber: spec.AccCardsNum,
  464. MemoryMB: int(spec.MemGiB * 1024),
  465. ShmMB: int(spec.ShareMemGiB * 1024),
  466. Command: GetCloudbrainDebugCommand(), //Command,
  467. NeedIBDevice: false,
  468. IsMainRole: false,
  469. UseNNI: false,
  470. },
  471. },
  472. Volumes: volumes,
  473. })
  474. if err != nil {
  475. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  476. return err
  477. }
  478. if jobResult.Code != Success {
  479. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  480. return errors.New(jobResult.Msg)
  481. }
  482. var jobID = jobResult.Payload["jobId"].(string)
  483. newTask := &models.Cloudbrain{
  484. Status: string(models.JobWaiting),
  485. UserID: task.UserID,
  486. RepoID: task.RepoID,
  487. JobID: jobID,
  488. JobName: task.JobName,
  489. DisplayJobName: task.DisplayJobName,
  490. SubTaskName: task.SubTaskName,
  491. JobType: task.JobType,
  492. Type: task.Type,
  493. Uuid: task.Uuid,
  494. DatasetName: task.DatasetName,
  495. Image: task.Image,
  496. GpuQueue: task.GpuQueue,
  497. ResourceSpecId: task.ResourceSpecId,
  498. ComputeResource: task.ComputeResource,
  499. CreatedUnix: createTime,
  500. UpdatedUnix: createTime,
  501. BranchName: task.BranchName,
  502. Spec: spec,
  503. ModelName: task.ModelName,
  504. ModelVersion: task.ModelVersion,
  505. LabelName: task.LabelName,
  506. PreTrainModelUrl: task.PreTrainModelUrl,
  507. CkptName: task.CkptName,
  508. }
  509. err = models.RestartCloudbrain(task, newTask)
  510. if err != nil {
  511. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  512. return err
  513. }
  514. stringId := strconv.FormatInt(newTask.ID, 10)
  515. *newID = stringId
  516. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  517. return nil
  518. }
  519. func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {
  520. for _, specialPool := range SpecialPools.Pools {
  521. if specialPool.ResourceSpec != nil {
  522. if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
  523. for _, spec := range specialPool.ResourceSpec {
  524. if resourceSpecId == spec.Id {
  525. return spec
  526. }
  527. }
  528. }
  529. }
  530. }
  531. return nil
  532. }
  533. func DelCloudBrainJob(jobId string) string {
  534. task, err := models.GetCloudbrainByJobID(jobId)
  535. if err != nil {
  536. log.Error("get cloud brain err:", err)
  537. return "cloudbrain.Delete_failed"
  538. }
  539. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  540. log.Error("the job(%s) has not been stopped", task.JobName)
  541. return "cloudbrain.Not_Stopped"
  542. }
  543. err = models.DeleteJob(task)
  544. if err != nil {
  545. log.Error("DeleteJob failed:", err)
  546. return "cloudbrain.Delete_failed"
  547. }
  548. deleteJobStorage(task.JobName)
  549. return ""
  550. }
  551. func deleteJobStorage(jobName string) error {
  552. //delete local
  553. localJobPath := setting.JobPath + jobName
  554. err := os.RemoveAll(localJobPath)
  555. if err != nil {
  556. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  557. }
  558. dirPath := setting.CBCodePathPrefix + jobName + "/"
  559. err = storage.Attachments.DeleteDir(dirPath)
  560. if err != nil {
  561. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  562. }
  563. return nil
  564. }
  565. func InitSpecialPool() {
  566. if SpecialPools == nil && setting.SpecialPools != "" {
  567. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  568. }
  569. }
  570. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  571. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  572. return true
  573. }
  574. for _, v := range resourceSpecs {
  575. if v.Id == resourceSpecId {
  576. return true
  577. }
  578. }
  579. return false
  580. }
  581. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  582. for _, v := range pool {
  583. if v.Queue == queue {
  584. return true
  585. }
  586. }
  587. return false
  588. }
  589. func IsElementExist(s []string, str string) bool {
  590. for _, v := range s {
  591. if v == str {
  592. return true
  593. }
  594. }
  595. return false
  596. }
  597. func GetCloudBrainByIdOrJobId(id string, initialQuery string) (*models.Cloudbrain, error) {
  598. _, err := strconv.ParseInt(id, 10, 64)
  599. var job *models.Cloudbrain
  600. if err != nil {
  601. job, err = models.GetCloudbrainByJobID(id)
  602. } else {
  603. if strings.EqualFold(initialQuery, "id") {
  604. job, err = models.GetCloudbrainByID(id)
  605. if err != nil {
  606. job, err = models.GetCloudbrainByJobID(id)
  607. }
  608. } else {
  609. job, err = models.GetCloudbrainByJobID(id)
  610. if err != nil {
  611. job, err = models.GetCloudbrainByID(id)
  612. }
  613. }
  614. }
  615. return job, err
  616. }
  617. type GenerateModelArtsNotebookReq struct {
  618. JobName string
  619. DisplayJobName string
  620. Uuid string
  621. Description string
  622. BootFile string
  623. ImageId string
  624. AutoStopDurationMs int64
  625. Spec *models.Specification
  626. ModelName string
  627. LabelName string
  628. CkptName string
  629. ModelVersion string
  630. PreTrainModelUrl string
  631. }