You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 20 kB

2 years ago
2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
2 years ago
4 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
3 years ago
2 years ago
4 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "os"
  6. "strconv"
  7. "strings"
  8. "code.gitea.io/gitea/modules/timeutil"
  9. "code.gitea.io/gitea/modules/storage"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. )
  16. const (
  17. //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  18. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  19. CommandBenchmark = `cd /benchmark && bash run_bk.sh >/model/benchmark-log.txt`
  20. CodeMountPath = "/code"
  21. DataSetMountPath = "/dataset"
  22. ModelMountPath = "/model"
  23. PretrainModelMountPath = "/pretrainmodel"
  24. LogFile = "log.txt"
  25. BenchMarkMountPath = "/benchmark"
  26. BenchMarkResourceID = 1
  27. Snn4imagenetMountPath = "/snn4imagenet"
  28. BrainScoreMountPath = "/brainscore"
  29. TaskInfoName = "/taskInfo"
  30. Snn4imagenetCommand = `/opt/conda/bin/python /benchmark/testSNN_script.py --modelname '%s' --modelpath '/pretrainmodel/%s' --modeldescription '%s' >/model/benchmark-log.txt`
  31. BrainScoreCommand = `bash /benchmark/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/pretrainmodel/%s' -d '%s' >/model/benchmark-log.txt`
  32. Snn4EcosetCommand = `/opt/conda/bin/python /benchmark/testSNN_script.py --datapath '/dataset' --modelname '%s' --modelpath '/pretrainmodel/%s' --modeldescription '%s' >/model/benchmark-log.txt`
  33. SubTaskName = "task1"
  34. Success = "S000"
  35. DefaultBranchName = "master"
  36. ResultPath = "/result"
  37. )
  38. var (
  39. ResourceSpecs *models.ResourceSpecs
  40. TrainResourceSpecs *models.ResourceSpecs
  41. InferenceResourceSpecs *models.ResourceSpecs
  42. SpecialPools *models.SpecialPools
  43. )
  44. type GenerateCloudBrainTaskReq struct {
  45. Ctx *context.Context
  46. DisplayJobName string
  47. JobName string
  48. Image string
  49. Command string
  50. CodePath string
  51. ModelPath string
  52. BenchmarkPath string
  53. Snn4ImageNetPath string
  54. BrainScorePath string
  55. JobType string
  56. Description string
  57. BranchName string
  58. BootFile string
  59. Params string
  60. CommitID string
  61. Uuids string
  62. DatasetNames string
  63. DatasetInfos map[string]models.DatasetInfo
  64. BenchmarkTypeID int
  65. BenchmarkChildTypeID int
  66. ResultPath string
  67. TrainUrl string
  68. ModelName string
  69. ModelVersion string
  70. CkptName string
  71. LabelName string
  72. PreTrainModelPath string
  73. PreTrainModelUrl string
  74. Spec *models.Specification
  75. }
  76. func GetCloudbrainDebugCommand() string {
  77. var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
  78. return command
  79. }
  80. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  81. if !ctx.IsSigned {
  82. return false
  83. }
  84. if err != nil {
  85. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  86. } else {
  87. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  88. }
  89. }
  90. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  91. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  92. }
  93. func CanCreateOrDebugJob(ctx *context.Context) bool {
  94. if !ctx.IsSigned {
  95. return false
  96. }
  97. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  98. }
  99. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  100. return isAdminOrJobCreater(ctx, job, nil)
  101. }
  102. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  103. if !ctx.IsSigned {
  104. return false
  105. }
  106. if err != nil {
  107. return ctx.IsUserSiteAdmin()
  108. } else {
  109. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  110. }
  111. }
  112. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  113. if !ctx.IsSigned {
  114. return false
  115. }
  116. if err != nil {
  117. return ctx.IsUserSiteAdmin()
  118. } else {
  119. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  120. }
  121. }
  122. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  123. var id = ctx.Params(":id")
  124. job, err := GetCloudBrainByIdOrJobId(id, "id")
  125. if err != nil {
  126. log.Error("GetCloudbrainByID failed:%v", err.Error())
  127. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  128. }
  129. ctx.Cloudbrain = job
  130. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  131. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  132. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  133. }
  134. }
  135. func AdminOrJobCreaterRight(ctx *context.Context) {
  136. var id = ctx.Params(":id")
  137. job, err := GetCloudBrainByIdOrJobId(id, "id")
  138. if err != nil {
  139. log.Error("GetCloudbrainByID failed:%v", err.Error())
  140. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  141. }
  142. ctx.Cloudbrain = job
  143. if !isAdminOrJobCreater(ctx, job, err) {
  144. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  145. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  146. }
  147. }
  148. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  149. var jobID = ctx.Params(":jobid")
  150. job, err := GetCloudBrainByIdOrJobId(jobID, "jobid")
  151. if err != nil {
  152. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  153. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  154. }
  155. ctx.Cloudbrain = job
  156. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  157. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  158. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  159. }
  160. }
  161. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  162. var jobID = ctx.Params(":jobid")
  163. job, err := GetCloudBrainByIdOrJobId(jobID, "jobid")
  164. if err != nil {
  165. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  166. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  167. }
  168. ctx.Cloudbrain = job
  169. if !isAdminOrJobCreater(ctx, job, err) {
  170. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  171. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  172. }
  173. }
  174. func AdminOrImageCreaterRight(ctx *context.Context) {
  175. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  176. var image *models.Image
  177. if err != nil {
  178. log.Error("Get Image by ID failed:%v", err.Error())
  179. } else {
  180. image, err = models.GetImageByID(id)
  181. if err != nil {
  182. log.Error("Get Image by ID failed:%v", err.Error())
  183. return
  184. }
  185. }
  186. if !isAdminOrImageCreater(ctx, image, err) {
  187. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  188. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  189. }
  190. }
  191. func GenerateTask(req GenerateCloudBrainTaskReq) (string, error) {
  192. var versionCount int
  193. if req.JobType == string(models.JobTypeTrain) {
  194. versionCount = 1
  195. }
  196. volumes := []models.Volume{
  197. {
  198. HostPath: models.StHostPath{
  199. Path: req.CodePath,
  200. MountPath: CodeMountPath,
  201. ReadOnly: false,
  202. },
  203. },
  204. {
  205. HostPath: models.StHostPath{
  206. Path: req.ModelPath,
  207. MountPath: ModelMountPath,
  208. ReadOnly: false,
  209. },
  210. },
  211. {
  212. HostPath: models.StHostPath{
  213. Path: req.BenchmarkPath,
  214. MountPath: BenchMarkMountPath,
  215. ReadOnly: true,
  216. },
  217. },
  218. {
  219. HostPath: models.StHostPath{
  220. Path: req.ResultPath,
  221. MountPath: ResultPath,
  222. ReadOnly: false,
  223. },
  224. },
  225. }
  226. if req.PreTrainModelUrl != "" { //预训练
  227. volumes = append(volumes, models.Volume{
  228. HostPath: models.StHostPath{
  229. Path: req.PreTrainModelPath,
  230. MountPath: PretrainModelMountPath,
  231. ReadOnly: true,
  232. },
  233. })
  234. }
  235. if len(req.DatasetInfos) == 1 {
  236. volumes = append(volumes, models.Volume{
  237. HostPath: models.StHostPath{
  238. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  239. MountPath: DataSetMountPath,
  240. ReadOnly: true,
  241. },
  242. })
  243. } else if len(req.DatasetInfos) > 1 {
  244. for _, dataset := range req.DatasetInfos {
  245. volumes = append(volumes, models.Volume{
  246. HostPath: models.StHostPath{
  247. Path: dataset.DataLocalPath,
  248. MountPath: DataSetMountPath + "/" + dataset.Name,
  249. ReadOnly: true,
  250. },
  251. })
  252. }
  253. }
  254. createTime := timeutil.TimeStampNow()
  255. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  256. JobName: req.JobName,
  257. RetryCount: 1,
  258. GpuType: req.Spec.QueueCode,
  259. Image: req.Image,
  260. TaskRoles: []models.TaskRole{
  261. {
  262. Name: SubTaskName,
  263. TaskNumber: 1,
  264. MinSucceededTaskCount: 1,
  265. MinFailedTaskCount: 1,
  266. CPUNumber: req.Spec.CpuCores,
  267. GPUNumber: req.Spec.AccCardsNum,
  268. MemoryMB: int(req.Spec.MemGiB * 1024),
  269. ShmMB: int(req.Spec.ShareMemGiB * 1024),
  270. Command: req.Command,
  271. NeedIBDevice: false,
  272. IsMainRole: false,
  273. UseNNI: false,
  274. },
  275. },
  276. Volumes: volumes,
  277. })
  278. if err != nil {
  279. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  280. return "", err
  281. }
  282. if jobResult.Code != Success {
  283. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  284. return "", errors.New(jobResult.Msg)
  285. }
  286. var jobID = jobResult.Payload["jobId"].(string)
  287. err = models.CreateCloudbrain(&models.Cloudbrain{
  288. Status: string(models.JobWaiting),
  289. UserID: req.Ctx.User.ID,
  290. RepoID: req.Ctx.Repo.Repository.ID,
  291. JobID: jobID,
  292. JobName: req.JobName,
  293. DisplayJobName: req.DisplayJobName,
  294. SubTaskName: SubTaskName,
  295. JobType: req.JobType,
  296. Type: models.TypeCloudBrainOne,
  297. Uuid: req.Uuids,
  298. Image: req.Image,
  299. GpuQueue: req.Spec.QueueCode,
  300. ComputeResource: models.GPUResource,
  301. BenchmarkTypeID: req.BenchmarkTypeID,
  302. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  303. Description: req.Description,
  304. IsLatestVersion: "1",
  305. VersionCount: versionCount,
  306. BranchName: req.BranchName,
  307. BootFile: req.BootFile,
  308. DatasetName: req.DatasetNames,
  309. Parameters: req.Params,
  310. TrainUrl: req.TrainUrl,
  311. ModelName: req.ModelName,
  312. ModelVersion: req.ModelVersion,
  313. CkptName: req.CkptName,
  314. ResultUrl: req.ResultPath,
  315. LabelName: req.LabelName,
  316. PreTrainModelUrl: req.PreTrainModelUrl,
  317. CreatedUnix: createTime,
  318. UpdatedUnix: createTime,
  319. CommitID: req.CommitID,
  320. Spec: req.Spec,
  321. })
  322. if err != nil {
  323. return "", err
  324. }
  325. task, err := models.GetCloudbrainByJobID(jobID)
  326. if err != nil {
  327. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  328. return "", err
  329. }
  330. stringId := strconv.FormatInt(task.ID, 10)
  331. if IsBenchmarkJob(req.JobType) {
  332. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  333. } else if string(models.JobTypeTrain) == req.JobType {
  334. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  335. } else if string(models.JobTypeInference) == req.JobType {
  336. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  337. } else {
  338. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  339. }
  340. return jobID, nil
  341. }
  342. func IsBenchmarkJob(jobType string) bool {
  343. return string(models.JobTypeModelSafety) == jobType || string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType || string(models.JobTypeSnn4Ecoset) == jobType
  344. }
  345. func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...models.JobType) int64 {
  346. num, err := models.GetWaitingCloudbrainCount(cloudbrainType, computeResource, jobTypes...)
  347. if err != nil {
  348. log.Warn("Get waiting count err", err)
  349. num = 0
  350. }
  351. return num
  352. }
  353. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  354. jobName := task.JobName
  355. spec := task.Spec
  356. var datasetInfos map[string]models.DatasetInfo
  357. if task.Uuid != "" {
  358. var err error
  359. datasetInfos, _, err = models.GetDatasetInfo(task.Uuid)
  360. if err != nil {
  361. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  362. return err
  363. }
  364. }
  365. volumes := []models.Volume{
  366. {
  367. HostPath: models.StHostPath{
  368. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  369. MountPath: CodeMountPath,
  370. ReadOnly: false,
  371. },
  372. },
  373. {
  374. HostPath: models.StHostPath{
  375. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  376. MountPath: ModelMountPath,
  377. ReadOnly: false,
  378. },
  379. },
  380. {
  381. HostPath: models.StHostPath{
  382. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  383. MountPath: BenchMarkMountPath,
  384. ReadOnly: true,
  385. },
  386. },
  387. {
  388. HostPath: models.StHostPath{
  389. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  390. MountPath: Snn4imagenetMountPath,
  391. ReadOnly: true,
  392. },
  393. },
  394. {
  395. HostPath: models.StHostPath{
  396. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  397. MountPath: BrainScoreMountPath,
  398. ReadOnly: true,
  399. },
  400. },
  401. }
  402. if datasetInfos != nil {
  403. if len(datasetInfos) == 1 {
  404. volumes = append(volumes, models.Volume{
  405. HostPath: models.StHostPath{
  406. Path: datasetInfos[task.Uuid].DataLocalPath,
  407. MountPath: DataSetMountPath,
  408. ReadOnly: true,
  409. },
  410. })
  411. } else {
  412. for _, dataset := range datasetInfos {
  413. volumes = append(volumes, models.Volume{
  414. HostPath: models.StHostPath{
  415. Path: dataset.DataLocalPath,
  416. MountPath: DataSetMountPath + "/" + dataset.Name,
  417. ReadOnly: true,
  418. },
  419. })
  420. }
  421. }
  422. }
  423. if task.PreTrainModelUrl != "" { //预训练
  424. _, err := models.QueryModelByPath(task.PreTrainModelUrl)
  425. if err != nil {
  426. log.Warn("The model may be deleted", err)
  427. } else {
  428. volumes = append(volumes, models.Volume{
  429. HostPath: models.StHostPath{
  430. Path: setting.Attachment.Minio.RealPath + task.PreTrainModelUrl,
  431. MountPath: PretrainModelMountPath,
  432. ReadOnly: true,
  433. },
  434. })
  435. }
  436. }
  437. createTime := timeutil.TimeStampNow()
  438. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  439. JobName: jobName,
  440. RetryCount: 1,
  441. GpuType: task.GpuQueue,
  442. Image: task.Image,
  443. TaskRoles: []models.TaskRole{
  444. {
  445. Name: SubTaskName,
  446. TaskNumber: 1,
  447. MinSucceededTaskCount: 1,
  448. MinFailedTaskCount: 1,
  449. CPUNumber: spec.CpuCores,
  450. GPUNumber: spec.AccCardsNum,
  451. MemoryMB: int(spec.MemGiB * 1024),
  452. ShmMB: int(spec.ShareMemGiB * 1024),
  453. Command: GetCloudbrainDebugCommand(), //Command,
  454. NeedIBDevice: false,
  455. IsMainRole: false,
  456. UseNNI: false,
  457. },
  458. },
  459. Volumes: volumes,
  460. })
  461. if err != nil {
  462. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  463. return err
  464. }
  465. if jobResult.Code != Success {
  466. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  467. return errors.New(jobResult.Msg)
  468. }
  469. var jobID = jobResult.Payload["jobId"].(string)
  470. newTask := &models.Cloudbrain{
  471. Status: string(models.JobWaiting),
  472. UserID: task.UserID,
  473. RepoID: task.RepoID,
  474. JobID: jobID,
  475. JobName: task.JobName,
  476. DisplayJobName: task.DisplayJobName,
  477. SubTaskName: task.SubTaskName,
  478. JobType: task.JobType,
  479. Type: task.Type,
  480. Uuid: task.Uuid,
  481. DatasetName: task.DatasetName,
  482. Image: task.Image,
  483. GpuQueue: task.GpuQueue,
  484. ResourceSpecId: task.ResourceSpecId,
  485. ComputeResource: task.ComputeResource,
  486. CreatedUnix: createTime,
  487. UpdatedUnix: createTime,
  488. BranchName: task.BranchName,
  489. Spec: spec,
  490. ModelName: task.ModelName,
  491. ModelVersion: task.ModelVersion,
  492. LabelName: task.LabelName,
  493. PreTrainModelUrl: task.PreTrainModelUrl,
  494. CkptName: task.CkptName,
  495. }
  496. err = models.RestartCloudbrain(task, newTask)
  497. if err != nil {
  498. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  499. return err
  500. }
  501. stringId := strconv.FormatInt(newTask.ID, 10)
  502. *newID = stringId
  503. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  504. return nil
  505. }
  506. func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec {
  507. for _, specialPool := range SpecialPools.Pools {
  508. if specialPool.ResourceSpec != nil {
  509. if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) {
  510. for _, spec := range specialPool.ResourceSpec {
  511. if resourceSpecId == spec.Id {
  512. return spec
  513. }
  514. }
  515. }
  516. }
  517. }
  518. return nil
  519. }
  520. func DelCloudBrainJob(jobId string) string {
  521. task, err := models.GetCloudbrainByJobID(jobId)
  522. if err != nil {
  523. log.Error("get cloud brain err:", err)
  524. return "cloudbrain.Delete_failed"
  525. }
  526. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  527. log.Error("the job(%s) has not been stopped", task.JobName)
  528. return "cloudbrain.Not_Stopped"
  529. }
  530. err = models.DeleteJob(task)
  531. if err != nil {
  532. log.Error("DeleteJob failed:", err)
  533. return "cloudbrain.Delete_failed"
  534. }
  535. deleteJobStorage(task.JobName)
  536. return ""
  537. }
  538. func deleteJobStorage(jobName string) error {
  539. //delete local
  540. localJobPath := setting.JobPath + jobName
  541. err := os.RemoveAll(localJobPath)
  542. if err != nil {
  543. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  544. }
  545. dirPath := setting.CBCodePathPrefix + jobName + "/"
  546. err = storage.Attachments.DeleteDir(dirPath)
  547. if err != nil {
  548. log.Error("DeleteDir(%s) failed:%v", localJobPath, err)
  549. }
  550. return nil
  551. }
  552. func InitSpecialPool() {
  553. if SpecialPools == nil && setting.SpecialPools != "" {
  554. json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
  555. }
  556. }
  557. func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
  558. if resourceSpecs == nil || len(resourceSpecs) == 0 {
  559. return true
  560. }
  561. for _, v := range resourceSpecs {
  562. if v.Id == resourceSpecId {
  563. return true
  564. }
  565. }
  566. return false
  567. }
  568. func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
  569. for _, v := range pool {
  570. if v.Queue == queue {
  571. return true
  572. }
  573. }
  574. return false
  575. }
  576. func IsElementExist(s []string, str string) bool {
  577. for _, v := range s {
  578. if v == str {
  579. return true
  580. }
  581. }
  582. return false
  583. }
  584. func GetCloudBrainByIdOrJobId(id string, initialQuery string) (*models.Cloudbrain, error) {
  585. _, err := strconv.ParseInt(id, 10, 64)
  586. var job *models.Cloudbrain
  587. if err != nil {
  588. job, err = models.GetCloudbrainByJobID(id)
  589. } else {
  590. if strings.EqualFold(initialQuery, "id") {
  591. job, err = models.GetCloudbrainByID(id)
  592. if err != nil {
  593. job, err = models.GetCloudbrainByJobID(id)
  594. }
  595. } else {
  596. job, err = models.GetCloudbrainByJobID(id)
  597. if err != nil {
  598. job, err = models.GetCloudbrainByID(id)
  599. }
  600. }
  601. }
  602. return job, err
  603. }
  604. type GenerateModelArtsNotebookReq struct {
  605. JobName string
  606. DisplayJobName string
  607. Uuid string
  608. Description string
  609. BootFile string
  610. ImageId string
  611. AutoStopDurationMs int64
  612. BranchName string
  613. Spec *models.Specification
  614. ModelName string
  615. LabelName string
  616. CkptName string
  617. ModelVersion string
  618. PreTrainModelUrl string
  619. }