You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 15 kB

4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "strconv"
  6. "code.gitea.io/gitea/modules/timeutil"
  7. "code.gitea.io/gitea/modules/storage"
  8. "code.gitea.io/gitea/models"
  9. "code.gitea.io/gitea/modules/context"
  10. "code.gitea.io/gitea/modules/log"
  11. "code.gitea.io/gitea/modules/notification"
  12. "code.gitea.io/gitea/modules/setting"
  13. )
  14. const (
  15. Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  16. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  17. CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
  18. CodeMountPath = "/code"
  19. DataSetMountPath = "/dataset"
  20. ModelMountPath = "/model"
  21. LogFile = "log.txt"
  22. BenchMarkMountPath = "/benchmark"
  23. BenchMarkResourceID = 1
  24. Snn4imagenetMountPath = "/snn4imagenet"
  25. BrainScoreMountPath = "/brainscore"
  26. TaskInfoName = "/taskInfo"
  27. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'`
  28. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'`
  29. SubTaskName = "task1"
  30. Success = "S000"
  31. DefaultBranchName = "master"
  32. )
  33. var (
  34. ResourceSpecs *models.ResourceSpecs
  35. TrainResourceSpecs *models.ResourceSpecs
  36. )
  37. type GenerateCloudBrainTaskReq struct {
  38. Ctx *context.Context
  39. DisplayJobName string
  40. JobName string
  41. Image string
  42. Command string
  43. CodePath string
  44. ModelPath string
  45. BenchmarkPath string
  46. Snn4ImageNetPath string
  47. BrainScorePath string
  48. JobType string
  49. GpuQueue string
  50. Description string
  51. BranchName string
  52. BootFile string
  53. Params string
  54. CommitID string
  55. Uuids string
  56. DatasetNames string
  57. DatasetInfos map[string]models.DatasetInfo
  58. BenchmarkTypeID int
  59. BenchmarkChildTypeID int
  60. ResourceSpecId int
  61. }
  62. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  63. if !ctx.IsSigned {
  64. return false
  65. }
  66. if err != nil {
  67. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  68. } else {
  69. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  70. }
  71. }
  72. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  73. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  74. }
  75. func CanCreateOrDebugJob(ctx *context.Context) bool {
  76. if !ctx.IsSigned {
  77. return false
  78. }
  79. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  80. }
  81. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  82. return isAdminOrJobCreater(ctx, job, nil)
  83. }
  84. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  85. if !ctx.IsSigned {
  86. return false
  87. }
  88. if err != nil {
  89. return ctx.IsUserSiteAdmin()
  90. } else {
  91. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  92. }
  93. }
  94. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  95. if !ctx.IsSigned {
  96. return false
  97. }
  98. if err != nil {
  99. return ctx.IsUserSiteAdmin()
  100. } else {
  101. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  102. }
  103. }
  104. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  105. var ID = ctx.Params(":id")
  106. job, err := models.GetCloudbrainByID(ID)
  107. if err != nil {
  108. log.Error("GetCloudbrainByID failed:%v", err.Error())
  109. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  110. }
  111. ctx.Cloudbrain = job
  112. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  113. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  114. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  115. }
  116. }
  117. func AdminOrJobCreaterRight(ctx *context.Context) {
  118. var ID = ctx.Params(":id")
  119. job, err := models.GetCloudbrainByID(ID)
  120. if err != nil {
  121. log.Error("GetCloudbrainByID failed:%v", err.Error())
  122. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  123. }
  124. ctx.Cloudbrain = job
  125. if !isAdminOrJobCreater(ctx, job, err) {
  126. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  127. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  128. }
  129. }
  130. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  131. var jobID = ctx.Params(":jobid")
  132. job, err := models.GetCloudbrainByJobID(jobID)
  133. if err != nil {
  134. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  135. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  136. }
  137. ctx.Cloudbrain = job
  138. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  139. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  140. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  141. }
  142. }
  143. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  144. var jobID = ctx.Params(":jobid")
  145. job, err := models.GetCloudbrainByJobID(jobID)
  146. if err != nil {
  147. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  148. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  149. }
  150. ctx.Cloudbrain = job
  151. if !isAdminOrJobCreater(ctx, job, err) {
  152. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  153. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  154. }
  155. }
  156. func AdminOrImageCreaterRight(ctx *context.Context) {
  157. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  158. var image *models.Image
  159. if err != nil {
  160. log.Error("Get Image by ID failed:%v", err.Error())
  161. } else {
  162. image, err = models.GetImageByID(id)
  163. if err != nil {
  164. log.Error("Get Image by ID failed:%v", err.Error())
  165. return
  166. }
  167. }
  168. if !isAdminOrImageCreater(ctx, image, err) {
  169. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  170. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  171. }
  172. }
  173. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  174. var resourceSpec *models.ResourceSpec
  175. var versionCount int
  176. if req.JobType == string(models.JobTypeTrain) {
  177. versionCount = 1
  178. if TrainResourceSpecs == nil {
  179. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  180. }
  181. for _, spec := range TrainResourceSpecs.ResourceSpec {
  182. if req.ResourceSpecId == spec.Id {
  183. resourceSpec = spec
  184. }
  185. }
  186. } else {
  187. if ResourceSpecs == nil {
  188. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  189. }
  190. for _, spec := range ResourceSpecs.ResourceSpec {
  191. if req.ResourceSpecId == spec.Id {
  192. resourceSpec = spec
  193. }
  194. }
  195. }
  196. if resourceSpec == nil {
  197. log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
  198. return errors.New("no such resourceSpec")
  199. }
  200. volumes := []models.Volume{
  201. {
  202. HostPath: models.StHostPath{
  203. Path: req.CodePath,
  204. MountPath: CodeMountPath,
  205. ReadOnly: false,
  206. },
  207. },
  208. {
  209. HostPath: models.StHostPath{
  210. Path: req.ModelPath,
  211. MountPath: ModelMountPath,
  212. ReadOnly: false,
  213. },
  214. },
  215. {
  216. HostPath: models.StHostPath{
  217. Path: req.BenchmarkPath,
  218. MountPath: BenchMarkMountPath,
  219. ReadOnly: true,
  220. },
  221. },
  222. {
  223. HostPath: models.StHostPath{
  224. Path: req.Snn4ImageNetPath,
  225. MountPath: Snn4imagenetMountPath,
  226. ReadOnly: true,
  227. },
  228. },
  229. {
  230. HostPath: models.StHostPath{
  231. Path: req.BrainScorePath,
  232. MountPath: BrainScoreMountPath,
  233. ReadOnly: true,
  234. },
  235. },
  236. }
  237. if len(req.DatasetInfos) == 1 {
  238. volumes = append(volumes, models.Volume{
  239. HostPath: models.StHostPath{
  240. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  241. MountPath: DataSetMountPath,
  242. ReadOnly: true,
  243. },
  244. })
  245. } else {
  246. for _, dataset := range req.DatasetInfos {
  247. volumes = append(volumes, models.Volume{
  248. HostPath: models.StHostPath{
  249. Path: dataset.DataLocalPath,
  250. MountPath: DataSetMountPath + "/" + dataset.Name,
  251. ReadOnly: true,
  252. },
  253. })
  254. }
  255. }
  256. createTime := timeutil.TimeStampNow()
  257. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  258. JobName: req.JobName,
  259. RetryCount: 1,
  260. GpuType: req.GpuQueue,
  261. Image: req.Image,
  262. TaskRoles: []models.TaskRole{
  263. {
  264. Name: SubTaskName,
  265. TaskNumber: 1,
  266. MinSucceededTaskCount: 1,
  267. MinFailedTaskCount: 1,
  268. CPUNumber: resourceSpec.CpuNum,
  269. GPUNumber: resourceSpec.GpuNum,
  270. MemoryMB: resourceSpec.MemMiB,
  271. ShmMB: resourceSpec.ShareMemMiB,
  272. Command: req.Command,
  273. NeedIBDevice: false,
  274. IsMainRole: false,
  275. UseNNI: false,
  276. },
  277. },
  278. Volumes: volumes,
  279. })
  280. if err != nil {
  281. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  282. return err
  283. }
  284. if jobResult.Code != Success {
  285. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  286. return errors.New(jobResult.Msg)
  287. }
  288. var jobID = jobResult.Payload["jobId"].(string)
  289. err = models.CreateCloudbrain(&models.Cloudbrain{
  290. Status: string(models.JobWaiting),
  291. UserID: req.Ctx.User.ID,
  292. RepoID: req.Ctx.Repo.Repository.ID,
  293. JobID: jobID,
  294. JobName: req.JobName,
  295. DisplayJobName: req.DisplayJobName,
  296. SubTaskName: SubTaskName,
  297. JobType: req.JobType,
  298. Type: models.TypeCloudBrainOne,
  299. Uuid: req.Uuids,
  300. Image: req.Image,
  301. GpuQueue: req.GpuQueue,
  302. ResourceSpecId: req.ResourceSpecId,
  303. ComputeResource: models.GPUResource,
  304. BenchmarkTypeID: req.BenchmarkTypeID,
  305. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  306. Description: req.Description,
  307. IsLatestVersion: "1",
  308. VersionCount: versionCount,
  309. BranchName: req.BranchName,
  310. BootFile: req.BootFile,
  311. DatasetName: req.DatasetNames,
  312. Parameters: req.Params,
  313. CreatedUnix: createTime,
  314. UpdatedUnix: createTime,
  315. CommitID: req.CommitID,
  316. })
  317. if err != nil {
  318. return err
  319. }
  320. task, err := models.GetCloudbrainByJobID(jobID)
  321. if err != nil {
  322. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  323. return err
  324. }
  325. stringId := strconv.FormatInt(task.ID, 10)
  326. if IsBenchmarkJob(req.JobType) {
  327. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  328. } else if string(models.JobTypeTrain) == req.JobType {
  329. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  330. } else {
  331. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  332. }
  333. return nil
  334. }
  335. func IsBenchmarkJob(jobType string) bool {
  336. return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  337. }
  338. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  339. jobName := task.JobName
  340. var resourceSpec *models.ResourceSpec
  341. if ResourceSpecs == nil {
  342. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  343. }
  344. for _, spec := range ResourceSpecs.ResourceSpec {
  345. if task.ResourceSpecId == spec.Id {
  346. resourceSpec = spec
  347. }
  348. }
  349. if resourceSpec == nil {
  350. log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
  351. return errors.New("no such resourceSpec")
  352. }
  353. datasetInfos, _, err := models.GetDatasetInfo(task.Uuid)
  354. if err != nil {
  355. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  356. return err
  357. }
  358. volumes := []models.Volume{
  359. {
  360. HostPath: models.StHostPath{
  361. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  362. MountPath: CodeMountPath,
  363. ReadOnly: false,
  364. },
  365. },
  366. {
  367. HostPath: models.StHostPath{
  368. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  369. MountPath: ModelMountPath,
  370. ReadOnly: false,
  371. },
  372. },
  373. {
  374. HostPath: models.StHostPath{
  375. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  376. MountPath: BenchMarkMountPath,
  377. ReadOnly: true,
  378. },
  379. },
  380. {
  381. HostPath: models.StHostPath{
  382. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  383. MountPath: Snn4imagenetMountPath,
  384. ReadOnly: true,
  385. },
  386. },
  387. {
  388. HostPath: models.StHostPath{
  389. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  390. MountPath: BrainScoreMountPath,
  391. ReadOnly: true,
  392. },
  393. },
  394. }
  395. if len(datasetInfos) == 1 {
  396. volumes = append(volumes, models.Volume{
  397. HostPath: models.StHostPath{
  398. Path: datasetInfos[task.Uuid].DataLocalPath,
  399. MountPath: DataSetMountPath,
  400. ReadOnly: true,
  401. },
  402. })
  403. } else {
  404. for _, dataset := range datasetInfos {
  405. volumes = append(volumes, models.Volume{
  406. HostPath: models.StHostPath{
  407. Path: dataset.DataLocalPath,
  408. MountPath: DataSetMountPath + "/" + dataset.Name,
  409. ReadOnly: true,
  410. },
  411. })
  412. }
  413. }
  414. createTime := timeutil.TimeStampNow()
  415. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  416. JobName: jobName,
  417. RetryCount: 1,
  418. GpuType: task.GpuQueue,
  419. Image: task.Image,
  420. TaskRoles: []models.TaskRole{
  421. {
  422. Name: SubTaskName,
  423. TaskNumber: 1,
  424. MinSucceededTaskCount: 1,
  425. MinFailedTaskCount: 1,
  426. CPUNumber: resourceSpec.CpuNum,
  427. GPUNumber: resourceSpec.GpuNum,
  428. MemoryMB: resourceSpec.MemMiB,
  429. ShmMB: resourceSpec.ShareMemMiB,
  430. Command: Command,
  431. NeedIBDevice: false,
  432. IsMainRole: false,
  433. UseNNI: false,
  434. },
  435. },
  436. Volumes: volumes,
  437. })
  438. if err != nil {
  439. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  440. return err
  441. }
  442. if jobResult.Code != Success {
  443. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  444. return errors.New(jobResult.Msg)
  445. }
  446. var jobID = jobResult.Payload["jobId"].(string)
  447. newTask := &models.Cloudbrain{
  448. Status: string(models.JobWaiting),
  449. UserID: task.UserID,
  450. RepoID: task.RepoID,
  451. JobID: jobID,
  452. JobName: task.JobName,
  453. DisplayJobName: task.DisplayJobName,
  454. SubTaskName: task.SubTaskName,
  455. JobType: task.JobType,
  456. Type: task.Type,
  457. Uuid: task.Uuid,
  458. DatasetName: task.DatasetName,
  459. Image: task.Image,
  460. GpuQueue: task.GpuQueue,
  461. ResourceSpecId: task.ResourceSpecId,
  462. ComputeResource: task.ComputeResource,
  463. CreatedUnix: createTime,
  464. UpdatedUnix: createTime,
  465. BranchName: task.BranchName,
  466. }
  467. err = models.RestartCloudbrain(task, newTask)
  468. if err != nil {
  469. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  470. return err
  471. }
  472. stringId := strconv.FormatInt(newTask.ID, 10)
  473. *newID = stringId
  474. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  475. return nil
  476. }