You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 15 kB

4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "strconv"
  6. "code.gitea.io/gitea/modules/timeutil"
  7. "code.gitea.io/gitea/modules/storage"
  8. "code.gitea.io/gitea/models"
  9. "code.gitea.io/gitea/modules/context"
  10. "code.gitea.io/gitea/modules/log"
  11. "code.gitea.io/gitea/modules/notification"
  12. "code.gitea.io/gitea/modules/setting"
  13. )
  14. const (
  15. Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  16. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  17. CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
  18. CodeMountPath = "/code"
  19. DataSetMountPath = "/dataset"
  20. ModelMountPath = "/model"
  21. LogFile = "log.txt"
  22. BenchMarkMountPath = "/benchmark"
  23. BenchMarkResourceID = 1
  24. Snn4imagenetMountPath = "/snn4imagenet"
  25. BrainScoreMountPath = "/brainscore"
  26. TaskInfoName = "/taskInfo"
  27. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'`
  28. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'`
  29. SubTaskName = "task1"
  30. Success = "S000"
  31. DefaultBranchName = "master"
  32. )
  33. var (
  34. ResourceSpecs *models.ResourceSpecs
  35. TrainResourceSpecs *models.ResourceSpecs
  36. )
  37. type DatasetInfo struct {
  38. DataLocalPath string
  39. Name string
  40. }
  41. type GenerateCloudBrainTaskReq struct {
  42. Ctx *context.Context
  43. DisplayJobName string
  44. JobName string
  45. Image string
  46. Command string
  47. CodePath string
  48. ModelPath string
  49. BenchmarkPath string
  50. Snn4ImageNetPath string
  51. BrainScorePath string
  52. JobType string
  53. GpuQueue string
  54. Description string
  55. BranchName string
  56. BootFile string
  57. Params string
  58. CommitID string
  59. Uuids string
  60. DatasetNames string
  61. DatasetInfos map[string]DatasetInfo
  62. BenchmarkTypeID int
  63. BenchmarkChildTypeID int
  64. ResourceSpecId int
  65. }
  66. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  67. if !ctx.IsSigned {
  68. return false
  69. }
  70. if err != nil {
  71. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  72. } else {
  73. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  74. }
  75. }
  76. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  77. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  78. }
  79. func CanCreateOrDebugJob(ctx *context.Context) bool {
  80. if !ctx.IsSigned {
  81. return false
  82. }
  83. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  84. }
  85. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  86. return isAdminOrJobCreater(ctx, job, nil)
  87. }
  88. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  89. if !ctx.IsSigned {
  90. return false
  91. }
  92. if err != nil {
  93. return ctx.IsUserSiteAdmin()
  94. } else {
  95. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  96. }
  97. }
  98. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  99. if !ctx.IsSigned {
  100. return false
  101. }
  102. if err != nil {
  103. return ctx.IsUserSiteAdmin()
  104. } else {
  105. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  106. }
  107. }
  108. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  109. var ID = ctx.Params(":id")
  110. job, err := models.GetCloudbrainByID(ID)
  111. if err != nil {
  112. log.Error("GetCloudbrainByID failed:%v", err.Error())
  113. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  114. }
  115. ctx.Cloudbrain = job
  116. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  117. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  118. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  119. }
  120. }
  121. func AdminOrJobCreaterRight(ctx *context.Context) {
  122. var ID = ctx.Params(":id")
  123. job, err := models.GetCloudbrainByID(ID)
  124. if err != nil {
  125. log.Error("GetCloudbrainByID failed:%v", err.Error())
  126. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  127. }
  128. ctx.Cloudbrain = job
  129. if !isAdminOrJobCreater(ctx, job, err) {
  130. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  131. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  132. }
  133. }
  134. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  135. var jobID = ctx.Params(":jobid")
  136. job, err := models.GetCloudbrainByJobID(jobID)
  137. if err != nil {
  138. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  139. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  140. }
  141. ctx.Cloudbrain = job
  142. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  143. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  144. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  145. }
  146. }
  147. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  148. var jobID = ctx.Params(":jobid")
  149. job, err := models.GetCloudbrainByJobID(jobID)
  150. if err != nil {
  151. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  152. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  153. }
  154. ctx.Cloudbrain = job
  155. if !isAdminOrJobCreater(ctx, job, err) {
  156. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  157. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  158. }
  159. }
  160. func AdminOrImageCreaterRight(ctx *context.Context) {
  161. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  162. var image *models.Image
  163. if err != nil {
  164. log.Error("Get Image by ID failed:%v", err.Error())
  165. } else {
  166. image, err = models.GetImageByID(id)
  167. if err != nil {
  168. log.Error("Get Image by ID failed:%v", err.Error())
  169. return
  170. }
  171. }
  172. if !isAdminOrImageCreater(ctx, image, err) {
  173. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  174. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  175. }
  176. }
  177. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  178. var resourceSpec *models.ResourceSpec
  179. var versionCount int
  180. if req.JobType == string(models.JobTypeTrain) {
  181. versionCount = 1
  182. if TrainResourceSpecs == nil {
  183. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  184. }
  185. for _, spec := range TrainResourceSpecs.ResourceSpec {
  186. if req.ResourceSpecId == spec.Id {
  187. resourceSpec = spec
  188. }
  189. }
  190. } else {
  191. if ResourceSpecs == nil {
  192. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  193. }
  194. for _, spec := range ResourceSpecs.ResourceSpec {
  195. if req.ResourceSpecId == spec.Id {
  196. resourceSpec = spec
  197. }
  198. }
  199. }
  200. if resourceSpec == nil {
  201. log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
  202. return errors.New("no such resourceSpec")
  203. }
  204. volumes := []models.Volume{
  205. {
  206. HostPath: models.StHostPath{
  207. Path: req.CodePath,
  208. MountPath: CodeMountPath,
  209. ReadOnly: false,
  210. },
  211. },
  212. {
  213. HostPath: models.StHostPath{
  214. Path: req.ModelPath,
  215. MountPath: ModelMountPath,
  216. ReadOnly: false,
  217. },
  218. },
  219. {
  220. HostPath: models.StHostPath{
  221. Path: req.BenchmarkPath,
  222. MountPath: BenchMarkMountPath,
  223. ReadOnly: true,
  224. },
  225. },
  226. {
  227. HostPath: models.StHostPath{
  228. Path: req.Snn4ImageNetPath,
  229. MountPath: Snn4imagenetMountPath,
  230. ReadOnly: true,
  231. },
  232. },
  233. {
  234. HostPath: models.StHostPath{
  235. Path: req.BrainScorePath,
  236. MountPath: BrainScoreMountPath,
  237. ReadOnly: true,
  238. },
  239. },
  240. }
  241. if len(req.DatasetInfos) == 1 {
  242. volumes = append(volumes, models.Volume{
  243. HostPath: models.StHostPath{
  244. Path: req.DatasetInfos[req.Uuids].DataLocalPath,
  245. MountPath: DataSetMountPath,
  246. ReadOnly: true,
  247. },
  248. })
  249. } else {
  250. for _, dataset := range req.DatasetInfos {
  251. volumes = append(volumes, models.Volume{
  252. HostPath: models.StHostPath{
  253. Path: dataset.DataLocalPath,
  254. MountPath: DataSetMountPath + "/" + dataset.Name,
  255. ReadOnly: true,
  256. },
  257. })
  258. }
  259. }
  260. createTime := timeutil.TimeStampNow()
  261. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  262. JobName: req.JobName,
  263. RetryCount: 1,
  264. GpuType: req.GpuQueue,
  265. Image: req.Image,
  266. TaskRoles: []models.TaskRole{
  267. {
  268. Name: SubTaskName,
  269. TaskNumber: 1,
  270. MinSucceededTaskCount: 1,
  271. MinFailedTaskCount: 1,
  272. CPUNumber: resourceSpec.CpuNum,
  273. GPUNumber: resourceSpec.GpuNum,
  274. MemoryMB: resourceSpec.MemMiB,
  275. ShmMB: resourceSpec.ShareMemMiB,
  276. Command: req.Command,
  277. NeedIBDevice: false,
  278. IsMainRole: false,
  279. UseNNI: false,
  280. },
  281. },
  282. Volumes: volumes,
  283. })
  284. if err != nil {
  285. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  286. return err
  287. }
  288. if jobResult.Code != Success {
  289. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  290. return errors.New(jobResult.Msg)
  291. }
  292. var jobID = jobResult.Payload["jobId"].(string)
  293. err = models.CreateCloudbrain(&models.Cloudbrain{
  294. Status: string(models.JobWaiting),
  295. UserID: req.Ctx.User.ID,
  296. RepoID: req.Ctx.Repo.Repository.ID,
  297. JobID: jobID,
  298. JobName: req.JobName,
  299. DisplayJobName: req.DisplayJobName,
  300. SubTaskName: SubTaskName,
  301. JobType: req.JobType,
  302. Type: models.TypeCloudBrainOne,
  303. Uuid: req.Uuids,
  304. Image: req.Image,
  305. GpuQueue: req.GpuQueue,
  306. ResourceSpecId: req.ResourceSpecId,
  307. ComputeResource: models.GPUResource,
  308. BenchmarkTypeID: req.BenchmarkTypeID,
  309. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  310. Description: req.Description,
  311. IsLatestVersion: "1",
  312. VersionCount: versionCount,
  313. BranchName: req.BranchName,
  314. BootFile: req.BootFile,
  315. DatasetName: req.DatasetNames,
  316. Parameters: req.Params,
  317. CreatedUnix: createTime,
  318. UpdatedUnix: createTime,
  319. CommitID: req.CommitID,
  320. })
  321. if err != nil {
  322. return err
  323. }
  324. task, err := models.GetCloudbrainByJobID(jobID)
  325. if err != nil {
  326. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  327. return err
  328. }
  329. stringId := strconv.FormatInt(task.ID, 10)
  330. if IsBenchmarkJob(req.JobType) {
  331. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  332. } else if string(models.JobTypeTrain) == req.JobType {
  333. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  334. } else {
  335. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  336. }
  337. return nil
  338. }
  339. func IsBenchmarkJob(jobType string) bool {
  340. return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  341. }
  342. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  343. dataActualPath := setting.Attachment.Minio.RealPath +
  344. setting.Attachment.Minio.Bucket + "/" +
  345. setting.Attachment.Minio.BasePath +
  346. models.AttachmentRelativePath(task.Uuid) +
  347. task.Uuid
  348. jobName := task.JobName
  349. var resourceSpec *models.ResourceSpec
  350. if ResourceSpecs == nil {
  351. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  352. }
  353. for _, spec := range ResourceSpecs.ResourceSpec {
  354. if task.ResourceSpecId == spec.Id {
  355. resourceSpec = spec
  356. }
  357. }
  358. if resourceSpec == nil {
  359. log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
  360. return errors.New("no such resourceSpec")
  361. }
  362. createTime := timeutil.TimeStampNow()
  363. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  364. JobName: jobName,
  365. RetryCount: 1,
  366. GpuType: task.GpuQueue,
  367. Image: task.Image,
  368. TaskRoles: []models.TaskRole{
  369. {
  370. Name: SubTaskName,
  371. TaskNumber: 1,
  372. MinSucceededTaskCount: 1,
  373. MinFailedTaskCount: 1,
  374. CPUNumber: resourceSpec.CpuNum,
  375. GPUNumber: resourceSpec.GpuNum,
  376. MemoryMB: resourceSpec.MemMiB,
  377. ShmMB: resourceSpec.ShareMemMiB,
  378. Command: Command,
  379. NeedIBDevice: false,
  380. IsMainRole: false,
  381. UseNNI: false,
  382. },
  383. },
  384. Volumes: []models.Volume{
  385. {
  386. HostPath: models.StHostPath{
  387. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  388. MountPath: CodeMountPath,
  389. ReadOnly: false,
  390. },
  391. },
  392. {
  393. HostPath: models.StHostPath{
  394. Path: dataActualPath,
  395. MountPath: DataSetMountPath,
  396. ReadOnly: true,
  397. },
  398. },
  399. {
  400. HostPath: models.StHostPath{
  401. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  402. MountPath: ModelMountPath,
  403. ReadOnly: false,
  404. },
  405. },
  406. {
  407. HostPath: models.StHostPath{
  408. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  409. MountPath: BenchMarkMountPath,
  410. ReadOnly: true,
  411. },
  412. },
  413. {
  414. HostPath: models.StHostPath{
  415. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  416. MountPath: Snn4imagenetMountPath,
  417. ReadOnly: true,
  418. },
  419. },
  420. {
  421. HostPath: models.StHostPath{
  422. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  423. MountPath: BrainScoreMountPath,
  424. ReadOnly: true,
  425. },
  426. },
  427. },
  428. })
  429. if err != nil {
  430. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  431. return err
  432. }
  433. if jobResult.Code != Success {
  434. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  435. return errors.New(jobResult.Msg)
  436. }
  437. var jobID = jobResult.Payload["jobId"].(string)
  438. newTask := &models.Cloudbrain{
  439. Status: string(models.JobWaiting),
  440. UserID: task.UserID,
  441. RepoID: task.RepoID,
  442. JobID: jobID,
  443. JobName: task.JobName,
  444. DisplayJobName: task.DisplayJobName,
  445. SubTaskName: task.SubTaskName,
  446. JobType: task.JobType,
  447. Type: task.Type,
  448. Uuid: task.Uuid,
  449. Image: task.Image,
  450. GpuQueue: task.GpuQueue,
  451. ResourceSpecId: task.ResourceSpecId,
  452. ComputeResource: task.ComputeResource,
  453. CreatedUnix: createTime,
  454. UpdatedUnix: createTime,
  455. BranchName: task.BranchName,
  456. }
  457. err = models.RestartCloudbrain(task, newTask)
  458. if err != nil {
  459. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  460. return err
  461. }
  462. stringId := strconv.FormatInt(newTask.ID, 10)
  463. *newID = stringId
  464. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  465. return nil
  466. }