|
- package cloudbrain
-
- import (
- "encoding/json"
- "errors"
- "strconv"
-
- "code.gitea.io/gitea/modules/timeutil"
-
- "code.gitea.io/gitea/modules/storage"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/notification"
- "code.gitea.io/gitea/modules/setting"
- )
-
- const (
- Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
- //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
- CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
- CodeMountPath = "/code"
- DataSetMountPath = "/dataset"
- ModelMountPath = "/model"
- LogFile = "log.txt"
- BenchMarkMountPath = "/benchmark"
- BenchMarkResourceID = 1
- Snn4imagenetMountPath = "/snn4imagenet"
- BrainScoreMountPath = "/brainscore"
- TaskInfoName = "/taskInfo"
- Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'`
- BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'`
-
- SubTaskName = "task1"
-
- Success = "S000"
-
- DefaultBranchName = "master"
- )
-
- var (
- ResourceSpecs *models.ResourceSpecs
- TrainResourceSpecs *models.ResourceSpecs
- )
-
- func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
- if !ctx.IsSigned {
- return false
- }
- if err != nil {
-
- return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
- } else {
- return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
- }
-
- }
-
- func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
-
- return isAdminOrOwnerOrJobCreater(ctx, job, nil)
- }
-
- func CanCreateOrDebugJob(ctx *context.Context) bool {
- if !ctx.IsSigned {
- return false
- }
- return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
- }
-
- func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
-
- return isAdminOrJobCreater(ctx, job, nil)
- }
-
- func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
- if !ctx.IsSigned {
- return false
- }
- if err != nil {
- return ctx.IsUserSiteAdmin()
- } else {
- return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
- }
-
- }
-
- func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
- if !ctx.IsSigned {
- return false
- }
- if err != nil {
- return ctx.IsUserSiteAdmin()
- } else {
- return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
- }
-
- }
-
- func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
-
- var ID = ctx.Params(":id")
- job, err := models.GetCloudbrainByID(ID)
- if err != nil {
- log.Error("GetCloudbrainByID failed:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
- ctx.Cloudbrain = job
- if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
- log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
-
- }
-
- func AdminOrJobCreaterRight(ctx *context.Context) {
-
- var ID = ctx.Params(":id")
- job, err := models.GetCloudbrainByID(ID)
- if err != nil {
- log.Error("GetCloudbrainByID failed:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
- ctx.Cloudbrain = job
- if !isAdminOrJobCreater(ctx, job, err) {
- log.Error("!isAdminOrJobCreater error:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
-
- }
-
- func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
-
- var jobID = ctx.Params(":jobid")
- job, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID failed:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
- ctx.Cloudbrain = job
- if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
- log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
-
- }
-
- func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
-
- var jobID = ctx.Params(":jobid")
- job, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID failed:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
- ctx.Cloudbrain = job
- if !isAdminOrJobCreater(ctx, job, err) {
- log.Error("!isAdminOrJobCreater errot:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
-
- }
-
- func AdminOrImageCreaterRight(ctx *context.Context) {
-
- id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
- var image *models.Image
- if err != nil {
- log.Error("Get Image by ID failed:%v", err.Error())
-
- } else {
- image, err = models.GetImageByID(id)
- if err != nil {
- log.Error("Get Image by ID failed:%v", err.Error())
- return
- }
- }
-
- if !isAdminOrImageCreater(ctx, image, err) {
- log.Error("!isAdminOrImageCreater error:%v", err.Error())
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- }
-
- }
-
- func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description, branchName, bootFile, params, commitID string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error {
-
- dataActualPath := setting.Attachment.Minio.RealPath +
- setting.Attachment.Minio.Bucket + "/" +
- setting.Attachment.Minio.BasePath +
- models.AttachmentRelativePath(uuid) +
- uuid
-
- var resourceSpec *models.ResourceSpec
- var versionCount int
- if jobType == string(models.JobTypeTrain) {
- versionCount = 1
- if TrainResourceSpecs == nil {
- json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
- }
- for _, spec := range TrainResourceSpecs.ResourceSpec {
- if resourceSpecId == spec.Id {
- resourceSpec = spec
- }
- }
- } else {
- if ResourceSpecs == nil {
- json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
- }
- for _, spec := range ResourceSpecs.ResourceSpec {
- if resourceSpecId == spec.Id {
- resourceSpec = spec
- }
- }
-
- }
-
- if resourceSpec == nil {
- log.Error("no such resourceSpecId(%d)", resourceSpecId, ctx.Data["MsgID"])
- return errors.New("no such resourceSpec")
- }
-
- var datasetName string
- attach, err := models.GetAttachmentByUUID(uuid)
- if err != nil {
- //for benchmark, do not return error
- log.Error("GetAttachmentByUUID failed:%v", err)
- } else {
- datasetName = attach.Name
- }
-
- createTime := timeutil.TimeStampNow()
- jobResult, err := CreateJob(jobName, models.CreateJobParams{
- JobName: jobName,
- RetryCount: 1,
- GpuType: gpuQueue,
- Image: image,
- TaskRoles: []models.TaskRole{
- {
- Name: SubTaskName,
- TaskNumber: 1,
- MinSucceededTaskCount: 1,
- MinFailedTaskCount: 1,
- CPUNumber: resourceSpec.CpuNum,
- GPUNumber: resourceSpec.GpuNum,
- MemoryMB: resourceSpec.MemMiB,
- ShmMB: resourceSpec.ShareMemMiB,
- Command: command,
- NeedIBDevice: false,
- IsMainRole: false,
- UseNNI: false,
- },
- },
- Volumes: []models.Volume{
- {
- HostPath: models.StHostPath{
- Path: codePath,
- MountPath: CodeMountPath,
- ReadOnly: false,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: dataActualPath,
- MountPath: DataSetMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: modelPath,
- MountPath: ModelMountPath,
- ReadOnly: false,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: benchmarkPath,
- MountPath: BenchMarkMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: snn4imagenetPath,
- MountPath: Snn4imagenetMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: brainScorePath,
- MountPath: BrainScoreMountPath,
- ReadOnly: true,
- },
- },
- },
- })
- if err != nil {
- log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
- return err
- }
- if jobResult.Code != Success {
- log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
- return errors.New(jobResult.Msg)
- }
-
- var jobID = jobResult.Payload["jobId"].(string)
- err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: string(models.JobWaiting),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobID,
- JobName: jobName,
- DisplayJobName: displayJobName,
- SubTaskName: SubTaskName,
- JobType: jobType,
- Type: models.TypeCloudBrainOne,
- Uuid: uuid,
- Image: image,
- GpuQueue: gpuQueue,
- ResourceSpecId: resourceSpecId,
- ComputeResource: models.GPUResource,
- BenchmarkTypeID: benchmarkTypeID,
- BenchmarkChildTypeID: benchmarkChildTypeID,
- Description: description,
- IsLatestVersion: "1",
- VersionCount: versionCount,
- BranchName: branchName,
- BootFile: bootFile,
- DatasetName: datasetName,
- Parameters: params,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- CommitID: commitID,
- })
-
- if err != nil {
- return err
- }
-
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByName failed: %v", err.Error())
- return err
- }
- stringId := strconv.FormatInt(task.ID, 10)
-
- if IsBenchmarkJob(jobType) {
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask)
- } else if string(models.JobTypeTrain) == jobType {
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, displayJobName, models.ActionCreateGPUTrainTask)
- } else {
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask)
- }
-
- return nil
- }
-
- func IsBenchmarkJob(jobType string) bool {
- return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
- }
-
- func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
- dataActualPath := setting.Attachment.Minio.RealPath +
- setting.Attachment.Minio.Bucket + "/" +
- setting.Attachment.Minio.BasePath +
- models.AttachmentRelativePath(task.Uuid) +
- task.Uuid
- jobName := task.JobName
-
- var resourceSpec *models.ResourceSpec
- if ResourceSpecs == nil {
- json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
- }
- for _, spec := range ResourceSpecs.ResourceSpec {
- if task.ResourceSpecId == spec.Id {
- resourceSpec = spec
- }
- }
-
- if resourceSpec == nil {
- log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
- return errors.New("no such resourceSpec")
- }
-
- createTime := timeutil.TimeStampNow()
- jobResult, err := CreateJob(jobName, models.CreateJobParams{
- JobName: jobName,
- RetryCount: 1,
- GpuType: task.GpuQueue,
- Image: task.Image,
- TaskRoles: []models.TaskRole{
- {
- Name: SubTaskName,
- TaskNumber: 1,
- MinSucceededTaskCount: 1,
- MinFailedTaskCount: 1,
- CPUNumber: resourceSpec.CpuNum,
- GPUNumber: resourceSpec.GpuNum,
- MemoryMB: resourceSpec.MemMiB,
- ShmMB: resourceSpec.ShareMemMiB,
- Command: Command,
- NeedIBDevice: false,
- IsMainRole: false,
- UseNNI: false,
- },
- },
- Volumes: []models.Volume{
- {
- HostPath: models.StHostPath{
- Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
- MountPath: CodeMountPath,
- ReadOnly: false,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: dataActualPath,
- MountPath: DataSetMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
- MountPath: ModelMountPath,
- ReadOnly: false,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
- MountPath: BenchMarkMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
- MountPath: Snn4imagenetMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
- MountPath: BrainScoreMountPath,
- ReadOnly: true,
- },
- },
- },
- })
- if err != nil {
- log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
- return err
- }
- if jobResult.Code != Success {
- log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
- return errors.New(jobResult.Msg)
- }
-
- var jobID = jobResult.Payload["jobId"].(string)
- newTask := &models.Cloudbrain{
- Status: string(models.JobWaiting),
- UserID: task.UserID,
- RepoID: task.RepoID,
- JobID: jobID,
- JobName: task.JobName,
- DisplayJobName: task.DisplayJobName,
- SubTaskName: task.SubTaskName,
- JobType: task.JobType,
- Type: task.Type,
- Uuid: task.Uuid,
- Image: task.Image,
- GpuQueue: task.GpuQueue,
- ResourceSpecId: task.ResourceSpecId,
- ComputeResource: task.ComputeResource,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- BranchName: task.BranchName,
- }
-
- err = models.RestartCloudbrain(task, newTask)
- if err != nil {
- log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
- return err
- }
-
- stringId := strconv.FormatInt(newTask.ID, 10)
- *newID = stringId
-
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
-
- return nil
- }
|