package repo import ( "code.gitea.io/gitea/modules/urfs_client/urchin" "encoding/json" "errors" "fmt" "io/ioutil" "net/http" "os" "path" "strconv" "strings" "time" "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask" "code.gitea.io/gitea/modules/dataset" "code.gitea.io/gitea/services/cloudbrain/resource" "code.gitea.io/gitea/services/reward/point/account" "code.gitea.io/gitea/modules/auth" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/notification" "code.gitea.io/gitea/modules/redis/redis_key" "code.gitea.io/gitea/modules/redis/redis_lock" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/util" "github.com/unknwon/com" "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/base" "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" cloudbrainService "code.gitea.io/gitea/services/cloudbrain" ) const ( tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" //GPU tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" //NPU tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" ) func GrampusTrainJobGPUNew(ctx *context.Context) { ctx.Data["IsCreate"] = true err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) if err != nil { ctx.ServerError("get new train-job info failed", err) return } ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew) } func GrampusTrainJobNPUNew(ctx *context.Context) { ctx.Data["IsCreate"] = true err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) if err != nil { ctx.ServerError("get new train-job info failed", err) return } ctx.HTML(200, tplGrampusTrainJobNPUNew) } func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error { ctx.Data["PageIsCloudBrain"] = true t := time.Now() var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] ctx.Data["display_job_name"] = displayJobName //get valid images images, err := grampus.GetImages(processType) if err != nil { log.Error("GetImages failed:", err.Error()) } else { ctx.Data["images"] = images.Infos } grampus.InitSpecialPool() ctx.Data["GPUEnabled"] = true ctx.Data["NPUEnabled"] = true includeCenters := make(map[string]struct{}) excludeCenters := make(map[string]struct{}) if grampus.SpecialPools != nil { for _, pool := range grampus.SpecialPools.Pools { if pool.IsExclusive { if !IsUserInOrgPool(ctx.User.ID, pool) { ctx.Data[pool.Type+"Enabled"] = false } } else { if strings.Contains(strings.ToLower(processType), strings.ToLower(pool.Type)) { if IsUserInOrgPool(ctx.User.ID, pool) { for _, center := range pool.Pool { includeCenters[center.Queue] = struct{}{} } } else { for _, center := range pool.Pool { excludeCenters[center.Queue] = struct{}{} } } } } } } //prepare available specs if processType == grampus.ProcessorTypeNPU { prepareGrampusTrainSpecs(ctx, models.NPU) } else if processType == grampus.ProcessorTypeGPU { prepareGrampusTrainSpecs(ctx, models.GPU) } //get branches branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) if err != nil { log.Error("GetBranches error:", err.Error()) } else { ctx.Data["branches"] = branches } ctx.Data["branchName"] = ctx.Repo.BranchName if processType == grampus.ProcessorTypeGPU { ctx.Data["datasetType"] = models.TypeCloudBrainOne waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain) ctx.Data["WaitCount"] = waitCount NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource) ctx.Data["NotStopTaskCount"] = NotStopTaskCount } else if processType == grampus.ProcessorTypeNPU { ctx.Data["datasetType"] = models.TypeCloudBrainTwo waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain) ctx.Data["WaitCount"] = waitCount NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource) ctx.Data["NotStopTaskCount"] = NotStopTaskCount } if ctx.Cloudbrain != nil { uuids, datasetNames := dataset.GetFilterDeletedAttachments(ctx.Cloudbrain.Uuid) ctx.Data["attachment"] = uuids ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile ctx.Data["image_id"] = ctx.Cloudbrain.ImageID ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters ctx.Data["description"] = ctx.Cloudbrain.Description ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName ctx.Data["work_server_number"] = ctx.Cloudbrain.WorkServerNumber if ctx.Cloudbrain.Image != "" { ctx.Data["image"] = ctx.Cloudbrain.Image } else { ctx.Data["image"] = ctx.Cloudbrain.EngineName } ctx.Data["dataset_name"] = datasetNames ctx.Data["model_name"] = ctx.Cloudbrain.ModelName ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName ctx.Data["label_names"] = ctx.Cloudbrain.LabelName ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID) if spec != nil { ctx.Data["spec_id"] = spec.ID } } return nil } func GrampusTrainJobVersionNew(ctx *context.Context) { task := ctx.Cloudbrain ctx.Data["IsCreate"] = false if task.ComputeResource == models.GPUResource { err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) if err != nil { ctx.ServerError("get new train-job version info failed", err) return } ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew) } else if task.ComputeResource == models.NPUResource { err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) if err != nil { ctx.ServerError("get new train-job version info failed", err) return } ctx.HTML(200, tplGrampusTrainJobNPUNew) } } func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) { noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ JobType: models.JobTypeTrain, ComputeResource: computeResource, Cluster: models.C2NetCluster, }) ctx.Data["Specs"] = noteBookSpecs } func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec { if len(includeCenters) == 0 && len(excludeCenters) == 0 { return specs.Infos } var grampusSpecs []models.GrampusSpec for _, info := range specs.Infos { if isInIncludeCenters(info, includeCenters) || (len(excludeCenters) != 0 && isNotAllInExcludeCenters(info, excludeCenters)) { grampusSpecs = append(grampusSpecs, info) } } return grampusSpecs } func isInIncludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool { for _, center := range grampusSpec.Centers { if _, ok := centers[center.ID]; ok { return true } } return false } func isNotAllInExcludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool { for _, center := range grampusSpec.Centers { if _, ok := centers[center.ID]; !ok { return true } } return false } func IsUserInOrgPool(userId int64, pool *models.SpecialPool) bool { org, _ := models.GetOrgByName(pool.Org) if org != nil { isOrgMember, _ := models.IsOrganizationMember(org.ID, userId) return isOrgMember } return false } func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") { log.Error("the boot file(%s) must be a python file", form.BootFile) return errors.New("启动文件必须是python文件") } if form.BranchName == "" { log.Error("the branch must not be null!", form.BranchName) return errors.New("代码分支不能为空!") } return nil } func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { ctx.Data["IsCreate"] = true grampusTrainJobGpuCreate(ctx, form) } func grampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment description := form.Description bootFile := strings.TrimSpace(form.BootFile) params := form.Params repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" branchName := form.BranchName image := strings.TrimSpace(form.Image) tpl := tplGrampusTrainJobGPUNew lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName)) isOk, err := lock.Lock(models.CloudbrainKeyDuration) if !isOk { log.Error("lock processed failed:%v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobGPUNew, &form) return } defer lock.UnLock() if !jobNamePattern.MatchString(displayJobName) { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName) if err != nil || !bootFileExist { log.Error("Get bootfile error:", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form) return } //check count limit count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource) if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } //check param if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(err.Error(), tpl, &form) return } //check whether the task name in the project is duplicated tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr("system error", tpl, &form) return } } //check specification spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ JobType: models.JobTypeTrain, ComputeResource: models.GPU, Cluster: models.C2NetCluster, }) if err != nil || spec == nil { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr("Resource specification not available", tpl, &form) return } if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobGPUNew, &form) return } //check dataset datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU) if err != nil { log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } //prepare code and out path _, err = ioutil.ReadDir(codeLocalPath) if err == nil { os.RemoveAll(codeLocalPath) } if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } //todo: upload code (send to file_server todo this work?) //upload code if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" if err := mkModelPath(modelPath); err != nil { log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } //init model readme if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil { log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } var datasetRemotePath, allFileName string for _, datasetInfo := range datasetInfos { if datasetRemotePath == "" { datasetRemotePath = datasetInfo.DataLocalPath allFileName = datasetInfo.FullName } else { datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath allFileName = allFileName + ";" + datasetInfo.FullName } } //prepare command preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName, "") if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr("Create task failed, internal error", tpl, &form) return } commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ JobName: jobName, DisplayJobName: displayJobName, ComputeResource: models.GPUResource, ProcessType: grampus.ProcessorTypeGPU, Command: command, ImageUrl: image, Description: description, BootFile: bootFile, Uuid: uuid, CommitID: commitID, BranchName: branchName, Params: form.Params, EngineName: image, DatasetNames: datasetNames, DatasetInfos: datasetInfos, IsLatestVersion: modelarts.IsLatestVersion, VersionCount: modelarts.VersionCountOne, WorkServerNumber: 1, Spec: spec, } if form.ModelName != "" { //使用预训练模型训练 req.ModelName = form.ModelName req.LabelName = form.LabelName req.CkptName = form.CkptName req.ModelVersion = form.ModelVersion req.PreTrainModelUrl = form.PreTrainModelUrl } err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) ctx.RenderWithErr(err.Error(), tpl, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } func getPreTrainModelPath(pretrainModelDir string, fileName string) string { index := strings.Index(pretrainModelDir, "/") if index > 0 { filterBucket := pretrainModelDir[index+1:] return filterBucket + fileName } else { return "" } } func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { ctx.Data["IsCreate"] = false computeResource := ctx.Query("compute_resource") if computeResource == models.GPUResource { grampusTrainJobGpuCreate(ctx, form) } else if computeResource == models.NPUResource { grampusTrainJobNpuCreate(ctx, form) } else { ctx.ServerError("resource error", errors.New("compute resource is not support")) return } } func checkSpecialPool(ctx *context.Context, resourceType string) string { grampus.InitSpecialPool() if grampus.SpecialPools != nil { for _, pool := range grampus.SpecialPools.Pools { if pool.IsExclusive && pool.Type == resourceType { org, _ := models.GetOrgByName(pool.Org) if org != nil { isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID) if !isOrgMember { return ctx.Tr("repo.grampus.no_operate_right") } } } } } return "" } func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { ctx.Data["IsCreate"] = true grampusTrainJobNpuCreate(ctx, form) } func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment description := form.Description bootFile := strings.TrimSpace(form.BootFile) params := form.Params repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath codeObsPath := grampus.JobPath + jobName + modelarts.CodePath //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion versionCount := modelarts.VersionCountOne engineName := form.EngineName tpl := tplGrampusTrainJobNPUNew lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName)) isOk, err := lock.Lock(models.CloudbrainKeyDuration) if !isOk { log.Error("lock processed failed:%v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobNPUNew, &form) return } defer lock.UnLock() if !jobNamePattern.MatchString(displayJobName) { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName) if err != nil || !bootFileExist { log.Error("Get bootfile error:", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form) return } //check count limit count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource) if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } //check param if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(err.Error(), tpl, &form) return } //check whether the task name in the project is duplicated tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr("system error", tpl, &form) return } } //check specification spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ JobType: models.JobTypeTrain, ComputeResource: models.NPU, Cluster: models.C2NetCluster, }) if err != nil || spec == nil { grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr("Resource specification not available", tpl, &form) return } if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobNPUNew, &form) return } //check dataset datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU) if err != nil { log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } //prepare code and out path _, err = ioutil.ReadDir(codeLocalPath) if err == nil { os.RemoveAll(codeLocalPath) } if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } //todo: upload code (send to file_server todo this work?) if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) return } var datasetRemotePath, allFileName string for _, datasetInfo := range datasetInfos { if datasetRemotePath == "" { datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'" allFileName = datasetInfo.FullName } else { datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'" allFileName = allFileName + ";" + datasetInfo.FullName } } //prepare command preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName)) if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr("Create task failed, internal error", tpl, &form) return } commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ JobName: jobName, DisplayJobName: displayJobName, ComputeResource: models.NPUResource, ProcessType: grampus.ProcessorTypeNPU, Command: command, ImageId: form.ImageID, Description: description, CodeObsPath: codeObsPath, BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, WorkServerNumber: form.WorkServerNumber, Uuid: uuid, CommitID: commitID, IsLatestVersion: isLatestVersion, BranchName: branchName, Params: form.Params, EngineName: engineName, VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, DatasetNames: datasetNames, DatasetInfos: datasetInfos, Spec: spec, CodeName: strings.ToLower(repo.Name), } if form.ModelName != "" { //使用预训练模型训练 req.ModelName = form.ModelName req.LabelName = form.LabelName req.CkptName = form.CkptName req.ModelVersion = form.ModelVersion req.PreTrainModelUrl = form.PreTrainModelUrl req.PreTrainModelPath = preTrainModelPath } err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) ctx.RenderWithErr(err.Error(), tpl, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } func GrampusStopJob(ctx *context.Context) { var ID = ctx.Params(":jobid") var resultCode = "0" var errorMsg = "" var status = "" task := ctx.Cloudbrain for { if task.Status == string(models.GrampusStatusStopped) || task.Status == string(models.GrampusStatusFailed) || task.Status == string(models.GrampusStatusSucceeded) { log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) resultCode = "-1" errorMsg = "system error" break } res, err := grampus.StopJob(task.JobID) if err != nil { log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) resultCode = strconv.Itoa(res.ErrorCode) errorMsg = res.ErrorMsg break } oldStatus := task.Status task.Status = string(models.GrampusStatusStopped) if task.EndTime == 0 { task.EndTime = timeutil.TimeStampNow() } task.ComputeAndSetDuration() if oldStatus != task.Status { notification.NotifyChangeCloudbrainStatus(task, oldStatus) } err = models.UpdateJob(task) if err != nil { log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) resultCode = "-1" errorMsg = "system error" break } status = task.Status break } ctx.JSON(200, map[string]interface{}{ "result_code": resultCode, "error_msg": errorMsg, "status": status, "id": ID, "StatusOK": 0, }) } func GrampusTrainJobDel(ctx *context.Context) { var listType = ctx.Query("listType") if err := deleteGrampusJob(ctx); err != nil { log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"]) ctx.ServerError(err.Error(), err) return } var isAdminPage = ctx.Query("isadminpage") var isHomePage = ctx.Query("ishomepage") if ctx.IsUserSiteAdmin() && isAdminPage == "true" { ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") } else if isHomePage == "true" { ctx.Redirect(setting.AppSubURL + "/cloudbrains") } else { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) } } func deleteGrampusJob(ctx *context.Context) error { task := ctx.Cloudbrain if task.Status != string(models.GrampusStatusStopped) && task.Status != string(models.GrampusStatusSucceeded) && task.Status != string(models.GrampusStatusFailed) { log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) return errors.New("the job has not been stopped") } err := models.DeleteJob(task) if err != nil { log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"]) return err } storageType := models.TypeCloudBrainOne if task.ComputeResource == models.NPUResource { storageType = models.TypeCloudBrainTwo } DeleteCloudbrainJobStorage(task.JobName, storageType) return nil } func GrampusTrainJobShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true var task *models.Cloudbrain task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid")) if err != nil { log.Error("GetCloudbrainByJobID failed:" + err.Error()) ctx.NotFound(ctx.Req.URL.RequestURI(), nil) return } task.ContainerIp = "" if task.DeletedAt.IsZero() { //normal record result, err := grampus.GetJob(task.JobID) if err != nil { log.Error("GetJob failed:" + err.Error()) ctx.NotFound(ctx.Req.URL.RequestURI(), nil) return } if result != nil { if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] } oldStatus := task.Status task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) if task.Status != oldStatus || task.Status == models.GrampusStatusRunning { task.Duration = result.JobInfo.RunSec if task.Duration < 0 { task.Duration = 0 } task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) } if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { task.EndTime = task.StartTime.Add(task.Duration) } task.CorrectCreateUnix() if oldStatus != task.Status { notification.NotifyChangeCloudbrainStatus(task, oldStatus) if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource { if len(result.JobInfo.Tasks[0].CenterID) == 1 { urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID)) } } } err = models.UpdateJob(task) if err != nil { log.Error("UpdateJob failed:" + err.Error()) } } } } if len(task.Parameters) > 0 { var parameters models.Parameters err := json.Unmarshal([]byte(task.Parameters), ¶meters) if err != nil { log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) ctx.ServerError("system error", err) return } if len(parameters.Parameter) > 0 { paramTemp := "" for _, Parameter := range parameters.Parameter { param := Parameter.Label + " = " + Parameter.Value + "; " paramTemp = paramTemp + param } task.Parameters = paramTemp[:len(paramTemp)-2] } else { task.Parameters = "" } } taskList := make([]*models.Cloudbrain, 0) taskList = append(taskList, task) prepareSpec4Show(ctx, task) ctx.Data["version_list_task"] = taskList ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false) ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) ctx.Data["displayJobName"] = task.DisplayJobName ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx) ctx.HTML(http.StatusOK, tplGrampusTrainJobShow) } func GrampusDownloadLog(ctx *context.Context) { jobID := ctx.Params(":jobid") job, err := models.GetCloudbrainByJobID(jobID) if err != nil { log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) return } content, err := grampus.GetTrainJobLog(job.JobID) if err != nil { log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"]) content = "" } fileName := job.JobName + "-log.txt" ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName) ctx.Resp.Header().Set("Content-Type", "application/octet-stream") var b []byte = []byte(content) ctx.Resp.Write(b) } func GrampusGetLog(ctx *context.Context) { jobID := ctx.Params(":jobid") job, err := models.GetCloudbrainByJobID(jobID) if err != nil { log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) return } content, err := grampus.GetTrainJobLog(job.JobID) if err != nil { log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) return } var canLogDownload bool if err != nil { canLogDownload = false } else { canLogDownload = true } ctx.JSON(http.StatusOK, map[string]interface{}{ "JobName": job.JobName, "Content": content, "CanLogDownload": canLogDownload, }) return } func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) { var command string //prepare workDir := grampus.NpuWorkDir if processorType == grampus.ProcessorTypeNPU { command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu } else if processorType == grampus.ProcessorTypeGPU { workDir = grampus.GpuWorkDir command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject) } //download code & dataset if processorType == grampus.ProcessorTypeNPU { //no need to download code & dataset by internet } else if processorType == grampus.ProcessorTypeGPU { commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'" commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload) command += commandDownload } //unzip code & dataset if processorType == grampus.ProcessorTypeNPU { //no need to process } else if processorType == grampus.ProcessorTypeGPU { unZipDatasetCommand := generateDatasetUnzipCommand(datasetName) commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand command += commandUnzip } command += "echo \"unzip finished;start to exec code;\";" // set export var commandExport string if processorType == grampus.ProcessorTypeNPU { commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";" } else if processorType == grampus.ProcessorTypeGPU { commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";" } command += commandExport //exec code var parameters models.Parameters var paramCode string if len(paramSrc) != 0 { err := json.Unmarshal([]byte(paramSrc), ¶meters) if err != nil { log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err) return command, err } for _, parameter := range parameters.Parameter { paramCode += " --" + parameter.Label + "=" + parameter.Value } } var commandCode string if processorType == grampus.ProcessorTypeNPU { paramCode += " --model_url=" + modelRemoteObsUrl commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";" } else if processorType == grampus.ProcessorTypeGPU { if pretrainModelFileName != "" { paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName } commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";" } command += commandCode //get exec result commandGetRes := "result=$?;" command += commandGetRes //upload models if processorType == grampus.ProcessorTypeNPU { // no need to upload } else if processorType == grampus.ProcessorTypeGPU { commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;" command += commandUpload } //check exec result commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\"" command += commandCheckRes return command, nil } func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string { commandDownloadTemp := commandDownload if pretrainModelPath != "" { commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'" } commandDownloadTemp += ";" return commandDownloadTemp } func generateDatasetUnzipCommand(datasetName string) string { var unZipDatasetCommand string datasetNameArray := strings.Split(datasetName, ";") if len(datasetNameArray) == 1 { //单数据集 unZipDatasetCommand = "unzip -q '" + datasetName + "';" if strings.HasSuffix(datasetNameArray[0], ".tar.gz") { unZipDatasetCommand = "tar --strip-components=1 -zxvf '" + datasetName + "';" } unZipDatasetCommand += "rm -f '" + datasetName + "';" } else { //多数据集 for _, datasetNameTemp := range datasetNameArray { if strings.HasSuffix(datasetNameTemp, ".tar.gz") { unZipDatasetCommand = unZipDatasetCommand + "tar -zxvf '" + datasetNameTemp + "';" } else { unZipDatasetCommand = unZipDatasetCommand + "unzip -q '" + datasetNameTemp + "' -d './" + strings.TrimSuffix(datasetNameTemp, ".zip") + "';" } unZipDatasetCommand += "rm -f '" + datasetNameTemp + "';" } } return unZipDatasetCommand } func downloadZipCode(ctx *context.Context, codePath, branchName string) error { archiveType := git.ZIP archivePath := codePath if !com.IsDir(archivePath) { if err := os.MkdirAll(archivePath, os.ModePerm); err != nil { log.Error("MkdirAll failed:" + err.Error()) return err } } // Get corresponding commit. var ( commit *git.Commit err error ) gitRepo := ctx.Repo.GitRepo if err != nil { log.Error("OpenRepository failed:" + err.Error()) return err } if gitRepo.IsBranchExist(branchName) { commit, err = gitRepo.GetBranchCommit(branchName) if err != nil { log.Error("GetBranchCommit failed:" + err.Error()) return err } } else { log.Error("the branch is not exist: " + branchName) return fmt.Errorf("The branch does not exist.") } archivePath = path.Join(archivePath, grampus.CodeArchiveName) if !com.IsFile(archivePath) { if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{ Format: archiveType, Prefix: setting.Repository.PrefixArchiveFiles, }); err != nil { log.Error("CreateArchive failed:" + err.Error()) return err } } return nil }