|
- package repo
-
- import (
- "encoding/json"
- "errors"
- "io"
- "io/ioutil"
- "net/http"
- "os"
- "path"
- "strconv"
- "strings"
- "time"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/auth"
- "code.gitea.io/gitea/modules/base"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/git"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/modelarts"
- "code.gitea.io/gitea/modules/obs"
- "code.gitea.io/gitea/modules/setting"
- "code.gitea.io/gitea/modules/storage"
-
- "github.com/unknwon/com"
- )
-
- const (
- tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
- tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
- tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
-
- tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
- tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
- tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
- tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
- )
-
- // MustEnableDataset check if repository enable internal cb
- func MustEnableModelArts(ctx *context.Context) {
- if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
- ctx.NotFound("MustEnableCloudbrain", nil)
- return
- }
- }
-
- func NotebookIndex(ctx *context.Context) {
- MustEnableModelArts(ctx)
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
-
- ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobType: string(models.JobTypeDebug),
- })
- if err != nil {
- ctx.ServerError("Cloudbrain", err)
- return
- }
-
- for i, task := range ciTasks {
- if task.Status == string(models.JobRunning) {
- ciTasks[i].CanDebug = true
- } else {
- ciTasks[i].CanDebug = false
- }
- }
-
- pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
- pager.SetDefaultParams(ctx)
- ctx.Data["Page"] = pager
-
- ctx.Data["PageIsCloudBrain"] = true
- ctx.Data["Tasks"] = ciTasks
- ctx.HTML(200, tplModelArtsNotebookIndex)
- }
-
- func NotebookNew(ctx *context.Context) {
- ctx.Data["PageIsCloudBrain"] = true
-
- t := time.Now()
- var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["job_name"] = jobName
-
- attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return
- }
-
- ctx.Data["attachments"] = attachs
- ctx.Data["dataset_path"] = modelarts.DataSetMountPath
- ctx.Data["env"] = modelarts.NotebookEnv
- ctx.Data["notebook_type"] = modelarts.NotebookType
- if modelarts.FlavorInfos == nil {
- json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
- }
- ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
-
- ctx.HTML(200, tplModelArtsNotebookNew)
- }
-
- func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
- ctx.Data["PageIsNotebook"] = true
- jobName := form.JobName
- uuid := form.Attachment
- description := form.Description
- flavor := form.Flavor
-
- err := modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
- return
- }
-
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook")
- }
-
- func NotebookShow(ctx *context.Context) {
- ctx.Data["PageIsCloudBrain"] = true
-
- var jobID = ctx.Params(":jobid")
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- ctx.Data["error"] = err.Error()
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
- return
- }
-
- result, err := modelarts.GetJob(jobID)
- if err != nil {
- ctx.Data["error"] = err.Error()
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
- return
- }
-
- if result != nil {
- task.Status = result.Status
- err = models.UpdateJob(task)
- if err != nil {
- ctx.Data["error"] = err.Error()
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
- return
- }
-
- createTime, _ := com.StrTo(result.CreationTimestamp).Int64()
- result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05")
- endTime, _ := com.StrTo(result.LatestUpdateTimestamp).Int64()
- result.LatestUpdateTime = time.Unix(int64(endTime/1000), 0).Format("2006-01-02 15:04:05")
- result.QueuingInfo.BeginTime = time.Unix(int64(result.QueuingInfo.BeginTimestamp/1000), 0).Format("2006-01-02 15:04:05")
- result.QueuingInfo.EndTime = time.Unix(int64(result.QueuingInfo.EndTimestamp/1000), 0).Format("2006-01-02 15:04:05")
- }
-
- ctx.Data["task"] = task
- ctx.Data["jobID"] = jobID
- ctx.Data["result"] = result
- ctx.HTML(200, tplModelArtsNotebookShow)
- }
-
- func NotebookDebug(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- _, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- ctx.ServerError("GetCloudbrainByJobID failed", err)
- return
- }
-
- result, err := modelarts.GetJob(jobID)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
- return
- }
-
- res, err := modelarts.GetJobToken(jobID)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
- return
- }
-
- urls := strings.Split(result.Spec.Annotations.Url, "/")
- urlPrefix := result.Spec.Annotations.TargetDomain
- for i, url := range urls {
- if i > 2 {
- urlPrefix += "/" + url
- }
- }
-
- debugUrl := urlPrefix + "?token=" + res.Token
- ctx.Redirect(debugUrl)
- }
-
- func NotebookStop(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- log.Info(jobID)
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- ctx.ServerError("GetCloudbrainByJobID failed", err)
- return
- }
-
- if task.Status != string(models.JobRunning) {
- log.Error("the job(%s) is not running", task.JobName)
- ctx.ServerError("the job is not running", errors.New("the job is not running"))
- return
- }
-
- param := models.NotebookAction{
- Action: models.ActionStop,
- }
- res, err := modelarts.StopJob(jobID, param)
- if err != nil {
- log.Error("StopJob(%s) failed:%v", task.JobName, err.Error())
- ctx.ServerError("StopJob failed", err)
- return
- }
-
- task.Status = res.CurrentStatus
- err = models.UpdateJob(task)
- if err != nil {
- ctx.ServerError("UpdateJob failed", err)
- return
- }
-
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook")
- }
-
- func NotebookDel(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- ctx.ServerError("GetCloudbrainByJobID failed", err)
- return
- }
-
- if task.Status != string(models.JobStopped) {
- log.Error("the job(%s) has not been stopped", task.JobName)
- ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped"))
- return
- }
-
- _, err = modelarts.DelNotebook(jobID)
- if err != nil {
- log.Error("DelJob(%s) failed:%v", task.JobName, err.Error())
- ctx.ServerError("DelJob failed", err)
- return
- }
-
- err = models.DeleteJob(task)
- if err != nil {
- ctx.ServerError("DeleteJob failed", err)
- return
- }
-
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook")
- }
-
- func TrainJobIndex(ctx *context.Context) {
- MustEnableModelArts(ctx)
-
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
-
- tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobType: string(models.JobTypeTrain),
- IsLatestVersion: modelarts.IsLatestVersion,
- })
- if err != nil {
- ctx.ServerError("Cloudbrain", err)
- return
- }
-
- pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
- pager.SetDefaultParams(ctx)
- ctx.Data["Page"] = pager
-
- ctx.Data["PageIsCloudBrain"] = true
- ctx.Data["Tasks"] = tasks
- ctx.HTML(200, tplModelArtsTrainJobIndex)
- }
-
- func TrainJobNew(ctx *context.Context) {
- err := trainJobNewDataPrepare(ctx)
- if err != nil {
- ctx.ServerError("get new train-job info failed", err)
- return
- }
- ctx.HTML(200, tplModelArtsTrainJobNew)
- }
-
- func trainJobNewDataPrepare(ctx *context.Context) error {
- ctx.Data["PageIsCloudBrain"] = true
-
- //can, err := canUserCreateTrainJob(ctx.User.ID)
- //if err != nil {
- // ctx.ServerError("canUserCreateTrainJob", err)
- // return
- //}
- //
- //if !can {
- // log.Error("the user can not create train-job")
- // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
- // return
- //}
-
- t := time.Now()
- var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["job_name"] = jobName
-
- attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
- ctx.Data["train_url"] = outputObsPath
-
- Branches, err := ctx.Repo.GitRepo.GetBranches()
- if err != nil {
- ctx.ServerError("GetBranches error:", err)
- return err
- }
- ctx.Data["Branches"] = Branches
- ctx.Data["BranchesCount"] = len(Branches)
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
-
- return nil
- }
-
- func TrainJobNewVersion(ctx *context.Context) {
- err := trainJobNewVersionDataPrepare(ctx)
- if err != nil {
- ctx.ServerError("get new train-job info failed", err)
- return
- }
- ctx.HTML(200, tplModelArtsTrainJobVersionNew)
- }
-
- func trainJobNewVersionDataPrepare(ctx *context.Context) error {
- ctx.Data["PageIsCloudBrain"] = true
- var jobID = ctx.Params(":jobid")
- // var versionName = ctx.Params(":version-name")
- var versionName = ctx.Query("version_name")
-
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
- if err != nil {
- log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
- return err
- }
-
- t := time.Now()
- var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["job_name"] = task.JobName
-
- attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- var Parameters modelarts.Parameters
- if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["params"] = Parameters.Parameter
-
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
- ctx.Data["train_url"] = outputObsPath
-
- Branches, err := ctx.Repo.GitRepo.GetBranches()
- if err != nil {
- ctx.ServerError("GetBranches error:", err)
- return err
- }
- ctx.Data["branches"] = Branches
- ctx.Data["branch_name"] = task.BranchName
- ctx.Data["description"] = task.Description
- ctx.Data["boot_file"] = task.BootFile
- ctx.Data["dataset_name"] = task.DatasetName
- ctx.Data["work_server_number"] = task.WorkServerNumber
- ctx.Data["flavor_name"] = task.FlavorName
- ctx.Data["engine_name"] = task.EngineName
- ctx.Data["uuid"] = task.Uuid
- ctx.Data["flavor_code"] = task.FlavorCode
- ctx.Data["engine_id"] = task.EngineID
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
-
- return nil
- }
-
- func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
- ctx.Data["PageIsTrainJob"] = true
- VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(modelarts.TotalVersionCount)
- jobName := form.JobName
- uuid := form.Attachment
- description := form.Description
- workServerNumber := form.WorkServerNumber
- engineID := form.EngineID
- bootFile := form.BootFile
- flavorCode := form.Flavor
- params := form.Params
- poolID := form.PoolID
- isSaveParam := form.IsSaveParam
- repo := ctx.Repo.Repository
- codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
- codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
- logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
- dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
- branch_name := form.BranchName
- isLatestVersion := modelarts.IsLatestVersion
- FlavorName := form.FlavorName
- VersionCount := modelarts.VersionCount
- EngineName := form.EngineName
-
- if err := paramCheckCreateTrainJob(form); err != nil {
- log.Error("paramCheckCreateTrainJob failed:(%v)", err)
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
-
- attach, err := models.GetAttachmentByUUID(uuid)
- if err != nil {
- log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
- return
- }
-
- //todo: del the codeLocalPath
- _, err = ioutil.ReadDir(codeLocalPath)
- if err == nil {
- os.RemoveAll(codeLocalPath)
- }
-
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branch_name)
-
- if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
- Branch: branch_name,
- }); err != nil {
- log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
- trainJobNewDataPrepare(ctx)
-
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["datasetName"] = attach.Name
- ctx.Data["params"] = form.Params
- ctx.Data["branch_name"] = branch_name
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form)
- return
- }
-
- //todo: upload code (send to file_server todo this work?)
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
- return
- }
-
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
- return
- }
-
- if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
- log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
- return
- }
-
- //todo: del local code?
-
- var parameters models.Parameters
- param := make([]models.Parameter, 0)
- param = append(param, models.Parameter{
- Label: modelarts.TrainUrl,
- Value: outputObsPath,
- }, models.Parameter{
- Label: modelarts.DataUrl,
- Value: dataPath,
- })
- if len(params) != 0 {
- err := json.Unmarshal([]byte(params), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
- return
- }
-
- for _, parameter := range parameters.Parameter {
- if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
- }
- }
-
- //save param config
- if isSaveParam == "on" {
- if form.ParameterTemplateName == "" {
- log.Error("ParameterTemplateName is empty")
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
- return
- }
-
- _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
- ConfigName: form.ParameterTemplateName,
- Description: form.PrameterDescription,
- DataUrl: dataPath,
- AppUrl: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- TrainUrl: outputObsPath,
- Flavor: models.Flavor{
- Code: flavorCode,
- },
- WorkServerNum: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Parameter: param,
- })
-
- if err != nil {
- log.Error("Failed to CreateTrainJobConfig: %v", err)
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
- }
-
- req := &modelarts.GenerateTrainJobReq{
- JobName: jobName,
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- TrainUrl: outputObsPath,
- FlavorCode: flavorCode,
- WorkServerNumber: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Uuid: uuid,
- Parameters: parameters.Parameter,
- CommitID: commitID,
- IsLatestVersion: isLatestVersion,
- BranchName: branch_name,
- Params: form.Params,
- FlavorName: FlavorName,
- EngineName: EngineName,
- VersionCount: VersionCount,
- TotalVersionCount: modelarts.TotalVersionCount,
- }
-
- err = modelarts.GenerateTrainJob(ctx, req)
- if err != nil {
- log.Error("GenerateTrainJob failed:%v", err.Error())
- trainJobNewDataPrepare(ctx)
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["datasetName"] = attach.Name
- ctx.Data["params"] = form.Params
- ctx.Data["branch_name"] = branch_name
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
- }
-
- func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
- ctx.Data["PageIsTrainJob"] = true
- var jobID = ctx.Params(":jobid")
-
- latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
- if err != nil {
- ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
- return
- }
- VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(latestTask.TotalVersionCount + 1)
-
- jobName := form.JobName
- uuid := form.Attachment
- description := form.Description
- workServerNumber := form.WorkServerNumber
- engineID := form.EngineID
- bootFile := form.BootFile
- flavorCode := form.Flavor
- params := form.Params
- poolID := form.PoolID
- isSaveParam := form.IsSaveParam
- repo := ctx.Repo.Repository
- codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
- codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
- logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
- dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
- branch_name := form.BranchName
- PreVersionName := form.VersionName
- FlavorName := form.FlavorName
- EngineName := form.EngineName
- isLatestVersion := modelarts.IsLatestVersion
-
- if err := paramCheckCreateTrainJob(form); err != nil {
- log.Error("paramCheckCreateTrainJob failed:(%v)", err)
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- attach, err := models.GetAttachmentByUUID(uuid)
- if err != nil {
- log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
- return
- }
-
- //todo: del the codeLocalPath
- _, err = ioutil.ReadDir(codeLocalPath)
- if err == nil {
- os.RemoveAll(codeLocalPath)
- }
-
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branch_name)
- if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
- Branch: branch_name,
- }); err != nil {
- log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
- trainJobNewVersionDataPrepare(ctx)
-
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["datasetName"] = attach.Name
- ctx.Data["params"] = form.Params
- ctx.Data["branch_name"] = branch_name
- ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- //todo: upload code (send to file_server todo this work?)
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
- log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- //todo: del local code?
-
- var parameters models.Parameters
- param := make([]models.Parameter, 0)
- param = append(param, models.Parameter{
- Label: modelarts.TrainUrl,
- Value: outputObsPath,
- }, models.Parameter{
- Label: modelarts.DataUrl,
- Value: dataPath,
- })
- if len(params) != 0 {
- err := json.Unmarshal([]byte(params), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- for _, parameter := range parameters.Parameter {
- if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
- }
- }
-
- //save param config
- if isSaveParam == "on" {
- if form.ParameterTemplateName == "" {
- log.Error("ParameterTemplateName is empty")
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
- ConfigName: form.ParameterTemplateName,
- Description: form.PrameterDescription,
- DataUrl: dataPath,
- AppUrl: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- TrainUrl: outputObsPath,
- Flavor: models.Flavor{
- Code: flavorCode,
- },
- WorkServerNum: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Parameter: parameters.Parameter,
- })
-
- if err != nil {
- log.Error("Failed to CreateTrainJobConfig: %v", err)
- trainJobNewVersionDataPrepare(ctx)
- ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
- }
-
- if err != nil {
- log.Error("getFlavorNameByEngineID(%s) failed:%v", engineID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
- if err != nil {
- log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
- req := &modelarts.GenerateTrainJobReq{
- JobName: task.JobName,
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- TrainUrl: outputObsPath,
- FlavorCode: flavorCode,
- WorkServerNumber: workServerNumber,
- IsLatestVersion: isLatestVersion,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Uuid: uuid,
- Params: form.Params,
- Parameters: parameters.Parameter,
- PreVersionId: task.VersionID,
- CommitID: commitID,
- BranchName: branch_name,
- FlavorName: FlavorName,
- EngineName: EngineName,
- PreVersionName: PreVersionName,
- TotalVersionCount: latestTask.TotalVersionCount + 1,
- }
-
- err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
- if err != nil {
- log.Error("GenerateTrainJob failed:%v", err.Error())
- trainJobNewVersionDataPrepare(ctx)
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["datasetName"] = attach.Name
- ctx.Data["params"] = form.Params
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
- // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- // readDir reads the directory named by dirname and returns
- // a list of directory entries sorted by filename.
- func readDir(dirname string) ([]os.FileInfo, error) {
- f, err := os.Open(dirname)
- if err != nil {
- return nil, err
- }
-
- list, err := f.Readdir(100)
- f.Close()
- if err != nil {
- //todo: can not upload empty folder
- if err == io.EOF {
- return nil, nil
- }
- return nil, err
- }
-
- //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
- return list, nil
- }
-
- func uploadCodeToObs(codePath, jobName, parentDir string) error {
- files, err := readDir(codePath)
- if err != nil {
- log.Error("readDir(%s) failed: %s", codePath, err.Error())
- return err
- }
-
- for _, file := range files {
- if file.IsDir() {
- input := &obs.PutObjectInput{}
- input.Bucket = setting.Bucket
- input.Key = parentDir + file.Name() + "/"
- _, err = storage.ObsCli.PutObject(input)
- if err != nil {
- log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
- return err
- }
-
- if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
- log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
- return err
- }
- } else {
- input := &obs.PutFileInput{}
- input.Bucket = setting.Bucket
- input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
- input.SourceFile = codePath + file.Name()
- _, err = storage.ObsCli.PutFile(input)
- if err != nil {
- log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
- return err
- }
- }
- }
-
- return nil
- }
-
- func obsMkdir(dir string) error {
- input := &obs.PutObjectInput{}
- input.Bucket = setting.Bucket
- input.Key = dir
- _, err := storage.ObsCli.PutObject(input)
- if err != nil {
- log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
- return err
- }
-
- return nil
- }
-
- func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
- if !strings.HasSuffix(form.BootFile, ".py") {
- log.Error("the boot file(%s) must be a python file", form.BootFile)
- return errors.New("启动文件必须是python文件")
- }
-
- if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
- log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
- return errors.New("计算节点数必须在1-25之间")
- }
-
- return nil
- }
-
- func TrainJobShow(ctx *context.Context) {
- ctx.Data["PageIsCloudBrain"] = true
-
- var jobID = ctx.Params(":jobid")
-
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
- VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobType: string(models.JobTypeTrain),
- JobID: jobID,
- })
-
- if err != nil {
- log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
- //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
- for i, _ := range VersionListTasks {
-
- var parameters models.Parameters
-
- err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
- trainJobNewDataPrepare(ctx)
- return
- }
-
- if len(parameters.Parameter) > 0 {
- paramTemp := ""
- for _, Parameter := range parameters.Parameter {
- param := Parameter.Label + " = " + Parameter.Value + ", "
- paramTemp = paramTemp + param
- }
- VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
- } else {
- VersionListTasks[i].Parameters = ""
- }
- }
-
- ctx.Data["jobID"] = jobID
- ctx.Data["jobName"] = VersionListTasks[0].JobName
- ctx.Data["version_list_task"] = VersionListTasks
- ctx.Data["version_list_count"] = VersionListCount
- ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- func TrainJobGetLog(ctx *context.Context) {
- ctx.Data["PageIsTrainJob"] = true
-
- var jobID = ctx.Params(":jobid")
- var logFileName = ctx.Query("file_name")
- var baseLine = ctx.Query("base_line")
- var order = ctx.Query("order")
-
- if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
- log.Error("order(%s) check failed", order)
- ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
- return
- }
-
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
- if err != nil {
- log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- ctx.Data["log"] = result
- //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
- return nil, nil, err
- }
-
- resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
- if err != nil {
- log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
- return nil, nil, err
- }
-
- result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
- if err != nil {
- log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
- return nil, nil, err
- }
-
- return resultLogFile, result, err
- }
-
- func TrainJobDel(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- repo := ctx.Repo.Repository
-
- VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobType: string(models.JobTypeTrain),
- JobID: jobID,
- })
- if err != nil {
- ctx.ServerError("get VersionListTasks failed", err)
- return
- }
- //删除数据库Cloudbrain表的记录
- for _, task := range VersionListTasks {
- err = models.DeleteJobVersion(&task.Cloudbrain)
- if err != nil {
- ctx.ServerError("DeleteJobVersion failed", err)
- return
- }
- }
- //删除modelarts上的任务记录
- _, err = modelarts.DelTrainJob(jobID)
- if err != nil {
- log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
- return
- }
-
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
- }
-
- func TrainJobStop(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
- return
- }
-
- _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
- if err != nil {
- log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
- return
- }
-
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
- }
-
- func canUserCreateTrainJob(uid int64) (bool, error) {
- org, err := models.GetOrgByName(setting.AllowedOrg)
- if err != nil {
- log.Error("get allowed org failed: ", setting.AllowedOrg)
- return false, err
- }
-
- return org.IsOrgMember(uid)
- }
-
- func TrainJobGetConfigList(ctx *context.Context) {
- ctx.Data["PageIsTrainJob"] = true
-
- var jobID = ctx.Params(":jobid")
- var logFileName = ctx.Query("file_name")
- var baseLine = ctx.Query("base_line")
- var order = ctx.Query("order")
-
- if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
- log.Error("order(%s) check failed", order)
- ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
- return
- }
-
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
- if err != nil {
- log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- ctx.Data["log"] = result
- //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
- var result models.GetConfigListResult
-
- list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
- if err != nil {
- log.Error("GetConfigList failed:", err)
- return &result, err
- }
-
- for _, config := range list.ParaConfigs {
- paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
- if err != nil {
- log.Error("GetParaConfig failed:", err)
- return &result, err
- }
-
- config.Result = paraConfig
- }
-
- return list, nil
- }
|