diff --git a/models/action.go b/models/action.go index 4b6f1dbad..869acb762 100755 --- a/models/action.go +++ b/models/action.go @@ -65,6 +65,8 @@ const ( ActionCreateImage //36 ActionImageRecommend //37 ActionChangeUserAvatar //38 + ActionCreateGrampusNPUDebugTask //39 + ActionCreateGrampusGPUDebugTask //40 ) // Action represents user operation type and other information to @@ -375,6 +377,8 @@ func (a *Action) IsCloudbrainAction() bool { ActionCreateInferenceTask, ActionCreateBenchMarkTask, ActionCreateGPUTrainTask, + ActionCreateGrampusGPUDebugTask, + ActionCreateGrampusNPUDebugTask, ActionCreateGrampusNPUTrainTask, ActionCreateGrampusGPUTrainTask: return true diff --git a/models/ai_model_manage.go b/models/ai_model_manage.go index 702cf0937..d55370ea1 100644 --- a/models/ai_model_manage.go +++ b/models/ai_model_manage.go @@ -403,6 +403,18 @@ func QueryModelByName(name string, repoId int64) []*AiModelManage { return aiModelManageList } +func QueryModelByPath(path string) (*AiModelManage, error) { + modelManage := new(AiModelManage) + has, err := x.Where("path=?", path).Get(modelManage) + if err != nil { + return nil, err + } + if !has { + return nil, ErrNotExist{} + } + return modelManage, nil +} + func QueryModel(opts *AiModelQueryOptions) ([]*AiModelManage, int64, error) { sess := x.NewSession() defer sess.Close() @@ -473,6 +485,12 @@ func QueryModel(opts *AiModelQueryOptions) ([]*AiModelManage, int64, error) { return aiModelManages, count, nil } +func QueryModelConvertCountByRepoID(repoId int64) int64 { + convert := new(AiModelConvert) + total, _ := x.Where("repo_id =?", repoId).Count(convert) + return total +} + func QueryModelConvertByRepoID(repoId int64) ([]*AiModelConvert, error) { sess := x.NewSession() defer sess.Close() diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 76463df40..37965c73a 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -114,6 +114,7 @@ const ( GrampusStatusFailed = "FAILED" GrampusStatusSucceeded = "SUCCEEDED" GrampusStatusStopped = "STOPPED" + GrampusStatusStopping = "STOPPING" GrampusStatusUnknown = "UNKNOWN" GrampusStatusWaiting = "WAITING" @@ -181,7 +182,7 @@ type Cloudbrain struct { BranchName string //分支名称 Parameters string //传给modelarts的param参数 BootFile string //启动文件 - DataUrl string //数据集的obs路径 + DataUrl string `xorm:"varchar(3500)"` //数据集的obs路径 LogUrl string //日志输出的obs路径 PreVersionId int64 //父版本的版本id FlavorCode string //modelarts上的规格id @@ -298,6 +299,12 @@ func (task *Cloudbrain) IsUserHasRight(user *User) bool { } return user.IsAdmin || user.ID == task.UserID } +func (task *Cloudbrain) IsGPUTask() bool { + return task.ComputeResource == GPUResource +} +func (task *Cloudbrain) IsNPUTask() bool { + return task.ComputeResource == NPUResource +} func ConvertDurationToStr(duration int64) string { if duration <= 0 { @@ -1215,6 +1222,13 @@ type DatasetDownload struct { IsDelete bool `json:"is_delete"` } +type ModelDownload struct { + Name string `json:"name"` + DownloadLink string `json:"download_link"` + RepositoryLink string `json:"repository_link"` + IsDelete bool `json:"is_delete"` +} + type DataSource struct { DatasetID string `json:"dataset_id"` DatasetVersion string `json:"dataset_version"` @@ -1458,6 +1472,20 @@ type GrampusJobInfo struct { UserID string `json:"userId"` Tasks []GrampusTasks `json:"tasks"` } + +type GrampusNotebookInfo struct { + StartedAt int64 `json:"startedAt"` + RunSec int64 `json:"runSec"` + CompletedAt int64 `json:"completedAt"` + CreatedAt int64 `json:"createdAt"` + UpdatedAt int64 `json:"updatedAt"` + Desc string `json:"desc"` + JobID string `json:"id"` + Name string `json:"name"` + Status string `json:"status"` + UserID string `json:"userId"` + Tasks []GrampusNotebookTask `json:"tasks"` +} type Center struct { ID string `json:"id"` Name string `json:"name"` @@ -1534,9 +1562,22 @@ type GetGrampusJobResponse struct { JobInfo GrampusJobInfo `json:"otJob"` } +type GrampusNotebookResponse struct { + GrampusResult + JobInfo GrampusNotebookInfo `json:"otJob"` +} + +type GrampusNotebookRestartResponse struct { + GrampusResult + NewId string `json:"newId"` + Status string `json:"status"` +} + type GrampusStopJobResponse struct { GrampusResult - StoppedAt int64 `json:"stoppedAt"` + StoppedAt int64 `json:"stoppedAt"` + ID string `json:"id"` + Status string `json:"status"` } type GrampusTasks struct { @@ -1553,12 +1594,32 @@ type GrampusTasks struct { Code GrampusDataset `json:"code"` BootFile string `json:"bootFile"` } +type GrampusNotebookTask struct { + AutoStopDuration int `json:"autoStopDuration"` + Name string `json:"name"` + Capacity int `json:"capacity"` + CenterID []string `json:"centerID"` + CenterName []string `json:"centerName"` + Code GrampusDataset `json:"code"` + Datasets []GrampusDataset `json:"datasets"` + CodeUrl string `json:"codeUrl"` + DataUrl string `json:"dataUrl"` + ImageId string `json:"imageId"` + ImageUrl string `json:"imageUrl"` + ResourceSpecId string `json:"resourceSpecId"` + Token string `json:"token"` + Url string `json:"url"` + Status string `json:"status"` + Command string `json:"command"` +} type GrampusDataset struct { - Name string `json:"name"` - Bucket string `json:"bucket"` - EndPoint string `json:"endPoint"` - ObjectKey string `json:"objectKey"` + Name string `json:"name"` + Bucket string `json:"bucket"` + EndPoint string `json:"endPoint"` + ObjectKey string `json:"objectKey"` + ContainerPath string `json:"containerPath"` + ReadOnly bool `json:"readOnly"` } type CreateGrampusJobRequest struct { @@ -1566,6 +1627,11 @@ type CreateGrampusJobRequest struct { Tasks []GrampusTasks `json:"tasks"` } +type CreateGrampusNotebookRequest struct { + Name string `json:"name"` + Tasks []GrampusNotebookTask `json:"tasks"` +} + type GetTrainJobMetricStatisticResult struct { TrainJobResult Interval int `json:"interval"` //查询的时间间隔,单位为分钟 diff --git a/models/cloudbrain_static.go b/models/cloudbrain_static.go index 40d7a2a2e..beb1ceee5 100644 --- a/models/cloudbrain_static.go +++ b/models/cloudbrain_static.go @@ -92,6 +92,17 @@ type HourTimeStatistic struct { HourTimeTotalDuration map[string]int `json:"hourTimeTotalDuration"` HourTimeUsageRate map[string]float64 `json:"hourTimeUsageRate"` } +type CloudbrainTypeDuration []struct { + Type int `xorm:"type"` + DurationSum int `xorm:"durationSum"` + CardDurationSum int `xorm:"cardDurationSum"` + Count int `xorm:"count"` +} +type CloudbrainAllDuration struct { + DurationSum int `xorm:"durationSum"` + CardDurationSum int `xorm:"cardDurationSum"` + Count int `xorm:"count"` +} func GetTodayCreatorCount(beginTime time.Time, endTime time.Time) (int64, error) { countSql := "SELECT count(distinct user_id) FROM " + @@ -303,7 +314,7 @@ func GetCloudbrainByTime(beginTime int64, endTime int64) ([]*CloudbrainInfo, err builder.And(builder.Gte{"cloudbrain.start_time": beginTime}, builder.Lte{"cloudbrain.start_time": endTime}, builder.Gt{"cloudbrain.start_time": 0}), ) cond = cond.Or( - builder.And(builder.Eq{"cloudbrain.status": string(JobRunning)}), + builder.And(builder.Eq{"cloudbrain.status": string(JobRunning)}, builder.Lte{"cloudbrain.start_time": beginTime}), ) sess.OrderBy("cloudbrain.id ASC") cloudbrains := make([]*CloudbrainInfo, 0, 10) @@ -425,3 +436,55 @@ func DeleteCloudbrainDurationStatistic(beginTime timeutil.TimeStamp, endTime tim } return nil } + +func GetCloudbrainTypeCardDuration() (CloudbrainTypeDuration, error) { + query := ` + SELECT + cloudbrain.type, + SUM(cloudbrain.duration) as durationSum, + SUM( + COALESCE(cloudbrain.duration * + CASE + WHEN cloudbrain.work_server_number = 0 THEN 1 + ELSE COALESCE(cloudbrain.work_server_number, 1) + END * + COALESCE(cloudbrain_spec.acc_cards_num, 1), 0) + ) as cardDurationSum, + COUNT(*) as count + FROM cloudbrain + LEFT JOIN cloudbrain_spec + ON cloudbrain.id = cloudbrain_spec.cloudbrain_id + GROUP BY cloudbrain.type + ` + // 执行查询 + var results CloudbrainTypeDuration + if err := x.SQL(query).Find(&results); err != nil { + panic(err) + } + return results, nil +} + +func GetCloudbrainAllCardDuration() (CloudbrainAllDuration, error) { + query := ` + SELECT + SUM(cloudbrain.duration) as durationSum, + SUM( + COALESCE(cloudbrain.duration * + CASE + WHEN cloudbrain.work_server_number = 0 THEN 1 + ELSE COALESCE(cloudbrain.work_server_number, 1) + END * + COALESCE(cloudbrain_spec.acc_cards_num, 1), 0) + ) as cardDurationSum, + COUNT(*) as count + FROM cloudbrain + LEFT JOIN cloudbrain_spec + ON cloudbrain.id = cloudbrain_spec.cloudbrain_id + ` + // 执行查询 + var result CloudbrainAllDuration + if _, err := x.SQL(query).Get(&result); err != nil { + panic(err) + } + return result, nil +} diff --git a/models/repo_statistic.go b/models/repo_statistic.go index ecdd77e57..b99b7c259 100755 --- a/models/repo_statistic.go +++ b/models/repo_statistic.go @@ -36,7 +36,7 @@ type RepoStatistic struct { NumDevMonths int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` RepoSize int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` DatasetSize int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` - NumModels int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` + NumModels int64 `xorm:"NOT NULL DEFAULT 0" json:"model"` NumWikiViews int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` NumCommits int64 `xorm:"NOT NULL DEFAULT 0" json:"commit"` NumCommitsAdded int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` @@ -55,6 +55,15 @@ type RepoStatistic struct { NumIssuesGrowth int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` NumCommentsGrowth int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` + NumDatasetFile int64 `xorm:"NOT NULL DEFAULT 0" json:"datasetFiles"` + NumCloudbrain int64 `xorm:"NOT NULL DEFAULT 0" json:"cloudbrains"` + NumModelConvert int64 `xorm:"NOT NULL DEFAULT 0" json:"modelConverts"` + + NumDatasetFileAdded int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` + NumCloudbrainAdded int64 `xorm:"NOT NULL DEFAULT 0" json:"-"` + NumModelConvertAdded int64 `xorm:"NOT NULL DEFAULT 0" json:"- "` + NumModelsAdded int64 `xorm:"NOT NULL DEFAULT 0" json:"- "` + Impact float64 `xorm:"NOT NULL DEFAULT 0" json:"impact"` Completeness float64 `xorm:"NOT NULL DEFAULT 0" json:"completeness"` Liveness float64 `xorm:"NOT NULL DEFAULT 0" json:"liveness"` diff --git a/models/task_config.go b/models/task_config.go index 0d9d21187..f86032fc9 100644 --- a/models/task_config.go +++ b/models/task_config.go @@ -36,6 +36,8 @@ func GetTaskTypeFromAction(a ActionType) TaskType { ActionCreateInferenceTask, ActionCreateBenchMarkTask, ActionCreateGPUTrainTask, + ActionCreateGrampusGPUDebugTask, + ActionCreateGrampusNPUDebugTask, ActionCreateGrampusNPUTrainTask, ActionCreateGrampusGPUTrainTask: return TaskCreateCloudbrainTask diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 414a7c25d..f8a238124 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -29,3 +29,24 @@ type CreateGrampusTrainJobForm struct { func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { return validate(errs, ctx.Data, f, ctx.Locale) } + +type CreateGrampusNotebookForm struct { + Type int `form:"type"` + DisplayJobName string `form:"display_job_name" binding:"Required"` + Attachment string `form:"attachment"` + ImageID string `form:"image_id" binding:"Required"` + Description string `form:"description"` + BranchName string `form:"branch_name" binding:"Required"` + Image string `form:"image" binding:"Required"` + DatasetName string `form:"dataset_name"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` + SpecId int64 `form:"spec_id" binding:"Required"` +} + +func (f *CreateGrampusNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { + return validate(errs, ctx.Data, f, ctx.Locale) +} diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index 0221c51d8..0061648ce 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -16,13 +16,18 @@ func (f *CreateModelArtsForm) Validate(ctx *macaron.Context, errs binding.Errors } type CreateModelArtsNotebookForm struct { - DisplayJobName string `form:"display_job_name" binding:"Required"` - JobName string `form:"job_name" binding:"Required"` - Attachment string `form:"attachment"` - Description string `form:"description"` - Flavor string `form:"flavor" binding:"Required"` - ImageId string `form:"image_id" binding:"Required"` - SpecId int64 `form:"spec_id" binding:"Required"` + DisplayJobName string `form:"display_job_name" binding:"Required"` + JobName string `form:"job_name" binding:"Required"` + Attachment string `form:"attachment"` + Description string `form:"description"` + Flavor string `form:"flavor" binding:"Required"` + ImageId string `form:"image_id" binding:"Required"` + ModelName string `form:"model_name"` + ModelVersion string `form:"model_version"` + CkptName string `form:"ckpt_name"` + LabelName string `form:"label_names"` + PreTrainModelUrl string `form:"pre_train_model_url"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 6111cf460..c85f4b8cd 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -5,6 +5,7 @@ import ( "errors" "os" "strconv" + "strings" "code.gitea.io/gitea/modules/timeutil" @@ -145,7 +146,7 @@ func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) { var id = ctx.Params(":id") - job, err := GetCloudBrainByIdOrJobId(id) + job, err := GetCloudBrainByIdOrJobId(id, "id") if err != nil { log.Error("GetCloudbrainByID failed:%v", err.Error()) ctx.NotFound(ctx.Req.URL.RequestURI(), nil) @@ -161,7 +162,7 @@ func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) { func AdminOrJobCreaterRight(ctx *context.Context) { var id = ctx.Params(":id") - job, err := GetCloudBrainByIdOrJobId(id) + job, err := GetCloudBrainByIdOrJobId(id, "id") if err != nil { log.Error("GetCloudbrainByID failed:%v", err.Error()) ctx.NotFound(ctx.Req.URL.RequestURI(), nil) @@ -177,7 +178,7 @@ func AdminOrJobCreaterRight(ctx *context.Context) { func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) { var jobID = ctx.Params(":jobid") - job, err := GetCloudBrainByIdOrJobId(jobID) + job, err := GetCloudBrainByIdOrJobId(jobID, "jobid") if err != nil { log.Error("GetCloudbrainByJobID failed:%v", err.Error()) ctx.NotFound(ctx.Req.URL.RequestURI(), nil) @@ -193,7 +194,7 @@ func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) { func AdminOrJobCreaterRightForTrain(ctx *context.Context) { var jobID = ctx.Params(":jobid") - job, err := GetCloudBrainByIdOrJobId(jobID) + job, err := GetCloudBrainByIdOrJobId(jobID, "jobid") if err != nil { log.Error("GetCloudbrainByJobID failed:%v", err.Error()) ctx.NotFound(ctx.Req.URL.RequestURI(), nil) @@ -490,6 +491,21 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e } } + if task.PreTrainModelUrl != "" { //预训练 + _, err := models.QueryModelByPath(task.PreTrainModelUrl) + if err != nil { + log.Warn("The model may be deleted", err) + } else { + volumes = append(volumes, models.Volume{ + HostPath: models.StHostPath{ + Path: setting.Attachment.Minio.RealPath + task.PreTrainModelUrl, + MountPath: PretrainModelMountPath, + ReadOnly: true, + }, + }) + } + } + createTime := timeutil.TimeStampNow() jobResult, err := CreateJob(jobName, models.CreateJobParams{ JobName: jobName, @@ -540,10 +556,16 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e GpuQueue: task.GpuQueue, ResourceSpecId: task.ResourceSpecId, ComputeResource: task.ComputeResource, - CreatedUnix: createTime, - UpdatedUnix: createTime, - BranchName: task.BranchName, - Spec: spec, + + CreatedUnix: createTime, + UpdatedUnix: createTime, + BranchName: task.BranchName, + Spec: spec, + ModelName: task.ModelName, + ModelVersion: task.ModelVersion, + LabelName: task.LabelName, + PreTrainModelUrl: task.PreTrainModelUrl, + CkptName: task.CkptName, } err = models.RestartCloudbrain(task, newTask) @@ -653,18 +675,45 @@ func IsElementExist(s []string, str string) bool { return false } -func GetCloudBrainByIdOrJobId(id string) (*models.Cloudbrain,error) { +func GetCloudBrainByIdOrJobId(id string, initialQuery string) (*models.Cloudbrain, error) { _, err := strconv.ParseInt(id, 10, 64) var job *models.Cloudbrain if err != nil { job, err = models.GetCloudbrainByJobID(id) } else { - job, err = models.GetCloudbrainByID(id) - if err!=nil{ + + if strings.EqualFold(initialQuery, "id") { + job, err = models.GetCloudbrainByID(id) + if err != nil { + job, err = models.GetCloudbrainByJobID(id) + } + } else { job, err = models.GetCloudbrainByJobID(id) + if err != nil { + job, err = models.GetCloudbrainByID(id) + } } } - return job,err + return job, err +} + +type GenerateModelArtsNotebookReq struct { + JobName string + DisplayJobName string + Uuid string + Description string + + BootFile string + + ImageId string + AutoStopDurationMs int64 + + Spec *models.Specification + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelUrl string } diff --git a/modules/cron/tasks_basic.go b/modules/cron/tasks_basic.go index 6a1fc6e39..5907a3418 100755 --- a/modules/cron/tasks_basic.go +++ b/modules/cron/tasks_basic.go @@ -5,10 +5,11 @@ package cron import ( - "code.gitea.io/gitea/modules/setting" "context" "time" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/urfs_client/urchin" cloudbrainService "code.gitea.io/gitea/services/cloudbrain" @@ -296,7 +297,7 @@ func registerHandleCloudbrainDurationStatistic() { RegisterTaskFatal("handle_cloudbrain_duration_statistic", &BaseConfig{ Enabled: true, RunAtStart: false, - Schedule: "1 0 * * * ?", + Schedule: "1 1 * * * ?", }, func(ctx context.Context, _ *models.User, _ Config) error { repo.CloudbrainDurationStatisticHour() return nil diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 34d7d3fe0..37e6fc1bf 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -1,7 +1,8 @@ package grampus import ( - "encoding/json" + "fmt" + "strconv" "strings" "code.gitea.io/gitea/models" @@ -26,8 +27,10 @@ const ( CodeArchiveName = "master.zip" - BucketRemote = "grampus" - RemoteModelPath = "/output/" + models.ModelSuffix + BucketRemote = "grampus" + RemoteModelPath = "/output/" + models.ModelSuffix + autoStopDurationMs = 4 * 60 * 60 * 1000 + CommandGpuDebug = "mkdir -p /dataset;%s! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='/code' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;" ) var ( @@ -37,7 +40,7 @@ var ( SpecialPools *models.SpecialPools - CommandPrepareScriptGpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://openi.pcl.ac.cn/OpenIOSSG/%s/archive/master.zip;" + + CommandPrepareScriptGpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" + "echo \"finish loading script\";unzip -q master.zip;cd %s;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;" ) @@ -81,6 +84,32 @@ type GenerateTrainJobReq struct { CodeName string } +type GenerateNotebookJobReq struct { + JobName string + Command string + ImageUrl string + ImageId string + DisplayJobName string + Uuid string + Description string + CodeStoragePath string + CommitID string + BranchName string + ComputeResource string + ProcessType string + DatasetNames string + DatasetInfos map[string]models.DatasetInfo + ModelName string + LabelName string + CkptName string + ModelVersion string + PreTrainModelPath string + PreTrainModelUrl string + Spec *models.Specification + CodeName string + ModelPath string //参考启智GPU调试, 挂载/model目录用户的模型可以输出到这个目录 +} + func getEndPoint() string { index := strings.Index(setting.Endpoint, "//") endpoint := setting.Endpoint[index+2:] @@ -101,6 +130,151 @@ func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.Gram } return datasetGrampus } +func getDatasetGPUGrampus(datasetInfos map[string]models.DatasetInfo) ([]models.GrampusDataset, string) { + var datasetGrampus []models.GrampusDataset + var command = "" + for uuid, datasetInfo := range datasetInfos { + datasetGrampus = append(datasetGrampus, models.GrampusDataset{ + Name: datasetInfo.FullName, + Bucket: setting.Attachment.Minio.Bucket, + EndPoint: setting.Attachment.Minio.Endpoint, + ObjectKey: datasetInfo.DataLocalPath, + ReadOnly: true, + ContainerPath: "/dataset1/" + datasetInfo.Name, + }) + + command += "cp /dataset1/'" + datasetInfo.Name + "'/" + uuid + " /dataset/'" + datasetInfo.FullName + "';" + + } + return datasetGrampus, command +} + +func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (jobId string, err error) { + createTime := timeutil.TimeStampNow() + + var datasetGrampus []models.GrampusDataset + var codeGrampus models.GrampusDataset + var cpCommand string + imageUrl := req.ImageUrl + if ProcessorTypeNPU == req.ProcessType { + datasetGrampus = getDatasetGrampus(req.DatasetInfos) + if len(req.ModelName) != 0 { + datasetGrampus = append(datasetGrampus, models.GrampusDataset{ + Name: req.ModelName, + Bucket: setting.Bucket, + EndPoint: getEndPoint(), + ReadOnly: true, + ObjectKey: req.PreTrainModelPath, + }) + } + + codeGrampus = models.GrampusDataset{ + Name: req.CodeName, + Bucket: setting.Bucket, + EndPoint: getEndPoint(), + ObjectKey: req.CodeStoragePath + cloudbrain.DefaultBranchName + ".zip", + ReadOnly: false, + } + imageUrl = "" + req.Command = "" + } else { + datasetGrampus, cpCommand = getDatasetGPUGrampus(req.DatasetInfos) + if len(req.ModelName) != 0 { + datasetGrampus = append(datasetGrampus, models.GrampusDataset{ + Name: req.ModelName, + Bucket: setting.Attachment.Minio.Bucket, + EndPoint: setting.Attachment.Minio.Endpoint, + ObjectKey: req.PreTrainModelPath, + ReadOnly: true, + ContainerPath: cloudbrain.PretrainModelMountPath, + }) + } + codeGrampus = models.GrampusDataset{ + Name: req.CodeName, + Bucket: setting.Attachment.Minio.Bucket, + EndPoint: setting.Attachment.Minio.Endpoint, + ObjectKey: req.CodeStoragePath + cloudbrain.DefaultBranchName + ".zip", + ReadOnly: false, + ContainerPath: cloudbrain.CodeMountPath, + } + req.Command = fmt.Sprintf(CommandGpuDebug, cpCommand, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval) + log.Info("debug command:" + req.Command) + + } + + jobResult, err := createNotebookJob(models.CreateGrampusNotebookRequest{ + Name: req.JobName, + Tasks: []models.GrampusNotebookTask{ + { + Name: req.JobName, + ResourceSpecId: req.Spec.SourceSpecId, + ImageId: req.ImageId, + ImageUrl: imageUrl, + Datasets: datasetGrampus, + Code: codeGrampus, + AutoStopDuration: autoStopDurationMs, + Capacity: setting.Capacity, + Command: req.Command, + }, + }, + }) + if err != nil { + log.Error("createNotebookJob failed: %v", err.Error()) + return "", err + } + + jobID := jobResult.JobInfo.JobID + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.JobInfo.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeC2Net, + Uuid: req.Uuid, + DatasetName: req.DatasetNames, + CommitID: req.CommitID, + IsLatestVersion: "1", + ComputeResource: req.ComputeResource, + ImageID: req.ImageId, + BranchName: req.BranchName, + Description: req.Description, + WorkServerNumber: 1, + EngineName: req.ImageUrl, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, + }) + + if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) + return "", err + } + + var actionType models.ActionType + if req.ComputeResource == models.NPUResource { + actionType = models.ActionCreateGrampusNPUDebugTask + } else if req.ComputeResource == models.GPUResource { + actionType = models.ActionCreateGrampusGPUDebugTask + } + task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID failed: %v", err.Error()) + return "", err + } + + stringId := strconv.FormatInt(task.ID, 10) + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, actionType) + + return jobID, nil +} func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) { createTime := timeutil.TimeStampNow() @@ -269,11 +443,6 @@ func TransTrainJobStatus(status string) string { return strings.ToUpper(status) } -func InitSpecialPool() { - if SpecialPools == nil && setting.Grampus.SpecialPools != "" { - json.Unmarshal([]byte(setting.Grampus.SpecialPools), &SpecialPools) - } -} func GetNpuModelRemoteObsUrl(jobName string) string { return "s3:///" + BucketRemote + "/" + GetNpuModelObjectKey(jobName) diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index a9e1aed5c..a0d5384e2 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -26,6 +26,7 @@ const ( urlGetResourceSpecs = urlOpenApiV1 + "resourcespec" urlGetAiCenter = urlOpenApiV1 + "sharescreen/aicenter" urlGetImages = urlOpenApiV1 + "image" + urlNotebookJob = urlOpenApiV1 + "notebook" errorIllegalToken = 1005 ) @@ -87,6 +88,39 @@ func getToken() error { return nil } +func createNotebookJob(req models.CreateGrampusNotebookRequest) (*models.GrampusNotebookResponse, error) { + checkSetting() + client := getRestyClient() + var result models.GrampusNotebookResponse + + retry := 0 + +sendjob: + _, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(req). + SetResult(&result). + Post(HOST + urlNotebookJob) + + if err != nil { + return nil, fmt.Errorf("resty CreateNotebookJob: %s", err) + } + + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + func createJob(req models.CreateGrampusJobRequest) (*models.CreateGrampusJobResponse, error) { checkSetting() client := getRestyClient() @@ -120,6 +154,38 @@ sendjob: return &result, nil } +func GetNotebookJob(jobID string) (*models.GrampusNotebookResponse, error) { + checkSetting() + client := getRestyClient() + var result models.GrampusNotebookResponse + + retry := 0 + +sendjob: + _, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + urlNotebookJob + "/" + jobID) + + if err != nil { + return nil, fmt.Errorf("resty GetNotebookJob: %v", err) + } + + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("GetNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return nil, fmt.Errorf("GetNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + func GetJob(jobID string) (*models.GetGrampusJobResponse, error) { checkSetting() client := getRestyClient() @@ -184,18 +250,23 @@ sendjob: return &result, nil } -func GetImages(processorType string) (*models.GetGrampusImagesResult, error) { +func GetImages(processorType string, jobType string) (*models.GetGrampusImagesResult, error) { checkSetting() client := getRestyClient() var result models.GetGrampusImagesResult retry := 0 + queryType := "TrainJob" + if jobType == string(models.JobTypeDebug) { + queryType = "Notebook" + } + sendjob: _, err := client.R(). SetAuthToken(TOKEN). SetResult(&result). - Get(HOST + urlGetImages + "?processorType=" + processorType) + Get(HOST + urlGetImages + "?processorType=" + processorType + "&trainType=" + queryType) if err != nil { return nil, fmt.Errorf("resty GetImages: %v", err) @@ -271,19 +342,26 @@ func GetGrampusMetrics(jobID string) (models.GetTrainJobMetricStatisticResult, e return result, nil } -func StopJob(jobID string) (*models.GrampusStopJobResponse, error) { +func StopJob(jobID string, jobType ...string) (*models.GrampusStopJobResponse, error) { checkSetting() client := getRestyClient() var result models.GrampusStopJobResponse retry := 0 + url := urlTrainJob + if len(jobType) > 0 { + if jobType[0] == string(models.JobTypeDebug) { + url = urlNotebookJob + } + } + sendjob: _, err := client.R(). //SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). SetResult(&result). - Post(HOST + urlTrainJob + "/" + jobID + "/stop") + Post(HOST + url + "/" + jobID + "/stop") if err != nil { return &result, fmt.Errorf("resty StopTrainJob: %v", err) @@ -335,3 +413,33 @@ sendjob: return &result, nil } + +func RestartNotebookJob(jobID string) (*models.GrampusNotebookRestartResponse, error) { + checkSetting() + client := getRestyClient() + var restartResponse *models.GrampusNotebookRestartResponse + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&restartResponse). + Post(HOST + urlNotebookJob + "/" + jobID + "/start") + + if err != nil { + return nil, fmt.Errorf("resty grampus restart note book job: %v", err) + } + if restartResponse.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + log.Error("resty grampus restart note book job failed(%s): %v", res.String(), err.Error()) + return nil, fmt.Errorf("resty grampus restart note book job failed: %v", err) + } + + return restartResponse, nil +} diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 9cc9dc1ed..dcad1eb00 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -20,34 +20,16 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/notification" "code.gitea.io/gitea/modules/setting" - "code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/timeutil" ) const ( //notebook + storageTypeOBS = "obs" autoStopDuration = 4 * 60 * 60 AutoStopDurationMs = 4 * 60 * 60 * 1000 - MORDELART_USER_IMAGE_ENGINE_ID = -1 - DataSetMountPath = "/home/ma-user/work" - NotebookEnv = "Python3" - NotebookType = "Ascend" - FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" - - //train-job - // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" - // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" - // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + - // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + - // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + - // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + - // "]}" - // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + - // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + - // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + - // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + - // "]}" + CodePath = "/code/" OutputPath = "/output/" ResultPath = "/result/" @@ -190,14 +172,6 @@ type OrgMultiNode struct { Node []int `json:"node"` } -// type Parameter struct { -// Label string `json:"label"` -// Value string `json:"value"` -// } - -// type Parameters struct { -// Parameter []Parameter `json:"parameter"` -// } type Parameters struct { Parameter []struct { @@ -206,98 +180,23 @@ type Parameters struct { } `json:"parameter"` } -func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error { - var dataActualPath string - if uuid != "" { - dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" - } else { - userPath := setting.UserBasePath + ctx.User.Name + "/" - isExist, err := storage.ObsHasObject(userPath) - if err != nil { - log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"]) - return err - } - - if !isExist { - if err = storage.ObsCreateObject(userPath); err != nil { - log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"]) - return err - } - } - - dataActualPath = setting.Bucket + "/" + userPath - } - - if poolInfos == nil { - json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) - } - createTime := timeutil.TimeStampNow() - jobResult, err := CreateJob(models.CreateNotebookParams{ - JobName: jobName, - Description: description, - ProfileID: setting.ProfileID, - Flavor: flavor, - Pool: models.Pool{ - ID: poolInfos.PoolInfo[0].PoolId, - Name: poolInfos.PoolInfo[0].PoolName, - Type: poolInfos.PoolInfo[0].PoolType, - }, - Spec: models.Spec{ - Storage: models.Storage{ - Type: storageTypeOBS, - Location: models.Location{ - Path: dataActualPath, - }, - }, - AutoStop: models.AutoStop{ - Enable: true, - Duration: autoStopDuration, - }, - }, - }) - if err != nil { - log.Error("CreateJob failed: %v", err.Error()) - return err - } - err = models.CreateCloudbrain(&models.Cloudbrain{ - - Status: string(models.JobWaiting), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobResult.ID, - JobName: jobName, - JobType: string(models.JobTypeDebug), - Type: models.TypeCloudBrainTwo, - Uuid: uuid, - ComputeResource: models.NPUResource, - CreatedUnix: createTime, - UpdatedUnix: createTime, - }) - - if err != nil { - return err - } - notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask) - return nil -} - -func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification, bootFile string, autoStopDurationInMs int64) (string, error) { +func GenerateNotebook2(ctx *context.Context, req cloudbrain.GenerateModelArtsNotebookReq) (string, error) { if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } - imageName, err := GetNotebookImageName(imageId) + imageName, err := GetNotebookImageName(req.ImageId) if err != nil { log.Error("GetNotebookImageName failed: %v", err.Error()) return "", err } createTime := timeutil.TimeStampNow() jobResult, err := createNotebook2(models.CreateNotebook2Params{ - JobName: jobName, - Description: description, - Flavor: spec.SourceSpecId, - Duration: autoStopDurationInMs, - ImageID: imageId, + JobName: req.JobName, + Description: req.Description, + Flavor: req.Spec.SourceSpecId, + Duration: req.AutoStopDurationMs, + ImageID: req.ImageId, PoolID: poolInfos.PoolInfo[0].PoolId, Feature: models.NotebookFeature, Volume: models.VolumeReq{ @@ -310,13 +209,13 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc if err != nil { log.Error("createNotebook2 failed: %v", err.Error()) if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { - log.Info("(%s)unknown error, set temp status", displayJobName) + log.Info("(%s)unknown error, set temp status", req.DisplayJobName) errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: models.TempJobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, Type: models.TypeCloudBrainTwo, - JobName: jobName, + JobName: req.JobName, JobType: string(models.JobTypeDebug), }) if errTemp != nil { @@ -327,23 +226,28 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc return "", err } task := &models.Cloudbrain{ - Status: jobResult.Status, - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobResult.ID, - JobName: jobName, - FlavorCode: spec.SourceSpecId, - DisplayJobName: displayJobName, - JobType: string(models.JobTypeDebug), - Type: models.TypeCloudBrainTwo, - Uuid: uuid, - ComputeResource: models.NPUResource, - Image: imageName, - BootFile: bootFile, - Description: description, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: spec, + Status: jobResult.Status, + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobResult.ID, + JobName: req.JobName, + FlavorCode: req.Spec.SourceSpecId, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeCloudBrainTwo, + Uuid: req.Uuid, + ComputeResource: models.NPUResource, + Image: imageName, + BootFile: req.BootFile, + Description: req.Description, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, } err = models.CreateCloudbrain(task) @@ -352,7 +256,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc } stringId := strconv.FormatInt(task.ID, 10) - notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask) + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask) return jobResult.ID, nil } diff --git a/modules/modelarts_cd/modelarts.go b/modules/modelarts_cd/modelarts.go index 93032fa89..bdc42002a 100755 --- a/modules/modelarts_cd/modelarts.go +++ b/modules/modelarts_cd/modelarts.go @@ -5,6 +5,8 @@ import ( "strconv" "strings" + "code.gitea.io/gitea/modules/cloudbrain" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" @@ -88,19 +90,19 @@ type Parameters struct { } `json:"parameter"` } -func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification, bootFile string,autoStopDurationInMs int64) (string, error) { - imageName, err := GetNotebookImageName(imageId) +func GenerateNotebook(ctx *context.Context, req cloudbrain.GenerateModelArtsNotebookReq) (string, error) { + imageName, err := GetNotebookImageName(req.ImageId) if err != nil { log.Error("GetNotebookImageName failed: %v", err.Error()) return "", err } createTime := timeutil.TimeStampNow() jobResult, err := createNotebook(models.CreateNotebookWithoutPoolParams{ - JobName: jobName, - Description: description, - Flavor: spec.SourceSpecId, - Duration: autoStopDurationInMs, - ImageID: imageId, + JobName: req.JobName, + Description: req.Description, + Flavor: req.Spec.SourceSpecId, + Duration: req.AutoStopDurationMs, + ImageID: req.ImageId, Feature: models.NotebookFeature, Volume: models.VolumeReq{ Capacity: setting.Capacity, @@ -112,13 +114,13 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr if err != nil { log.Error("createNotebook failed: %v", err.Error()) if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { - log.Info("(%s)unknown error, set temp status", displayJobName) + log.Info("(%s)unknown error, set temp status", req.DisplayJobName) errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: models.TempJobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, Type: models.TypeCDCenter, - JobName: jobName, + JobName: req.JobName, JobType: string(models.JobTypeDebug), }) if errTemp != nil { @@ -129,23 +131,28 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr return "", err } task := &models.Cloudbrain{ - Status: jobResult.Status, - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobResult.ID, - JobName: jobName, - FlavorCode: spec.SourceSpecId, - DisplayJobName: displayJobName, - JobType: string(models.JobTypeDebug), - Type: models.TypeCDCenter, - Uuid: uuid, - ComputeResource: models.NPUResource, - Image: imageName, - Description: description, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: spec, - BootFile: bootFile, + Status: jobResult.Status, + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobResult.ID, + JobName: req.JobName, + FlavorCode: req.Spec.SourceSpecId, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeCDCenter, + Uuid: req.Uuid, + ComputeResource: models.NPUResource, + Image: imageName, + Description: req.Description, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: req.Spec, + BootFile: req.BootFile, + ModelName: req.ModelName, + ModelVersion: req.ModelVersion, + LabelName: req.LabelName, + PreTrainModelUrl: req.PreTrainModelUrl, + CkptName: req.CkptName, } err = models.CreateCloudbrain(task) @@ -154,7 +161,7 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr } stringId := strconv.FormatInt(task.ID, 10) - notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask) + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask) return jobResult.ID, nil } diff --git a/modules/setting/setting.go b/modules/setting/setting.go index d4e776062..43a841f18 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -519,7 +519,6 @@ var ( CullIdleTimeout string CullInterval string - //benchmark config IsBenchmarkEnabled bool BenchmarkOwner string @@ -617,14 +616,14 @@ var ( UsageRateBeginTime string }{} - ClearStrategy= struct { - Enabled bool - ResultSaveDays int - BatchSize int - DebugJobSize int - TrashSaveDays int - Cron string - RunAtStart bool + ClearStrategy = struct { + Enabled bool + ResultSaveDays int + BatchSize int + DebugJobSize int + TrashSaveDays int + Cron string + RunAtStart bool }{} C2NetInfos *C2NetSqInfos @@ -711,6 +710,7 @@ var ( ProjectHealth float64 ProjectHealthIssueCompleteRatio float64 + ProjectHealth0IssueCloseRatio float64 TeamHealth float64 TeamHealthContributors float64 @@ -1705,16 +1705,16 @@ func getModelartsCDConfig() { getNotebookFlavorInfos() } -func getClearStrategy(){ +func getClearStrategy() { sec := Cfg.Section("clear_strategy") - ClearStrategy.Enabled=sec.Key("ENABLED").MustBool(false) - ClearStrategy.ResultSaveDays=sec.Key("RESULT_SAVE_DAYS").MustInt(30) - ClearStrategy.BatchSize=sec.Key("BATCH_SIZE").MustInt(500) - ClearStrategy.DebugJobSize=sec.Key("DEBUG_BATCH_SIZE").MustInt(100) - ClearStrategy.TrashSaveDays=sec.Key("TRASH_SAVE_DAYS").MustInt(90) - ClearStrategy.Cron=sec.Key("CRON").MustString("* 0,30 2-8 * * ?") - ClearStrategy.RunAtStart=sec.Key("RUN_AT_START").MustBool(false) + ClearStrategy.Enabled = sec.Key("ENABLED").MustBool(false) + ClearStrategy.ResultSaveDays = sec.Key("RESULT_SAVE_DAYS").MustInt(30) + ClearStrategy.BatchSize = sec.Key("BATCH_SIZE").MustInt(500) + ClearStrategy.DebugJobSize = sec.Key("DEBUG_BATCH_SIZE").MustInt(100) + ClearStrategy.TrashSaveDays = sec.Key("TRASH_SAVE_DAYS").MustInt(90) + ClearStrategy.Cron = sec.Key("CRON").MustString("* 0,30 2-8 * * ?") + ClearStrategy.RunAtStart = sec.Key("RUN_AT_START").MustBool(false) } func getGrampusConfig() { @@ -1781,6 +1781,7 @@ func SetRadarMapConfig() { RadarMap.LivenessRelease = sec.Key("liveness_release").MustFloat64(0.4) RadarMap.ProjectHealth = sec.Key("project_health").MustFloat64(0.1) RadarMap.ProjectHealthIssueCompleteRatio = sec.Key("project_health_issue_complete_ratio").MustFloat64(100) + RadarMap.ProjectHealth0IssueCloseRatio = sec.Key("project_health_0_issue_close_ratio").MustFloat64(0.0) RadarMap.TeamHealth = sec.Key("team_health").MustFloat64(0.1) RadarMap.TeamHealthContributors = sec.Key("team_health_contributors").MustFloat64(0.2) RadarMap.TeamHealthKeyContributors = sec.Key("team_health_key_contributors").MustFloat64(0.6) diff --git a/modules/templates/helper.go b/modules/templates/helper.go index c314127f1..9d31952d6 100755 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -47,6 +47,7 @@ const ( REF_TYPE_BRANCH = "branch" REF_TYPE_TAG = "tag" REF_TYPE_PATTERN = "(refs/heads/|refs/tags/)" + DURATION_STR_ZERO = "00:00:00" ) // Used from static.go && dynamic.go @@ -109,6 +110,7 @@ func NewFuncMap() []template.FuncMap { "AttachmentStatus": dataset.GetStatusText, "IsShowDataSetOfCurrentRepo": dataset.IsShowDataSetOfCurrentRepo, "TimeSinceUnixShort": timeutil.TimeSinceUnixShort, + "ConvertDurationToStr": ConvertDurationToStr, "RawTimeSince": timeutil.RawTimeSince, "FileSize": base.FileSize, "PrettyNumber": base.PrettyNumber, @@ -365,6 +367,7 @@ func NewTextFuncMap() []texttmpl.FuncMap { "TimeSinceUnix": timeutil.TimeSinceUnix, "TimeSinceUnix1": timeutil.TimeSinceUnix1, "TimeSinceUnixShort": timeutil.TimeSinceUnixShort, + "ConvertDurationToStr": ConvertDurationToStr, "RawTimeSince": timeutil.RawTimeSince, "AttachmentResourceType": dataset.GetResourceType, "AttachmentStatus": dataset.GetStatusText, @@ -804,3 +807,9 @@ func MB2GB(size int) string { } return s } +func ConvertDurationToStr(duration int64) string { + if duration <= 0 { + return DURATION_STR_ZERO + } + return util.AddZero(duration/3600) + ":" + util.AddZero(duration%3600/60) + ":" + util.AddZero(duration%60) +} diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index bd8226342..fcb20597c 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -854,6 +854,7 @@ description = Description description_format_err=Description's length can be up to %s characters long. create_dataset = Create Dataset download_url=Download Url +download_model_url=Download Url download_oper=Operation download_copy=Copy URL create_dataset_fail=Failed to create dataset. @@ -1062,6 +1063,7 @@ model_rename=Duplicate model name, please modify model name. notebook_file_not_exist=Notebook file does not exist. notebook_select_wrong=Please select a Notebook(.ipynb) file first. notebook_file_no_right=You have no right to access the Notebook(.ipynb) file. +debug_again_fail=Fail to restart debug task, please try again later. date=Date repo_add=Project Increment @@ -1344,9 +1346,12 @@ modelconvert.inputshapeerror=Format input error, please input such as: 1,1,32,32 modelconvert.manage.create_error1=A model transformation task with the same name already exists. modelconvert.manage.create_error2=Only one running model transformation task can be created. -modelconvert.manage.model_not_exist=The model does not exist. +modelconvert.manage.model_not_exist=The model in the task does not exist or has been deleted. modelconvert.manage.no_operate_right=You have no right to do the operation. +debug.manage.model_not_exist=The model in the task does not exist or has been deleted, please create a new debug job. +debug.manage.dataset_not_exist=The part of datasets in the task does not exist or has been deleted, please create a new debug job. + grampus.train_job.ai_center = AI Center grampus.dataset_path_rule = The code is storaged in /cache/code;the dataset is storaged in /cache/dataset;and please put your model into /cache/output, then you can download it online。 grampus.gpu_dataset_path_rule = The code is storaged in /tmp/code;the dataset is storaged in /tmp/dataset;and please put your model into /tmp/output, then you can download it online。 @@ -2752,6 +2757,10 @@ repos.pr=PR repos.commit=Commit repos.closedIssues=Closed Issue repos.contributor=Contributor +repos.numDataset=Dataset File +repos.numCloudbrain=Cloudbrain Task +repos.numModel=Model +repos.numModelConvert=Model Convert Task repos.yes=Yes repos.no=No @@ -3122,6 +3131,8 @@ reject_pull_request = `suggested changes for %s#%[2]s` upload_dataset=`upload dataset %s` task_gpudebugjob=`created CPU/GPU type debugging task %s` task_npudebugjob=`created NPU type debugging task %s` +task_c2net_gpudebugjob=`created CPU/GPU type debugging task %s` +task_c2net_npudebugjob=`created NPU type debugging task %s` task_nputrainjob=`created NPU training task %s` task_inferencejob=`created reasoning task %s` task_benchmark=`created profiling task %s` @@ -3241,6 +3252,7 @@ dataset = Dataset resource_specification = Resource specification dataset_storage_path = Dataset storage path model_storage_path = Model storage path +output_storage_path = Output storage path code_storage_path = Code storage path benchmark_path = Benchmark script path snn4imagenet_path = Snn4imagenet script path @@ -3296,8 +3308,11 @@ load_code_failed=Fail to load code, please check if the right branch is selected error.dataset_select = dataset select error:the count exceed the limit or has same name new_train_gpu_tooltips = The code is storaged in %s, the dataset is storaged in %s, the pre-trained model is storaged in the run parameter %s, and please put your model into %s then you can download it online +new_debug_gpu_tooltips = The code is storaged in %s, the dataset is storaged in %s, the pre-trained model is storaged in the %s, and please put your model into %s then you can download it online +new_debug_gpu_tooltips1 = The code is storaged in %s, the dataset is storaged in %s, the pre-trained model is storaged in the %s. new_train_npu_tooltips = The code is storaged in %s, the pre-trained model is storaged in the run parameter %s, and please put your model into %s then you can download it online new_infer_gpu_tooltips = The dataset is stored in %s, the model file is stored in %s, please store the inference output in %s for subsequent downloads. +code_obs_address = Code OBS address [points] points = points diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 891729d70..86d445871 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -865,6 +865,7 @@ reference_dataset_fail=关联数据集失败,请稍后再试。 cancel_reference_dataset_fail=取消关联数据集失败,请稍后再试。 download_url=数据集下载地址 +download_model_url=模型文件下载地址 download_copy=复制链接 download_oper=操作 show_dataset=数据集 @@ -1061,6 +1062,7 @@ model_rename=模型名称重复,请修改模型名称 notebook_file_not_exist=Notebook文件不存在。 notebook_select_wrong=请先选择Notebook(.ipynb)文件。 notebook_file_no_right=您没有这个Notebook文件的读权限。 +debug_again_fail=再次调试失败,请稍后再试。 date=日期 repo_add=新增项目 @@ -1358,9 +1360,13 @@ modelconvert.modelfileempty=请选择模型文件。 modelconvert.manage.create_error1=相同的名称模型转换任务已经存在。 modelconvert.manage.create_error2=只能创建一个正在运行的模型转换任务。 -modelconvert.manage.model_not_exist=选择的模型不存在。 +modelconvert.manage.model_not_exist=任务中选择的模型不存在或者已被删除。 modelconvert.manage.no_operate_right=您没有操作权限。 + +debug.manage.model_not_exist=任务中选择的模型不存在或者已被删除,请新建调试任务。 +debug.manage.dataset_not_exist=任务中选择的部分数据集不存在或者已被删除,请新建调试任务。 + grampus.train_job.ai_center=智算中心 grampus.dataset_path_rule = 训练脚本存储在/cache/code中,数据集存储在/cache/dataset中,训练输出请存储在/cache/output中以供后续下载。 grampus.gpu_dataset_path_rule = 训练脚本存储在/tmp/code中,数据集存储在/tmp/dataset中,训练输出请存储在/tmp/output中以供后续下载。 @@ -2769,6 +2775,11 @@ repos.pr=PR数 repos.commit=Commit数 repos.closedIssues=已解决任务数 repos.contributor=贡献者数 +repos.numDataset=数据集文件数 +repos.numCloudbrain=云脑任务数 +repos.numModel=模型数 +repos.numModelConvert=转换任务数 + repos.yes=是 repos.no=否 @@ -3139,6 +3150,8 @@ reject_pull_request=`建议变更 %s#%[2]s` upload_dataset=`上传了数据集文件 %s` task_gpudebugjob=`创建了CPU/GPU类型调试任务 %s` task_npudebugjob=`创建了NPU类型调试任务 %s` +task_c2net_gpudebugjob=`创建了CPU/GPU类型调试任务 %s` +task_c2net_npudebugjob=`创建了NPU类型调试任务 %s` task_nputrainjob=`创建了NPU类型训练任务 %s` task_inferencejob=`创建了推理任务 %s` task_benchmark=`创建了评测任务 %s` @@ -3259,6 +3272,7 @@ resource_specification = 资源规格 dataset_storage_path = 数据集存放路径 model_storage_path = 模型存放路径 code_storage_path = 代码存放路径 +output_storage_path = 输出存放路径 benchmark_path = benchmark脚本存放路径 snn4imagenet_path = snn4imagenet脚本存放路径 brainscore_path = brainscore脚本存放路径 @@ -3316,8 +3330,11 @@ load_code_failed=代码加载失败,请确认选择了正确的分支。 error.dataset_select = 数据集选择错误:数量超过限制或者有同名数据集 new_train_gpu_tooltips = 训练脚本存储在 %s 中,数据集存储在 %s 中,预训练模型存放在运行参数 %s 中,训练输出请存储在 %s 中以供后续下载。 +new_debug_gpu_tooltips = 项目代码存储在 %s 中,数据集存储在 %s 中,选择的模型存储在 %s 中,调试输出请存储在 %s 中以供后续下载。 +new_debug_gpu_tooltips1 = 项目代码存储在 %s 中,数据集存储在 %s 中,选择的模型存储在 %s 中。 new_train_npu_tooltips = 训练脚本存储在 %s 中,预训练模型存放在运行参数 %s 中,训练输出请存储在 %s 中以供后续下载。 new_infer_gpu_tooltips = 数据集存储在 %s 中,模型文件存储在 %s 中,推理输出请存储在 %s 中以供后续下载。 +code_obs_address = 代码obs地址 [points] points = 积分 diff --git a/public/home/home.js b/public/home/home.js index df18b7891..fe843161e 100755 --- a/public/home/home.js +++ b/public/home/home.js @@ -247,7 +247,7 @@ document.onreadystatechange = function () { html += recordPrefix + actionName; html += " " + record.RefName + "" } - else if(record.OpType == "25" || record.OpType == "29"){ + else if(record.OpType == "25" || record.OpType == "29" || record.OpType == "39" || record.OpType == "40"){ html += recordPrefix + actionName; html += " " + record.RefName + "" } @@ -294,7 +294,10 @@ function getTaskLink(record){ re = re + "/cloudbrain/train-job/" + record.Content; }else if(record.OpType == 32 || record.OpType == 33){ re = re + "/grampus/train-job/" + record.Content; + }else if(record.OpType == 39 || record.OpType == 40){ + re = re + "/grampus/notebook/" + record.Content; } + re = encodeURI(re); return re; } @@ -450,7 +453,9 @@ var actionNameZH={ "33":"创建了CPU/GPU类型训练任务", "35":"创建的数据集 {dataset} 被设置为推荐数据集", "36":"提交了镜像 {image}", - "37":"提交的镜像 {image} 被设置为推荐镜像", + "37": "提交的镜像 {image} 被设置为推荐镜像", + "39":"创建了CPU/GPU类型调试任务", + "40":"创建了NPU类型调试任务", }; var actionNameEN={ @@ -481,7 +486,9 @@ var actionNameEN={ "33":" created CPU/GPU type training task", "35":" created dataset {dataset} was set as recommended dataset", "36":"committed image {image}", - "37":"committed image {image} was set as recommended image", + "37": "committed image {image} was set as recommended image", + "39":" created CPU/GPU type debugging task ", + "40":" created NPU type debugging task ", }; var repoAndOrgZH={ diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 3e50b00fc..4936c2362 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -1062,6 +1062,9 @@ func RegisterRoutes(m *macaron.Macaron) { }) }, reqRepoReader(models.UnitTypeCloudBrain)) m.Group("/grampus", func() { + m.Group("/notebook", func() { + m.Get("/:id", repo_ext.GetGrampusNotebook) + }) m.Group("/train-job", func() { m.Group("/:jobid", func() { m.Get("", repo.GetModelArtsTrainJobVersion) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 1c5a58b47..805443788 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -9,6 +9,7 @@ import ( "bufio" "encoding/json" "io" + "io/ioutil" "net/http" "os" "path" @@ -237,7 +238,7 @@ func GetCloudbrainTask(ctx *context.APIContext) { ID := ctx.Params(":id") - job, err := cloudbrain.GetCloudBrainByIdOrJobId(ID) + job, err := cloudbrain.GetCloudBrainByIdOrJobId(ID, "id") if err != nil { ctx.NotFound(err) @@ -647,6 +648,19 @@ func CloudbrainDownloadLogFile(ctx *context.Context) { } } + existStr := "" + if job.JobType == string(models.JobTypeTrain) || job.JobType == string(models.JobTypeInference) { + if job.Type == models.TypeCloudBrainOne { + result, err := cloudbrain.GetJob(job.JobID) + if err == nil && result != nil { + jobRes, _ := models.ConvertToJobResultPayload(result.Payload) + taskRoles := jobRes.TaskRoles + taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) + existStr = taskRes.TaskStatuses[0].ExitDiagnostics + } + } + } + logDir := "/model" if job.JobType == string(models.JobTypeInference) || job.JobType == string(models.JobTypeModelSafety) { logDir = cloudbrain.ResultPath @@ -664,17 +678,30 @@ func CloudbrainDownloadLogFile(ctx *context.Context) { } } if fileName != "" { - prefix := "/" + setting.CBCodePathPrefix + job.JobName + logDir - url, err := storage.Attachments.PresignedGetURL(prefix+"/"+fileName, fileName) + prefix := "/" + setting.CBCodePathPrefix + job.JobName + "/model" + filePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + prefix + "/" + fileName + // Read the file contents into a byte slice + data, err := ioutil.ReadFile(filePath) if err != nil { - log.Error("Get minio get SignedUrl failed: %v", err.Error(), ctx.Data["msgID"]) + ctx.ServerError("ReadFile", err) + return + } + + // Set the appropriate response headers + ctx.Resp.Header().Set("Content-Type", "application/octet-stream") + ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName) + + // Write the file contents to the response + if _, err := ctx.Resp.Write(data); err != nil { + ctx.ServerError("Write", err) + return + } + if _, err := ctx.Resp.Write([]byte(existStr)); err != nil { + log.Error("Write failed: %v", err.Error(), ctx.Data["msgID"]) return } - log.Info("fileName=" + fileName) - http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect) } else { log.Info("fileName is null.") - } } @@ -760,8 +787,28 @@ func CloudbrainGetLog(ctx *context.APIContext) { content = result["Content"].(string) } - if ctx.Data["existStr"] != nil && result["Lines"].(int) < 50 { - content = content + ctx.Data["existStr"].(string) + if (job.JobType == string(models.JobTypeTrain) || job.JobType == string(models.JobTypeInference)) && job.Type == models.TypeCloudBrainOne && job.Status == string(models.JobFailed) { + if ctx.Data["existStr"] != nil { + if baseLine == "" && order == "desc" && result["Lines"].(int) == 0 { + result["Lines"] = 1 + result["EndLine"] = 1 + content = content + ctx.Data["existStr"].(string) + } + + if result["Lines"].(int) == 0 && result["StartLine"] == result["EndLine"] && result["StartLine"].(int) != 0 { + content = content + ctx.Data["existStr"].(string) + result["Lines"] = 1 + result["StartLine"] = result["StartLine"].(int) - 1 + } + if result["Lines"].(int) == 1 && result["StartLine"] == result["EndLine"] { + result["Lines"] = 0 + result["StartLine"] = result["StartLine"].(int) + 1 + } + } + } else { + if ctx.Data["existStr"] != nil && result["Lines"].(int) < 50 { + content = content + ctx.Data["existStr"].(string) + } } logFileName := result["FileName"] diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index 0d68fff30..bb04038b9 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -103,86 +103,62 @@ func GetAllCloudbrainsOverview(ctx *context.Context) { }) } func GetOverviewDuration(ctx *context.Context) { - recordCloudbrain, err := models.GetRecordBeginTime() - if err != nil { - log.Error("Can not get recordCloudbrain", err) - ctx.Error(http.StatusBadRequest, ctx.Tr("repo.record_begintime_get_err")) - return - } - recordBeginTime := recordCloudbrain[0].Cloudbrain.CreatedUnix - now := time.Now() - endTime := now - var workServerNumber int64 - var cardNum int64 + durationSum := 0 + cardDurationSum := 0 - durationAllSum := int64(0) - cardDuSum := int64(0) + cloudBrainOneCardDuSum := 0 + cloudBrainTwoCardDuSum := 0 + c2NetCardDuSum := 0 + cDNetCardDuSum := 0 - cloudBrainOneCardDuSum := int64(0) - cloudBrainTwoCardDuSum := int64(0) - c2NetCardDuSum := int64(0) - cDNetCardDuSum := int64(0) + cloudBrainOneDuration := 0 + cloudBrainTwoDuration := 0 + c2NetDuration := 0 + cDCenterDuration := 0 - cloudBrainOneDuration := int64(0) - cloudBrainTwoDuration := int64(0) - c2NetDuration := int64(0) - cDCenterDuration := int64(0) - - cloudbrains, _, err := models.CloudbrainAllKanBan(&models.CloudbrainsOptions{ - Type: models.TypeCloudBrainAll, - BeginTimeUnix: int64(recordBeginTime), - EndTimeUnix: endTime.Unix(), - }) + cloudbrainTypeDuration, err := models.GetCloudbrainTypeCardDuration() if err != nil { - ctx.ServerError("Get cloudbrains failed:", err) + log.Error("GetCloudbrainTypeCardDuration err!", err) return } - models.LoadSpecs4CloudbrainInfo(cloudbrains) - - for _, cloudbrain := range cloudbrains { - cloudbrain = cloudbrainService.UpdateCloudbrainAiCenter(cloudbrain) - if cloudbrain.Cloudbrain.Spec != nil { - cardNum = int64(cloudbrain.Cloudbrain.Spec.AccCardsNum) - } else { - cardNum = 1 + for _, result := range cloudbrainTypeDuration { + if result.Type == models.TypeCloudBrainOne { + cloudBrainOneDuration = result.DurationSum + cloudBrainOneCardDuSum = result.CardDurationSum } - if cloudbrain.Cloudbrain.WorkServerNumber >= 1 { - workServerNumber = int64(cloudbrain.Cloudbrain.WorkServerNumber) - } else { - workServerNumber = 1 + if result.Type == models.TypeCloudBrainTwo { + cloudBrainTwoDuration = result.DurationSum + cloudBrainTwoCardDuSum = result.CardDurationSum } - duration := models.ConvertStrToDuration(cloudbrain.TrainJobDuration) - CardDuration := workServerNumber * int64(cardNum) * duration - - if cloudbrain.Cloudbrain.Type == models.TypeCloudBrainOne { - cloudBrainOneDuration += duration - cloudBrainOneCardDuSum += CardDuration - } else if cloudbrain.Cloudbrain.Type == models.TypeCloudBrainTwo { - cloudBrainTwoDuration += duration - cloudBrainTwoCardDuSum += CardDuration - } else if cloudbrain.Cloudbrain.Type == models.TypeC2Net { - c2NetDuration += duration - c2NetCardDuSum += CardDuration - } else if cloudbrain.Cloudbrain.Type == models.TypeCDCenter { - cDCenterDuration += duration - cDNetCardDuSum += CardDuration + if result.Type == models.TypeC2Net { + c2NetDuration = result.DurationSum + c2NetCardDuSum = result.CardDurationSum } - - durationAllSum += duration - cardDuSum += CardDuration + if result.Type == models.TypeCDCenter { + cDCenterDuration = result.DurationSum + cDNetCardDuSum = result.CardDurationSum + } + } + cloudbrainAllDuration, err := models.GetCloudbrainAllCardDuration() + if err != nil { + log.Error("GetCloudbrainAllCardDuration err!", err) + return } + durationSum = cloudbrainAllDuration.DurationSum + cardDurationSum = cloudbrainAllDuration.CardDurationSum + ctx.JSON(http.StatusOK, map[string]interface{}{ "cloudBrainOneCardDuSum": cloudBrainOneCardDuSum, "cloudBrainTwoCardDuSum": cloudBrainTwoCardDuSum, "c2NetCardDuSum": c2NetCardDuSum, "cDNetCardDuSum": cDNetCardDuSum, - "cardDuSum": cardDuSum, + "cardDuSum": cardDurationSum, "cloudBrainOneDuration": cloudBrainOneDuration, "cloudBrainTwoDuration": cloudBrainTwoDuration, "c2NetDuration": c2NetDuration, "cDCenterDuration": cDCenterDuration, - "durationSum": durationAllSum, + "durationSum": durationSum, }) } diff --git a/routers/api/v1/repo/images.go b/routers/api/v1/repo/images.go index f0cb62980..e09ca260a 100644 --- a/routers/api/v1/repo/images.go +++ b/routers/api/v1/repo/images.go @@ -88,7 +88,7 @@ func getModelArtsImages(ctx *context.APIContext) { } func getC2netNpuImages(ctx *context.APIContext) { - images, err := grampus.GetImages(grampus.ProcessorTypeNPU) + images, err := grampus.GetImages(grampus.ProcessorTypeNPU, string(models.JobTypeTrain)) var npuImageInfos []NPUImageINFO if err != nil { log.Error("GetImages failed:", err.Error()) diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 127ddd835..a0abab38b 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -39,7 +39,7 @@ func GetModelArtsNotebook2(ctx *context.APIContext) { ID := ctx.Params(":id") - job,err := cloudbrain.GetCloudBrainByIdOrJobId(ID) + job, err := cloudbrain.GetCloudBrainByIdOrJobId(ID, "id") if err != nil { ctx.NotFound(err) diff --git a/routers/api/v1/repo/repo_dashbord.go b/routers/api/v1/repo/repo_dashbord.go index b3a01cff1..e3a54b4fa 100644 --- a/routers/api/v1/repo/repo_dashbord.go +++ b/routers/api/v1/repo/repo_dashbord.go @@ -601,7 +601,7 @@ func getSummaryFileName(ctx *context.Context, beginTime time.Time, endTime time. func allProjectsPeroidHeader(ctx *context.Context) map[string]string { return map[string]string{"A1": ctx.Tr("admin.repos.id"), "B1": ctx.Tr("admin.repos.projectName"), "C1": ctx.Tr("repo.owner"), "D1": ctx.Tr("admin.repos.isPrivate"), "E1": ctx.Tr("admin.repos.openi"), "F1": ctx.Tr("admin.repos.visit"), "G1": ctx.Tr("admin.repos.download"), "H1": ctx.Tr("admin.repos.pr"), "I1": ctx.Tr("admin.repos.commit"), - "J1": ctx.Tr("admin.repos.watches"), "K1": ctx.Tr("admin.repos.stars"), "L1": ctx.Tr("admin.repos.forks"), "M1": ctx.Tr("admin.repos.issues"), "N1": ctx.Tr("admin.repos.closedIssues"), "O1": ctx.Tr("admin.repos.contributor"), "P1": ctx.Tr("admin.repos.isFork"), "Q1": ctx.Tr("admin.repos.isMirror"), "R1": ctx.Tr("admin.repos.create")} + "J1": ctx.Tr("admin.repos.watches"), "K1": ctx.Tr("admin.repos.stars"), "L1": ctx.Tr("admin.repos.forks"), "M1": ctx.Tr("admin.repos.issues"), "N1": ctx.Tr("admin.repos.closedIssues"), "O1": ctx.Tr("admin.repos.contributor"), "P1": ctx.Tr("admin.repos.numDataset"), "Q1": ctx.Tr("admin.repos.numCloudbrain"), "R1": ctx.Tr("admin.repos.numModel"), "S1": ctx.Tr("admin.repos.numModelConvert"), "T1": ctx.Tr("admin.repos.isFork"), "U1": ctx.Tr("admin.repos.isMirror"), "V1": ctx.Tr("admin.repos.create")} } @@ -619,11 +619,13 @@ func allProjectsPeriodSummaryValues(row int, rs *ProjectSummaryBaseData, ctx *co } func allProjectsPeroidValues(row int, rs *models.RepoStatistic, ctx *context.Context) map[string]string { + return map[string]string{getCellName("A", row): strconv.FormatInt(rs.RepoID, 10), getCellName("B", row): rs.DisplayName(), getCellName("C", row): rs.OwnerName, getCellName("D", row): getBoolDisplay(rs.IsPrivate, ctx), getCellName("E", row): strconv.FormatFloat(rs.RadarTotal, 'f', 2, 64), getCellName("F", row): strconv.FormatInt(rs.NumVisits, 10), getCellName("G", row): strconv.FormatInt(rs.NumDownloads, 10), getCellName("H", row): strconv.FormatInt(rs.NumPulls, 10), getCellName("I", row): strconv.FormatInt(rs.NumCommits, 10), getCellName("J", row): strconv.FormatInt(rs.NumWatches, 10), getCellName("K", row): strconv.FormatInt(rs.NumStars, 10), getCellName("L", row): strconv.FormatInt(rs.NumForks, 10), getCellName("M", row): strconv.FormatInt(rs.NumIssues, 10), - getCellName("N", row): strconv.FormatInt(rs.NumClosedIssues, 10), getCellName("O", row): strconv.FormatInt(rs.NumContributor, 10), getCellName("P", row): getBoolDisplay(rs.IsFork, ctx), getCellName("Q", row): getBoolDisplay(rs.IsMirror, ctx), getCellName("R", row): time.Unix(int64(rs.RepoCreatedUnix), 0).Format(CREATE_TIME_FORMAT), + getCellName("N", row): strconv.FormatInt(rs.NumClosedIssues, 10), getCellName("O", row): strconv.FormatInt(rs.NumContributor, 10), getCellName("P", row): strconv.FormatInt(rs.NumDatasetFile, 10), getCellName("Q", row): strconv.FormatInt(rs.NumCloudbrain, 10), getCellName("R", row): strconv.FormatInt(rs.NumModels, 10), getCellName("S", row): strconv.FormatInt(rs.NumModelConvert, 10), getCellName("T", row): getBoolDisplay(rs.IsFork, ctx), getCellName("U", row): getBoolDisplay(rs.IsMirror, ctx), getCellName("V", row): time.Unix(int64(rs.RepoCreatedUnix), 0).Format(CREATE_TIME_FORMAT), } + } func allProjectsOpenIHeader() map[string]string { @@ -804,11 +806,11 @@ func generateOpenICountSql(latestDate string) string { } func generateTypeAllSql(beginTime time.Time, endTime time.Time, latestDate string, q string, orderBy string, page int, pageSize int) string { - sql := "SELECT A.repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total,num_watches,num_visits,num_downloads,num_pulls,num_commits,num_stars,num_forks,num_issues,num_closed_issues,num_contributor FROM " + + sql := "SELECT A.repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total,num_watches,num_visits,num_downloads,num_pulls,num_commits,num_stars,num_forks,num_issues,num_closed_issues,num_contributor,num_models,num_model_convert,num_cloudbrain,num_dataset_file FROM " + "(SELECT repo_id,sum(num_visits) as num_visits " + " FROM repo_statistic where created_unix >=" + strconv.FormatInt(beginTime.Unix(), 10) + " and created_unix<" + strconv.FormatInt(endTime.Unix(), 10) + " group by repo_id) A," + - "(SELECT repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total,num_watches,num_downloads,num_pulls,num_commits,num_stars,num_forks,num_issues,num_closed_issues,num_contributor from public.repo_statistic where date='" + latestDate + "') B" + + "(SELECT repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total,num_watches,num_downloads,num_pulls,num_commits,num_stars,num_forks,num_issues,num_closed_issues,num_contributor,num_models,num_model_convert,num_cloudbrain,num_dataset_file from public.repo_statistic where date='" + latestDate + "') B" + " where A.repo_id=B.repo_id" if q != "" { @@ -828,8 +830,8 @@ func generateTypeAllOpenISql(latestDate string, page int, pageSize int) string { func generatePageSql(beginTime time.Time, endTime time.Time, latestDate string, q string, orderBy string, page int, pageSize int) string { - sql := "SELECT A.repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total,num_watches,num_visits,num_downloads,num_pulls,num_commits,num_stars,num_forks,num_issues,num_closed_issues,num_contributor FROM " + - "(SELECT repo_id,sum(num_watches_added) as num_watches,sum(num_visits) as num_visits, sum(num_downloads_added) as num_downloads,sum(num_pulls_added) as num_pulls,sum(num_commits_added) as num_commits,sum(num_stars_added) as num_stars,sum(num_forks_added) num_forks,sum(num_issues_added) as num_issues,sum(num_closed_issues_added) as num_closed_issues,sum(num_contributor_added) as num_contributor " + + sql := "SELECT A.repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total,num_watches,num_visits,num_downloads,num_pulls,num_commits,num_stars,num_forks,num_issues,num_closed_issues,num_contributor,num_models,num_model_convert,num_cloudbrain,num_dataset_file FROM " + + "(SELECT repo_id,sum(num_watches_added) as num_watches,sum(num_visits) as num_visits, sum(num_downloads_added) as num_downloads,sum(num_pulls_added) as num_pulls,sum(num_commits_added) as num_commits,sum(num_stars_added) as num_stars,sum(num_forks_added) num_forks,sum(num_issues_added) as num_issues,sum(num_closed_issues_added) as num_closed_issues,sum(num_contributor_added) as num_contributor,sum(num_models_added) as num_models,sum(num_model_convert_added) as num_model_convert,sum(num_dataset_file_added) as num_dataset_file, sum(num_cloudbrain_added) as num_cloudbrain " + " FROM repo_statistic where created_unix >=" + strconv.FormatInt(beginTime.Unix(), 10) + " and created_unix<" + strconv.FormatInt(endTime.Unix(), 10) + " group by repo_id) A," + "(SELECT repo_id,name,alias,owner_name,is_private,is_mirror,is_fork,repo_created_unix,radar_total from public.repo_statistic where date='" + latestDate + "') B" + diff --git a/routers/repo/ai_model_manage.go b/routers/repo/ai_model_manage.go index fadcff051..592194371 100644 --- a/routers/repo/ai_model_manage.go +++ b/routers/repo/ai_model_manage.go @@ -1319,19 +1319,25 @@ func QueryModelFileForPredict(ctx *context.Context) { func QueryModelFileByID(id string) []storage.FileInfo { model, err := models.QueryModelById(id) - if err == nil { - if model.Type == models.TypeCloudBrainTwo { - prefix := model.Path[len(setting.Bucket)+1:] - fileinfos, _ := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, prefix) - return fileinfos - } else if model.Type == models.TypeCloudBrainOne { - prefix := model.Path[len(setting.Attachment.Minio.Bucket)+1:] - fileinfos, _ := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, prefix) - return fileinfos - } - } else { + if err != nil { log.Error("no such model!", err.Error()) + return nil } + return QueryModelFileByModel(model) +} + +func QueryModelFileByModel(model *models.AiModelManage) []storage.FileInfo { + + if model.Type == models.TypeCloudBrainTwo { + prefix := model.Path[len(setting.Bucket)+1:] + fileinfos, _ := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, prefix) + return fileinfos + } else if model.Type == models.TypeCloudBrainOne { + prefix := model.Path[len(setting.Attachment.Minio.Bucket)+1:] + fileinfos, _ := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, prefix) + return fileinfos + } + return nil } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index a23cd5462..905c25a64 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -81,6 +81,7 @@ var ( const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" const CLONE_FILE_PREFIX = "file:///" +const README = "README" var benchmarkTypesMap = make(map[string]*models.BenchmarkTypes, 0) @@ -373,6 +374,13 @@ func cloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } if form.ModelName != "" { //使用预训练模型训练 + _, err := models.QueryModelByPath(form.PreTrainModelUrl) + if err != nil { + log.Error("Can not find model", err) + cloudBrainNewDataPrepare(ctx, jobType) + ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form) + return + } req.ModelName = form.ModelName req.LabelName = form.LabelName req.CkptName = form.CkptName @@ -411,8 +419,13 @@ func loadCodeAndMakeModelPath(repo *models.Repository, codePath string, branchNa return "cloudbrain.load_code_failed" } + return initModelPath(jobName, resultPath) + +} + +func initModelPath(jobName string, resultPath string) string { modelPath := setting.JobPath + jobName + resultPath + "/" - err = mkModelPath(modelPath) + err := mkModelPath(modelPath) if err != nil { return "cloudbrain.load_code_failed" } @@ -691,6 +704,17 @@ func CloudBrainRestart(ctx *context.Context) { break } } + if !HasModelFile(task) { + resultCode = "-1" + errorMsg = ctx.Tr("repo.debug.manage.model_not_exist") + break + } + + if hasDatasetDeleted(task) { + resultCode = "-1" + errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist") + break + } err = cloudbrain.RestartTask(ctx, task, &ID) if err != nil { @@ -711,6 +735,39 @@ func CloudBrainRestart(ctx *context.Context) { }) } +func hasDatasetDeleted(task *models.Cloudbrain) bool { + if task.Uuid == "" { + return false + } + uuids := strings.Split(task.Uuid, ";") + attachs, _ := models.GetAttachmentsByUUIDs(uuids) + return len(attachs) < len(uuids) +} + +func HasModelFile(task *models.Cloudbrain) bool { + if task.PreTrainModelUrl == "" { + return true + } + + model, err := models.QueryModelByPath(task.PreTrainModelUrl) + if err != nil { + log.Error("Can not find model", err) + return false + } + + fileInfos := QueryModelFileByModel(model) + isFind := false + if fileInfos != nil { + for _, fileInfo := range fileInfos { + if fileInfo.FileName == task.CkptName { + isFind = true + break + } + } + + } + return isFind +} func getOldJobPath(task *models.Cloudbrain) string { return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + task.JobName @@ -854,10 +911,10 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo func CloudBrainDebug(ctx *context.Context) { task := ctx.Cloudbrain debugUrl := setting.DebugServerHost + "jpylab_" + task.JobID + "_" + task.SubTaskName - if task.BootFile!=""{ - ctx.Redirect(getFileUrl(debugUrl,task.BootFile)) + if task.BootFile != "" { + ctx.Redirect(getFileUrl(debugUrl, task.BootFile)) - }else{ + } else { ctx.Redirect(debugUrl) } @@ -1758,7 +1815,7 @@ func mkPathAndReadMeFile(path string, text string) error { return err } - fileName := path + "README" + fileName := path + README f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) if err != nil { log.Error("OpenFile failed", err.Error()) @@ -1816,6 +1873,7 @@ func SyncCloudbrainStatus() { if task.JobType == string(models.JobTypeModelSafety) { continue } + if task.Type == models.TypeCloudBrainOne { task, err = cloudbrainTask.SyncCloudBrainOneStatus(task) @@ -1824,32 +1882,7 @@ func SyncCloudbrainStatus() { continue } - if task.Status != string(models.JobWaiting) { - if task.Duration >= setting.MaxDuration && task.JobType == string(models.JobTypeDebug) { - log.Info("begin to stop job(%s), because of the duration", task.DisplayJobName) - err = cloudbrain.StopJob(task.JobID) - if err != nil { - log.Error("StopJob(%s) failed:%v", task.DisplayJobName, err) - continue - } - oldStatus := task.Status - task.Status = string(models.JobStopped) - if task.EndTime == 0 { - task.EndTime = timeutil.TimeStampNow() - } - task.ComputeAndSetDuration() - if oldStatus != task.Status { - notification.NotifyChangeCloudbrainStatus(task, oldStatus) - } - err = models.UpdateJob(task) - if err != nil { - log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) - continue - } - } - - } - } else if task.Type == models.TypeCloudBrainTwo { + } else if task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter { if task.JobType == string(models.JobTypeDebug) { err := modelarts.HandleNotebookInfo(task) if err != nil { @@ -1866,48 +1899,77 @@ func SyncCloudbrainStatus() { log.Error("task.JobType(%s) is error:%s", task.DisplayJobName, task.JobType) } } else if task.Type == models.TypeC2Net { - result, err := grampus.GetJob(task.JobID) - if err != nil { - log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err) - continue - } - - if result != nil { - if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { - task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + if task.JobType == string(models.JobTypeDebug) { + cloudbrainTask.SyncGrampusNotebookStatus(task) + } else { + result, err := grampus.GetJob(task.JobID) + if err != nil { + log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err) + continue } - oldStatus := task.Status - task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) - task.Duration = result.JobInfo.RunSec - if task.Duration < 0 { - task.Duration = 0 - } - task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) + if result != nil { + if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { + task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } + oldStatus := task.Status + task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + task.Duration = result.JobInfo.RunSec - if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { - task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) + if task.Duration < 0 { + task.Duration = 0 + } + task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) + + if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { + task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) + } + if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { + task.EndTime = task.StartTime.Add(task.Duration) + } + task.CorrectCreateUnix() + if oldStatus != task.Status { + notification.NotifyChangeCloudbrainStatus(task, oldStatus) + if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource { + if len(result.JobInfo.Tasks[0].CenterID) == 1 { + urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID)) + } + } + } + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } + } + } else { + log.Error("task.Type(%s) is error:%d", task.JobName, task.Type) + } + if task.Status != string(models.JobWaiting) { + if task.Duration >= setting.MaxDuration && task.JobType == string(models.JobTypeDebug) { + log.Info("begin to stop job(%s), because of the duration", task.DisplayJobName) + err = cloudbrainTask.StopDebugJob(task) + if err != nil { + log.Error("StopJob(%s) failed:%v", task.DisplayJobName, err) + continue } - if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { - task.EndTime = task.StartTime.Add(task.Duration) + oldStatus := task.Status + task.Status = string(models.JobStopped) + if task.EndTime == 0 { + task.EndTime = timeutil.TimeStampNow() } - task.CorrectCreateUnix() + task.ComputeAndSetDuration() if oldStatus != task.Status { notification.NotifyChangeCloudbrainStatus(task, oldStatus) - if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource { - if len(result.JobInfo.Tasks[0].CenterID) == 1 { - urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID)) - } - } } err = models.UpdateJob(task) if err != nil { - log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) continue } } - } else { - log.Error("task.Type(%s) is error:%d", task.JobName, task.Type) + } } diff --git a/routers/repo/cloudbrain_statistic.go b/routers/repo/cloudbrain_statistic.go index de95babe9..43c1ab1a4 100644 --- a/routers/repo/cloudbrain_statistic.go +++ b/routers/repo/cloudbrain_statistic.go @@ -14,13 +14,7 @@ import ( ) func CloudbrainDurationStatisticHour() { - defer func() { - err := recover() - if err == nil { - return - } - }() - if setting.IsCloudbrainTimingEnabled { + if setting.IsCloudbrainTimingEnabled { var statisticTime time.Time var count int64 recordDurationUpdateTime, err := models.GetDurationRecordUpdateTime() @@ -35,17 +29,16 @@ func CloudbrainDurationStatisticHour() { statisticTime = currentTime } - err = models.DeleteCloudbrainDurationStatistic(timeutil.TimeStamp(statisticTime.Add(-1*time.Hour).Unix()), timeutil.TimeStamp(currentTime.Unix())) + err = models.DeleteCloudbrainDurationStatistic(timeutil.TimeStamp(statisticTime.Unix()), timeutil.TimeStamp(currentTime.Unix())) if err != nil { log.Error("DeleteCloudbrainDurationStatistic failed", err) } - + statisticTime = statisticTime.Add(+1 * time.Hour) for statisticTime.Before(currentTime) || statisticTime.Equal(currentTime) { countEach := summaryDurationStat(statisticTime) count += countEach statisticTime = statisticTime.Add(+1 * time.Hour) } - log.Info("summaryDurationStat count: %v", count) } } func UpdateDurationStatisticHistoryData(beginTime time.Time, endTime time.Time) int64 { @@ -71,15 +64,18 @@ func summaryDurationStat(statisticTime time.Time) int64 { ciTasks, err := models.GetCloudbrainByTime(beginTime, endTime) if err != nil { - log.Info("GetCloudbrainByTime err: %v", err) + log.Error("GetCloudbrainByTime err: %v", err) return 0 } - models.LoadSpecs4CloudbrainInfo(ciTasks) - cloudBrainCenterCodeAndCardTypeInfo, cloudbrainMap := getcloudBrainCenterCodeAndCardTypeInfo(ciTasks, beginTime, endTime) + err = models.LoadSpecs4CloudbrainInfo(ciTasks) + if err != nil { + log.Error("LoadSpecs4CloudbrainInfo err: %v", err) + } + cloudBrainCenterCodeAndCardTypeInfo := getcloudBrainCenterCodeAndCardTypeInfo(ciTasks, int(beginTime), int(endTime)) resourceQueues, err := models.GetCanUseCardInfo() if err != nil { - log.Info("GetCanUseCardInfo err: %v", err) + log.Error("GetCanUseCardInfo err: %v", err) return 0 } @@ -91,56 +87,45 @@ func summaryDurationStat(statisticTime time.Time) int64 { cardsTotalDurationMap[resourceQueue.Cluster+"/"+resourceQueue.AiCenterCode+"/"+resourceQueue.AccCardType] += resourceQueue.CardsTotalNum * 1 * 60 * 60 } } - - for centerCode, CardTypes := range cloudBrainCenterCodeAndCardTypeInfo { - for cardType, cardDuration := range CardTypes { - cloudbrainTable := cloudbrainMap[centerCode+"/"+cardType] - if cloudbrainTable != nil { - if _, ok := cardsTotalDurationMap[cloudbrainTable.Cluster+"/"+centerCode+"/"+cardType]; !ok { - cardsTotalDurationMap[cloudbrainTable.Cluster+"/"+centerCode+"/"+cardType] = 0 - } - cloudbrainDurationStat := models.CloudbrainDurationStatistic{ - DateTimeUnix: dateTimeUnix, - DayTime: dayTime, - HourTime: hourTime, - Cluster: cloudbrainTable.Cluster, - AiCenterName: GetAiCenterNameByCode(centerCode, "zh-CN"), - AiCenterCode: centerCode, - AccCardType: cardType, - CardsUseDuration: cardDuration, - CardsTotalDuration: cardsTotalDurationMap[cloudbrainTable.Cluster+"/"+centerCode+"/"+cardType], - CreatedUnix: timeutil.TimeStampNow(), - } - if _, err = models.InsertCloudbrainDurationStatistic(&cloudbrainDurationStat); err != nil { - log.Error("Insert cloudbrainDurationStat failed: %v", err.Error()) - } - count++ - delete(cardsTotalDurationMap, cloudbrainTable.Cluster+"/"+centerCode+"/"+cardType) - } - } - } - for key, cardsTotalDuration := range cardsTotalDurationMap { - cloudbrainDurationStat := models.CloudbrainDurationStatistic{ - DateTimeUnix: dateTimeUnix, - DayTime: dayTime, - HourTime: hourTime, - Cluster: strings.Split(key, "/")[0], - AiCenterName: GetAiCenterNameByCode(strings.Split(key, "/")[1], "zh-CN"), - AiCenterCode: strings.Split(key, "/")[1], - AccCardType: strings.Split(key, "/")[2], - CardsUseDuration: 0, - CardsTotalDuration: cardsTotalDuration, - CardsTotalNum: cardsTotalDuration / 1 / 60 / 60, - CreatedUnix: timeutil.TimeStampNow(), - } - if _, err = models.InsertCloudbrainDurationStatistic(&cloudbrainDurationStat); err != nil { - log.Error("Insert cloudbrainDurationStat failed: %v", err.Error()) + if _, ok := cloudBrainCenterCodeAndCardTypeInfo[strings.Split(key, "/")[0]+"/"+strings.Split(key, "/")[1]][strings.Split(key, "/")[2]]; ok { + cloudbrainDurationStat := models.CloudbrainDurationStatistic{ + DateTimeUnix: dateTimeUnix, + DayTime: dayTime, + HourTime: hourTime, + Cluster: strings.Split(key, "/")[0], + AiCenterName: GetAiCenterNameByCode(strings.Split(key, "/")[1], "zh-CN"), + AiCenterCode: strings.Split(key, "/")[1], + AccCardType: strings.Split(key, "/")[2], + CardsUseDuration: cloudBrainCenterCodeAndCardTypeInfo[strings.Split(key, "/")[0]+"/"+strings.Split(key, "/")[1]][strings.Split(key, "/")[2]], + CardsTotalDuration: cardsTotalDuration, + CardsTotalNum: cardsTotalDuration / 1 / 60 / 60, + CreatedUnix: timeutil.TimeStampNow(), + } + if _, err = models.InsertCloudbrainDurationStatistic(&cloudbrainDurationStat); err != nil { + log.Error("Insert cloudbrainDurationStat failed: %v", err.Error()) + } + count++ + } else { + cloudbrainDurationStat := models.CloudbrainDurationStatistic{ + DateTimeUnix: dateTimeUnix, + DayTime: dayTime, + HourTime: hourTime, + Cluster: strings.Split(key, "/")[0], + AiCenterName: GetAiCenterNameByCode(strings.Split(key, "/")[1], "zh-CN"), + AiCenterCode: strings.Split(key, "/")[1], + AccCardType: strings.Split(key, "/")[2], + CardsUseDuration: 0, + CardsTotalDuration: cardsTotalDuration, + CardsTotalNum: cardsTotalDuration / 1 / 60 / 60, + CreatedUnix: timeutil.TimeStampNow(), + } + if _, err = models.InsertCloudbrainDurationStatistic(&cloudbrainDurationStat); err != nil { + log.Error("Insert cloudbrainDurationStat failed: %v", err.Error()) + } + count++ } - count++ } - - log.Info("finish summary cloudbrainDurationStat") return count } @@ -159,33 +144,21 @@ func GetAiCenterNameByCode(centerCode string, language string) string { return aiCenterName } -func getcloudBrainCenterCodeAndCardTypeInfo(ciTasks []*models.CloudbrainInfo, beginTime int64, endTime int64) (map[string]map[string]int, map[string]*models.Cloudbrain) { +func getcloudBrainCenterCodeAndCardTypeInfo(ciTasks []*models.CloudbrainInfo, hourBeginTime int, hourEndTime int) map[string]map[string]int { var WorkServerNumber int var AccCardsNum int - cloudbrainMap := make(map[string]*models.Cloudbrain) cloudBrainCenterCodeAndCardType := make(map[string]map[string]int) for _, cloudbrain := range ciTasks { - if cloudbrain.Cloudbrain.StartTime == 0 { - cloudbrain.Cloudbrain.StartTime = cloudbrain.Cloudbrain.CreatedUnix - } - if cloudbrain.Cloudbrain.EndTime == 0 { - cloudbrain.Cloudbrain.EndTime = timeutil.TimeStamp(time.Now().Unix()) - } - cloudbrain = cloudbrainService.UpdateCloudbrainAiCenter(cloudbrain) - if cloudbrain.Cloudbrain.Spec != nil { - if _, ok := cloudbrainMap[cloudbrain.Cloudbrain.AiCenter+"/"+cloudbrain.Cloudbrain.Spec.AccCardType]; !ok { - if cloudbrain.Cloudbrain.Spec != nil { - cloudbrainMap[cloudbrain.Cloudbrain.AiCenter+"/"+cloudbrain.Cloudbrain.Spec.AccCardType] = &cloudbrain.Cloudbrain - } - } - } - cloudbrain = cloudbrainService.UpdateCloudbrainAiCenter(cloudbrain) if cloudbrain.Cloudbrain.StartTime == 0 { cloudbrain.Cloudbrain.StartTime = cloudbrain.Cloudbrain.CreatedUnix } if cloudbrain.Cloudbrain.EndTime == 0 { - cloudbrain.Cloudbrain.EndTime = cloudbrain.Cloudbrain.UpdatedUnix + if cloudbrain.Cloudbrain.Status == string(models.JobRunning) { + cloudbrain.Cloudbrain.EndTime = timeutil.TimeStamp(time.Now().Unix()) + } else { + cloudbrain.Cloudbrain.EndTime = cloudbrain.Cloudbrain.StartTime + timeutil.TimeStamp(cloudbrain.Cloudbrain.Duration) + } } if cloudbrain.Cloudbrain.WorkServerNumber >= 1 { WorkServerNumber = cloudbrain.Cloudbrain.WorkServerNumber @@ -197,55 +170,36 @@ func getcloudBrainCenterCodeAndCardTypeInfo(ciTasks []*models.CloudbrainInfo, be } else { AccCardsNum = cloudbrain.Cloudbrain.Spec.AccCardsNum } - if _, ok := cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter]; !ok { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter] = make(map[string]int) + if _, ok := cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter]; !ok { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter] = make(map[string]int) } + taskStartTime := int(cloudbrain.Cloudbrain.StartTime) + taskEndTime := int(cloudbrain.Cloudbrain.EndTime) if cloudbrain.Cloudbrain.Spec != nil { - if cloudbrain.Cloudbrain.Status == string(models.ModelArtsRunning) && cloudbrain.Cloudbrain.DeletedAt.IsZero() { - if _, ok := cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType]; !ok { - if int64(cloudbrain.Cloudbrain.StartTime) < beginTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (int(endTime) - int(beginTime)) - } else if beginTime <= int64(cloudbrain.Cloudbrain.StartTime) && int64(cloudbrain.Cloudbrain.StartTime) < endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (int(endTime) - int(cloudbrain.Cloudbrain.StartTime)) - } else if int64(cloudbrain.Cloudbrain.StartTime) >= endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = 0 - } - } else { - if int64(cloudbrain.Cloudbrain.StartTime) < beginTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (int(endTime) - int(beginTime)) - } else if beginTime <= int64(cloudbrain.Cloudbrain.StartTime) && int64(cloudbrain.Cloudbrain.StartTime) < endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (int(endTime) - int(cloudbrain.Cloudbrain.StartTime)) - } else if int64(cloudbrain.Cloudbrain.StartTime) >= endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += 0 - } + if _, ok := cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType]; !ok { + if taskStartTime < hourBeginTime && taskEndTime >= hourBeginTime && taskEndTime <= hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (taskEndTime - hourBeginTime) + } else if taskStartTime < hourBeginTime && taskEndTime > hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (hourEndTime - hourBeginTime) + } else if taskStartTime >= hourBeginTime && taskStartTime <= hourEndTime && taskEndTime >= hourBeginTime && taskEndTime <= hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (taskEndTime - taskStartTime) + } else if taskStartTime >= hourBeginTime && taskStartTime <= hourEndTime && taskEndTime > hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (hourEndTime - taskStartTime) } } else { - if _, ok := cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType]; !ok { - if int64(cloudbrain.Cloudbrain.StartTime) <= beginTime && int64(cloudbrain.Cloudbrain.EndTime) <= endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (int(cloudbrain.Cloudbrain.EndTime) - int(beginTime)) - } else if int64(cloudbrain.Cloudbrain.StartTime) <= beginTime && int64(cloudbrain.Cloudbrain.EndTime) > endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (int(endTime) - int(beginTime)) - } else if beginTime <= int64(cloudbrain.Cloudbrain.StartTime) && int64(cloudbrain.Cloudbrain.StartTime) <= endTime && int64(cloudbrain.Cloudbrain.EndTime) <= endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (int(cloudbrain.Cloudbrain.EndTime) - int(cloudbrain.Cloudbrain.StartTime)) - } else if beginTime <= int64(cloudbrain.Cloudbrain.StartTime) && int64(cloudbrain.Cloudbrain.StartTime) <= endTime && int64(cloudbrain.Cloudbrain.EndTime) > endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] = AccCardsNum * WorkServerNumber * (int(endTime) - int(cloudbrain.Cloudbrain.StartTime)) - } - } else { - if int64(cloudbrain.Cloudbrain.StartTime) <= beginTime && int64(cloudbrain.Cloudbrain.EndTime) <= endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (int(cloudbrain.Cloudbrain.EndTime) - int(beginTime)) - } else if int64(cloudbrain.Cloudbrain.StartTime) <= beginTime && int64(cloudbrain.Cloudbrain.EndTime) > endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (int(endTime) - int(beginTime)) - } else if beginTime <= int64(cloudbrain.Cloudbrain.StartTime) && int64(cloudbrain.Cloudbrain.StartTime) <= endTime && int64(cloudbrain.Cloudbrain.EndTime) <= endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (int(cloudbrain.Cloudbrain.EndTime) - int(cloudbrain.Cloudbrain.StartTime)) - } else if beginTime <= int64(cloudbrain.Cloudbrain.StartTime) && int64(cloudbrain.Cloudbrain.StartTime) <= endTime && int64(cloudbrain.Cloudbrain.EndTime) > endTime { - cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (int(endTime) - int(cloudbrain.Cloudbrain.StartTime)) - } + if taskStartTime < hourBeginTime && taskEndTime >= hourBeginTime && taskEndTime <= hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (taskEndTime - hourBeginTime) + } else if taskStartTime < hourBeginTime && taskEndTime > hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (hourEndTime - hourBeginTime) + } else if taskStartTime >= hourBeginTime && taskStartTime <= hourEndTime && taskEndTime >= hourBeginTime && taskEndTime <= hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (taskEndTime - taskStartTime) + } else if taskStartTime >= hourBeginTime && taskStartTime <= hourEndTime && taskEndTime > hourEndTime { + cloudBrainCenterCodeAndCardType[cloudbrain.Cloudbrain.Cluster+"/"+cloudbrain.Cloudbrain.AiCenter][cloudbrain.Cloudbrain.Spec.AccCardType] += AccCardsNum * WorkServerNumber * (hourEndTime - taskStartTime) } } } } - - return cloudBrainCenterCodeAndCardType, cloudbrainMap + return cloudBrainCenterCodeAndCardType } func CloudbrainUpdateHistoryData(ctx *context.Context) { diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 8f3182758..14db1a50d 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -44,14 +44,37 @@ import ( const ( tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" + tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show" //GPU + tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new" tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" //NPU + tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new" tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" ) +func GrampusNotebookNew(ctx *context.Context) { + ctx.Data["IsCreate"] = true + notebookType := ctx.QueryInt("type") + processType := grampus.ProcessorTypeGPU + if notebookType == 1 { + processType = grampus.ProcessorTypeNPU + } + err := grampusNotebookNewDataPrepare(ctx, processType) + if err != nil { + ctx.ServerError("get new notebook-job info failed", err) + return + } + if processType == grampus.ProcessorTypeGPU { + ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew) + } else { + ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew) + } + +} + func GrampusTrainJobGPUNew(ctx *context.Context) { ctx.Data["IsCreate"] = true err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) @@ -72,57 +95,262 @@ func GrampusTrainJobNPUNew(ctx *context.Context) { } ctx.HTML(200, tplGrampusTrainJobNPUNew) } +func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) { + ctx.Data["IsCreate"] = true + displayJobName := form.DisplayJobName + jobName := util.ConvertDisplayJobNameToJobName(displayJobName) + uuid := form.Attachment + description := form.Description + repo := ctx.Repo.Repository + branchName := form.BranchName + image := strings.TrimSpace(form.Image) -func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error { + codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" + + tpl := tplGrampusNotebookGPUNew + processType := grampus.ProcessorTypeGPU + computeSource := models.GPUResource + computeSourceSimple := models.GPU + if form.Type == 1 { + tpl = tplGrampusNotebookNPUNew + processType = grampus.ProcessorTypeNPU + computeSource = models.NPUResource + computeSourceSimple = models.NPU + codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath + } + + lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName)) + defer lock.UnLock() + isOk, err := lock.Lock(models.CloudbrainKeyDuration) + if !isOk { + log.Error("lock processed failed:%v", err, ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form) + return + } + + if !jobNamePattern.MatchString(displayJobName) { + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) + return + } + + //check count limit + count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource) + if err != nil { + log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr("system error", tpl, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) + return + } + } + + //check whether the task name in the project is duplicated + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName) + if err == nil { + if len(tasks) != 0 { + log.Error("the job name did already exist", ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr("the job name did already exist", tpl, &form) + return + } + } else { + if !models.IsErrJobNotExist(err) { + log.Error("system error, %v", err, ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr("system error", tpl, &form) + return + } + } + + //check specification + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: computeSourceSimple, + Cluster: models.C2NetCluster, + }) + if err != nil || spec == nil { + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } + + if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { + log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form) + return + } + + var datasetInfos map[string]models.DatasetInfo + var datasetNames string + //var + if uuid != "" { + datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple) + if err != nil { + log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) + return + } + } + + //prepare code and out path + codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { + log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) + return + } + + if processType == grampus.ProcessorTypeGPU { + if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { + log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) + return + } + + } else { + + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form) + return + } + } + + commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) + + req := &grampus.GenerateNotebookJobReq{ + JobName: jobName, + DisplayJobName: displayJobName, + ComputeResource: computeSource, + ProcessType: processType, + ImageUrl: image, + ImageId: form.ImageID, + Description: description, + Uuid: uuid, + CommitID: commitID, + BranchName: branchName, + DatasetNames: datasetNames, + DatasetInfos: datasetInfos, + Spec: spec, + CodeStoragePath: codeStoragePath, + CodeName: strings.ToLower(repo.Name), + } + + if form.ModelName != "" { //使用预训练模型训练 + + _, err := models.QueryModelByPath(form.PreTrainModelUrl) + if err != nil { + log.Error("Can not find model", err) + grampusNotebookNewDataPrepare(ctx, processType) + ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form) + return + } + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName) + + } + + _, err = grampus.GenerateNotebookJob(ctx, req) + if err != nil { + log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"]) + grampusTrainJobNewDataPrepare(ctx, processType) + ctx.RenderWithErr(err.Error(), tpl, &form) + return + } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") +} +func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error { ctx.Data["PageIsCloudBrain"] = true var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name) ctx.Data["display_job_name"] = displayJobName //get valid images - images, err := grampus.GetImages(processType) + if processType == grampus.ProcessorTypeNPU { + images, err := grampus.GetImages(processType, string(models.JobTypeDebug)) + if err != nil { + log.Error("GetImages failed:", err.Error()) + } else { + ctx.Data["images"] = images.Infos + } + } + //prepare available specs + computeResourceSimple := models.GPU + datasetType := models.TypeCloudBrainOne + computeResource := models.GPUResource + if processType == grampus.ProcessorTypeNPU { + computeResourceSimple = models.NPU + datasetType = models.TypeCloudBrainTwo + computeResource = models.NPUResource + } + + prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug) + + //get branches + branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) if err != nil { - log.Error("GetImages failed:", err.Error()) + log.Error("GetBranches error:", err.Error()) } else { - ctx.Data["images"] = images.Infos + ctx.Data["branches"] = branches } - grampus.InitSpecialPool() + ctx.Data["branchName"] = ctx.Repo.BranchName - ctx.Data["GPUEnabled"] = true - ctx.Data["NPUEnabled"] = true - includeCenters := make(map[string]struct{}) - excludeCenters := make(map[string]struct{}) - if grampus.SpecialPools != nil { - for _, pool := range grampus.SpecialPools.Pools { - if pool.IsExclusive { - if !IsUserInOrgPool(ctx.User.ID, pool) { - ctx.Data[pool.Type+"Enabled"] = false - } - } else { - if strings.Contains(strings.ToLower(processType), strings.ToLower(pool.Type)) { - if IsUserInOrgPool(ctx.User.ID, pool) { - for _, center := range pool.Pool { - includeCenters[center.Queue] = struct{}{} - } - } else { - for _, center := range pool.Pool { - excludeCenters[center.Queue] = struct{}{} - } + ctx.Data["datasetType"] = datasetType + waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug) + ctx.Data["WaitCount"] = waitCount + NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource) + ctx.Data["NotStopTaskCount"] = NotStopTaskCount - } + ctx.Data["code_path"] = cloudbrain.CodeMountPath + ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath + ctx.Data["model_path"] = cloudbrain.ModelMountPath - } + return nil +} - } +func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error { + ctx.Data["PageIsCloudBrain"] = true + + var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name) + ctx.Data["display_job_name"] = displayJobName + + //get valid images + if processType == grampus.ProcessorTypeNPU { + images, err := grampus.GetImages(processType, string(models.JobTypeTrain)) + if err != nil { + log.Error("GetImages failed:", err.Error()) + } else { + ctx.Data["images"] = images.Infos } } //prepare available specs if processType == grampus.ProcessorTypeNPU { - prepareGrampusTrainSpecs(ctx, models.NPU) + prepareGrampusSpecs(ctx, models.NPU) } else if processType == grampus.ProcessorTypeGPU { - prepareGrampusTrainSpecs(ctx, models.GPU) + prepareGrampusSpecs(ctx, models.GPU) } //get branches @@ -201,55 +429,19 @@ func GrampusTrainJobVersionNew(ctx *context.Context) { } } -func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) { +func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) { + tempJobType := models.JobTypeTrain + if len(jobType) > 0 { + tempJobType = jobType[0] + } noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ - JobType: models.JobTypeTrain, + JobType: tempJobType, ComputeResource: computeResource, Cluster: models.C2NetCluster, }) ctx.Data["Specs"] = noteBookSpecs } -func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec { - if len(includeCenters) == 0 && len(excludeCenters) == 0 { - return specs.Infos - } - var grampusSpecs []models.GrampusSpec - for _, info := range specs.Infos { - if isInIncludeCenters(info, includeCenters) || (len(excludeCenters) != 0 && isNotAllInExcludeCenters(info, excludeCenters)) { - grampusSpecs = append(grampusSpecs, info) - } - - } - return grampusSpecs -} - -func isInIncludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool { - for _, center := range grampusSpec.Centers { - if _, ok := centers[center.ID]; ok { - return true - } - } - return false -} -func isNotAllInExcludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool { - for _, center := range grampusSpec.Centers { - if _, ok := centers[center.ID]; !ok { - return true - } - } - return false -} - -func IsUserInOrgPool(userId int64, pool *models.SpecialPool) bool { - org, _ := models.GetOrgByName(pool.Org) - if org != nil { - isOrgMember, _ := models.IsOrganizationMember(org.ID, userId) - return isOrgMember - } - return false -} - func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") { log.Error("the boot file(%s) must be a python file", form.BootFile) @@ -721,30 +913,64 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func GetGrampusNotebook(ctx *context.APIContext) { + var ( + err error + ) + + ID := ctx.Params(":id") + job, err := models.GetCloudbrainByID(ID) + if err != nil { + ctx.NotFound("", err) + log.Error("GetCloudbrainByID failed:", err) + return + } + + jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job) + + aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context) + + if err != nil { + ctx.NotFound(err) + log.Error("Sync cloud brain one status failed:", err) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "ID": ID, + "JobName": jobAfter.JobName, + "JobStatus": jobAfter.Status, + "AiCenter": aiCenterName, + "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"), + "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"), + "JobDuration": jobAfter.TrainJobDuration, + }) +} + func GrampusStopJob(ctx *context.Context) { - var ID = ctx.Params(":jobid") + var ID = ctx.Params(":id") var resultCode = "0" var errorMsg = "" var status = "" task := ctx.Cloudbrain for { - if task.Status == string(models.GrampusStatusStopped) || task.Status == string(models.GrampusStatusFailed) || task.Status == string(models.GrampusStatusSucceeded) { + if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded { log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) resultCode = "-1" - errorMsg = "system error" + errorMsg = ctx.Tr("cloudbrain.Already_stopped") break } - res, err := grampus.StopJob(task.JobID) + res, err := grampus.StopJob(task.JobID, task.JobType) if err != nil { log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) resultCode = strconv.Itoa(res.ErrorCode) - errorMsg = res.ErrorMsg + errorMsg = ctx.Tr("cloudbrain.Stopped_failed") break } oldStatus := task.Status - task.Status = string(models.GrampusStatusStopped) + task.Status = getStopJobResponseStatus(res) if task.EndTime == 0 { task.EndTime = timeutil.TimeStampNow() } @@ -773,6 +999,33 @@ func GrampusStopJob(ctx *context.Context) { }) } +func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string { + newStatus := models.GrampusStatusStopping + if res.Status != "" { + newStatus = grampus.TransTrainJobStatus(res.Status) + } + return newStatus +} + +func GrampusNotebookDel(ctx *context.Context) { + var listType = ctx.Query("listType") + if err := deleteGrampusJob(ctx); err != nil { + log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"]) + ctx.ServerError(err.Error(), err) + return + } + + var isAdminPage = ctx.Query("isadminpage") + var isHomePage = ctx.Query("ishomepage") + if ctx.IsUserSiteAdmin() && isAdminPage == "true" { + ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") + } else if isHomePage == "true" { + ctx.Redirect(setting.AppSubURL + "/cloudbrains") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType) + } +} + func GrampusTrainJobDel(ctx *context.Context) { var listType = ctx.Query("listType") if err := deleteGrampusJob(ctx); err != nil { @@ -795,9 +1048,9 @@ func GrampusTrainJobDel(ctx *context.Context) { func deleteGrampusJob(ctx *context.Context) error { task := ctx.Cloudbrain - if task.Status != string(models.GrampusStatusStopped) && task.Status != string(models.GrampusStatusSucceeded) && task.Status != string(models.GrampusStatusFailed) { + if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed { log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) - return errors.New("the job has not been stopped") + return errors.New(ctx.Tr("cloudbrain.Not_Stopped")) } err := models.DeleteJob(task) @@ -815,6 +1068,166 @@ func deleteGrampusJob(ctx *context.Context) error { return nil } +type NotebookDataset struct { + DatasetUrl string `json:"dataset_url"` +} + +func GrampusNotebookShow(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + var task *models.Cloudbrain + task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) + if err != nil { + log.Error("GetCloudbrainByID failed:" + err.Error()) + ctx.NotFound(ctx.Req.URL.RequestURI(), nil) + return + } + task.ContainerIp = "" + + if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record + result, err := grampus.GetNotebookJob(task.JobID) + if err != nil { + log.Error("GetJob failed:" + err.Error()) + ctx.NotFound(ctx.Req.URL.RequestURI(), nil) + return + } + + if result != nil { + if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { + task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } + oldStatus := task.Status + task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + if task.Status != oldStatus || task.Status == models.GrampusStatusRunning { + task.Duration = result.JobInfo.RunSec + if task.Duration < 0 { + task.Duration = 0 + } + task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) + + if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { + task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) + } + if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { + task.EndTime = task.StartTime.Add(task.Duration) + } + task.CorrectCreateUnix() + if oldStatus != task.Status { + notification.NotifyChangeCloudbrainStatus(task, oldStatus) + if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource { + if len(result.JobInfo.Tasks[0].CenterID) == 1 { + urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID)) + } + } + } + } + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob failed:" + err.Error()) + } + } + } + + if len(task.Parameters) > 0 { + var parameters models.Parameters + err := json.Unmarshal([]byte(task.Parameters), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) + ctx.ServerError("system error", err) + return + } + + if len(parameters.Parameter) > 0 { + paramTemp := "" + for _, Parameter := range parameters.Parameter { + param := Parameter.Label + " = " + Parameter.Value + "; " + paramTemp = paramTemp + param + } + task.Parameters = paramTemp[:len(paramTemp)-2] + } else { + task.Parameters = "" + } + } + user, err := models.GetUserByID(task.UserID) + if err == nil { + task.User = user + } + + prepareSpec4Show(ctx, task) + + ctx.Data["task"] = task + ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task) + ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task) + ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) + ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx) + ctx.Data["code_path"] = cloudbrain.CodeMountPath + ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath + ctx.Data["model_path"] = cloudbrain.ModelMountPath + ctx.HTML(http.StatusOK, tplGrampusNotebookShow) +} + +func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload { + datasetDownload := make([]*models.DatasetDownload, 0) + if ctx.IsSigned { + if task.Uuid != "" && task.UserID == ctx.User.ID { + if task.IsGPUTask() { + return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false) + } else { + datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false) + datasetObsUrlList := make([]NotebookDataset, 0) + _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList) + + for _, datasetInfo := range datasetDownload { + + for _, datasetObs := range datasetObsUrlList { + log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName) + if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) { + datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl + break + } + } + + } + + } + + } + } + + return datasetDownload +} + +func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload { + var modelDownload models.ModelDownload + if ctx.IsSigned { + if task.ModelName != "" && task.UserID == ctx.User.ID { + if task.IsNPUTask() { + modelDownload = models.ModelDownload{ + Name: task.CkptName, + DownloadLink: "", + IsDelete: false, + } + if !HasModelFile(task) { + modelDownload.IsDelete = true + } + datasetObsUrlList := make([]NotebookDataset, 0) + _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList) + for _, datasetObs := range datasetObsUrlList { + if strings.Contains(datasetObs.DatasetUrl, task.CkptName) { + modelDownload.DownloadLink = datasetObs.DatasetUrl + break + } + } + + } + + } + + } + + return &modelDownload +} + func GrampusTrainJobShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true @@ -1158,3 +1571,172 @@ func HandleTaskWithAiCenter(ctx *context.Context) { r["updateCounts"] = updateCounts ctx.JSON(http.StatusOK, response.SuccessWithData(r)) } + +func GrampusNotebookDebug(ctx *context.Context) { + + result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID) + + if err != nil { + ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil) + return + } + if len(result.JobInfo.Tasks) > 0 { + + ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token) + return + } + ctx.NotFound("Can not find the job.", nil) + +} + +func GrampusNotebookRestart(ctx *context.Context) { + var id = ctx.Params(":id") + var resultCode = "-1" + var errorMsg = "" + var status = "" + var spec *models.Specification + + task := ctx.Cloudbrain + if ctx.Written() { + return + } + + for { + + if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed { + log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) + errorMsg = "the job is not stopped" + break + } + + count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource) + + if err != nil { + log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) + errorMsg = "system error" + break + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + resultCode = "2" + errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob") + break + } + } + + oldSpec, err := resource.GetCloudbrainSpec(task.ID) + if err != nil || oldSpec == nil { + log.Error("NotebookManage GetCloudbrainSpec error.%v", err) + errorMsg = "Resource specification not available" + break + } + + computeSourceSimple := models.GPU + action := models.ActionCreateGrampusGPUDebugTask + if task.ComputeResource == models.NPUResource { + computeSourceSimple = models.NPU + action = models.ActionCreateGrampusNPUDebugTask + } + spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: computeSourceSimple, + Cluster: models.C2NetCluster, + }) + if err != nil || spec == nil { + log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID) + errorMsg = "Resource specification not support any more" + break + } + if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) { + log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID) + errorMsg = ctx.Tr("points.insufficient_points_balance") + break + } + if task.IsGPUTask() { + if _, err := os.Stat(getOldJobPath(task)); err != nil { + log.Error("Can not find job minio path", err) + resultCode = "-1" + errorMsg = ctx.Tr("cloudbrain.result_cleared") + break + } + } + + if !HasModelFile(task) { //使用预训练模型训练 + errorMsg = ctx.Tr("repo.debug.manage.model_not_exist") + break + } + if hasDatasetDeleted(task) { + errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist") + break + } + + createTime := timeutil.TimeStampNow() + + res, err := grampus.RestartNotebookJob(task.JobID) + if err != nil { + log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"]) + errorMsg = ctx.Tr("repo.debug_again_fail") + break + } + + if res.GrampusResult.ErrorCode != 0 || res.NewId == "" { + log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg) + errorMsg = ctx.Tr("repo.debug_again_fail") + break + } + + newTask := &models.Cloudbrain{ + Status: res.Status, + UserID: task.UserID, + RepoID: task.RepoID, + JobID: res.NewId, + JobName: task.JobName, + DisplayJobName: task.DisplayJobName, + JobType: task.JobType, + Type: task.Type, + Uuid: task.Uuid, + Image: task.Image, + ImageID: task.ImageID, + EngineID: task.EngineID, + CommitID: task.CommitID, + EngineName: task.EngineName, + IsLatestVersion: "1", + BranchName: task.BranchName, + DatasetName: task.DatasetName, + ComputeResource: task.ComputeResource, + Description: task.Description, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: spec, + ModelName: task.ModelName, + ModelVersion: task.ModelVersion, + LabelName: task.LabelName, + PreTrainModelUrl: task.PreTrainModelUrl, + CkptName: task.CkptName, + WorkServerNumber: 1, + } + + err = models.RestartCloudbrain(task, newTask) + if err != nil { + log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) + errorMsg = "system error" + break + } + + id = strconv.FormatInt(newTask.ID, 10) + + status = res.Status + resultCode = "0" + + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action) + + break + } + + ctx.JSON(200, map[string]string{ + "result_code": resultCode, + "error_msg": errorMsg, + "status": status, + "id": id, + }) +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 4e30e625d..3dbe101a8 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -239,10 +239,37 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm return } + req := cloudbrain.GenerateModelArtsNotebookReq{ + DisplayJobName: displayJobName, + JobName: jobName, + Description: description, + Uuid: uuid, + ImageId: imageId, + Spec: spec, + BootFile: "", + AutoStopDurationMs: modelarts.AutoStopDurationMs, + } + + if form.ModelName != "" { //使用预训练模型训练 + _, err := models.QueryModelByPath(form.PreTrainModelUrl) + if err != nil { + log.Error("Can not find model", err) + notebookNewDataPrepare(ctx) + ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tplModelArtsNotebookNew, &form) + return + } + req.ModelName = form.ModelName + req.LabelName = form.LabelName + req.CkptName = form.CkptName + req.ModelVersion = form.ModelVersion + req.PreTrainModelUrl = form.PreTrainModelUrl + + } + if setting.ModelartsCD.Enabled { - _, err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, uuid, description, imageId, spec, "", modelarts.AutoStopDurationMs) + _, err = modelarts_cd.GenerateNotebook(ctx, req) } else { - _, err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec, "", modelarts.AutoStopDurationMs) + _, err = modelarts.GenerateNotebook2(ctx, req) } if err != nil { @@ -279,11 +306,17 @@ func NotebookShow(ctx *context.Context) { } - datasetDownload := make([]models.DatasetDownload, 0) + datasetDownload := make([]*models.DatasetDownload, 0) + var modelDownload models.ModelDownload if ctx.IsSigned { if task.Uuid != "" && task.UserID == ctx.User.ID { datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, true) } + if task.ModelName != "" && task.UserID == ctx.User.ID { + modelDownload = GetModelDownload(task) + + } + } user, err := models.GetUserByID(task.UserID) if err == nil { @@ -304,6 +337,7 @@ func NotebookShow(ctx *context.Context) { } ctx.Data["duration"] = task.TrainJobDuration ctx.Data["datasetDownload"] = datasetDownload + ctx.Data["modelDownload"] = modelDownload ctx.Data["task"] = task ctx.Data["ID"] = ID ctx.Data["jobName"] = task.JobName @@ -311,8 +345,25 @@ func NotebookShow(ctx *context.Context) { ctx.HTML(200, tplModelArtsNotebookShow) } -func GetCloudBrainDataSetInfo(uuid string, datasetname string, isNeedDown bool) []models.DatasetDownload { - datasetDownload := make([]models.DatasetDownload, 0) +func GetModelDownload(task *models.Cloudbrain) models.ModelDownload { + index := strings.Index(task.PreTrainModelUrl, "/") + key := task.PreTrainModelUrl[index+1:] + task.CkptName + url, _ := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, key) + modelDownload := models.ModelDownload{ + Name: task.CkptName, + DownloadLink: url, + IsDelete: false, + } + + if !HasModelFile(task) { + log.Warn("Can not get model by path:" + task.PreTrainModelUrl) + modelDownload.IsDelete = true + } + return modelDownload +} + +func GetCloudBrainDataSetInfo(uuid string, datasetname string, isNeedDown bool) []*models.DatasetDownload { + datasetDownload := make([]*models.DatasetDownload, 0) if len(uuid) == 0 { return datasetDownload } @@ -349,7 +400,7 @@ func GetCloudBrainDataSetInfo(uuid string, datasetname string, isNeedDown bool) } } - datasetDownload = append(datasetDownload, models.DatasetDownload{ + datasetDownload = append(datasetDownload, &models.DatasetDownload{ DatasetName: name, DatasetDownloadLink: url, RepositoryLink: link, @@ -476,6 +527,16 @@ func NotebookRestart(ctx *context.Context) { errorMsg = ctx.Tr("points.insufficient_points_balance") break } + if !HasModelFile(task) { //使用预训练模型训练 + errorMsg = ctx.Tr("repo.debug.manage.model_not_exist") + break + } + + if hasDatasetDeleted(task) { + errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist") + break + } + createTime := timeutil.TimeStampNow() param := models.NotebookAction{ Action: models.ActionStart, @@ -511,21 +572,26 @@ func NotebookRestart(ctx *context.Context) { } newTask := &models.Cloudbrain{ - Status: res.Status, - UserID: task.UserID, - RepoID: task.RepoID, - JobID: task.JobID, - JobName: task.JobName, - DisplayJobName: task.DisplayJobName, - JobType: task.JobType, - Type: task.Type, - Uuid: task.Uuid, - Image: task.Image, - ComputeResource: task.ComputeResource, - Description: task.Description, - CreatedUnix: createTime, - UpdatedUnix: createTime, - Spec: spec, + Status: res.Status, + UserID: task.UserID, + RepoID: task.RepoID, + JobID: task.JobID, + JobName: task.JobName, + DisplayJobName: task.DisplayJobName, + JobType: task.JobType, + Type: task.Type, + Uuid: task.Uuid, + Image: task.Image, + ComputeResource: task.ComputeResource, + Description: task.Description, + CreatedUnix: createTime, + UpdatedUnix: createTime, + Spec: spec, + ModelName: task.ModelName, + ModelVersion: task.ModelVersion, + LabelName: task.LabelName, + PreTrainModelUrl: task.PreTrainModelUrl, + CkptName: task.CkptName, } err = models.RestartCloudbrain(task, newTask) @@ -568,17 +634,7 @@ func NotebookStop(ctx *context.Context) { break } - param := models.NotebookAction{ - Action: models.ActionStop, - } - - var err error - var res *models.NotebookActionResult - if task.Type == models.TypeCloudBrainTwo { - res, err = modelarts.ManageNotebook2(task.JobID, param) - } else if task.Type == models.TypeCDCenter { - res, err = modelarts_cd.ManageNotebook(task.JobID, param) - } + err, res := StopModelArtsNotebook(task) if err != nil { log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) @@ -619,6 +675,21 @@ func NotebookStop(ctx *context.Context) { }) } +func StopModelArtsNotebook(task *models.Cloudbrain) (error, *models.NotebookActionResult) { + param := models.NotebookAction{ + Action: models.ActionStop, + } + + var err error + var res *models.NotebookActionResult + if task.Type == models.TypeCloudBrainTwo { + res, err = modelarts.ManageNotebook2(task.JobID, param) + } else if task.Type == models.TypeCDCenter { + res, err = modelarts_cd.ManageNotebook(task.JobID, param) + } + return err, res +} + func NotebookDel(ctx *context.Context) { var listType = ctx.Query("debugListType") task := ctx.Cloudbrain @@ -1791,7 +1862,7 @@ func TrainJobShow(ctx *context.Context) { return } ctx.Data["canNewJob"] = canNewJob - datasetList := make([][]models.DatasetDownload, 0) + datasetList := make([][]*models.DatasetDownload, 0) //将运行参数转化为epoch_size = 3, device_target = Ascend的格式 for i, task := range VersionListTasks { diff --git a/routers/repo/repo_statistic.go b/routers/repo/repo_statistic.go index c1a7954a7..c889046e3 100755 --- a/routers/repo/repo_statistic.go +++ b/routers/repo/repo_statistic.go @@ -75,7 +75,7 @@ func RepoStatisticDaily(date string) { if repo.NumIssues != 0 { issueFixedRate = float32(repo.NumClosedIssues) / float32(repo.NumIssues) } else { - issueFixedRate = 1.0 + issueFixedRate = float32(setting.RadarMap.ProjectHealth0IssueCloseRatio) } var numVersions int64 @@ -124,7 +124,7 @@ func RepoStatisticDaily(date string) { NumDevMonths: numDevMonths, RepoSize: repo.Size, DatasetSize: datasetSize, - NumModels: 0, + NumModels: repo.ModelCnt, NumWikiViews: numWikiViews, NumCommits: numCommits, NumIssues: int64(repo.NumIssues), @@ -135,6 +135,9 @@ func RepoStatisticDaily(date string) { NumCommitsGrowth: numCommitsGrowth, NumCommitLinesGrowth: numCommitLinesGrowth, NumContributorsGrowth: numContributorsGrowth, + NumCloudbrain: repo.AiTaskCnt, + NumDatasetFile: repo.DatasetCnt, + NumModelConvert: models.QueryModelConvertCountByRepoID(repo.ID), } dayBeforeDate := t.AddDate(0, 0, -1).Format("2006-01-02") @@ -155,6 +158,10 @@ func RepoStatisticDaily(date string) { repoStat.NumIssuesAdded = repoStat.NumIssues - repoStatisticBefore.NumIssues repoStat.NumPullsAdded = repoStat.NumPulls - repoStatisticBefore.NumPulls repoStat.NumContributorAdded = repoStat.NumContributor - repoStatisticBefore.NumContributor + repoStat.NumModelsAdded = repoStat.NumModels - repoStatisticBefore.NumModels + repoStat.NumCloudbrainAdded = repoStat.NumCloudbrain - repoStatisticBefore.NumCloudbrain + repoStat.NumModelConvertAdded = repoStat.NumModelConvert - repoStatisticBefore.NumModelConvert + repoStat.NumDatasetFileAdded = repoStat.NumDatasetFile - repoStatisticBefore.NumDatasetFile } } day4MonthsAgo := t.AddDate(0, -4, 0) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 8b2b458db..063a20999 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1229,10 +1229,23 @@ func RegisterRoutes(m *macaron.Macaron) { }) }, context.RepoRef()) m.Group("/grampus", func() { + m.Group("/notebook", func() { + m.Group("/:id", func() { + m.Get("", reqRepoCloudBrainReader, repo.GrampusNotebookShow) + m.Get("/debug", reqWechatBind, cloudbrain.AdminOrJobCreaterRight, repo.GrampusNotebookDebug) + m.Post("/restart", reqWechatBind, cloudbrain.AdminOrJobCreaterRight, repo.GrampusNotebookRestart) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusNotebookDel) + }) + + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusNotebookNew) + m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusNotebookForm{}), repo.GrampusNotebookCreate) + }) + m.Group("/train-job", func() { m.Group("/:jobid", func() { m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusStopJob) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel) m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.GrampusTrainJobVersionNew) @@ -1302,16 +1315,6 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/modelarts", func() { m.Group("/notebook", func() { - /* v1.0 - m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) - m.Get("/debug", cloudbrain.AdminOrJobCreaterRight, repo.NotebookDebug) - m.Post("/:action", reqRepoCloudBrainWriter, repo.NotebookManage) - m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel) - }) - m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew) - m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsNotebookForm{}), repo.NotebookCreate) - */ m.Group("/:id", func() { m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) m.Get("/debug", cloudbrain.AdminOrJobCreaterRight, repo.NotebookDebug2) diff --git a/services/cloudbrain/cloudbrainTask/count.go b/services/cloudbrain/cloudbrainTask/count.go index 985706911..4ae742c3a 100644 --- a/services/cloudbrain/cloudbrainTask/count.go +++ b/services/cloudbrain/cloudbrainTask/count.go @@ -62,6 +62,16 @@ var StatusInfoDict = map[string]StatusInfo{string(models.JobTypeDebug) + "-" + s JobType: []models.JobType{models.JobTypeTrain}, NotFinalStatuses: GrampusNotFinalStatuses, ComputeResource: models.NPUResource, +}, string(models.JobTypeDebug) + "-" + strconv.Itoa(models.TypeC2Net) + "-" + models.GPUResource: { + CloudBrainTypes: []int{models.TypeC2Net}, + JobType: []models.JobType{models.JobTypeDebug}, + NotFinalStatuses: GrampusNotFinalStatuses, + ComputeResource: models.GPUResource, +}, string(models.JobTypeDebug) + "-" + strconv.Itoa(models.TypeC2Net) + "-" + models.NPUResource: { + CloudBrainTypes: []int{models.TypeC2Net}, + JobType: []models.JobType{models.JobTypeDebug}, + NotFinalStatuses: GrampusNotFinalStatuses, + ComputeResource: models.NPUResource, }} func GetNotFinalStatusTaskCount(uid int64, cloudbrainType int, jobType string, computeResource ...string) (int, error) { diff --git a/services/cloudbrain/cloudbrainTask/notebook.go b/services/cloudbrain/cloudbrainTask/notebook.go index 6b2fcf707..cc9563520 100644 --- a/services/cloudbrain/cloudbrainTask/notebook.go +++ b/services/cloudbrain/cloudbrainTask/notebook.go @@ -82,7 +82,7 @@ func FileNotebookCreate(ctx *context.Context, option api.CreateFileNotebookJobOp }) } if err != nil { - ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.failed_to_create_notebook_repo",setting.FileNoteBook.ProjectName))) + ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.failed_to_create_notebook_repo", setting.FileNoteBook.ProjectName))) return } if option.Type <= 1 { @@ -291,10 +291,21 @@ func modelartsFileNoteBookCreate(ctx *context.Context, option api.CreateFileNote } var jobId string + req := cloudbrain.GenerateModelArtsNotebookReq{ + DisplayJobName: displayJobName, + JobName: jobName, + Description: getDescription(option), + ImageId: setting.FileNoteBook.ImageIdNPU, + Spec: spec, + BootFile: "", + AutoStopDurationMs: modelarts.AutoStopDurationMs / 4, + } + if setting.ModelartsCD.Enabled { - jobId, err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, "", getDescription(option), setting.FileNoteBook.ImageIdNPUCD, spec, option.File,modelarts.AutoStopDurationMs/4) + req.ImageId = setting.FileNoteBook.ImageIdNPUCD + jobId, err = modelarts_cd.GenerateNotebook(ctx, req) } else { - jobId, err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, "", getDescription(option), setting.FileNoteBook.ImageIdNPU, spec, option.File,modelarts.AutoStopDurationMs/4) + jobId, err = modelarts.GenerateNotebook2(ctx, req) } if err != nil { diff --git a/services/cloudbrain/cloudbrainTask/sync_status.go b/services/cloudbrain/cloudbrainTask/sync_status.go index 973b9bbc2..3bc09071c 100644 --- a/services/cloudbrain/cloudbrainTask/sync_status.go +++ b/services/cloudbrain/cloudbrainTask/sync_status.go @@ -3,9 +3,13 @@ package cloudbrainTask import ( "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/cloudbrain" + "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/modelarts_cd" "code.gitea.io/gitea/modules/notification" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" "net/http" "strconv" ) @@ -58,6 +62,55 @@ func SyncCloudBrainOneStatus(task *models.Cloudbrain) (*models.Cloudbrain, error } +func SyncGrampusNotebookStatus(job *models.Cloudbrain) (*models.Cloudbrain, error) { + result, err := grampus.GetNotebookJob(job.JobID) + if err != nil { + + log.Error("GetJob(%s) failed:%v", job.JobName, err) + + return job, err + } + + if job.StartTime == 0 && result.JobInfo.StartedAt > 0 { + job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) + } + oldStatus := job.Status + job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + job.Duration = result.JobInfo.RunSec + job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) + + if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { + job.EndTime = job.StartTime.Add(job.Duration) + } + job.CorrectCreateUnix() + + if len(job.AiCenter) == 0 { + if len(result.JobInfo.Tasks) > 0 { + if len(result.JobInfo.Tasks[0].CenterID) > 0 && len(result.JobInfo.Tasks[0].CenterName) > 0 { + job.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } + } + } + + if job.Status != models.GrampusStatusWaiting { + if oldStatus != job.Status { + notification.NotifyChangeCloudbrainStatus(job, oldStatus) + } + if job.ComputeResource == models.NPUResource { + job.TrainUrl = result.JobInfo.Tasks[0].CodeUrl + job.DataUrl = result.JobInfo.Tasks[0].DataUrl + } + err = models.UpdateJob(job) + if err != nil { + log.Error("UpdateJob failed:", err) + return nil, err + } + } + + return job, nil + +} + func isNoteBookReady(task *models.Cloudbrain) bool { if task.JobType != string(models.JobTypeDebug) { return true @@ -90,3 +143,28 @@ func isNoteBookReady(task *models.Cloudbrain) bool { return false } + +func StopDebugJob(task *models.Cloudbrain) error { + param := models.NotebookAction{ + Action: models.ActionStop, + } + var err error = nil + + if task.JobType == string(models.JobTypeDebug) { + if task.Type == models.TypeCloudBrainOne { + return cloudbrain.StopJob(task.JobID) + } else if task.Type == models.TypeCloudBrainTwo { + _, err = modelarts.ManageNotebook2(task.JobID, param) + + } else if task.Type == models.TypeCDCenter { + _, err = modelarts_cd.ManageNotebook(task.JobID, param) + + } else if task.Type == models.TypeC2Net { + _, err = grampus.StopJob(task.JobID, task.JobType) + + } + + } + return err + +} diff --git a/services/socketwrap/clientManager.go b/services/socketwrap/clientManager.go index 7470b1198..7bac92ab8 100755 --- a/services/socketwrap/clientManager.go +++ b/services/socketwrap/clientManager.go @@ -10,7 +10,7 @@ import ( "github.com/elliotchance/orderedmap" ) -var opTypes = []int{1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35} +var opTypes = []int{1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 39, 40} type ClientsManager struct { Clients *orderedmap.OrderedMap diff --git a/templates/admin/cloudbrain/list.tmpl b/templates/admin/cloudbrain/list.tmpl index 94f80c0fa..f6d20216a 100755 --- a/templates/admin/cloudbrain/list.tmpl +++ b/templates/admin/cloudbrain/list.tmpl @@ -98,7 +98,7 @@