From 406d2aa41b97a9d8e690be070fbfa79ba4874f46 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 15 Dec 2022 18:06:43 +0800 Subject: [PATCH 1/7] fix-3319 --- models/cloudbrain.go | 85 ++++++++++++++++++----------- routers/api/v1/repo/cloudbrain_dashboard.go | 32 +++++++---- 2 files changed, 74 insertions(+), 43 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index aeed8629c..1f5068ca6 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -204,7 +204,7 @@ type Cloudbrain struct { BenchmarkTypeRankLink string `xorm:"-"` StartTime timeutil.TimeStamp EndTime timeutil.TimeStamp - Cleared bool `xorm:"DEFAULT false"` + Cleared bool `xorm:"DEFAULT false"` Spec *Specification `xorm:"-"` } @@ -442,29 +442,32 @@ type GetImagesPayload struct { type CloudbrainsOptions struct { ListOptions - RepoID int64 // include all repos if empty - UserID int64 - JobID string - SortType string - CloudbrainIDs []int64 - JobStatus []string - JobStatusNot bool - Keyword string - Type int - JobTypes []string - VersionName string - IsLatestVersion string - JobTypeNot bool - NeedRepoInfo bool - RepoIDList []int64 - BeginTime time.Time - EndTime time.Time - ComputeResource string - BeginTimeUnix int64 - EndTimeUnix int64 - AiCenter string - NeedDeleteInfo string - Cluster string + RepoID int64 // include all repos if empty + UserID int64 + JobID string + SortType string + CloudbrainIDs []int64 + JobStatus []string + JobStatusNot bool + Keyword string + Type int + JobTypes []string + VersionName string + IsLatestVersion string + JobTypeNot bool + NeedRepoInfo bool + RepoIDList []int64 + BeginTime time.Time + EndTime time.Time + ComputeResource string + BeginTimeUnix int64 + EndTimeUnix int64 + AiCenter string + NeedDeleteInfo string + Cluster string + AccCardType string + AccCardsNum string + WorkServerNumber int } type TaskPod struct { @@ -1906,7 +1909,7 @@ func GetCloudbrainByID(id string) (*Cloudbrain, error) { return getRepoCloudBrain(cb) } -func IsCloudbrainExistByJobName(jobName string)(bool,error){ +func IsCloudbrainExistByJobName(jobName string) (bool, error) { return x.Unscoped().Exist(&Cloudbrain{ JobName: jobName, }) @@ -2070,25 +2073,25 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra Limit(limit). Find(&cloudbrains) } + /** 本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间 - */ +*/ func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { cloudbrains := make([]*Cloudbrain, 0, 10) endTimeBefore := time.Now().Unix() - int64(days)*24*3600 missEndTimeBefore := endTimeBefore - 24*3600 - sql:=`SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) + sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) id, job_name, job_id,status,end_time,updated_unix,cleared FROM cloudbrain where type=0 and job_type='DEBUG' ORDER BY job_name, updated_unix DESC) a where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix 0 { + cond = cond.And( + builder.Eq{"COALESCE(cloudbrain.work_server_number, 1)": opts.WorkServerNumber}, + ) + } + + // 如果opts中包含了cardtype,那么添加查询条件 + if opts.AccCardType != "" { + cond = cond.And(builder.Eq{"cloudbrain_spec.acc_card_type": opts.AccCardType}) + } + if opts.AccCardsNum != "" { + cond = cond.And(builder.Eq{"cloudbrain_spec.acc_cards_num": opts.AccCardsNum}) + } + var count int64 var err error condition := "cloudbrain.user_id = `user`.id" if len(opts.Keyword) == 0 { - count, err = sess.Unscoped().Where(cond).Count(new(Cloudbrain)) + count, err = sess.Table(&Cloudbrain{}).Unscoped().Where(cond). + Join("left", "`user`", condition). + Join("left", "cloudbrain_spec", "cloudbrain.id = cloudbrain_spec.cloudbrain_id"). + Count(new(CloudbrainInfo)) } else { lowerKeyWord := strings.ToLower(opts.Keyword) cond = cond.And(builder.Or(builder.Like{"LOWER(cloudbrain.job_name)", lowerKeyWord}, builder.Like{"LOWER(cloudbrain.display_job_name)", lowerKeyWord}, builder.Like{"`user`.lower_name", lowerKeyWord})) count, err = sess.Table(&Cloudbrain{}).Unscoped().Where(cond). - Join("left", "`user`", condition).Count(new(CloudbrainInfo)) + Join("left", "`user`", condition). + Join("left", "cloudbrain_spec", "cloudbrain.id = cloudbrain_spec.cloudbrain_id"). + Count(new(CloudbrainInfo)) } @@ -2354,6 +2376,7 @@ func CloudbrainAll(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum) if err := sess.Table(&Cloudbrain{}).Unscoped().Where(cond). Join("left", "`user`", condition). + Join("left", "cloudbrain_spec", "cloudbrain.id = cloudbrain_spec.cloudbrain_id"). Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index 0d68fff30..e6b436a4b 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -719,6 +719,11 @@ func GetCloudbrainsDetailData(ctx *context.Context) { aiCenter := ctx.Query("aiCenter") needDeleteInfo := ctx.Query("needDeleteInfo") + accCardType := ctx.Query("accCardType") + accCardsNum := ctx.Query("accCardsNum") + workServerNumber := ctx.QueryInt("workServerNumber") + workServerNumber = 0 + if cloudBrainType == models.TypeCloudBrainOne && aiCenter == models.AICenterOfCloudBrainOne { aiCenter = "" } @@ -777,18 +782,21 @@ func GetCloudbrainsDetailData(ctx *context.Context) { Page: page, PageSize: pageSize, }, - Keyword: keyword, - Type: cloudBrainType, - ComputeResource: listType, - JobTypeNot: jobTypeNot, - JobStatusNot: jobStatusNot, - JobStatus: jobStatuses, - JobTypes: jobTypes, - NeedRepoInfo: true, - BeginTimeUnix: int64(recordBeginTime), - EndTimeUnix: endTime.Unix(), - AiCenter: aiCenter, - NeedDeleteInfo: needDeleteInfo, + Keyword: keyword, + Type: cloudBrainType, + ComputeResource: listType, + JobTypeNot: jobTypeNot, + JobStatusNot: jobStatusNot, + JobStatus: jobStatuses, + JobTypes: jobTypes, + NeedRepoInfo: true, + BeginTimeUnix: int64(recordBeginTime), + EndTimeUnix: endTime.Unix(), + AiCenter: aiCenter, + NeedDeleteInfo: needDeleteInfo, + AccCardType: accCardType, + AccCardsNum: accCardsNum, + WorkServerNumber: workServerNumber, }) if err != nil { ctx.ServerError("Get job failed:", err) From 5e65f0fc10072c57281fde36e44008f33bf54b0e Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 16 Dec 2022 17:57:23 +0800 Subject: [PATCH 2/7] fix-3319 --- models/cloudbrain.go | 15 +++++++++++---- routers/api/v1/repo/cloudbrain_dashboard.go | 29 +++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 1f5068ca6..c75dca3bd 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -2325,12 +2325,19 @@ func CloudbrainAll(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { } if opts.WorkServerNumber > 0 { - cond = cond.And( - builder.Eq{"COALESCE(cloudbrain.work_server_number, 1)": opts.WorkServerNumber}, - ) + if opts.WorkServerNumber == 1 { + cond = cond.And(builder.Or( + builder.Eq{"cloudbrain.work_server_number": 0}, + builder.Eq{"cloudbrain.work_server_number": 1}, + builder.IsNull{"cloudbrain.work_server_number"}, + )) + } else { + cond = cond.And( + builder.Eq{"cloudbrain.work_server_number": opts.WorkServerNumber}, + ) + } } - // 如果opts中包含了cardtype,那么添加查询条件 if opts.AccCardType != "" { cond = cond.And(builder.Eq{"cloudbrain_spec.acc_card_type": opts.AccCardType}) } diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index e6b436a4b..8b13b38dc 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -711,7 +711,6 @@ func GetCloudbrainsDetailData(ctx *context.Context) { return } recordBeginTime := recordCloudbrain[0].Cloudbrain.CreatedUnix - endTime := time.Now() listType := ctx.Query("listType") jobType := ctx.Query("jobType") jobStatus := ctx.Query("jobStatus") @@ -722,7 +721,29 @@ func GetCloudbrainsDetailData(ctx *context.Context) { accCardType := ctx.Query("accCardType") accCardsNum := ctx.Query("accCardsNum") workServerNumber := ctx.QueryInt("workServerNumber") - workServerNumber = 0 + beginTimeStr := ctx.QueryTrim("beginTime") + endTimeStr := ctx.QueryTrim("endTime") + var beginTimeUnix int64 + var endTimeUnix int64 + if beginTimeStr == "" || endTimeStr == "" { + beginTimeUnix = int64(recordBeginTime) + endTimeUnix = time.Now().Unix() + } else { + beginTime, err := time.ParseInLocation("2006-01-02", beginTimeStr, time.Local) + if err != nil { + log.Error("Can not ParseInLocation.", err) + ctx.Error(http.StatusBadRequest, ctx.Tr("ParseInLocation_get_error")) + return + } + beginTimeUnix = beginTime.Unix() + endTime, err := time.ParseInLocation("2006-01-02", endTimeStr, time.Local) + if err != nil { + log.Error("Can not ParseInLocation.", err) + ctx.Error(http.StatusBadRequest, ctx.Tr("ParseInLocation_get_error")) + return + } + endTimeUnix = endTime.Unix() + } if cloudBrainType == models.TypeCloudBrainOne && aiCenter == models.AICenterOfCloudBrainOne { aiCenter = "" @@ -790,8 +811,8 @@ func GetCloudbrainsDetailData(ctx *context.Context) { JobStatus: jobStatuses, JobTypes: jobTypes, NeedRepoInfo: true, - BeginTimeUnix: int64(recordBeginTime), - EndTimeUnix: endTime.Unix(), + BeginTimeUnix: beginTimeUnix, + EndTimeUnix: endTimeUnix, AiCenter: aiCenter, NeedDeleteInfo: needDeleteInfo, AccCardType: accCardType, From dd426bcca5f6f228c9826a7dee564f1814157d50 Mon Sep 17 00:00:00 2001 From: liuzx Date: Mon, 19 Dec 2022 11:21:05 +0800 Subject: [PATCH 3/7] fix-3319 --- models/cloudbrain.go | 4 ++-- routers/api/v1/repo/cloudbrain_dashboard.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index c75dca3bd..366358638 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -466,7 +466,7 @@ type CloudbrainsOptions struct { NeedDeleteInfo string Cluster string AccCardType string - AccCardsNum string + AccCardsNum int WorkServerNumber int } @@ -2341,7 +2341,7 @@ func CloudbrainAll(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { if opts.AccCardType != "" { cond = cond.And(builder.Eq{"cloudbrain_spec.acc_card_type": opts.AccCardType}) } - if opts.AccCardsNum != "" { + if opts.AccCardsNum >= 0 { cond = cond.And(builder.Eq{"cloudbrain_spec.acc_cards_num": opts.AccCardsNum}) } diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index 8b13b38dc..7856f0b95 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -719,7 +719,7 @@ func GetCloudbrainsDetailData(ctx *context.Context) { needDeleteInfo := ctx.Query("needDeleteInfo") accCardType := ctx.Query("accCardType") - accCardsNum := ctx.Query("accCardsNum") + accCardsNum := ctx.QueryInt("accCardsNum") workServerNumber := ctx.QueryInt("workServerNumber") beginTimeStr := ctx.QueryTrim("beginTime") endTimeStr := ctx.QueryTrim("endTime") From 622e57da860b23e632fa012ae8ea0b581b4ed2e4 Mon Sep 17 00:00:00 2001 From: liuzx Date: Mon, 19 Dec 2022 17:01:39 +0800 Subject: [PATCH 4/7] fix-3319 --- routers/api/v1/repo/cloudbrain_dashboard.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index 7856f0b95..b9c75e73f 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -729,14 +729,14 @@ func GetCloudbrainsDetailData(ctx *context.Context) { beginTimeUnix = int64(recordBeginTime) endTimeUnix = time.Now().Unix() } else { - beginTime, err := time.ParseInLocation("2006-01-02", beginTimeStr, time.Local) + beginTime, err := time.ParseInLocation("2006-01-02T15:04:05", beginTimeStr, time.Local) if err != nil { log.Error("Can not ParseInLocation.", err) ctx.Error(http.StatusBadRequest, ctx.Tr("ParseInLocation_get_error")) return } beginTimeUnix = beginTime.Unix() - endTime, err := time.ParseInLocation("2006-01-02", endTimeStr, time.Local) + endTime, err := time.ParseInLocation("2006-01-02T15:04:05", endTimeStr, time.Local) if err != nil { log.Error("Can not ParseInLocation.", err) ctx.Error(http.StatusBadRequest, ctx.Tr("ParseInLocation_get_error")) From 3e0f70aea1428a7f809901fbcaca74864a7cee6f Mon Sep 17 00:00:00 2001 From: liuzx Date: Tue, 20 Dec 2022 15:47:42 +0800 Subject: [PATCH 5/7] fix-3342 --- routers/api/v1/repo/cloudbrain_dashboard.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index bb04038b9..d89cd1d30 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -645,7 +645,7 @@ func GetAllCloudbrainsPeriodDistribution(ctx *context.Context) { } } - ComputeResourceList := []string{"CPU/GPU", "NPU"} + ComputeResourceList := []string{"CPU/GPU", "NPU", "GCU"} for _, v := range ComputeResourceList { if _, ok := cloudBrainComputeResource[v]; !ok { cloudBrainComputeResource[v] = 0 @@ -1039,7 +1039,7 @@ func getCloudbrainCount(beginTime time.Time, endTime time.Time, cloudbrains []*m } } - ComputeResourceList := []string{"CPU/GPU", "NPU"} + ComputeResourceList := []string{"CPU/GPU", "NPU", "GCU"} for _, v := range ComputeResourceList { if _, ok := cloudBrainComputeResource[v]; !ok { cloudBrainComputeResource[v] = 0 From b30d23ff9b187795132d6dbe9862d79ce53bb393 Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 6 Jan 2023 11:23:31 +0800 Subject: [PATCH 6/7] fix-3339 --- models/cloudbrain.go | 3 ++- modules/grampus/resty.go | 1 - modules/modelarts/modelarts.go | 7 +++---- routers/repo/grampus.go | 18 ++++++++++++++++++ 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 7e736e19d..2559f98ab 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1559,7 +1559,8 @@ type CreateGrampusJobResponse struct { type GetGrampusJobResponse struct { GrampusResult - JobInfo GrampusJobInfo `json:"otJob"` + JobInfo GrampusJobInfo `json:"otJob"` + ExitDiagnostics string `json:"exitDiagnostics"` } type GrampusNotebookResponse struct { diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index a0d5384e2..3611240b9 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -198,7 +198,6 @@ sendjob: SetAuthToken(TOKEN). SetResult(&result). Get(HOST + urlTrainJob + "/" + jobID) - if err != nil { return nil, fmt.Errorf("resty GetJob: %v", err) } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index dcad1eb00..e559470db 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -26,9 +26,9 @@ import ( const ( //notebook - storageTypeOBS = "obs" - autoStopDuration = 4 * 60 * 60 - AutoStopDurationMs = 4 * 60 * 60 * 1000 + storageTypeOBS = "obs" + autoStopDuration = 4 * 60 * 60 + AutoStopDurationMs = 4 * 60 * 60 * 1000 CodePath = "/code/" OutputPath = "/output/" @@ -172,7 +172,6 @@ type OrgMultiNode struct { Node []int `json:"node"` } - type Parameters struct { Parameter []struct { Label string `json:"label"` diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 14db1a50d..2c94281c9 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1249,6 +1249,7 @@ func GrampusTrainJobShow(ctx *context.Context) { } if result != nil { + log.Info("resultliuzx:", result.JobInfo) if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] } @@ -1358,6 +1359,23 @@ func GrampusGetLog(ctx *context.Context) { }) return } + result, err := grampus.GetJob(jobID) + if err != nil { + log.Error("GetJob(%s) failed:%v", job.JobName, err) + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobName": job.JobName, + "Content": content, + "CanLogDownload": false, + }) + return + } + if result != nil { + job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + if job.Status == models.GrampusStatusFailed { + content = content + "\n" + result.ExitDiagnostics + } + } + canLogDownload := err == nil && job.IsUserHasRight(ctx.User) ctx.JSON(http.StatusOK, map[string]interface{}{ "JobName": job.JobName, From 463952c61b1cf2bf651b07b1afcd7dbed5b470b6 Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 6 Jan 2023 17:55:25 +0800 Subject: [PATCH 7/7] fix-3339 --- routers/repo/grampus.go | 1 - 1 file changed, 1 deletion(-) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 2c94281c9..30812002d 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1249,7 +1249,6 @@ func GrampusTrainJobShow(ctx *context.Context) { } if result != nil { - log.Info("resultliuzx:", result.JobInfo) if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] }