Browse Source

提交代码

pull/3447/head
ychao_1983 2 years ago
parent
commit
a353739f15
5 changed files with 26 additions and 18 deletions
  1. +4
    -4
      models/cloudbrain.go
  2. +2
    -0
      options/locale/locale_en-US.ini
  3. +2
    -0
      options/locale/locale_zh-CN.ini
  4. +4
    -0
      routers/repo/grampus.go
  5. +14
    -14
      services/cloudbrain/clear.go

+ 4
- 4
models/cloudbrain.go View File

@@ -2172,7 +2172,7 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) {
Find(&cloudbrains) Find(&cloudbrains)
} }


func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
func GetGPUStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10) cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600 endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600 missEndTimeBefore := endTimeBefore - 24*3600
@@ -2181,7 +2181,7 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra
JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted,
ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed,
ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore).
Limit(limit). Limit(limit).
Find(&cloudbrains) Find(&cloudbrains)
} }
@@ -2189,14 +2189,14 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra
/** /**
本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间 本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间
*/ */
func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
func GetGPUStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10) cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600 endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600 missEndTimeBefore := endTimeBefore - 24*3600
sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name)
id, job_name, job_id,status,end_time,updated_unix,cleared id, job_name, job_id,status,end_time,updated_unix,cleared
FROM cloudbrain FROM cloudbrain
where type=0 and job_type='DEBUG'
where (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type='DEBUG'
ORDER BY job_name, updated_unix DESC) a ORDER BY job_name, updated_unix DESC) a
where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false` where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false`




+ 2
- 0
options/locale/locale_en-US.ini View File

@@ -1063,6 +1063,8 @@ notebook_file_not_exist=Notebook file does not exist.
notebook_select_wrong=Please select a Notebook(.ipynb) file first. notebook_select_wrong=Please select a Notebook(.ipynb) file first.
notebook_file_no_right=You have no right to access the Notebook(.ipynb) file. notebook_file_no_right=You have no right to access the Notebook(.ipynb) file.
notebook_repo_conflict=The files in different branches of the same repository can not run together. notebook_repo_conflict=The files in different branches of the same repository can not run together.
debug_again_fail=Fail to restart debug task, please try again later.
debug_again_fail_forever=The task was scheduled failed last time, can not restart.


date=Date date=Date
repo_add=Project Increment repo_add=Project Increment


+ 2
- 0
options/locale/locale_zh-CN.ini View File

@@ -1062,6 +1062,8 @@ notebook_file_not_exist=Notebook文件不存在。
notebook_select_wrong=请先选择Notebook(.ipynb)文件。 notebook_select_wrong=请先选择Notebook(.ipynb)文件。
notebook_file_no_right=您没有这个Notebook文件的读权限。 notebook_file_no_right=您没有这个Notebook文件的读权限。
notebook_repo_conflict=同一个仓库的不同分支文件不能同时运行。 notebook_repo_conflict=同一个仓库的不同分支文件不能同时运行。
debug_again_fail=再次调试失败,请稍后再试。
debug_again_fail_forever=这个任务之前没有调度成功,不能再次调试。


date=日期 date=日期
repo_add=新增项目 repo_add=新增项目


+ 4
- 0
routers/repo/grampus.go View File

@@ -1682,6 +1682,10 @@ func GrampusNotebookRestart(ctx *context.Context) {
if res.GrampusResult.ErrorCode != 0 || res.NewId == "" { if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg) log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
errorMsg = ctx.Tr("repo.debug_again_fail") errorMsg = ctx.Tr("repo.debug_again_fail")
if res.GrampusResult.ErrorCode == 5005 {
errorMsg = ctx.Tr("repo.debug_again_fail_forever")
}

break break
} }




+ 14
- 14
services/cloudbrain/clear.go View File

@@ -14,21 +14,21 @@ import (


func ClearCloudbrainResultSpace() { func ClearCloudbrainResultSpace() {
log.Info("clear cloudbrain one result space begin.") log.Info("clear cloudbrain one result space begin.")
if !setting.ClearStrategy.Enabled{
if !setting.ClearStrategy.Enabled {
return return
} }


tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
tasks, err := models.GetGPUStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
if err != nil { if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err) log.Warn("Failed to get cloudbrain, clear result failed.", err)
return return
} }
debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize)
debugTasks, err := models.GetGPUStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize)
if err != nil { if err != nil {
log.Warn("Failed to get debug cloudbrain.", err) log.Warn("Failed to get debug cloudbrain.", err)


} }
tasks=append(tasks,debugTasks...)
tasks = append(tasks, debugTasks...)


if err != nil { if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err) log.Warn("Failed to get cloudbrain, clear result failed.", err)
@@ -38,7 +38,7 @@ func ClearCloudbrainResultSpace() {
for _, task := range tasks { for _, task := range tasks {
err := DeleteCloudbrainOneJobStorage(task.JobName) err := DeleteCloudbrainOneJobStorage(task.JobName)
if err == nil { if err == nil {
log.Info("clear job in cloudbrain table:"+task.JobName)
log.Info("clear job in cloudbrain table:" + task.JobName)
ids = append(ids, task.ID) ids = append(ids, task.ID)
} }
} }
@@ -69,10 +69,10 @@ func clearMinioHistoryTrashFile() {
SortModTimeAscend(miniofiles) SortModTimeAscend(miniofiles)
for _, file := range miniofiles { for _, file := range miniofiles {


if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {


has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has {
has, err := models.IsCloudbrainExistByJobName(file.Name())
if err == nil && !has {
dirPath := setting.CBCodePathPrefix + file.Name() + "/" dirPath := setting.CBCodePathPrefix + file.Name() + "/"
log.Info("clear job in minio trash:" + file.Name()) log.Info("clear job in minio trash:" + file.Name())
storage.Attachments.DeleteDir(dirPath) storage.Attachments.DeleteDir(dirPath)
@@ -90,7 +90,7 @@ func clearMinioHistoryTrashFile() {
} }
} }


func clearLocalHistoryTrashFile() {
func clearLocalHistoryTrashFile() {
files, err := ioutil.ReadDir(setting.JobPath) files, err := ioutil.ReadDir(setting.JobPath)
processCount := 0 processCount := 0
if err != nil { if err != nil {
@@ -99,11 +99,11 @@ func clearLocalHistoryTrashFile() {
SortModTimeAscend(files) SortModTimeAscend(files)
for _, file := range files { for _, file := range files {
//清理n天前的历史垃圾数据,清理job目录 //清理n天前的历史垃圾数据,清理job目录
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has{
if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
has, err := models.IsCloudbrainExistByJobName(file.Name())
if err == nil && !has {
os.RemoveAll(setting.JobPath + file.Name()) os.RemoveAll(setting.JobPath + file.Name())
log.Info("clear job in local trash:"+file.Name())
log.Info("clear job in local trash:" + file.Name())
processCount++ processCount++
} }
if processCount == setting.ClearStrategy.BatchSize { if processCount == setting.ClearStrategy.BatchSize {
@@ -127,7 +127,7 @@ func SortModTimeAscend(files []os.FileInfo) {


func DeleteCloudbrainOneJobStorage(jobName string) error { func DeleteCloudbrainOneJobStorage(jobName string) error {


if jobName==""{
if jobName == "" {
return nil return nil
} }
//delete local //delete local


Loading…
Cancel
Save