@@ -2172,7 +2172,7 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { | |||||
Find(&cloudbrains) | Find(&cloudbrains) | ||||
} | } | ||||
func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||||
func GetGPUStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||||
cloudbrains := make([]*Cloudbrain, 0, 10) | cloudbrains := make([]*Cloudbrain, 0, 10) | ||||
endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | ||||
missEndTimeBefore := endTimeBefore - 24*3600 | missEndTimeBefore := endTimeBefore - 24*3600 | ||||
@@ -2181,7 +2181,7 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra | |||||
JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, | JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, | ||||
ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, | ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, | ||||
ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). | ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). | ||||
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore). | |||||
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore). | |||||
Limit(limit). | Limit(limit). | ||||
Find(&cloudbrains) | Find(&cloudbrains) | ||||
} | } | ||||
@@ -2189,14 +2189,14 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra | |||||
/** | /** | ||||
本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间 | 本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间 | ||||
*/ | */ | ||||
func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||||
func GetGPUStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||||
cloudbrains := make([]*Cloudbrain, 0, 10) | cloudbrains := make([]*Cloudbrain, 0, 10) | ||||
endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | ||||
missEndTimeBefore := endTimeBefore - 24*3600 | missEndTimeBefore := endTimeBefore - 24*3600 | ||||
sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) | sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) | ||||
id, job_name, job_id,status,end_time,updated_unix,cleared | id, job_name, job_id,status,end_time,updated_unix,cleared | ||||
FROM cloudbrain | FROM cloudbrain | ||||
where type=0 and job_type='DEBUG' | |||||
where (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type='DEBUG' | |||||
ORDER BY job_name, updated_unix DESC) a | ORDER BY job_name, updated_unix DESC) a | ||||
where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false` | where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false` | ||||
@@ -1063,6 +1063,8 @@ notebook_file_not_exist=Notebook file does not exist. | |||||
notebook_select_wrong=Please select a Notebook(.ipynb) file first. | notebook_select_wrong=Please select a Notebook(.ipynb) file first. | ||||
notebook_file_no_right=You have no right to access the Notebook(.ipynb) file. | notebook_file_no_right=You have no right to access the Notebook(.ipynb) file. | ||||
notebook_repo_conflict=The files in different branches of the same repository can not run together. | notebook_repo_conflict=The files in different branches of the same repository can not run together. | ||||
debug_again_fail=Fail to restart debug task, please try again later. | |||||
debug_again_fail_forever=The task was scheduled failed last time, can not restart. | |||||
date=Date | date=Date | ||||
repo_add=Project Increment | repo_add=Project Increment | ||||
@@ -1062,6 +1062,8 @@ notebook_file_not_exist=Notebook文件不存在。 | |||||
notebook_select_wrong=请先选择Notebook(.ipynb)文件。 | notebook_select_wrong=请先选择Notebook(.ipynb)文件。 | ||||
notebook_file_no_right=您没有这个Notebook文件的读权限。 | notebook_file_no_right=您没有这个Notebook文件的读权限。 | ||||
notebook_repo_conflict=同一个仓库的不同分支文件不能同时运行。 | notebook_repo_conflict=同一个仓库的不同分支文件不能同时运行。 | ||||
debug_again_fail=再次调试失败,请稍后再试。 | |||||
debug_again_fail_forever=这个任务之前没有调度成功,不能再次调试。 | |||||
date=日期 | date=日期 | ||||
repo_add=新增项目 | repo_add=新增项目 | ||||
@@ -1682,6 +1682,10 @@ func GrampusNotebookRestart(ctx *context.Context) { | |||||
if res.GrampusResult.ErrorCode != 0 || res.NewId == "" { | if res.GrampusResult.ErrorCode != 0 || res.NewId == "" { | ||||
log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg) | log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg) | ||||
errorMsg = ctx.Tr("repo.debug_again_fail") | errorMsg = ctx.Tr("repo.debug_again_fail") | ||||
if res.GrampusResult.ErrorCode == 5005 { | |||||
errorMsg = ctx.Tr("repo.debug_again_fail_forever") | |||||
} | |||||
break | break | ||||
} | } | ||||
@@ -14,21 +14,21 @@ import ( | |||||
func ClearCloudbrainResultSpace() { | func ClearCloudbrainResultSpace() { | ||||
log.Info("clear cloudbrain one result space begin.") | log.Info("clear cloudbrain one result space begin.") | ||||
if !setting.ClearStrategy.Enabled{ | |||||
if !setting.ClearStrategy.Enabled { | |||||
return | return | ||||
} | } | ||||
tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize) | |||||
tasks, err := models.GetGPUStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize) | |||||
if err != nil { | if err != nil { | ||||
log.Warn("Failed to get cloudbrain, clear result failed.", err) | log.Warn("Failed to get cloudbrain, clear result failed.", err) | ||||
return | return | ||||
} | } | ||||
debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize) | |||||
debugTasks, err := models.GetGPUStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize) | |||||
if err != nil { | if err != nil { | ||||
log.Warn("Failed to get debug cloudbrain.", err) | log.Warn("Failed to get debug cloudbrain.", err) | ||||
} | } | ||||
tasks=append(tasks,debugTasks...) | |||||
tasks = append(tasks, debugTasks...) | |||||
if err != nil { | if err != nil { | ||||
log.Warn("Failed to get cloudbrain, clear result failed.", err) | log.Warn("Failed to get cloudbrain, clear result failed.", err) | ||||
@@ -38,7 +38,7 @@ func ClearCloudbrainResultSpace() { | |||||
for _, task := range tasks { | for _, task := range tasks { | ||||
err := DeleteCloudbrainOneJobStorage(task.JobName) | err := DeleteCloudbrainOneJobStorage(task.JobName) | ||||
if err == nil { | if err == nil { | ||||
log.Info("clear job in cloudbrain table:"+task.JobName) | |||||
log.Info("clear job in cloudbrain table:" + task.JobName) | |||||
ids = append(ids, task.ID) | ids = append(ids, task.ID) | ||||
} | } | ||||
} | } | ||||
@@ -69,10 +69,10 @@ func clearMinioHistoryTrashFile() { | |||||
SortModTimeAscend(miniofiles) | SortModTimeAscend(miniofiles) | ||||
for _, file := range miniofiles { | for _, file := range miniofiles { | ||||
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||||
if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||||
has,err:=models.IsCloudbrainExistByJobName(file.Name()) | |||||
if err==nil && !has { | |||||
has, err := models.IsCloudbrainExistByJobName(file.Name()) | |||||
if err == nil && !has { | |||||
dirPath := setting.CBCodePathPrefix + file.Name() + "/" | dirPath := setting.CBCodePathPrefix + file.Name() + "/" | ||||
log.Info("clear job in minio trash:" + file.Name()) | log.Info("clear job in minio trash:" + file.Name()) | ||||
storage.Attachments.DeleteDir(dirPath) | storage.Attachments.DeleteDir(dirPath) | ||||
@@ -90,7 +90,7 @@ func clearMinioHistoryTrashFile() { | |||||
} | } | ||||
} | } | ||||
func clearLocalHistoryTrashFile() { | |||||
func clearLocalHistoryTrashFile() { | |||||
files, err := ioutil.ReadDir(setting.JobPath) | files, err := ioutil.ReadDir(setting.JobPath) | ||||
processCount := 0 | processCount := 0 | ||||
if err != nil { | if err != nil { | ||||
@@ -99,11 +99,11 @@ func clearLocalHistoryTrashFile() { | |||||
SortModTimeAscend(files) | SortModTimeAscend(files) | ||||
for _, file := range files { | for _, file := range files { | ||||
//清理n天前的历史垃圾数据,清理job目录 | //清理n天前的历史垃圾数据,清理job目录 | ||||
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||||
has,err:=models.IsCloudbrainExistByJobName(file.Name()) | |||||
if err==nil && !has{ | |||||
if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||||
has, err := models.IsCloudbrainExistByJobName(file.Name()) | |||||
if err == nil && !has { | |||||
os.RemoveAll(setting.JobPath + file.Name()) | os.RemoveAll(setting.JobPath + file.Name()) | ||||
log.Info("clear job in local trash:"+file.Name()) | |||||
log.Info("clear job in local trash:" + file.Name()) | |||||
processCount++ | processCount++ | ||||
} | } | ||||
if processCount == setting.ClearStrategy.BatchSize { | if processCount == setting.ClearStrategy.BatchSize { | ||||
@@ -127,7 +127,7 @@ func SortModTimeAscend(files []os.FileInfo) { | |||||
func DeleteCloudbrainOneJobStorage(jobName string) error { | func DeleteCloudbrainOneJobStorage(jobName string) error { | ||||
if jobName==""{ | |||||
if jobName == "" { | |||||
return nil | return nil | ||||
} | } | ||||
//delete local | //delete local | ||||