Browse Source

提交代码

pull/3447/head
ychao_1983 2 years ago
parent
commit
a353739f15
5 changed files with 26 additions and 18 deletions
  1. +4
    -4
      models/cloudbrain.go
  2. +2
    -0
      options/locale/locale_en-US.ini
  3. +2
    -0
      options/locale/locale_zh-CN.ini
  4. +4
    -0
      routers/repo/grampus.go
  5. +14
    -14
      services/cloudbrain/clear.go

+ 4
- 4
models/cloudbrain.go View File

@@ -2172,7 +2172,7 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) {
Find(&cloudbrains)
}

func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
func GetGPUStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600
@@ -2181,7 +2181,7 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra
JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted,
ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed,
ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore).
Limit(limit).
Find(&cloudbrains)
}
@@ -2189,14 +2189,14 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra
/**
本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间
*/
func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
func GetGPUStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600
sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name)
id, job_name, job_id,status,end_time,updated_unix,cleared
FROM cloudbrain
where type=0 and job_type='DEBUG'
where (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type='DEBUG'
ORDER BY job_name, updated_unix DESC) a
where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false`



+ 2
- 0
options/locale/locale_en-US.ini View File

@@ -1063,6 +1063,8 @@ notebook_file_not_exist=Notebook file does not exist.
notebook_select_wrong=Please select a Notebook(.ipynb) file first.
notebook_file_no_right=You have no right to access the Notebook(.ipynb) file.
notebook_repo_conflict=The files in different branches of the same repository can not run together.
debug_again_fail=Fail to restart debug task, please try again later.
debug_again_fail_forever=The task was scheduled failed last time, can not restart.

date=Date
repo_add=Project Increment


+ 2
- 0
options/locale/locale_zh-CN.ini View File

@@ -1062,6 +1062,8 @@ notebook_file_not_exist=Notebook文件不存在。
notebook_select_wrong=请先选择Notebook(.ipynb)文件。
notebook_file_no_right=您没有这个Notebook文件的读权限。
notebook_repo_conflict=同一个仓库的不同分支文件不能同时运行。
debug_again_fail=再次调试失败,请稍后再试。
debug_again_fail_forever=这个任务之前没有调度成功,不能再次调试。

date=日期
repo_add=新增项目


+ 4
- 0
routers/repo/grampus.go View File

@@ -1682,6 +1682,10 @@ func GrampusNotebookRestart(ctx *context.Context) {
if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
errorMsg = ctx.Tr("repo.debug_again_fail")
if res.GrampusResult.ErrorCode == 5005 {
errorMsg = ctx.Tr("repo.debug_again_fail_forever")
}

break
}



+ 14
- 14
services/cloudbrain/clear.go View File

@@ -14,21 +14,21 @@ import (

func ClearCloudbrainResultSpace() {
log.Info("clear cloudbrain one result space begin.")
if !setting.ClearStrategy.Enabled{
if !setting.ClearStrategy.Enabled {
return
}

tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
tasks, err := models.GetGPUStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err)
return
}
debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize)
debugTasks, err := models.GetGPUStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize)
if err != nil {
log.Warn("Failed to get debug cloudbrain.", err)

}
tasks=append(tasks,debugTasks...)
tasks = append(tasks, debugTasks...)

if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err)
@@ -38,7 +38,7 @@ func ClearCloudbrainResultSpace() {
for _, task := range tasks {
err := DeleteCloudbrainOneJobStorage(task.JobName)
if err == nil {
log.Info("clear job in cloudbrain table:"+task.JobName)
log.Info("clear job in cloudbrain table:" + task.JobName)
ids = append(ids, task.ID)
}
}
@@ -69,10 +69,10 @@ func clearMinioHistoryTrashFile() {
SortModTimeAscend(miniofiles)
for _, file := range miniofiles {

if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {

has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has {
has, err := models.IsCloudbrainExistByJobName(file.Name())
if err == nil && !has {
dirPath := setting.CBCodePathPrefix + file.Name() + "/"
log.Info("clear job in minio trash:" + file.Name())
storage.Attachments.DeleteDir(dirPath)
@@ -90,7 +90,7 @@ func clearMinioHistoryTrashFile() {
}
}

func clearLocalHistoryTrashFile() {
func clearLocalHistoryTrashFile() {
files, err := ioutil.ReadDir(setting.JobPath)
processCount := 0
if err != nil {
@@ -99,11 +99,11 @@ func clearLocalHistoryTrashFile() {
SortModTimeAscend(files)
for _, file := range files {
//清理n天前的历史垃圾数据,清理job目录
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has{
if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
has, err := models.IsCloudbrainExistByJobName(file.Name())
if err == nil && !has {
os.RemoveAll(setting.JobPath + file.Name())
log.Info("clear job in local trash:"+file.Name())
log.Info("clear job in local trash:" + file.Name())
processCount++
}
if processCount == setting.ClearStrategy.BatchSize {
@@ -127,7 +127,7 @@ func SortModTimeAscend(files []os.FileInfo) {

func DeleteCloudbrainOneJobStorage(jobName string) error {

if jobName==""{
if jobName == "" {
return nil
}
//delete local


Loading…
Cancel
Save