Browse Source

Merge remote-tracking branch 'origin/V20211213' into liuzx_trainjob

pull/1071/head
zhoupzh 3 years ago
parent
commit
d0d5c7999a
5 changed files with 28 additions and 20 deletions
  1. +2
    -1
      models/cloudbrain.go
  2. +3
    -3
      modules/cron/tasks_basic.go
  3. +5
    -3
      modules/setting/setting.go
  4. +16
    -8
      routers/repo/cloudbrain.go
  5. +2
    -5
      routers/repo/modelarts.go

+ 2
- 1
models/cloudbrain.go View File

@@ -1110,7 +1110,8 @@ func UpdateJob(job *Cloudbrain) error {
func updateJob(e Engine, job *Cloudbrain) error { func updateJob(e Engine, job *Cloudbrain) error {
var sess *xorm.Session var sess *xorm.Session
sess = e.Where("job_id = ?", job.JobID) sess = e.Where("job_id = ?", job.JobID)
_, err := sess.Cols("status", "container_id", "container_ip").Update(job)
//_, err := sess.Cols("status", "container_id", "container_ip").Update(job)
_, err := sess.Update(job)
return err return err
} }




+ 3
- 3
modules/cron/tasks_basic.go View File

@@ -134,7 +134,7 @@ func registerHandleBlockChainUnSuccessRepos() {
RegisterTaskFatal("handle_blockchain_unsuccess_repos", &BaseConfig{ RegisterTaskFatal("handle_blockchain_unsuccess_repos", &BaseConfig{
Enabled: true, Enabled: true,
RunAtStart: true, RunAtStart: true,
Schedule: "@every 1m",
Schedule: "@every 10m",
}, func(ctx context.Context, _ *models.User, _ Config) error { }, func(ctx context.Context, _ *models.User, _ Config) error {
repo.HandleBlockChainUnSuccessRepos() repo.HandleBlockChainUnSuccessRepos()
return nil return nil
@@ -145,7 +145,7 @@ func registerHandleBlockChainMergedPulls() {
RegisterTaskFatal("handle_blockchain_merged_pull", &BaseConfig{ RegisterTaskFatal("handle_blockchain_merged_pull", &BaseConfig{
Enabled: true, Enabled: true,
RunAtStart: true, RunAtStart: true,
Schedule: "@every 1m",
Schedule: "@every 10m",
}, func(ctx context.Context, _ *models.User, _ Config) error { }, func(ctx context.Context, _ *models.User, _ Config) error {
repo.HandleBlockChainMergedPulls() repo.HandleBlockChainMergedPulls()
return nil return nil
@@ -156,7 +156,7 @@ func registerHandleBlockChainUnSuccessCommits() {
RegisterTaskFatal("handle_blockchain_unsuccess_commits", &BaseConfig{ RegisterTaskFatal("handle_blockchain_unsuccess_commits", &BaseConfig{
Enabled: true, Enabled: true,
RunAtStart: true, RunAtStart: true,
Schedule: "@every 3m",
Schedule: "@every 10m",
}, func(ctx context.Context, _ *models.User, _ Config) error { }, func(ctx context.Context, _ *models.User, _ Config) error {
repo.HandleBlockChainUnSuccessCommits() repo.HandleBlockChainUnSuccessCommits()
return nil return nil


+ 5
- 3
modules/setting/setting.go View File

@@ -448,23 +448,24 @@ var (
GpuTypes string GpuTypes string
DebugServerHost string DebugServerHost string
ResourceSpecs string ResourceSpecs string
MaxDuration int64


//benchmark config //benchmark config
IsBenchmarkEnabled bool IsBenchmarkEnabled bool
BenchmarkOwner string
BenchmarkOwner string
BenchmarkName string BenchmarkName string
BenchmarkServerHost string BenchmarkServerHost string
BenchmarkCategory string BenchmarkCategory string


//snn4imagenet config //snn4imagenet config
IsSnn4imagenetEnabled bool IsSnn4imagenetEnabled bool
Snn4imagenetOwner string
Snn4imagenetOwner string
Snn4imagenetName string Snn4imagenetName string
Snn4imagenetServerHost string Snn4imagenetServerHost string


//snn4imagenet config //snn4imagenet config
IsBrainScoreEnabled bool IsBrainScoreEnabled bool
BrainScoreOwner string
BrainScoreOwner string
BrainScoreName string BrainScoreName string
BrainScoreServerHost string BrainScoreServerHost string


@@ -1238,6 +1239,7 @@ func NewContext() {
JobType = sec.Key("GPU_TYPE_DEFAULT").MustString("openidebug") JobType = sec.Key("GPU_TYPE_DEFAULT").MustString("openidebug")
GpuTypes = sec.Key("GPU_TYPES").MustString("") GpuTypes = sec.Key("GPU_TYPES").MustString("")
ResourceSpecs = sec.Key("RESOURCE_SPECS").MustString("") ResourceSpecs = sec.Key("RESOURCE_SPECS").MustString("")
MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400)


sec = Cfg.Section("benchmark") sec = Cfg.Section("benchmark")
IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false)


+ 16
- 8
routers/repo/cloudbrain.go View File

@@ -70,14 +70,8 @@ func CloudBrainIndex(ctx *context.Context) {
return return
} }


timestamp := time.Now().Unix()
for i, task := range ciTasks { for i, task := range ciTasks {
if task.Status == string(models.JobRunning) && (timestamp-int64(task.Cloudbrain.CreatedUnix) > 10) {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
} else {
ciTasks[i].CanDebug = false
}

ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)


} }
@@ -831,10 +825,24 @@ func SyncCloudbrainStatus() {
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
task.Status = taskRes.TaskStatuses[0].State task.Status = taskRes.TaskStatuses[0].State
if task.Status != string(models.JobWaiting) { if task.Status != string(models.JobWaiting) {
task.Duration = time.Now().Unix() - taskRes.TaskStatuses[0].StartAt.Unix()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err) log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}

if task.Duration >= setting.MaxDuration {
log.Info("begin to stop job(%s), because of the duration", task.JobName)
err = cloudbrain.StopJob(task.JobID)
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err)
continue
}
task.Status = string(models.JobStopped)
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
}
} }
} }
} }


+ 2
- 5
routers/repo/modelarts.go View File

@@ -126,11 +126,8 @@ func NotebookIndex(ctx *context.Context) {
} }


for i, task := range ciTasks { for i, task := range ciTasks {
if task.Status == string(models.JobRunning) {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
} else {
ciTasks[i].CanDebug = false
}

ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
} }




Loading…
Cancel
Save