|
@@ -44,6 +44,31 @@ var ( |
|
|
TrainResourceSpecs *models.ResourceSpecs |
|
|
TrainResourceSpecs *models.ResourceSpecs |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
type GenerateCloudBrainTaskReq struct { |
|
|
|
|
|
Ctx *context.Context |
|
|
|
|
|
DisplayJobName string |
|
|
|
|
|
JobName string |
|
|
|
|
|
Image string |
|
|
|
|
|
Command string |
|
|
|
|
|
Uuids string |
|
|
|
|
|
CodePath string |
|
|
|
|
|
ModelPath string |
|
|
|
|
|
BenchmarkPath string |
|
|
|
|
|
Snn4ImageNetPath string |
|
|
|
|
|
BrainScorePath string |
|
|
|
|
|
JobType string |
|
|
|
|
|
GpuQueue string |
|
|
|
|
|
Description string |
|
|
|
|
|
BranchName string |
|
|
|
|
|
BootFile string |
|
|
|
|
|
Params string |
|
|
|
|
|
CommitID string |
|
|
|
|
|
DataLocalPath string |
|
|
|
|
|
BenchmarkTypeID int |
|
|
|
|
|
BenchmarkChildTypeID int |
|
|
|
|
|
ResourceSpecId int |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { |
|
|
func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { |
|
|
if !ctx.IsSigned { |
|
|
if !ctx.IsSigned { |
|
|
return false |
|
|
return false |
|
@@ -187,23 +212,23 @@ func AdminOrImageCreaterRight(ctx *context.Context) { |
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description, branchName, bootFile, params, commitID string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error { |
|
|
|
|
|
|
|
|
func GenerateTask(req GenerateCloudBrainTaskReq) error { |
|
|
|
|
|
|
|
|
dataActualPath := setting.Attachment.Minio.RealPath + |
|
|
dataActualPath := setting.Attachment.Minio.RealPath + |
|
|
setting.Attachment.Minio.Bucket + "/" + |
|
|
setting.Attachment.Minio.Bucket + "/" + |
|
|
setting.Attachment.Minio.BasePath + |
|
|
setting.Attachment.Minio.BasePath + |
|
|
models.AttachmentRelativePath(uuid) + |
|
|
|
|
|
uuid |
|
|
|
|
|
|
|
|
models.AttachmentRelativePath(req.Uuids) + |
|
|
|
|
|
req.Uuids |
|
|
|
|
|
|
|
|
var resourceSpec *models.ResourceSpec |
|
|
var resourceSpec *models.ResourceSpec |
|
|
var versionCount int |
|
|
var versionCount int |
|
|
if jobType == string(models.JobTypeTrain) { |
|
|
|
|
|
|
|
|
if req.JobType == string(models.JobTypeTrain) { |
|
|
versionCount = 1 |
|
|
versionCount = 1 |
|
|
if TrainResourceSpecs == nil { |
|
|
if TrainResourceSpecs == nil { |
|
|
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) |
|
|
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) |
|
|
} |
|
|
} |
|
|
for _, spec := range TrainResourceSpecs.ResourceSpec { |
|
|
for _, spec := range TrainResourceSpecs.ResourceSpec { |
|
|
if resourceSpecId == spec.Id { |
|
|
|
|
|
|
|
|
if req.ResourceSpecId == spec.Id { |
|
|
resourceSpec = spec |
|
|
resourceSpec = spec |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
@@ -212,7 +237,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) |
|
|
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) |
|
|
} |
|
|
} |
|
|
for _, spec := range ResourceSpecs.ResourceSpec { |
|
|
for _, spec := range ResourceSpecs.ResourceSpec { |
|
|
if resourceSpecId == spec.Id { |
|
|
|
|
|
|
|
|
if req.ResourceSpecId == spec.Id { |
|
|
resourceSpec = spec |
|
|
resourceSpec = spec |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
@@ -220,25 +245,25 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if resourceSpec == nil { |
|
|
if resourceSpec == nil { |
|
|
log.Error("no such resourceSpecId(%d)", resourceSpecId, ctx.Data["MsgID"]) |
|
|
|
|
|
|
|
|
log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) |
|
|
return errors.New("no such resourceSpec") |
|
|
return errors.New("no such resourceSpec") |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
var datasetName string |
|
|
var datasetName string |
|
|
attach, err := models.GetAttachmentByUUID(uuid) |
|
|
|
|
|
|
|
|
attach, err := models.GetAttachmentByUUID(req.Uuids) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
//for benchmark, do not return error |
|
|
//for benchmark, do not return error |
|
|
log.Error("GetAttachmentByUUID failed:%v", err) |
|
|
|
|
|
|
|
|
log.Error("GetAttachmentByUUID failed:%v", err, req.Ctx.Data["MsgID"]) |
|
|
} else { |
|
|
} else { |
|
|
datasetName = attach.Name |
|
|
datasetName = attach.Name |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
createTime := timeutil.TimeStampNow() |
|
|
createTime := timeutil.TimeStampNow() |
|
|
jobResult, err := CreateJob(jobName, models.CreateJobParams{ |
|
|
|
|
|
JobName: jobName, |
|
|
|
|
|
|
|
|
jobResult, err := CreateJob(req.JobName, models.CreateJobParams{ |
|
|
|
|
|
JobName: req.JobName, |
|
|
RetryCount: 1, |
|
|
RetryCount: 1, |
|
|
GpuType: gpuQueue, |
|
|
|
|
|
Image: image, |
|
|
|
|
|
|
|
|
GpuType: req.GpuQueue, |
|
|
|
|
|
Image: req.Image, |
|
|
TaskRoles: []models.TaskRole{ |
|
|
TaskRoles: []models.TaskRole{ |
|
|
{ |
|
|
{ |
|
|
Name: SubTaskName, |
|
|
Name: SubTaskName, |
|
@@ -249,7 +274,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
GPUNumber: resourceSpec.GpuNum, |
|
|
GPUNumber: resourceSpec.GpuNum, |
|
|
MemoryMB: resourceSpec.MemMiB, |
|
|
MemoryMB: resourceSpec.MemMiB, |
|
|
ShmMB: resourceSpec.ShareMemMiB, |
|
|
ShmMB: resourceSpec.ShareMemMiB, |
|
|
Command: command, |
|
|
|
|
|
|
|
|
Command: req.Command, |
|
|
NeedIBDevice: false, |
|
|
NeedIBDevice: false, |
|
|
IsMainRole: false, |
|
|
IsMainRole: false, |
|
|
UseNNI: false, |
|
|
UseNNI: false, |
|
@@ -258,7 +283,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
Volumes: []models.Volume{ |
|
|
Volumes: []models.Volume{ |
|
|
{ |
|
|
{ |
|
|
HostPath: models.StHostPath{ |
|
|
HostPath: models.StHostPath{ |
|
|
Path: codePath, |
|
|
|
|
|
|
|
|
Path: req.CodePath, |
|
|
MountPath: CodeMountPath, |
|
|
MountPath: CodeMountPath, |
|
|
ReadOnly: false, |
|
|
ReadOnly: false, |
|
|
}, |
|
|
}, |
|
@@ -272,28 +297,28 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
HostPath: models.StHostPath{ |
|
|
HostPath: models.StHostPath{ |
|
|
Path: modelPath, |
|
|
|
|
|
|
|
|
Path: req.ModelPath, |
|
|
MountPath: ModelMountPath, |
|
|
MountPath: ModelMountPath, |
|
|
ReadOnly: false, |
|
|
ReadOnly: false, |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
HostPath: models.StHostPath{ |
|
|
HostPath: models.StHostPath{ |
|
|
Path: benchmarkPath, |
|
|
|
|
|
|
|
|
Path: req.BenchmarkPath, |
|
|
MountPath: BenchMarkMountPath, |
|
|
MountPath: BenchMarkMountPath, |
|
|
ReadOnly: true, |
|
|
ReadOnly: true, |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
HostPath: models.StHostPath{ |
|
|
HostPath: models.StHostPath{ |
|
|
Path: snn4imagenetPath, |
|
|
|
|
|
|
|
|
Path: req.Snn4ImageNetPath, |
|
|
MountPath: Snn4imagenetMountPath, |
|
|
MountPath: Snn4imagenetMountPath, |
|
|
ReadOnly: true, |
|
|
ReadOnly: true, |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
HostPath: models.StHostPath{ |
|
|
HostPath: models.StHostPath{ |
|
|
Path: brainScorePath, |
|
|
|
|
|
|
|
|
Path: req.BrainScorePath, |
|
|
MountPath: BrainScoreMountPath, |
|
|
MountPath: BrainScoreMountPath, |
|
|
ReadOnly: true, |
|
|
ReadOnly: true, |
|
|
}, |
|
|
}, |
|
@@ -301,42 +326,42 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
}, |
|
|
}, |
|
|
}) |
|
|
}) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) |
|
|
|
|
|
|
|
|
log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"]) |
|
|
return err |
|
|
return err |
|
|
} |
|
|
} |
|
|
if jobResult.Code != Success { |
|
|
if jobResult.Code != Success { |
|
|
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"]) |
|
|
|
|
|
|
|
|
log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"]) |
|
|
return errors.New(jobResult.Msg) |
|
|
return errors.New(jobResult.Msg) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
var jobID = jobResult.Payload["jobId"].(string) |
|
|
var jobID = jobResult.Payload["jobId"].(string) |
|
|
err = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
err = models.CreateCloudbrain(&models.Cloudbrain{ |
|
|
Status: string(models.JobWaiting), |
|
|
Status: string(models.JobWaiting), |
|
|
UserID: ctx.User.ID, |
|
|
|
|
|
RepoID: ctx.Repo.Repository.ID, |
|
|
|
|
|
|
|
|
UserID: req.Ctx.User.ID, |
|
|
|
|
|
RepoID: req.Ctx.Repo.Repository.ID, |
|
|
JobID: jobID, |
|
|
JobID: jobID, |
|
|
JobName: jobName, |
|
|
|
|
|
DisplayJobName: displayJobName, |
|
|
|
|
|
|
|
|
JobName: req.JobName, |
|
|
|
|
|
DisplayJobName: req.DisplayJobName, |
|
|
SubTaskName: SubTaskName, |
|
|
SubTaskName: SubTaskName, |
|
|
JobType: jobType, |
|
|
|
|
|
|
|
|
JobType: req.JobType, |
|
|
Type: models.TypeCloudBrainOne, |
|
|
Type: models.TypeCloudBrainOne, |
|
|
Uuid: uuid, |
|
|
|
|
|
Image: image, |
|
|
|
|
|
GpuQueue: gpuQueue, |
|
|
|
|
|
ResourceSpecId: resourceSpecId, |
|
|
|
|
|
|
|
|
Uuid: req.Uuids, |
|
|
|
|
|
Image: req.Image, |
|
|
|
|
|
GpuQueue: req.GpuQueue, |
|
|
|
|
|
ResourceSpecId: req.ResourceSpecId, |
|
|
ComputeResource: models.GPUResource, |
|
|
ComputeResource: models.GPUResource, |
|
|
BenchmarkTypeID: benchmarkTypeID, |
|
|
|
|
|
BenchmarkChildTypeID: benchmarkChildTypeID, |
|
|
|
|
|
Description: description, |
|
|
|
|
|
|
|
|
BenchmarkTypeID: req.BenchmarkTypeID, |
|
|
|
|
|
BenchmarkChildTypeID: req.BenchmarkChildTypeID, |
|
|
|
|
|
Description: req.Description, |
|
|
IsLatestVersion: "1", |
|
|
IsLatestVersion: "1", |
|
|
VersionCount: versionCount, |
|
|
VersionCount: versionCount, |
|
|
BranchName: branchName, |
|
|
|
|
|
BootFile: bootFile, |
|
|
|
|
|
|
|
|
BranchName: req.BranchName, |
|
|
|
|
|
BootFile: req.BootFile, |
|
|
DatasetName: datasetName, |
|
|
DatasetName: datasetName, |
|
|
Parameters: params, |
|
|
|
|
|
|
|
|
Parameters: req.Params, |
|
|
CreatedUnix: createTime, |
|
|
CreatedUnix: createTime, |
|
|
UpdatedUnix: createTime, |
|
|
UpdatedUnix: createTime, |
|
|
CommitID: commitID, |
|
|
|
|
|
|
|
|
CommitID: req.CommitID, |
|
|
}) |
|
|
}) |
|
|
|
|
|
|
|
|
if err != nil { |
|
|
if err != nil { |
|
@@ -345,17 +370,17 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, |
|
|
|
|
|
|
|
|
task, err := models.GetCloudbrainByJobID(jobID) |
|
|
task, err := models.GetCloudbrainByJobID(jobID) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
log.Error("GetCloudbrainByName failed: %v", err.Error()) |
|
|
|
|
|
|
|
|
log.Error("GetCloudbrainByJobID failed: %v", err.Error()) |
|
|
return err |
|
|
return err |
|
|
} |
|
|
} |
|
|
stringId := strconv.FormatInt(task.ID, 10) |
|
|
stringId := strconv.FormatInt(task.ID, 10) |
|
|
|
|
|
|
|
|
if IsBenchmarkJob(jobType) { |
|
|
|
|
|
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask) |
|
|
|
|
|
} else if string(models.JobTypeTrain) == jobType { |
|
|
|
|
|
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, displayJobName, models.ActionCreateGPUTrainTask) |
|
|
|
|
|
|
|
|
if IsBenchmarkJob(req.JobType) { |
|
|
|
|
|
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask) |
|
|
|
|
|
} else if string(models.JobTypeTrain) == req.JobType { |
|
|
|
|
|
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask) |
|
|
} else { |
|
|
} else { |
|
|
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask) |
|
|
|
|
|
|
|
|
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return nil |
|
|
return nil |
|
|