Browse Source

Merge branch 'multi-dataset' of https://git.openi.org.cn/OpenI/aiforge into multi-dataset

pull/2384/head
zhoupzh 3 years ago
parent
commit
7f2c568742
2 changed files with 189 additions and 59 deletions
  1. +68
    -43
      modules/cloudbrain/cloudbrain.go
  2. +121
    -16
      routers/repo/cloudbrain.go

+ 68
- 43
modules/cloudbrain/cloudbrain.go View File

@@ -44,6 +44,31 @@ var (
TrainResourceSpecs *models.ResourceSpecs
)

type GenerateCloudBrainTaskReq struct {
Ctx *context.Context
DisplayJobName string
JobName string
Image string
Command string
Uuids string
CodePath string
ModelPath string
BenchmarkPath string
Snn4ImageNetPath string
BrainScorePath string
JobType string
GpuQueue string
Description string
BranchName string
BootFile string
Params string
CommitID string
DataLocalPath string
BenchmarkTypeID int
BenchmarkChildTypeID int
ResourceSpecId int
}

func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
if !ctx.IsSigned {
return false
@@ -187,23 +212,23 @@ func AdminOrImageCreaterRight(ctx *context.Context) {

}

func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description, branchName, bootFile, params, commitID string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error {
func GenerateTask(req GenerateCloudBrainTaskReq) error {

dataActualPath := setting.Attachment.Minio.RealPath +
setting.Attachment.Minio.Bucket + "/" +
setting.Attachment.Minio.BasePath +
models.AttachmentRelativePath(uuid) +
uuid
models.AttachmentRelativePath(req.Uuids) +
req.Uuids

var resourceSpec *models.ResourceSpec
var versionCount int
if jobType == string(models.JobTypeTrain) {
if req.JobType == string(models.JobTypeTrain) {
versionCount = 1
if TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
}
for _, spec := range TrainResourceSpecs.ResourceSpec {
if resourceSpecId == spec.Id {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
}
}
@@ -212,7 +237,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
}
for _, spec := range ResourceSpecs.ResourceSpec {
if resourceSpecId == spec.Id {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
}
}
@@ -220,25 +245,25 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", resourceSpecId, ctx.Data["MsgID"])
log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
return errors.New("no such resourceSpec")
}

var datasetName string
attach, err := models.GetAttachmentByUUID(uuid)
attach, err := models.GetAttachmentByUUID(req.Uuids)
if err != nil {
//for benchmark, do not return error
log.Error("GetAttachmentByUUID failed:%v", err)
log.Error("GetAttachmentByUUID failed:%v", err, req.Ctx.Data["MsgID"])
} else {
datasetName = attach.Name
}

createTime := timeutil.TimeStampNow()
jobResult, err := CreateJob(jobName, models.CreateJobParams{
JobName: jobName,
jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
JobName: req.JobName,
RetryCount: 1,
GpuType: gpuQueue,
Image: image,
GpuType: req.GpuQueue,
Image: req.Image,
TaskRoles: []models.TaskRole{
{
Name: SubTaskName,
@@ -249,7 +274,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,
GPUNumber: resourceSpec.GpuNum,
MemoryMB: resourceSpec.MemMiB,
ShmMB: resourceSpec.ShareMemMiB,
Command: command,
Command: req.Command,
NeedIBDevice: false,
IsMainRole: false,
UseNNI: false,
@@ -258,7 +283,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,
Volumes: []models.Volume{
{
HostPath: models.StHostPath{
Path: codePath,
Path: req.CodePath,
MountPath: CodeMountPath,
ReadOnly: false,
},
@@ -272,28 +297,28 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,
},
{
HostPath: models.StHostPath{
Path: modelPath,
Path: req.ModelPath,
MountPath: ModelMountPath,
ReadOnly: false,
},
},
{
HostPath: models.StHostPath{
Path: benchmarkPath,
Path: req.BenchmarkPath,
MountPath: BenchMarkMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: snn4imagenetPath,
Path: req.Snn4ImageNetPath,
MountPath: Snn4imagenetMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: brainScorePath,
Path: req.BrainScorePath,
MountPath: BrainScoreMountPath,
ReadOnly: true,
},
@@ -301,42 +326,42 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,
},
})
if err != nil {
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
return err
}
if jobResult.Code != Success {
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
return errors.New(jobResult.Msg)
}

var jobID = jobResult.Payload["jobId"].(string)
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: string(models.JobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
UserID: req.Ctx.User.ID,
RepoID: req.Ctx.Repo.Repository.ID,
JobID: jobID,
JobName: jobName,
DisplayJobName: displayJobName,
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
SubTaskName: SubTaskName,
JobType: jobType,
JobType: req.JobType,
Type: models.TypeCloudBrainOne,
Uuid: uuid,
Image: image,
GpuQueue: gpuQueue,
ResourceSpecId: resourceSpecId,
Uuid: req.Uuids,
Image: req.Image,
GpuQueue: req.GpuQueue,
ResourceSpecId: req.ResourceSpecId,
ComputeResource: models.GPUResource,
BenchmarkTypeID: benchmarkTypeID,
BenchmarkChildTypeID: benchmarkChildTypeID,
Description: description,
BenchmarkTypeID: req.BenchmarkTypeID,
BenchmarkChildTypeID: req.BenchmarkChildTypeID,
Description: req.Description,
IsLatestVersion: "1",
VersionCount: versionCount,
BranchName: branchName,
BootFile: bootFile,
BranchName: req.BranchName,
BootFile: req.BootFile,
DatasetName: datasetName,
Parameters: params,
Parameters: req.Params,
CreatedUnix: createTime,
UpdatedUnix: createTime,
CommitID: commitID,
CommitID: req.CommitID,
})

if err != nil {
@@ -345,17 +370,17 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command,

task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByName failed: %v", err.Error())
log.Error("GetCloudbrainByJobID failed: %v", err.Error())
return err
}
stringId := strconv.FormatInt(task.ID, 10)

if IsBenchmarkJob(jobType) {
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask)
} else if string(models.JobTypeTrain) == jobType {
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, displayJobName, models.ActionCreateGPUTrainTask)
if IsBenchmarkJob(req.JobType) {
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
} else if string(models.JobTypeTrain) == req.JobType {
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
} else {
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask)
notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
}

return nil


+ 121
- 16
routers/repo/cloudbrain.go View File

@@ -207,7 +207,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
image := strings.TrimSpace(form.Image)
uuid := form.Attachment
uuids := form.Attachment
jobType := form.JobType
gpuQueue := form.GpuType
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
@@ -273,6 +273,13 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}

if err = checkDatasetLimit(uuids); err != nil {
log.Error("checkDatasetLimit failed: %v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("checkDatasetLimit failed", tpl, &form)
return
}

if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
@@ -285,11 +292,31 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {

commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)

err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, form.Description, branchName, form.BootFile, form.Params,
commitID, 0, 0, resourceSpecId)
req := cloudbrain.GenerateCloudBrainTaskReq{
Ctx: ctx,
DisplayJobName: displayJobName,
JobName: jobName,
Image: image,
Command: command,
Uuids: uuids,
CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
ModelPath: storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
JobType: jobType,
GpuQueue: gpuQueue,
Description: form.Description,
BranchName: branchName,
BootFile: form.BootFile,
Params: form.Params,
CommitID: commitID,
BenchmarkTypeID: 0,
BenchmarkChildTypeID: 0,
ResourceSpecId: resourceSpecId,
}

err = cloudbrain.GenerateTask(req)
if err != nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
@@ -1982,11 +2009,38 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo
//return
}

err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, childInfo.Attachment, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, cloudbrain.DefaultBranchName, "", "",
"", benchmarkTypeID, benchmarkChildTypeID, resourceSpecId)
dataActualPath := setting.Attachment.Minio.RealPath +
setting.Attachment.Minio.Bucket + "/" +
setting.Attachment.Minio.BasePath +
models.AttachmentRelativePath(childInfo.Attachment) +
childInfo.Attachment

req := cloudbrain.GenerateCloudBrainTaskReq{
Ctx: ctx,
DisplayJobName: displayJobName,
JobName: jobName,
Image: image,
Command: command,
Uuids: childInfo.Attachment,
CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
ModelPath: storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
JobType: string(models.JobTypeBenchmark),
GpuQueue: gpuQueue,
Description: form.Description,
BranchName: cloudbrain.DefaultBranchName,
BootFile: "",
Params: "",
CommitID: "",
BenchmarkTypeID: benchmarkTypeID,
BenchmarkChildTypeID: benchmarkChildTypeID,
ResourceSpecId: resourceSpecId,
DataLocalPath: dataActualPath,
}

err = cloudbrain.GenerateTask(req)
if err != nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, &form)
@@ -2080,11 +2134,38 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
command = fmt.Sprintf(cloudbrain.BrainScoreCommand, getBrainRegion(benchmarkChildTypeID), displayJobName, trimSpaceNewlineInString(form.Description))
}

err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, form.Description, branchName, form.BootFile, form.Params,
"", 0, benchmarkChildTypeID, resourceSpecId)
dataActualPath := setting.Attachment.Minio.RealPath +
setting.Attachment.Minio.Bucket + "/" +
setting.Attachment.Minio.BasePath +
models.AttachmentRelativePath(uuid) +
uuid

req := cloudbrain.GenerateCloudBrainTaskReq{
Ctx: ctx,
DisplayJobName: displayJobName,
JobName: jobName,
Image: image,
Command: command,
Uuids: uuid,
CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
ModelPath: storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
JobType: jobType,
GpuQueue: gpuQueue,
Description: form.Description,
BranchName: branchName,
BootFile: form.BootFile,
Params: form.Params,
CommitID: "",
BenchmarkTypeID: 0,
BenchmarkChildTypeID: benchmarkChildTypeID,
ResourceSpecId: resourceSpecId,
DataLocalPath: dataActualPath,
}

err = cloudbrain.GenerateTask(req)
if err != nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
@@ -2195,3 +2276,27 @@ func GetBenchmarkTypes(ctx *context.Context) *models.BenchmarkTypes {
}
return benchmarkTypesMap[lang]
}

func checkDatasetLimit(uuidStr string) error {
uuids := strings.Split(uuidStr, ";")
if len(uuids) > 5 {
log.Error("the dataset count(%d) exceed the limit", len(uuids))
return errors.New("the dataset count exceed the limit")
}

attachNames := make(map[string]string)
for _, uuid := range uuids {
attach, err := models.GetAttachmentByUUID(uuid)
if err != nil {
log.Error("GetAttachmentByUUID failed: %v", err)
return err
}

if _, ok := attachNames[attach.Name]; ok {
log.Error("the dataset name is same: %v", attach.Name)
return errors.New("the dataset name is same")
}
attachNames[attach.Name] = attach.Name
}
return nil
}

Loading…
Cancel
Save