Browse Source

Merge pull request 'fix-2524' (#3084) from fix-2524 into V20221102

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/3084
Reviewed-by: zouap <zouap@pcl.ac.cn>
pull/3162/head
liuzx 2 years ago
parent
commit
741b2f48cd
4 changed files with 162 additions and 20 deletions
  1. +22
    -1
      models/cloudbrain.go
  2. +46
    -19
      modules/modelarts/modelarts.go
  3. +60
    -0
      modules/modelarts/resty.go
  4. +34
    -0
      routers/repo/modelarts.go

+ 22
- 1
models/cloudbrain.go View File

@@ -1070,6 +1070,12 @@ type CreateInferenceJobParams struct {
InfConfig InfConfig `json:"config"` InfConfig InfConfig `json:"config"`
WorkspaceID string `json:"workspace_id"` WorkspaceID string `json:"workspace_id"`
} }
type CreateInfUserImageParams struct {
JobName string `json:"job_name"`
Description string `json:"job_desc"`
Config InfUserImageConfig `json:"config"`
WorkspaceID string `json:"workspace_id"`
}


type InfConfig struct { type InfConfig struct {
WorkServerNum int `json:"worker_server_num"` WorkServerNum int `json:"worker_server_num"`
@@ -1084,6 +1090,21 @@ type InfConfig struct {
PoolID string `json:"pool_id"` PoolID string `json:"pool_id"`
} }


type InfUserImageConfig struct {
WorkServerNum int `json:"worker_server_num"`
AppUrl string `json:"app_url"` //训练作业的代码目录
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
Parameter []Parameter `json:"parameter"`
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
EngineID int64 `json:"engine_id"`
LogUrl string `json:"log_url"`
CreateVersion bool `json:"create_version"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
UserImageUrl string `json:"user_image_url"`
UserCommand string `json:"user_command"`
}

type CreateTrainJobVersionParams struct { type CreateTrainJobVersionParams struct {
Description string `json:"job_desc"` Description string `json:"job_desc"`
Config TrainJobVersionConfig `json:"config"` Config TrainJobVersionConfig `json:"config"`
@@ -2024,7 +2045,7 @@ func GetCloudbrainRunCountByRepoID(repoID int64) (int, error) {
} }


func GetModelSafetyCountByUserID(userID int64) (int, error) { func GetModelSafetyCountByUserID(userID int64) (int, error) {
count, err := x.In("status", JobWaiting, JobRunning,ModelArtsTrainJobInit,ModelArtsTrainJobImageCreating,ModelArtsTrainJobSubmitTrying,ModelArtsTrainJobScaling,ModelArtsTrainJobCheckInit,ModelArtsTrainJobCheckRunning,ModelArtsTrainJobCheckRunningCompleted).And("job_type = ? and user_id = ?", string(JobTypeModelSafety), userID).Count(new(Cloudbrain))
count, err := x.In("status", JobWaiting, JobRunning, ModelArtsTrainJobInit, ModelArtsTrainJobImageCreating, ModelArtsTrainJobSubmitTrying, ModelArtsTrainJobScaling, ModelArtsTrainJobCheckInit, ModelArtsTrainJobCheckRunning, ModelArtsTrainJobCheckRunningCompleted).And("job_type = ? and user_id = ?", string(JobTypeModelSafety), userID).Count(new(Cloudbrain))
return int(count), err return int(count), err
} }




+ 46
- 19
modules/modelarts/modelarts.go View File

@@ -143,6 +143,8 @@ type GenerateInferenceJobReq struct {
Spec *models.Specification Spec *models.Specification
DatasetName string DatasetName string
JobType string JobType string
UserImageUrl string
UserCommand string
} }


type VersionInfo struct { type VersionInfo struct {
@@ -682,26 +684,51 @@ func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {


func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) { func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
createTime := timeutil.TimeStampNow() createTime := timeutil.TimeStampNow()
jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
JobName: req.JobName,
Description: req.Description,
InfConfig: models.InfConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
// TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.Spec.SourceSpecId,
var jobResult *models.CreateTrainJobResult
var createErr error
if req.EngineID < 0 {
jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
JobName: req.JobName,
Description: req.Description,
Config: models.InfUserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
// TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
}, },
Parameter: req.Parameters,
},
})
if err != nil {
})
} else {
jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
JobName: req.JobName,
Description: req.Description,
InfConfig: models.InfConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
// TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
},
})
}
if createErr != nil {
log.Error("createInferenceJob failed: %v", err.Error()) log.Error("createInferenceJob failed: %v", err.Error())
if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
log.Info("(%s)unknown error, set temp status", req.DisplayJobName) log.Info("(%s)unknown error, set temp status", req.DisplayJobName)


+ 60
- 0
modules/modelarts/resty.go View File

@@ -1197,6 +1197,66 @@ sendjob:
return &result, nil return &result, nil
} }


func createInferenceJobUserImage(createJobParams models.CreateInfUserImageParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createInferenceJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createInferenceJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) {
checkSetting() checkSetting()
client := getRestyClient() client := getRestyClient()


+ 34
- 0
routers/repo/modelarts.go View File

@@ -1312,6 +1312,36 @@ func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, s
return userCommand, userImageUrl return userCommand, userImageUrl
} }


func getInfJobUserCommand(engineId int, req *modelarts.GenerateInferenceJobReq) (string, string) {
userImageUrl := ""
userCommand := ""
if engineId < 0 {
tmpCodeObsPath := strings.Trim(req.CodeObsPath, "/")
tmpCodeObsPaths := strings.Split(tmpCodeObsPath, "/")
lastCodeDir := "code"
if len(tmpCodeObsPaths) > 0 {
lastCodeDir = tmpCodeObsPaths[len(tmpCodeObsPaths)-1]
}
userCommand = "/bin/bash /home/work/run_train.sh 's3://" + req.CodeObsPath + "' '" + lastCodeDir + "/" + req.BootFile + "' '/tmp/log/train.log' --'data_url'='s3://" + req.DataUrl + "' --'train_url'='s3://" + req.TrainUrl + "'"
var versionInfos modelarts.VersionInfo
if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
log.Info("json parse err." + err.Error())
} else {
for _, engine := range versionInfos.Version {
if engine.ID == engineId {
userImageUrl = engine.Url
break
}
}
}
for _, param := range req.Parameters {
userCommand += " --'" + param.Label + "'='" + param.Value + "'"
}
return userCommand, userImageUrl
}
return userCommand, userImageUrl
}

func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid") var jobID = ctx.Params(":jobid")
@@ -2171,6 +2201,10 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
JobType: string(models.JobTypeInference), JobType: string(models.JobTypeInference),
} }


userCommand, userImageUrl := getInfJobUserCommand(engineID, req)
req.UserCommand = userCommand
req.UserImageUrl = userImageUrl

err = modelarts.GenerateInferenceJob(ctx, req) err = modelarts.GenerateInferenceJob(ctx, req)
if err != nil { if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error()) log.Error("GenerateTrainJob failed:%v", err.Error())


Loading…
Cancel
Save