@@ -499,8 +499,32 @@ type Config struct { | |||||
LogUrl string `json:"log_url"` | LogUrl string `json:"log_url"` | ||||
//UserImageUrl string `json:"user_image_url"` | //UserImageUrl string `json:"user_image_url"` | ||||
//UserCommand string `json:"user_command"` | //UserCommand string `json:"user_command"` | ||||
CreateVersion bool `json:"create_version"` | |||||
Volumes []Volumes `json:"volumes"` | |||||
//CreateVersion bool `json:"create_version"` | |||||
//Volumes []Volumes `json:"volumes"` | |||||
Flavor Flavor `json:"flavor"` | |||||
PoolID string `json:"pool_id"` | |||||
} | |||||
type CreateConfigParams struct { | |||||
ConfigName string `json:"config_name"` | |||||
Description string `json:"config_desc"` | |||||
WorkServerNum int `json:"worker_server_num"` | |||||
AppUrl string `json:"app_url"` //训练作业的代码目录 | |||||
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | |||||
Parameter []Parameter `json:"parameter"` | |||||
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | |||||
//DatasetID string `json:"dataset_id"` | |||||
//DataVersionID string `json:"dataset_version_id"` | |||||
//DataSource []DataSource `json:"data_source"` | |||||
//SpecID int64 `json:"spec_id"` | |||||
EngineID int64 `json:"engine_id"` | |||||
//ModelID int64 `json:"model_id"` | |||||
TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | |||||
LogUrl string `json:"log_url"` | |||||
//UserImageUrl string `json:"user_image_url"` | |||||
//UserCommand string `json:"user_command"` | |||||
//CreateVersion bool `json:"create_version"` | |||||
//Volumes []Volumes `json:"volumes"` | |||||
Flavor Flavor `json:"flavor"` | Flavor Flavor `json:"flavor"` | ||||
PoolID string `json:"pool_id"` | PoolID string `json:"pool_id"` | ||||
} | } | ||||
@@ -552,6 +576,12 @@ type CreateTrainJobResult struct { | |||||
VersionName string `json:"version_name"` | VersionName string `json:"version_name"` | ||||
} | } | ||||
type CreateTrainJobConfigResult struct { | |||||
ErrorCode string `json:"error_code"` | |||||
ErrorMsg string `json:"error_msg"` | |||||
IsSuccess bool `json:"is_success"` | |||||
} | |||||
type GetResourceSpecsResult struct { | type GetResourceSpecsResult struct { | ||||
ErrorCode string `json:"error_code"` | ErrorCode string `json:"error_code"` | ||||
ErrorMsg string `json:"error_msg"` | ErrorMsg string `json:"error_msg"` | ||||
@@ -574,6 +604,12 @@ type Specs struct { | |||||
InterfaceType int `json:"interface_type"` | InterfaceType int `json:"interface_type"` | ||||
} | } | ||||
type ErrorResult struct { | |||||
ErrorCode string `json:"error_code"` | |||||
ErrorMsg string `json:"error_message"` | |||||
IsSuccess bool `json:"is_success"` | |||||
} | |||||
func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | ||||
sess := x.NewSession() | sess := x.NewSession() | ||||
defer sess.Close() | defer sess.Close() | ||||
@@ -25,6 +25,7 @@ const ( | |||||
urlNotebook = "/demanager/instances" | urlNotebook = "/demanager/instances" | ||||
urlTrainJob = "/training-jobs" | urlTrainJob = "/training-jobs" | ||||
urlResourceSpecs = "/job/resource-specs" | urlResourceSpecs = "/job/resource-specs" | ||||
urlTrainJobConfig = "/training-job-configs" | |||||
errorCodeExceedLimit = "ModelArts.0118" | errorCodeExceedLimit = "ModelArts.0118" | ||||
) | ) | ||||
@@ -86,7 +87,6 @@ func getToken() error { | |||||
} | } | ||||
TOKEN = res.Header().Get("X-Subject-Token") | TOKEN = res.Header().Get("X-Subject-Token") | ||||
log.Info(TOKEN) | |||||
return nil | return nil | ||||
} | } | ||||
@@ -296,8 +296,6 @@ func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.Create | |||||
client := getRestyClient() | client := getRestyClient() | ||||
var result models.CreateTrainJobResult | var result models.CreateTrainJobResult | ||||
log.Info("%+v",createJobParams) | |||||
retry := 0 | retry := 0 | ||||
sendjob: | sendjob: | ||||
@@ -323,8 +321,13 @@ sendjob: | |||||
} | } | ||||
if res.StatusCode() != http.StatusOK { | if res.StatusCode() != http.StatusOK { | ||||
log.Error("createTrainJob failed", res.StatusCode(), res.RawResponse.Body, result.ErrorCode, result.ErrorMsg) | |||||
return &result, fmt.Errorf("createTrainJob failed(%d)", res.StatusCode()) | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | } | ||||
if !result.IsSuccess { | if !result.IsSuccess { | ||||
@@ -360,8 +363,13 @@ sendjob: | |||||
} | } | ||||
if res.StatusCode() != http.StatusOK { | if res.StatusCode() != http.StatusOK { | ||||
log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) | |||||
return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | } | ||||
if !result.IsSuccess { | if !result.IsSuccess { | ||||
@@ -371,3 +379,49 @@ sendjob: | |||||
return &result, nil | return &result, nil | ||||
} | } | ||||
func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { | |||||
checkSetting() | |||||
client := getRestyClient() | |||||
var result models.CreateTrainJobConfigResult | |||||
retry := 0 | |||||
sendjob: | |||||
res, err := client.R(). | |||||
SetHeader("Content-Type", "application/json"). | |||||
SetAuthToken(TOKEN). | |||||
SetBody(req). | |||||
SetResult(&result). | |||||
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||||
if err != nil { | |||||
return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) | |||||
} | |||||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
retry++ | |||||
_ = getToken() | |||||
goto sendjob | |||||
} | |||||
temp, _ := json.Marshal(req) | |||||
log.Info("%s", temp) | |||||
if res.StatusCode() != http.StatusOK { | |||||
var temp models.ErrorResult | |||||
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
} | |||||
log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
} | |||||
if !result.IsSuccess { | |||||
log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
} | |||||
return &result, nil | |||||
} |
@@ -354,10 +354,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
return | return | ||||
} | } | ||||
if isSaveParam == "on" { | |||||
//todo: save param | |||||
} | |||||
if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { | if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { | ||||
log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) | log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) | ||||
ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) | ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) | ||||
@@ -383,6 +379,39 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
return | return | ||||
} | } | ||||
if isSaveParam == "on" { | |||||
if form.ParameterTemplateName == "" { | |||||
log.Error("ParameterTemplateName is empty") | |||||
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) | |||||
return | |||||
} | |||||
_, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ | |||||
ConfigName: form.ParameterTemplateName, | |||||
Description: form.PrameterDescription, | |||||
DataUrl: dataPath, | |||||
AppUrl: codeObsPath, | |||||
BootFileUrl: codeObsPath + bootFile, | |||||
TrainUrl: outputObsPath, | |||||
Flavor: models.Flavor{ | |||||
Code: flavorCode, | |||||
}, | |||||
WorkServerNum: workServerNumber, | |||||
EngineID: int64(engineID), | |||||
LogUrl: logObsPath, | |||||
PoolID: poolID, | |||||
Parameter: []models.Parameter{ | |||||
}, | |||||
}) | |||||
if err != nil { | |||||
log.Error("Failed to CreateTrainJobConfig: %v", err) | |||||
ctx.RenderWithErr("保存作业参数失败:" + err.Error(), tplModelArtsTrainJobNew, &form) | |||||
return | |||||
} | |||||
} | |||||
req := &modelarts.GenerateTrainJobReq{ | req := &modelarts.GenerateTrainJobReq{ | ||||
JobName: jobName, | JobName: jobName, | ||||
DataUrl: dataPath, | DataUrl: dataPath, | ||||