From cdae759a091324f1acc109153d6e2e9135a3a47c Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 19 Aug 2022 17:53:33 +0800 Subject: [PATCH 01/26] init --- modules/modelarts_cd/modelarts.go | 227 ++++++++++++++++++++++++++++ modules/modelarts_cd/resty.go | 246 +++++++++++++++++++++++++++++++ modules/modelarts_gateway/core/escape.go | 42 ++++++ modules/modelarts_gateway/core/signer.go | 208 ++++++++++++++++++++++++++ modules/setting/setting.go | 16 ++ 5 files changed, 739 insertions(+) create mode 100755 modules/modelarts_cd/modelarts.go create mode 100755 modules/modelarts_cd/resty.go create mode 100755 modules/modelarts_gateway/core/escape.go create mode 100755 modules/modelarts_gateway/core/signer.go diff --git a/modules/modelarts_cd/modelarts.go b/modules/modelarts_cd/modelarts.go new file mode 100755 index 000000000..959907c08 --- /dev/null +++ b/modules/modelarts_cd/modelarts.go @@ -0,0 +1,227 @@ +package modelarts_cd + +import ( + "encoding/json" + "errors" + "strconv" + "strings" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/notification" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" +) + +const ( + //notebook + storageTypeOBS = "obs" + autoStopDuration = 4 * 60 * 60 + autoStopDurationMs = 4 * 60 * 60 * 1000 + MORDELART_USER_IMAGE_ENGINE_ID = -1 + DataSetMountPath = "/home/ma-user/work" + NotebookEnv = "Python3" + NotebookType = "Ascend" + FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" + + //train-job + CodePath = "/code/" + OutputPath = "/output/" + ResultPath = "/result/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 500 + TrainUrl = "train_url" + DataUrl = "data_url" + MultiDataUrl = "multi_data_url" + ResultUrl = "result_url" + CkptUrl = "ckpt_url" + DeviceTarget = "device_target" + Ascend = "Ascend" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" + VersionCountOne = 1 + + SortByCreateTime = "create_time" + ConfigTypeCustom = "custom" + TotalVersionCount = 1 +) + +var ( + poolInfos *models.PoolInfos + FlavorInfos *models.FlavorInfos + ImageInfos *models.ImageInfosModelArts + TrainFlavorInfos *Flavor + SpecialPools *models.SpecialPools +) + +type VersionInfo struct { + Version []struct { + ID int `json:"id"` + Value string `json:"value"` + Url string `json:"url"` + } `json:"version"` +} + +type Flavor struct { + Info []struct { + Code string `json:"code"` + Value string `json:"value"` + } `json:"flavor"` +} + +type Engine struct { + Info []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"engine"` +} + +type ResourcePool struct { + Info []struct { + ID string `json:"id"` + Value string `json:"value"` + } `json:"resource_pool"` +} + +type Parameters struct { + Parameter []struct { + Label string `json:"label"` + Value string `json:"value"` + } `json:"parameter"` +} + +func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { + if poolInfos == nil { + json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) + } + + imageName, err := GetNotebookImageName(imageId) + if err != nil { + log.Error("GetNotebookImageName failed: %v", err.Error()) + return err + } + createTime := timeutil.TimeStampNow() + jobResult, err := createNotebook(models.CreateNotebook2Params{ + JobName: jobName, + Description: description, + Flavor: flavor, + Duration: autoStopDurationMs, + ImageID: imageId, + PoolID: poolInfos.PoolInfo[0].PoolId, + Feature: models.NotebookFeature, + Volume: models.VolumeReq{ + Capacity: setting.Capacity, + Category: models.EVSCategory, + Ownership: models.ManagedOwnership, + }, + WorkspaceID: "0", + }) + if err != nil { + log.Error("createNotebook2 failed: %v", err.Error()) + if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { + log.Info("(%s)unknown error, set temp status", displayJobName) + errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ + JobID: models.TempJobId, + VersionID: models.TempVersionId, + Status: models.TempJobStatus, + Type: models.TypeCloudBrainTwo, + JobName: jobName, + JobType: string(models.JobTypeDebug), + }) + if errTemp != nil { + log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) + return errTemp + } + } + return err + } + task := &models.Cloudbrain{ + Status: jobResult.Status, + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobResult.ID, + JobName: jobName, + FlavorCode: flavor, + DisplayJobName: displayJobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeCloudBrainTwo, + Uuid: uuid, + ComputeResource: models.NPUResource, + Image: imageName, + Description: description, + CreatedUnix: createTime, + UpdatedUnix: createTime, + } + + err = models.CreateCloudbrain(task) + if err != nil { + return err + } + + stringId := strconv.FormatInt(task.ID, 10) + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask) + return nil +} + +func GetNotebookImageName(imageId string) (string, error) { + var validImage = false + var imageName = "" + + if ImageInfos == nil { + json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos) + } + + for _, imageInfo := range ImageInfos.ImageInfo { + if imageInfo.Id == imageId { + validImage = true + imageName = imageInfo.Value + } + } + + if !validImage { + log.Error("the image id(%s) is invalid", imageId) + return imageName, errors.New("the image id is invalid") + } + + return imageName, nil +} + +func HandleNotebookInfo(task *models.Cloudbrain) error { + + result, err := GetNotebook(task.JobID) + if err != nil { + log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) + return err + } + + if result != nil { + oldStatus := task.Status + task.Status = result.Status + if task.StartTime == 0 && result.Lease.UpdateTime > 0 { + task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) + } + if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { + task.EndTime = timeutil.TimeStampNow() + } + task.CorrectCreateUnix() + task.ComputeAndSetDuration() + if oldStatus != task.Status { + notification.NotifyChangeCloudbrainStatus(task, oldStatus) + } + if task.FlavorCode == "" { + task.FlavorCode = result.Flavor + } + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) + return err + } + } + + return nil +} diff --git a/modules/modelarts_cd/resty.go b/modules/modelarts_cd/resty.go new file mode 100755 index 000000000..941262be3 --- /dev/null +++ b/modules/modelarts_cd/resty.go @@ -0,0 +1,246 @@ +package modelarts_cd + +import ( + "bytes" + "code.gitea.io/gitea/modules/modelarts_gateway/core" + "crypto/tls" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "strconv" + "time" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" +) + +var ( + httpClient *http.Client + HOST string + TOKEN string +) + +const ( + methodPassword = "password" + + urlGetToken = "/v3/auth/tokens" + errorCodeExceedLimit = "ModelArts.0118" + + //notebook 2.0 + urlNotebook2 = "/notebooks" + + //error code + modelartsIllegalToken = "ModelArts.6401" + NotebookNotFound = "ModelArts.6404" + NotebookNoPermission = "ModelArts.6407" + NotebookInvalid = "ModelArts.6400" + UnknownErrorPrefix = "UNKNOWN:" +) + +func getHttpClient() *http.Client { + if httpClient == nil { + httpClient = &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}, + } + } + return httpClient +} + +func GetNotebook(jobID string) (*models.GetNotebook2Result, error) { + client := getHttpClient() + var result models.GetNotebook2Result + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + + if err != nil { + return nil, fmt.Errorf("resty GetJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { + client := getHttpClient() + var result models.NotebookActionResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) + + if err != nil { + return &result, fmt.Errorf("resty ManageNotebook2: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if res.StatusCode() == http.StatusBadGateway { + return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + if len(response.ErrorCode) != 0 { + log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelNotebook(jobID string) (*models.NotebookDelResult, error) { + client := getHttpClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func createNotebook(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { + client := getHttpClient() + var result models.CreateNotebookResult + + retry := 0 + + s := core.Signer{ + Key: "", + Secret: "", + } + + r, _ := http.NewRequest(http.MethodPost, "", ioutil.NopCloser(bytes.NewBuffer([]byte("")))) + + s.Sign(r) + + resp, err := http.DefaultClient.Do(r) + body, err := ioutil.ReadAll(resp.Body) + +sendjob: + res, err := client. + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) + + if err != nil { + return nil, fmt.Errorf("resty create notebook2: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error()) + } + + if res.StatusCode() == http.StatusBadGateway { + return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + if len(response.ErrorCode) != 0 { + log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == errorCodeExceedLimit { + response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" + } + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} diff --git a/modules/modelarts_gateway/core/escape.go b/modules/modelarts_gateway/core/escape.go new file mode 100755 index 000000000..e8c76b8ae --- /dev/null +++ b/modules/modelarts_gateway/core/escape.go @@ -0,0 +1,42 @@ +// based on https://github.com/golang/go/blob/master/src/net/url/url.go +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package core + +func shouldEscape(c byte) bool { + if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_' || c == '-' || c == '~' || c == '.' { + return false + } + return true +} +func escape(s string) string { + hexCount := 0 + for i := 0; i < len(s); i++ { + c := s[i] + if shouldEscape(c) { + hexCount++ + } + } + + if hexCount == 0 { + return s + } + + t := make([]byte, len(s)+2*hexCount) + j := 0 + for i := 0; i < len(s); i++ { + switch c := s[i]; { + case shouldEscape(c): + t[j] = '%' + t[j+1] = "0123456789ABCDEF"[c>>4] + t[j+2] = "0123456789ABCDEF"[c&15] + j += 3 + default: + t[j] = s[i] + j++ + } + } + return string(t) +} diff --git a/modules/modelarts_gateway/core/signer.go b/modules/modelarts_gateway/core/signer.go new file mode 100755 index 000000000..7992713b3 --- /dev/null +++ b/modules/modelarts_gateway/core/signer.go @@ -0,0 +1,208 @@ +// HWS API Gateway Signature +// based on https://github.com/datastream/aws/blob/master/signv4.go +// Copyright (c) 2014, Xianjie + +package core + +import ( + "bytes" + "crypto/hmac" + "crypto/sha256" + "fmt" + "io/ioutil" + "net/http" + "sort" + "strings" + "time" +) + +const ( + BasicDateFormat = "20060102T150405Z" + Algorithm = "SDK-HMAC-SHA256" + HeaderXDate = "X-Sdk-Date" + HeaderHost = "host" + HeaderAuthorization = "Authorization" + HeaderContentSha256 = "X-Sdk-Content-Sha256" +) + +func hmacsha256(key []byte, data string) ([]byte, error) { + h := hmac.New(sha256.New, []byte(key)) + if _, err := h.Write([]byte(data)); err != nil { + return nil, err + } + return h.Sum(nil), nil +} + +// Build a CanonicalRequest from a regular request string +// +// CanonicalRequest = +// HTTPRequestMethod + '\n' + +// CanonicalURI + '\n' + +// CanonicalQueryString + '\n' + +// CanonicalHeaders + '\n' + +// SignedHeaders + '\n' + +// HexEncode(Hash(RequestPayload)) +func CanonicalRequest(r *http.Request, signedHeaders []string) (string, error) { + var hexencode string + var err error + if hex := r.Header.Get(HeaderContentSha256); hex != "" { + hexencode = hex + } else { + data, err := RequestPayload(r) + if err != nil { + return "", err + } + hexencode, err = HexEncodeSHA256Hash(data) + if err != nil { + return "", err + } + } + return fmt.Sprintf("%s\n%s\n%s\n%s\n%s\n%s", r.Method, CanonicalURI(r), CanonicalQueryString(r), CanonicalHeaders(r, signedHeaders), strings.Join(signedHeaders, ";"), hexencode), err +} + +// CanonicalURI returns request uri +func CanonicalURI(r *http.Request) string { + pattens := strings.Split(r.URL.Path, "/") + var uri []string + for _, v := range pattens { + uri = append(uri, escape(v)) + } + urlpath := strings.Join(uri, "/") + if len(urlpath) == 0 || urlpath[len(urlpath)-1] != '/' { + urlpath = urlpath + "/" + } + return urlpath +} + +// CanonicalQueryString +func CanonicalQueryString(r *http.Request) string { + var keys []string + query := r.URL.Query() + for key := range query { + keys = append(keys, key) + } + sort.Strings(keys) + var a []string + for _, key := range keys { + k := escape(key) + sort.Strings(query[key]) + for _, v := range query[key] { + kv := fmt.Sprintf("%s=%s", k, escape(v)) + a = append(a, kv) + } + } + queryStr := strings.Join(a, "&") + r.URL.RawQuery = queryStr + return queryStr +} + +// CanonicalHeaders +func CanonicalHeaders(r *http.Request, signerHeaders []string) string { + var a []string + header := make(map[string][]string) + for k, v := range r.Header { + header[strings.ToLower(k)] = v + } + for _, key := range signerHeaders { + value := header[key] + if strings.EqualFold(key, HeaderHost) { + value = []string{r.Host} + } + sort.Strings(value) + for _, v := range value { + a = append(a, key+":"+strings.TrimSpace(v)) + } + } + return fmt.Sprintf("%s\n", strings.Join(a, "\n")) +} + +// SignedHeaders +func SignedHeaders(r *http.Request) []string { + var a []string + for key := range r.Header { + a = append(a, strings.ToLower(key)) + } + sort.Strings(a) + return a +} + +// RequestPayload +func RequestPayload(r *http.Request) ([]byte, error) { + if r.Body == nil { + return []byte(""), nil + } + b, err := ioutil.ReadAll(r.Body) + if err != nil { + return []byte(""), err + } + r.Body = ioutil.NopCloser(bytes.NewBuffer(b)) + return b, err +} + +// Create a "String to Sign". +func StringToSign(canonicalRequest string, t time.Time) (string, error) { + hash := sha256.New() + _, err := hash.Write([]byte(canonicalRequest)) + if err != nil { + return "", err + } + return fmt.Sprintf("%s\n%s\n%x", + Algorithm, t.UTC().Format(BasicDateFormat), hash.Sum(nil)), nil +} + +// Create the HWS Signature. +func SignStringToSign(stringToSign string, signingKey []byte) (string, error) { + hm, err := hmacsha256(signingKey, stringToSign) + return fmt.Sprintf("%x", hm), err +} + +// HexEncodeSHA256Hash returns hexcode of sha256 +func HexEncodeSHA256Hash(body []byte) (string, error) { + hash := sha256.New() + if body == nil { + body = []byte("") + } + _, err := hash.Write(body) + return fmt.Sprintf("%x", hash.Sum(nil)), err +} + +// Get the finalized value for the "Authorization" header. The signature parameter is the output from SignStringToSign +func AuthHeaderValue(signature, accessKey string, signedHeaders []string) string { + return fmt.Sprintf("%s Access=%s, SignedHeaders=%s, Signature=%s", Algorithm, accessKey, strings.Join(signedHeaders, ";"), signature) +} + +// Signature HWS meta +type Signer struct { + Key string + Secret string +} + +// SignRequest set Authorization header +func (s *Signer) Sign(r *http.Request) error { + var t time.Time + var err error + var dt string + if dt = r.Header.Get(HeaderXDate); dt != "" { + t, err = time.Parse(BasicDateFormat, dt) + } + if err != nil || dt == "" { + t = time.Now() + r.Header.Set(HeaderXDate, t.UTC().Format(BasicDateFormat)) + } + signedHeaders := SignedHeaders(r) + canonicalRequest, err := CanonicalRequest(r, signedHeaders) + if err != nil { + return err + } + stringToSign, err := StringToSign(canonicalRequest, t) + if err != nil { + return err + } + signature, err := SignStringToSign(stringToSign, []byte(s.Secret)) + if err != nil { + return err + } + authValue := AuthHeaderValue(signature, s.Key, signedHeaders) + r.Header.Set(HeaderAuthorization, authValue) + return nil +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 1e96ff9da..549889f1f 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -548,6 +548,22 @@ var ( TrainJobFLAVORINFOS string ModelArtsSpecialPools string + // modelarts-cd config + ModelartsCD = struct { + ModelArtsHost string + IamHost string + ProjectID string + ProjectName string + ModelArtsUsername string + ModelArtsPassword string + ModelArtsDomain string + AllowedOrg string + ProfileID string + PoolInfos string + Flavor string + DebugHost string + }{} + //grampus config Grampus = struct { Env string From e92d948822efbfcaca6f5160bdd0adac65f5b16e Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 22 Aug 2022 19:53:52 +0800 Subject: [PATCH 02/26] cd notebook --- go.sum | 2 + models/cloudbrain.go | 65 +++++------- modules/grampus/grampus.go | 4 +- modules/modelarts/modelarts.go | 18 ++-- modules/modelarts_cd/modelarts.go | 31 ++---- modules/modelarts_cd/resty.go | 206 +++++++++++++++++--------------------- modules/setting/setting.go | 88 ++++++++++++---- routers/repo/modelarts.go | 169 +++++++++++-------------------- 8 files changed, 266 insertions(+), 317 deletions(-) diff --git a/go.sum b/go.sum index 6735a1938..e0c11f261 100755 --- a/go.sum +++ b/go.sum @@ -713,12 +713,14 @@ github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHeiJApdr3r4w= github.com/smartystreets/assertions v1.0.1/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= +github.com/smartystreets/assertions v1.1.0 h1:MkTeG1DMwsrdH7QtLXy5W+fUxWq+vmb6cLmyJ7aRtF0= github.com/smartystreets/assertions v1.1.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM= github.com/smartystreets/goconvey v0.0.0-20181108003508-044398e4856c/go.mod h1:XDJAKZRPZ1CvBcN2aX5YOUTYGHki24fSF0Iv48Ibg0s= github.com/smartystreets/goconvey v0.0.0-20190330032615-68dc04aab96a/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 h1:WN9BUFbdyOsSH/XohnWpXOlq9NBD5sGAB2FciQMUEe8= github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE= diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 62bae29e2..bf3915eaa 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -24,15 +24,16 @@ type ModelArtsJobStatus string const ( TypeCloudBrainOne int = iota TypeCloudBrainTwo - TypeC2Net //智算网络 + TypeC2Net //智算网络 + TypeCDCenter //成都智算中心 TypeCloudBrainAll = -1 ) const ( - NPUResource = "NPU" - GPUResource = "CPU/GPU" - AllResource = "all" + NPUResource = "NPU" + GPUResource = "CPU/GPU" + AllResource = "all" //notebook storage category EVSCategory = "EVS" @@ -584,37 +585,17 @@ type ResourceSpec struct { ShareMemMiB int `json:"shareMemMiB"` } -type FlavorInfos struct { - FlavorInfo []*FlavorInfo `json:"flavor_info"` -} - -type FlavorInfo struct { - Id int `json:"id"` - Value string `json:"value"` - Desc string `json:"desc"` -} - type SpecialPools struct { Pools []*SpecialPool `json:"pools"` } type SpecialPool struct { - Org string `json:"org"` - Type string `json:"type"` - IsExclusive bool `json:"isExclusive"` - Pool []*GpuInfo `json:"pool"` - JobType []string `json:"jobType"` - ResourceSpec []*ResourceSpec `json:"resourceSpecs"` - Flavor []*FlavorInfo `json:"flavor"` -} - -type ImageInfosModelArts struct { - ImageInfo []*ImageInfoModelArts `json:"image_info"` -} - -type ImageInfoModelArts struct { - Id string `json:"id"` - Value string `json:"value"` - Desc string `json:"desc"` + Org string `json:"org"` + Type string `json:"type"` + IsExclusive bool `json:"isExclusive"` + Pool []*GpuInfo `json:"pool"` + JobType []string `json:"jobType"` + ResourceSpec []*ResourceSpec `json:"resourceSpecs"` + Flavor []*setting.FlavorInfo `json:"flavor"` } type PoolInfos struct { @@ -720,6 +701,17 @@ type CreateNotebook2Params struct { Volume VolumeReq `json:"volume"` } +type CreateNotebookWithoutPoolParams struct { + JobName string `json:"name"` + Description string `json:"description"` + Duration int64 `json:"duration"` //ms + Feature string `json:"feature"` + Flavor string `json:"flavor"` + ImageID string `json:"image_id"` + WorkspaceID string `json:"workspace_id"` + Volume VolumeReq `json:"volume"` +} + type VolumeReq struct { Capacity int `json:"capacity"` Category string `json:"category"` @@ -943,6 +935,7 @@ type NotebookGetJobTokenResult struct { } type NotebookDelResult struct { + NotebookResult InstanceID string `json:"instance_id"` } @@ -1434,12 +1427,6 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if len(opts.ComputeResource) > 0 { - cond = cond.And( - builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource}, - ) - } - if len(opts.JobTypes) > 0 { if opts.JobTypeNot { cond = cond.And( @@ -1904,9 +1891,9 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy return sess.Count(new(Cloudbrain)) } -func GetCloudbrainNotebookCountByUserID(userID int64) (int, error) { +func GetCloudbrainNotebookCountByUserID(userID int64, typeCloudbrain int) (int, error) { count, err := x.In("status", ModelArtsCreateQueue, ModelArtsCreating, ModelArtsStarting, ModelArtsReadyToStart, ModelArtsResizing, ModelArtsStartQueuing, ModelArtsRunning, ModelArtsRestarting). - And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) + And("job_type = ? and user_id = ? and type = ?", JobTypeDebug, userID, typeCloudbrain).Count(new(Cloudbrain)) return int(count), err } diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 0d84d7aa7..ea4f7afbb 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -30,8 +30,8 @@ const ( var ( poolInfos *models.PoolInfos - FlavorInfos *models.FlavorInfos - ImageInfos *models.ImageInfosModelArts + FlavorInfos *setting.StFlavorInfos + ImageInfos *setting.StImageInfosModelArts SpecialPools *models.SpecialPools ) diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 9a6ea0574..206db1473 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -1,6 +1,7 @@ package modelarts import ( + "code.gitea.io/gitea/modules/modelarts_cd" "encoding/json" "errors" "fmt" @@ -68,8 +69,6 @@ const ( var ( poolInfos *models.PoolInfos - FlavorInfos *models.FlavorInfos - ImageInfos *models.ImageInfosModelArts TrainFlavorInfos *Flavor SpecialPools *models.SpecialPools ) @@ -747,11 +746,7 @@ func GetNotebookImageName(imageId string) (string, error) { var validImage = false var imageName = "" - if ImageInfos == nil { - json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos) - } - - for _, imageInfo := range ImageInfos.ImageInfo { + for _, imageInfo := range setting.StImageInfos.ImageInfo { if imageInfo.Id == imageId { validImage = true imageName = imageInfo.Value @@ -808,8 +803,13 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error { } func HandleNotebookInfo(task *models.Cloudbrain) error { - - result, err := GetNotebook2(task.JobID) + var result *models.GetNotebook2Result + var err error + if task.Type == models.TypeCloudBrainTwo { + result, err = GetNotebook2(task.JobID) + } else if task.Type == models.TypeCDCenter { + result, err = modelarts_cd.GetNotebook(task.JobID) + } if err != nil { log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) return err diff --git a/modules/modelarts_cd/modelarts.go b/modules/modelarts_cd/modelarts.go index 959907c08..25324dbd5 100755 --- a/modules/modelarts_cd/modelarts.go +++ b/modules/modelarts_cd/modelarts.go @@ -1,7 +1,6 @@ package modelarts_cd import ( - "encoding/json" "errors" "strconv" "strings" @@ -51,13 +50,7 @@ const ( TotalVersionCount = 1 ) -var ( - poolInfos *models.PoolInfos - FlavorInfos *models.FlavorInfos - ImageInfos *models.ImageInfosModelArts - TrainFlavorInfos *Flavor - SpecialPools *models.SpecialPools -) +var () type VersionInfo struct { Version []struct { @@ -96,23 +89,18 @@ type Parameters struct { } func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { - if poolInfos == nil { - json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) - } - imageName, err := GetNotebookImageName(imageId) if err != nil { log.Error("GetNotebookImageName failed: %v", err.Error()) return err } createTime := timeutil.TimeStampNow() - jobResult, err := createNotebook(models.CreateNotebook2Params{ + jobResult, err := createNotebook(models.CreateNotebookWithoutPoolParams{ JobName: jobName, Description: description, Flavor: flavor, Duration: autoStopDurationMs, ImageID: imageId, - PoolID: poolInfos.PoolInfo[0].PoolId, Feature: models.NotebookFeature, Volume: models.VolumeReq{ Capacity: setting.Capacity, @@ -122,14 +110,14 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr WorkspaceID: "0", }) if err != nil { - log.Error("createNotebook2 failed: %v", err.Error()) + log.Error("createNotebook failed: %v", err.Error()) if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { log.Info("(%s)unknown error, set temp status", displayJobName) errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ JobID: models.TempJobId, VersionID: models.TempVersionId, Status: models.TempJobStatus, - Type: models.TypeCloudBrainTwo, + Type: models.TypeCDCenter, JobName: jobName, JobType: string(models.JobTypeDebug), }) @@ -149,7 +137,7 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr FlavorCode: flavor, DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug), - Type: models.TypeCloudBrainTwo, + Type: models.TypeCDCenter, Uuid: uuid, ComputeResource: models.NPUResource, Image: imageName, @@ -172,11 +160,7 @@ func GetNotebookImageName(imageId string) (string, error) { var validImage = false var imageName = "" - if ImageInfos == nil { - json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos) - } - - for _, imageInfo := range ImageInfos.ImageInfo { + for _, imageInfo := range setting.StImageInfos.ImageInfo { if imageInfo.Id == imageId { validImage = true imageName = imageInfo.Value @@ -191,6 +175,7 @@ func GetNotebookImageName(imageId string) (string, error) { return imageName, nil } +/* func HandleNotebookInfo(task *models.Cloudbrain) error { result, err := GetNotebook(task.JobID) @@ -225,3 +210,5 @@ func HandleNotebookInfo(task *models.Cloudbrain) error { return nil } + +*/ diff --git a/modules/modelarts_cd/resty.go b/modules/modelarts_cd/resty.go index 941262be3..6feb78967 100755 --- a/modules/modelarts_cd/resty.go +++ b/modules/modelarts_cd/resty.go @@ -23,9 +23,6 @@ var ( ) const ( - methodPassword = "password" - - urlGetToken = "/v3/auth/tokens" errorCodeExceedLimit = "ModelArts.0118" //notebook 2.0 @@ -50,196 +47,173 @@ func getHttpClient() *http.Client { } func GetNotebook(jobID string) (*models.GetNotebook2Result, error) { - client := getHttpClient() var result models.GetNotebook2Result - retry := 0 + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } + r, _ := http.NewRequest(http.MethodGet, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID, + nil) -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + r.Header.Add("content-type", "application/json") + s.Sign(r) + resp, err := client.Do(r) if err != nil { - return nil, fmt.Errorf("resty GetJob: %v", err) + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) } - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) + err = json.Unmarshal(body, &result) if err != nil { log.Error("json.Unmarshal failed: %s", err.Error()) return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) } - if len(response.ErrorCode) != 0 { - log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if len(result.ErrorCode) != 0 { + log.Error("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil } func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { - client := getHttpClient() var result models.NotebookActionResult - retry := 0 + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } + r, _ := http.NewRequest(http.MethodPost, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID+"/"+param.Action+"?duration="+strconv.Itoa(autoStopDurationMs), + nil) -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) + r.Header.Add("content-type", "application/json") + s.Sign(r) + resp, err := client.Do(r) if err != nil { - return &result, fmt.Errorf("resty ManageNotebook2: %v", err) + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) } - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) + err = json.Unmarshal(body, &result) if err != nil { log.Error("json.Unmarshal failed: %s", err.Error()) return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) } - if res.StatusCode() == http.StatusBadGateway { - return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - if len(response.ErrorCode) != 0 { - log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if len(result.ErrorCode) != 0 { + log.Error("ManageNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("ManageNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil } func DelNotebook(jobID string) (*models.NotebookDelResult, error) { - client := getHttpClient() var result models.NotebookDelResult - retry := 0 + client := getHttpClient() + s := core.Signer{ + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, + } -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + r, _ := http.NewRequest(http.MethodDelete, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID, + nil) + + r.Header.Add("content-type", "application/json") + s.Sign(r) + resp, err := client.Do(r) if err != nil { - return &result, fmt.Errorf("resty DelJob: %v", err) + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) } - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) + err = json.Unmarshal(body, &result) if err != nil { log.Error("json.Unmarshal failed: %s", err.Error()) return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) } - if len(response.ErrorCode) != 0 { - log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if len(result.ErrorCode) != 0 { + log.Error("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil } -func createNotebook(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { - client := getHttpClient() +func createNotebook(createJobParams models.CreateNotebookWithoutPoolParams) (*models.CreateNotebookResult, error) { var result models.CreateNotebookResult - - retry := 0 - + client := getHttpClient() s := core.Signer{ - Key: "", - Secret: "", + Key: setting.ModelartsCD.AccessKey, + Secret: setting.ModelartsCD.SecretKey, } - r, _ := http.NewRequest(http.MethodPost, "", ioutil.NopCloser(bytes.NewBuffer([]byte("")))) + req, _ := json.Marshal(createJobParams) + r, _ := http.NewRequest(http.MethodPost, + setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2, + ioutil.NopCloser(bytes.NewBuffer(req))) + r.Header.Add("content-type", "application/json") s.Sign(r) - resp, err := http.DefaultClient.Do(r) - body, err := ioutil.ReadAll(resp.Body) - -sendjob: - res, err := client. - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetBody(createJobParams). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) - + resp, err := client.Do(r) if err != nil { - return nil, fmt.Errorf("resty create notebook2: %s", err) + log.Error("client.Do failed: %s", err.Error()) + return &result, fmt.Errorf("client.Do failed: %s", err.Error()) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Error("ioutil.ReadAll failed: %s", err.Error()) + return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error()) } - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) + err = json.Unmarshal(body, &result) if err != nil { log.Error("json.Unmarshal failed: %s", err.Error()) return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error()) } - if res.StatusCode() == http.StatusBadGateway { - return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - if len(response.ErrorCode) != 0 { - log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == errorCodeExceedLimit { - response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" - } - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob + if len(result.ErrorCode) != 0 { + log.Error("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) + if result.ErrorCode == errorCodeExceedLimit { + result.ErrorMsg = "所选规格使用数量已超过最大配额限制。" } - return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 549889f1f..0bc47aa70 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -75,6 +75,26 @@ type C2NetSqInfos struct { C2NetSqInfo []*C2NetSequenceInfo `json:"sequence"` } +type StFlavorInfos struct { + FlavorInfo []*FlavorInfo `json:"flavor_info"` +} + +type FlavorInfo struct { + Id int `json:"id"` + Value string `json:"value"` + Desc string `json:"desc"` +} + +type StImageInfosModelArts struct { + ImageInfo []*ImageInfoModelArts `json:"image_info"` +} + +type ImageInfoModelArts struct { + Id string `json:"id"` + Value string `json:"value"` + Desc string `json:"desc"` +} + var ( // AppVer settings AppVer string @@ -535,33 +555,29 @@ var ( AllowedOrg string ProfileID string PoolInfos string - Flavor string + FlavorInfos string DebugHost string ImageInfos string Capacity int MaxTempQueryTimes int + StFlavorInfo *StFlavorInfos + StImageInfos *StImageInfosModelArts //train-job ResourcePools string Engines string EngineVersions string - FlavorInfos string TrainJobFLAVORINFOS string ModelArtsSpecialPools string // modelarts-cd config ModelartsCD = struct { - ModelArtsHost string - IamHost string - ProjectID string - ProjectName string - ModelArtsUsername string - ModelArtsPassword string - ModelArtsDomain string - AllowedOrg string - ProfileID string - PoolInfos string - Flavor string - DebugHost string + Enabled bool + EndPoint string + ProjectID string + AccessKey string + SecretKey string + ImageInfos string + FlavorInfos string }{} //grampus config @@ -1438,9 +1454,8 @@ func NewContext() { AllowedOrg = sec.Key("ORGANIZATION").MustString("") ProfileID = sec.Key("PROFILE_ID").MustString("") PoolInfos = sec.Key("POOL_INFOS").MustString("") - Flavor = sec.Key("FLAVOR").MustString("") ImageInfos = sec.Key("IMAGE_INFOS").MustString("") - Capacity = sec.Key("IMAGE_INFOS").MustInt(100) + Capacity = sec.Key("CAPACITY").MustInt(100) MaxTempQueryTimes = sec.Key("MAX_TEMP_QUERY_TIMES").MustInt(30) ResourcePools = sec.Key("Resource_Pools").MustString("") Engines = sec.Key("Engines").MustString("") @@ -1488,8 +1503,8 @@ func NewContext() { Course.OrgName = sec.Key("org_name").MustString("") Course.TeamName = sec.Key("team_name").MustString("") - GetGrampusConfig() - + getGrampusConfig() + getModelartsCDConfig() getModelConvertConfig() } @@ -1512,7 +1527,22 @@ func getModelConvertConfig() { ModelConvert.NPU_TENSORFLOW_IMAGE_ID = sec.Key("NPU_TENSORFLOW_IMAGE_ID").MustInt(35) } -func GetGrampusConfig() { +func getModelartsCDConfig() { + sec := Cfg.Section("modelarts-cd") + + ModelartsCD.Enabled = sec.Key("ENABLED").MustBool(false) + ModelartsCD.EndPoint = sec.Key("ENDPOINT").MustString("https://modelarts.cn-southwest-228.cdzs.cn") + ModelartsCD.ProjectID = sec.Key("PROJECT_ID").MustString("") + ModelartsCD.AccessKey = sec.Key("ACCESS_KEY").MustString("") + ModelartsCD.SecretKey = sec.Key("SECRET_KEY").MustString("") + ModelartsCD.ImageInfos = sec.Key("IMAGE_INFOS").MustString("") + ModelartsCD.FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") + + getNotebookImageInfos() + getNotebookFlavorInfos() +} + +func getGrampusConfig() { sec := Cfg.Section("grampus") Grampus.Env = sec.Key("ENV").MustString("TEST") @@ -1646,6 +1676,26 @@ func ensureLFSDirectory() { } } +func getNotebookImageInfos() { + if StImageInfos == nil { + if ModelartsCD.Enabled { + json.Unmarshal([]byte(ModelartsCD.ImageInfos), &StImageInfos) + } else { + json.Unmarshal([]byte(ImageInfos), &StImageInfos) + } + } +} + +func getNotebookFlavorInfos() { + if StFlavorInfo == nil { + if ModelartsCD.Enabled { + json.Unmarshal([]byte(ModelartsCD.FlavorInfos), &StFlavorInfo) + } else { + json.Unmarshal([]byte(FlavorInfos), &StFlavorInfo) + } + } +} + // NewServices initializes the services func NewServices() { InitDBConfig() diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 763308930..46d891e86 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -2,6 +2,7 @@ package repo import ( "archive/zip" + "code.gitea.io/gitea/modules/modelarts_cd" "encoding/json" "errors" "fmt" @@ -60,18 +61,11 @@ func DebugJobIndex(ctx *context.Context) { if page <= 0 { page = 1 } - typeCloudBrain := models.TypeCloudBrainAll + jobTypeNot := false - if listType == models.GPUResource { - typeCloudBrain = models.TypeCloudBrainOne - } else if listType == models.NPUResource { - typeCloudBrain = models.TypeCloudBrainTwo - } else if listType == models.AllResource { - typeCloudBrain = models.TypeCloudBrainAll - } else { - log.Error("listType(%s) error", listType) - ctx.ServerError("listType error", errors.New("listType error")) - return + var computeResource string + if listType != models.AllResource { + computeResource = listType } var jobTypes []string @@ -81,10 +75,11 @@ func DebugJobIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: typeCloudBrain, - JobTypeNot: jobTypeNot, - JobTypes: jobTypes, + RepoID: repo.ID, + ComputeResource: computeResource, + Type: models.TypeCloudBrainAll, + JobTypeNot: jobTypeNot, + JobTypes: jobTypes, }) if err != nil { ctx.ServerError("Get debugjob faild:", err) @@ -134,16 +129,8 @@ func notebookNewDataPrepare(ctx *context.Context) error { return err } ctx.Data["attachments"] = attachs - - if modelarts.ImageInfos == nil { - json.Unmarshal([]byte(setting.ImageInfos), &modelarts.ImageInfos) - } - ctx.Data["images"] = modelarts.ImageInfos.ImageInfo - - if modelarts.FlavorInfos == nil { - json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) - } - ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo + ctx.Data["images"] = setting.StImageInfos.ImageInfo + ctx.Data["flavors"] = setting.StFlavorInfo.FlavorInfo setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug)) ctx.Data["datasetType"] = models.TypeCloudBrainTwo @@ -154,50 +141,6 @@ func notebookNewDataPrepare(ctx *context.Context) error { return nil } -func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { - ctx.Data["PageIsNotebook"] = true - jobName := form.JobName - uuid := form.Attachment - description := form.Description - flavor := form.Flavor - - count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) - if err != nil { - log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form) - return - } else { - if count >= 1 { - log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form) - return - } - } - _, err = models.GetCloudbrainByName(jobName) - if err == nil { - log.Error("the job name did already exist", ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form) - return - } else { - if !models.IsErrJobNotExist(err) { - log.Error("system error, %v", err, ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form) - return - } - } - - err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor) - if err != nil { - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) - return - } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") -} - func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { ctx.Data["PageIsNotebook"] = true displayJobName := form.DisplayJobName @@ -208,7 +151,12 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm imageId := form.ImageId repo := ctx.Repo.Repository - count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) + typeCloudbrain := models.TypeCloudBrainTwo + if setting.ModelartsCD.Enabled { + typeCloudbrain = models.TypeCDCenter + } + + count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID, typeCloudbrain) if err != nil { log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) notebookNewDataPrepare(ctx) @@ -247,7 +195,12 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm return } - err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId) + if setting.ModelartsCD.Enabled { + err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, uuid, description, flavor, imageId) + } else { + err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId) + } + if err != nil { log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"]) notebookNewDataPrepare(ctx) @@ -292,14 +245,11 @@ func NotebookShow(ctx *context.Context) { if err == nil { task.User = user } - if modelarts.FlavorInfos == nil { - json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) - } findSpec := false - if modelarts.FlavorInfos != nil { - ctx.Data["resource_spec"] = modelarts.FlavorInfos.FlavorInfo[0].Desc - for _, f := range modelarts.FlavorInfos.FlavorInfo { + if setting.StFlavorInfo != nil { + ctx.Data["resource_spec"] = setting.StFlavorInfo.FlavorInfo[0].Desc + for _, f := range setting.StFlavorInfo.FlavorInfo { if fmt.Sprint(f.Value) == task.FlavorCode { ctx.Data["resource_spec"] = f.Desc findSpec = true @@ -378,36 +328,16 @@ func setShowSpecBySpecialPoolConfig(ctx *context.Context, findSpec bool, task *m } } -func NotebookDebug(ctx *context.Context) { - var jobID = ctx.Params(":jobid") - - result, err := modelarts.GetJob(jobID) - if err != nil { - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) - return - } - - res, err := modelarts.GetJobToken(jobID) - if err != nil { - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) - return - } - - urls := strings.Split(result.Spec.Annotations.Url, "/") - urlPrefix := result.Spec.Annotations.TargetDomain - for i, url := range urls { - if i > 2 { - urlPrefix += "/" + url - } - } - - debugUrl := urlPrefix + "?token=" + res.Token - ctx.Redirect(debugUrl) -} - func NotebookDebug2(ctx *context.Context) { + var err error + var result *models.GetNotebook2Result task := ctx.Cloudbrain - result, err := modelarts.GetNotebook2(task.JobID) + if task.Type == models.TypeCloudBrainTwo { + result, err = modelarts.GetNotebook2(task.JobID) + } else if task.Type == models.TypeCDCenter { + result, err = modelarts_cd.GetNotebook(task.JobID) + } + if err != nil { ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil) return @@ -435,7 +365,7 @@ func NotebookRestart(ctx *context.Context) { break } - count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) + count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID, task.Type) if err != nil { log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) errorMsg = "system error" @@ -453,7 +383,13 @@ func NotebookRestart(ctx *context.Context) { Action: models.ActionStart, } - res, err := modelarts.ManageNotebook2(task.JobID, param) + var res *models.NotebookActionResult + if task.Type == models.TypeCloudBrainTwo { + res, err = modelarts.ManageNotebook2(task.JobID, param) + } else if task.Type == models.TypeCDCenter { + res, err = modelarts_cd.ManageNotebook(task.JobID, param) + } + if err != nil { log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"]) /* 暂不处理再次调试502的场景,详情见方案 @@ -537,7 +473,14 @@ func NotebookStop(ctx *context.Context) { Action: models.ActionStop, } - res, err := modelarts.ManageNotebook2(task.JobID, param) + var err error + var res *models.NotebookActionResult + if task.Type == models.TypeCloudBrainTwo { + res, err = modelarts.ManageNotebook2(task.JobID, param) + } else if task.Type == models.TypeCDCenter { + res, err = modelarts_cd.ManageNotebook(task.JobID, param) + } + if err != nil { log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) resultCode = "-1" @@ -587,7 +530,13 @@ func NotebookDel(ctx *context.Context) { return } - _, err := modelarts.DelNotebook2(task.JobID) + var err error + if task.Type == models.TypeCloudBrainTwo { + _, err = modelarts.DelNotebook2(task.JobID) + } else if task.Type == models.TypeCDCenter { + _, err = modelarts_cd.DelNotebook(task.JobID) + } + if err != nil { log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error()) if strings.Contains(err.Error(), modelarts.NotebookNotFound) || strings.Contains(err.Error(), modelarts.NotebookNoPermission) || strings.Contains(err.Error(), modelarts.NotebookInvalid) { @@ -2187,7 +2136,7 @@ func checkModelArtsSpecialPool(ctx *context.Context, flavorCode string, jobType if !isMatchPool { isMatchSpec := false if jobType == string(models.JobTypeDebug) { - for _, flavor := range modelarts.FlavorInfos.FlavorInfo { + for _, flavor := range setting.StFlavorInfo.FlavorInfo { if flavor.Value == flavorCode { isMatchSpec = true break From 2cb2c3c887e94dd2658773fd38e9c01939705cab Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 31 Aug 2022 15:19:55 +0800 Subject: [PATCH 03/26] fix issue --- templates/repo/cloudbrain/inference/show.tmpl | 42 +++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/templates/repo/cloudbrain/inference/show.tmpl b/templates/repo/cloudbrain/inference/show.tmpl index 3ec01417e..70d02f512 100644 --- a/templates/repo/cloudbrain/inference/show.tmpl +++ b/templates/repo/cloudbrain/inference/show.tmpl @@ -264,7 +264,8 @@ data-tab="first">{{$.i18n.Tr "repo.modelarts.train_job.config"}} {{$.i18n.Tr "repo.cloudbrain.runinfo"}} - + {{$.i18n.Tr "repo.modelarts.log"}} {{$.i18n.Tr "repo.model_download"}} @@ -537,7 +538,44 @@ - +
+
+ + + {{$.i18n.Tr "repo.modelarts.download_log"}} + + +
+
+ + + + + + + +
+
+
+
+ + +

+                             
+ +
+ + +
From 5ebab0a2df0833a62b756e0d3780d9c352b10f7e Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 31 Aug 2022 15:25:24 +0800 Subject: [PATCH 04/26] fix issue --- templates/repo/cloudbrain/benchmark/show.tmpl | 54 +++++++++++++++++++-------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/templates/repo/cloudbrain/benchmark/show.tmpl b/templates/repo/cloudbrain/benchmark/show.tmpl index add7d34d4..d657ac2ff 100755 --- a/templates/repo/cloudbrain/benchmark/show.tmpl +++ b/templates/repo/cloudbrain/benchmark/show.tmpl @@ -256,8 +256,9 @@
@@ -528,19 +529,42 @@
- -
- - -

-                            
- -
- -
+ + + {{$.i18n.Tr "repo.modelarts.download_log"}} + + +
+
+ + + + + + + +
+
+
+
+ + +

+                             
+ +
+ + +
From a2ed940896f0676e340af63acb98adcb4204ba06 Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 31 Aug 2022 15:27:22 +0800 Subject: [PATCH 05/26] fix issue --- templates/repo/cloudbrain/inference/show.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/repo/cloudbrain/inference/show.tmpl b/templates/repo/cloudbrain/inference/show.tmpl index 70d02f512..012baa7f7 100644 --- a/templates/repo/cloudbrain/inference/show.tmpl +++ b/templates/repo/cloudbrain/inference/show.tmpl @@ -228,7 +228,7 @@ {{with .task}}
+ data-repopath="{{$.RepoRelPath}}/cloudbrain/inference-job" data-jobid="{{.ID}}" data-version="{{.VersionName}}">
From 6898b5df57bd3246f8d452b41eaa7d04fbd368eb Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 31 Aug 2022 15:31:54 +0800 Subject: [PATCH 06/26] fix issue --- templates/repo/cloudbrain/inference/show.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/repo/cloudbrain/inference/show.tmpl b/templates/repo/cloudbrain/inference/show.tmpl index 012baa7f7..4ab65ca84 100644 --- a/templates/repo/cloudbrain/inference/show.tmpl +++ b/templates/repo/cloudbrain/inference/show.tmpl @@ -228,7 +228,7 @@ {{with .task}}
+ data-repopath="{{$.RepoRelPath}}/cloudbrain" data-jobid="{{.ID}}" data-version="{{.VersionName}}">
From 1d64c9bcff26773874a2714d01f190e1865315f1 Mon Sep 17 00:00:00 2001 From: zouap Date: Wed, 31 Aug 2022 15:35:47 +0800 Subject: [PATCH 07/26] =?UTF-8?q?#2833=20=E9=9C=80=E6=B1=82=E5=BC=80?= =?UTF-8?q?=E5=8F=91=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- modules/cloudbrain/cloudbrain.go | 6 +++--- routers/repo/cloudbrain.go | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 30f080335..25c556278 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -20,7 +20,7 @@ import ( const ( //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` - CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` + CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh | tee /model/benchmark-log.txt;echo "end benchmark"` CodeMountPath = "/code" DataSetMountPath = "/dataset" ModelMountPath = "/model" @@ -30,8 +30,8 @@ const ( Snn4imagenetMountPath = "/snn4imagenet" BrainScoreMountPath = "/brainscore" TaskInfoName = "/taskInfo" - Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'` - BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'` + Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s' | tee /model/benchmark-log.txt` + BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s' | tee /model/benchmark-log.txt` SubTaskName = "task1" diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 2d8bebf4b..db52f41f0 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2431,7 +2431,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, &form) return } - + log.Info("Command=" + command) + log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -2560,7 +2561,8 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } - + log.Info("Command=" + command) + log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -2689,7 +2691,7 @@ func getInferenceJobCommand(form auth.CreateCloudBrainInferencForm) (string, err param += " --modelname" + "=" + form.CkptName - command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile + command += "python /code/" + bootFile + param + " | tee " + cloudbrain.ResultPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile return command, nil } From b76af052a474e5729b6d1fbea9b2c07dad4f86cb Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 10:24:12 +0800 Subject: [PATCH 08/26] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 104 +++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index b450b2e26..d127f27ed 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -405,8 +405,21 @@ func CloudbrainDownloadLogFile(ctx *context.Context) { func CloudbrainGetLog(ctx *context.Context) { ID := ctx.Params(":id") - startLine := ctx.QueryInt("base_line") + job, err := models.GetCloudbrainByID(ID) + if err != nil { + log.Error("GetCloudbrainByJobName failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } lines := ctx.QueryInt("lines") + baseLine := ctx.Query("base_line") + if baseLine == "" { + re := getLastLogFromModelDir(job.JobName, lines) + ctx.JSON(http.StatusOK, re) + return + } + + startLine := ctx.QueryInt("base_line") endLine := startLine + lines order := ctx.Query("order") if order == "asc" { @@ -416,12 +429,7 @@ func CloudbrainGetLog(ctx *context.Context) { startLine = 0 } } - job, err := models.GetCloudbrainByID(ID) - if err != nil { - log.Error("GetCloudbrainByJobName failed: %v", err, ctx.Data["MsgID"]) - ctx.ServerError(err.Error(), err) - return - } + result := getLogFromModelDir(job.JobName, startLine, endLine) if result == nil { log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) @@ -443,6 +451,88 @@ func CloudbrainGetLog(ctx *context.Context) { ctx.JSON(http.StatusOK, re) } +func getAllLineFromFile(path string) int { + count := 0 + reader, err := os.Open(path) + defer reader.Close() + if err == nil { + r := bufio.NewReader(reader) + for { + _, error := r.ReadString('\n') + if error == io.EOF { + log.Info("read file completed.") + break + } + if error != nil { + log.Info("read file error." + error.Error()) + break + } + count = count + 1 + } + } else { + log.Info("error:" + err.Error()) + } + return count +} + +func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + "/model" + files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") + if err != nil { + log.Error("query cloudbrain model failed: %v", err) + return nil + } + + re := "" + fileName := "" + count := 0 + allLines := 0 + for _, file := range files { + if strings.HasSuffix(file.FileName, "log.txt") { + fileName = file.FileName + path := storage.GetMinioPath(jobName+"/model/", file.FileName) + allLines = getAllLineFromFile(path) + start := allLines - 50 + if start < 0 { + start = 0 + } + count = allLines - start + log.Info("path=" + path) + reader, err := os.Open(path) + defer reader.Close() + if err == nil { + r := bufio.NewReader(reader) + for i := start; i < allLines; i++ { + line, error := r.ReadString('\n') + log.Info("line=" + line) + if error == io.EOF { + log.Info("read file completed.") + break + } + if error != nil { + log.Info("read file error." + error.Error()) + break + } + if error == nil { + re = re + line + } + } + } else { + log.Info("error:" + err.Error()) + } + break + } + } + + return map[string]interface{}{ + "JobName": jobName, + "Content": re, + "FileName": fileName, + "lines": count, + "endLine": allLines, + } +} + func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]interface{} { prefix := "/" + setting.CBCodePathPrefix + jobName + "/model" files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") From a6161bfddbc7e95db6860b0db96b61165610101c Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 10:35:55 +0800 Subject: [PATCH 09/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index d127f27ed..33d8d89c0 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -492,17 +492,17 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { fileName = file.FileName path := storage.GetMinioPath(jobName+"/model/", file.FileName) allLines = getAllLineFromFile(path) - start := allLines - 50 - if start < 0 { - start = 0 + startLine := allLines - 50 + if startLine < 0 { + startLine = 0 } - count = allLines - start + count = allLines - startLine log.Info("path=" + path) reader, err := os.Open(path) defer reader.Close() if err == nil { r := bufio.NewReader(reader) - for i := start; i < allLines; i++ { + for i := 0; i < allLines; i++ { line, error := r.ReadString('\n') log.Info("line=" + line) if error == io.EOF { @@ -514,7 +514,9 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { break } if error == nil { - re = re + line + if i >= startLine { + re = re + line + } } } } else { From e9bfc092cd287ab813e80b11c7db2fde05b0ba2a Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 10:50:02 +0800 Subject: [PATCH 10/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 74 +++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 33d8d89c0..fb20b6c7b 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -8,6 +8,7 @@ package repo import ( "bufio" "encoding/json" + "fmt" "io" "net/http" "os" @@ -413,41 +414,37 @@ func CloudbrainGetLog(ctx *context.Context) { } lines := ctx.QueryInt("lines") baseLine := ctx.Query("base_line") + var result map[string]interface{} if baseLine == "" { - re := getLastLogFromModelDir(job.JobName, lines) - ctx.JSON(http.StatusOK, re) - return - } - - startLine := ctx.QueryInt("base_line") - endLine := startLine + lines - order := ctx.Query("order") - if order == "asc" { - endLine = startLine - startLine = endLine - lines - if startLine < 0 { - startLine = 0 + result = getLastLogFromModelDir(job.JobName, lines) + } else { + startLine := ctx.QueryInt("base_line") + endLine := startLine + lines + order := ctx.Query("order") + if order == "asc" { + endLine = startLine + startLine = endLine - lines + if startLine < 0 { + startLine = 0 + } + } + result = getLogFromModelDir(job.JobName, startLine, endLine) + if result == nil { + log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return } } - - result := getLogFromModelDir(job.JobName, startLine, endLine) - if result == nil { - log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) - ctx.ServerError(err.Error(), err) - return - } - re := map[string]interface{}{ "JobID": ID, "LogFileName": result["FileName"], - "StartLine": startLine, - "EndLine": result["endLine"], + "StartLine": result["StartLine"], + "EndLine": result["EndLine"], "Content": result["Content"], - "Lines": result["lines"], + "Lines": result["Lines"], "CanLogDownload": result["FileName"] != "", } //result := CloudbrainGetLogByJobId(job.JobID, job.JobName) - ctx.JSON(http.StatusOK, re) } @@ -487,12 +484,13 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { fileName := "" count := 0 allLines := 0 + startLine := 0 for _, file := range files { if strings.HasSuffix(file.FileName, "log.txt") { fileName = file.FileName path := storage.GetMinioPath(jobName+"/model/", file.FileName) allLines = getAllLineFromFile(path) - startLine := allLines - 50 + startLine = allLines - lines if startLine < 0 { startLine = 0 } @@ -504,7 +502,6 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { r := bufio.NewReader(reader) for i := 0; i < allLines; i++ { line, error := r.ReadString('\n') - log.Info("line=" + line) if error == io.EOF { log.Info("read file completed.") break @@ -515,6 +512,7 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { } if error == nil { if i >= startLine { + log.Info("i=" + fmt.Sprint(i)) re = re + line } } @@ -527,11 +525,12 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { } return map[string]interface{}{ - "JobName": jobName, - "Content": re, - "FileName": fileName, - "lines": count, - "endLine": allLines, + "JobName": jobName, + "Content": re, + "FileName": fileName, + "Lines": count, + "EndLine": allLines, + "StartLine": startLine, } } @@ -583,11 +582,12 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i } return map[string]interface{}{ - "JobName": jobName, - "Content": re, - "FileName": fileName, - "lines": count, - "endLine": fileEndLine, + "JobName": jobName, + "Content": re, + "FileName": fileName, + "Lines": count, + "EndLine": fileEndLine, + "StartLine": startLine, } } From 16b23ff5cb3ae9c794c3c4b098101148afb381d6 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 10:57:56 +0800 Subject: [PATCH 11/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index fb20b6c7b..20c3cff01 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -414,13 +414,14 @@ func CloudbrainGetLog(ctx *context.Context) { } lines := ctx.QueryInt("lines") baseLine := ctx.Query("base_line") + order := ctx.Query("order") var result map[string]interface{} - if baseLine == "" { + if baseLine == "" && order == "desc" { result = getLastLogFromModelDir(job.JobName, lines) } else { startLine := ctx.QueryInt("base_line") endLine := startLine + lines - order := ctx.Query("order") + if order == "asc" { endLine = startLine startLine = endLine - lines From 8e1df7854543e6510694d63f3753f9616bf8fc18 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 11:08:17 +0800 Subject: [PATCH 12/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 20c3cff01..5fbaaa03b 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -559,7 +559,6 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i for i := 0; i < endLine; i++ { line, error := r.ReadString('\n') log.Info("line=" + line) - fileEndLine = i if error == io.EOF { log.Info("read file completed.") break @@ -570,11 +569,13 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i } if error == nil { if i >= startLine { + fileEndLine = i re = re + line count++ } } } + fileEndLine = fileEndLine + 1 } else { log.Info("error:" + err.Error()) } From 989b0d8f8a7b64694f4c7e2cee445d20ac72b007 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 11:16:57 +0800 Subject: [PATCH 13/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 5fbaaa03b..8c638d6b5 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -8,7 +8,6 @@ package repo import ( "bufio" "encoding/json" - "fmt" "io" "net/http" "os" @@ -422,7 +421,7 @@ func CloudbrainGetLog(ctx *context.Context) { startLine := ctx.QueryInt("base_line") endLine := startLine + lines - if order == "asc" { + if order == "asc" && (startLine-lines) > 0 { endLine = startLine startLine = endLine - lines if startLine < 0 { @@ -513,7 +512,6 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { } if error == nil { if i >= startLine { - log.Info("i=" + fmt.Sprint(i)) re = re + line } } From da0124ca660eb1e2aa2b62dd2fdfd2c6157890e2 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 11:20:27 +0800 Subject: [PATCH 14/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 8c638d6b5..75ddaf616 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -421,7 +421,7 @@ func CloudbrainGetLog(ctx *context.Context) { startLine := ctx.QueryInt("base_line") endLine := startLine + lines - if order == "asc" && (startLine-lines) > 0 { + if order == "asc" && (startLine-lines) >= 0 { endLine = startLine startLine = endLine - lines if startLine < 0 { From 99a5298217e3ca4826841b62a1ffc4d4b538566e Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 11:30:23 +0800 Subject: [PATCH 15/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 75ddaf616..9b222cc6a 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -421,11 +421,15 @@ func CloudbrainGetLog(ctx *context.Context) { startLine := ctx.QueryInt("base_line") endLine := startLine + lines - if order == "asc" && (startLine-lines) >= 0 { - endLine = startLine - startLine = endLine - lines - if startLine < 0 { - startLine = 0 + if order == "asc" { + if (startLine - lines) >= 0 { + endLine = startLine + startLine = endLine - lines + if startLine < 0 { + startLine = 0 + } + } else { + endLine = startLine } } result = getLogFromModelDir(job.JobName, startLine, endLine) @@ -540,7 +544,16 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i log.Error("query cloudbrain model failed: %v", err) return nil } - + if startLine == endLine { + return map[string]interface{}{ + "JobName": jobName, + "Content": "", + "FileName": "", + "Lines": 0, + "EndLine": startLine, + "StartLine": startLine, + } + } re := "" fileName := "" count := 0 From 24799e1842890dfebc0d109fab25941448c3ef70 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 11:41:25 +0800 Subject: [PATCH 16/26] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index 9b222cc6a..f3d1acfad 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -420,16 +420,16 @@ func CloudbrainGetLog(ctx *context.Context) { } else { startLine := ctx.QueryInt("base_line") endLine := startLine + lines - if order == "asc" { - if (startLine - lines) >= 0 { + if baseLine == "" { + startLine = 0 + endLine = lines + } else { endLine = startLine startLine = endLine - lines if startLine < 0 { startLine = 0 } - } else { - endLine = startLine } } result = getLogFromModelDir(job.JobName, startLine, endLine) From beb7b8e3a90b4f8396b4f3ec1a91063874bb2cb7 Mon Sep 17 00:00:00 2001 From: zouap Date: Thu, 1 Sep 2022 15:04:38 +0800 Subject: [PATCH 17/26] =?UTF-8?q?=E6=8E=A8=E7=90=86=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E8=B7=AF=E5=BE=84=E4=BF=AE=E6=94=B9=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- routers/api/v1/repo/cloudbrain.go | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index f3d1acfad..d6b7bb076 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -415,8 +415,12 @@ func CloudbrainGetLog(ctx *context.Context) { baseLine := ctx.Query("base_line") order := ctx.Query("order") var result map[string]interface{} + resultPath := "/model" + if job.JobType == string(models.JobTypeInference) { + resultPath = "/result" + } if baseLine == "" && order == "desc" { - result = getLastLogFromModelDir(job.JobName, lines) + result = getLastLogFromModelDir(job.JobName, lines, resultPath) } else { startLine := ctx.QueryInt("base_line") endLine := startLine + lines @@ -432,7 +436,7 @@ func CloudbrainGetLog(ctx *context.Context) { } } } - result = getLogFromModelDir(job.JobName, startLine, endLine) + result = getLogFromModelDir(job.JobName, startLine, endLine, resultPath) if result == nil { log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) @@ -476,8 +480,8 @@ func getAllLineFromFile(path string) int { return count } -func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { - prefix := "/" + setting.CBCodePathPrefix + jobName + "/model" +func getLastLogFromModelDir(jobName string, lines int, resultPath string) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + resultPath files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") if err != nil { log.Error("query cloudbrain model failed: %v", err) @@ -492,7 +496,7 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { for _, file := range files { if strings.HasSuffix(file.FileName, "log.txt") { fileName = file.FileName - path := storage.GetMinioPath(jobName+"/model/", file.FileName) + path := storage.GetMinioPath(jobName+resultPath+"/", file.FileName) allLines = getAllLineFromFile(path) startLine = allLines - lines if startLine < 0 { @@ -537,8 +541,8 @@ func getLastLogFromModelDir(jobName string, lines int) map[string]interface{} { } } -func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]interface{} { - prefix := "/" + setting.CBCodePathPrefix + jobName + "/model" +func getLogFromModelDir(jobName string, startLine int, endLine int, resultPath string) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + resultPath files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") if err != nil { log.Error("query cloudbrain model failed: %v", err) @@ -561,7 +565,7 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i for _, file := range files { if strings.HasSuffix(file.FileName, "log.txt") { fileName = file.FileName - path := storage.GetMinioPath(jobName+"/model/", file.FileName) + path := storage.GetMinioPath(jobName+resultPath+"/", file.FileName) log.Info("path=" + path) reader, err := os.Open(path) defer reader.Close() From 543ab98fe9645ba4bd2f18496cb638c60f19550d Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Thu, 1 Sep 2022 15:21:09 +0800 Subject: [PATCH 18/26] =?UTF-8?q?=E6=94=AF=E6=8C=81=E7=BB=99=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E7=BB=84=E7=BB=87=E5=AE=9A=E5=88=B6=E8=8A=82=E7=82=B9?= =?UTF-8?q?=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/modelarts/modelarts.go | 18 +++++++++++++++++- modules/setting/setting.go | 2 ++ routers/repo/modelarts.go | 15 +++++++++++++++ templates/repo/modelarts/trainjob/new.tmpl | 9 ++++++++- 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 9e8447978..97791e25a 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -71,7 +71,8 @@ var ( FlavorInfos *models.FlavorInfos ImageInfos *models.ImageInfosModelArts TrainFlavorInfos *Flavor - SpecialPools *models.SpecialPools + SpecialPools *models.SpecialPools + MultiNodeConfig *MultiNodes ) type GenerateTrainJobReq struct { @@ -166,6 +167,14 @@ type ResourcePool struct { } `json:"resource_pool"` } +type MultiNodes struct{ + Info []OrgMultiNode `json:"multinode"` +} +type OrgMultiNode struct{ + Org string `json:"org"` + Node []int `json:"node"` +} + // type Parameter struct { // Label string `json:"label"` // Value string `json:"value"` @@ -773,6 +782,13 @@ func InitSpecialPool() { } } +func InitMultiNode(){ + if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{ + json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig) + } + +} + func HandleTrainJobInfo(task *models.Cloudbrain) error { result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 1e96ff9da..3b8a1d8cf 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -547,6 +547,7 @@ var ( FlavorInfos string TrainJobFLAVORINFOS string ModelArtsSpecialPools string + ModelArtsMultiNode string //grampus config Grampus = struct { @@ -1432,6 +1433,7 @@ func NewContext() { FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("") ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("") + ModelArtsMultiNode=sec.Key("MULTI_NODE").MustString("") sec = Cfg.Section("elk") ElkUrl = sec.Key("ELKURL").MustString("") diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 847e831f6..10843e683 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -763,9 +763,23 @@ func trainJobNewDataPrepare(ctx *context.Context) error { waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) + return nil } +func setMultiNodeIfConfigureMatch(ctx *context.Context) { + modelarts.InitMultiNode() + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg { + ctx.Data["WorkNode"] = info.Node + break + } + } + } +} + func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { modelarts.InitSpecialPool() @@ -880,6 +894,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts ctx.Data["datasetType"] = models.TypeCloudBrainTwo waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) return nil } diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl index 7818938d3..2b6ea923b 100755 --- a/templates/repo/modelarts/trainjob/new.tmpl +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -287,8 +287,15 @@ id="trainjob_work_server_num" tabindex="3" autofocus required maxlength="255" value="1" readonly>
- + {{if .WorkNode}} + {{range .WorkNode}} + + {{end}} + + {{else}} + {{end}}
From de61282452933665739155d18d75803a4439c311 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 1 Sep 2022 15:23:53 +0800 Subject: [PATCH 19/26] show cd --- routers/repo/cloudbrain.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 2d8bebf4b..2e35ac3cc 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2763,6 +2763,8 @@ func GetCloudbrainAiCenter(task models.Cloudbrain, ctx *context.Context) string return ctx.Tr("repo.cloudbrain1") } else if task.Type == models.TypeCloudBrainTwo { return ctx.Tr("repo.cloudbrain2") + } else if task.Type == models.TypeCDCenter { + return ctx.Tr("repo.cdCenter") } else if task.Type == models.TypeC2Net { return getCutStringAiCenterByAiCenter(task.AiCenter) } @@ -2777,7 +2779,7 @@ func getCutStringAiCenterByAiCenter(aiCenter string) string { } func GetCloudbrainCluster(task models.Cloudbrain, ctx *context.Context) string { - if task.Type == models.TypeCloudBrainOne || task.Type == models.TypeCloudBrainTwo { + if task.Type == models.TypeCloudBrainOne || task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter { return ctx.Tr("cloudbrain.resource_cluster_openi") } else if task.Type == models.TypeC2Net { return ctx.Tr("cloudbrain.resource_cluster_c2net") @@ -2864,10 +2866,10 @@ func GetCloudbrainFlavorName(task models.Cloudbrain) (string, error) { return CloudbrainOneFlavorName, nil } } - } else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeC2Net) && task.FlavorName != "" { + } else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeC2Net || task.Type == models.TypeCDCenter) && task.FlavorName != "" { replaceFlavorName := strings.ReplaceAll(task.FlavorName, ":", ":") return replaceFlavorName, nil - } else if task.Type == models.TypeCloudBrainTwo && task.FlavorName == "" && task.FlavorCode != "" { + } else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter) && task.FlavorName == "" && task.FlavorCode != "" { cloudbrainTwoFlavorName := getFlavorNameByFlavorCode(task.FlavorCode) return cloudbrainTwoFlavorName, nil } else if task.Type == models.TypeCloudBrainTwo && task.JobType == string(models.JobTypeDebug) && task.FlavorName == "" && task.FlavorCode == "" { From 8bc5b8b06386ddb7d9efb7ac5463f8a48893d946 Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Thu, 1 Sep 2022 15:35:56 +0800 Subject: [PATCH 20/26] fix issue --- templates/repo/cloudbrain/inference/show.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/repo/cloudbrain/inference/show.tmpl b/templates/repo/cloudbrain/inference/show.tmpl index 4ab65ca84..6d0cee642 100644 --- a/templates/repo/cloudbrain/inference/show.tmpl +++ b/templates/repo/cloudbrain/inference/show.tmpl @@ -525,7 +525,7 @@ -
From 50ced14cf74733721928e7956a17b475ccda828c Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 1 Sep 2022 15:38:25 +0800 Subject: [PATCH 21/26] show cluster --- options/locale/locale_en-US.ini | 1 + options/locale/locale_zh-CN.ini | 1 + 2 files changed, 2 insertions(+) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 5eac4cf2e..adb5eebf5 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1079,6 +1079,7 @@ balance.total_view = Total Balance balance.available = Available Balance: cloudbrain1 = cloudbrain1 cloudbrain2 = cloudbrain2 +cdCenter = cd_ai_center cloudbrain_selection = select cloudbrain cloudbrain_platform_selection = Select the cloudbrain platform you want to use: confirm_choice = Confirm diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 2fbd3ab52..c8daeb1be 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1080,6 +1080,7 @@ balance.total_view=余额总览 balance.available=可用余额: cloudbrain1=云脑1 cloudbrain2=云脑2 +cdCenter=成都智算中心 intelligent_net=智算网络 cloudbrain_selection=云脑选择 cloudbrain_platform_selection=选择您准备使用的云脑平台: From af0bada5d33d62100f549d94f14f0b5ebf24eec7 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Thu, 1 Sep 2022 17:18:42 +0800 Subject: [PATCH 22/26] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- options/locale/locale_en-US.ini | 1 + options/locale/locale_zh-CN.ini | 1 + routers/repo/modelarts.go | 69 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 5eac4cf2e..3453344f7 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1213,6 +1213,7 @@ modelarts.infer_job.select_model = Select Model modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference.py, case/main.py. modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed. modelarts.download_log=Download log file +modelarts.no_node_right = The value of 'Amount of Compute Node' is wrong, you have no right to use the current value of 'Amount of Compute Node'. debug_task_not_created = Debug task has not been created diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 2fbd3ab52..d527218d3 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1226,6 +1226,7 @@ modelarts.infer_job.select_model = 选择模型 modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。 modelarts.infer_job.tooltip = 该模型已删除,无法查看。 modelarts.download_log=下载日志文件 +modelarts.no_node_right = 计算节点数的值配置错误,您没有权限使用当前配置的计算节点数。 debug_task_not_created = 未创建过调试任务 diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 10843e683..424a7fe23 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1130,6 +1130,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount := modelarts.VersionCountOne EngineName := form.EngineName + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1160,7 +1167,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { trainJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) @@ -1364,6 +1371,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func checkMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + modelarts.InitMultiNode() + var isServerNumValid=false + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { + if isInNodes(info.Node,serverNum){ + isServerNumValid=true + break + } + + } + } + } + if isServerNumValid{ + return "" + }else{ + return "repo.modelarts.no_node_right" + } +} +func checkInferenceJobMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + + return "repo.modelarts.no_node_right" + +} + +func isInNodes(nodes []int, num int) bool { + for _, node:=range nodes{ + if node==num{ + return true + } + } + return false + +} + func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { userImageUrl := "" userCommand := "" @@ -1398,6 +1447,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + versionErrorDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1465,7 +1521,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { versionErrorDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) @@ -2036,6 +2092,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ckptUrl := "/" + form.TrainUrl + form.CkptName log.Info("ckpt url:" + ckptUrl) + errStr:=checkInferenceJobMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + inferenceJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + return + } + count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -2084,7 +2147,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) if errStr != "" { inferenceJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) From 2355c18ab50b5002f270876da800a5d005f38cf4 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Thu, 1 Sep 2022 18:10:49 +0800 Subject: [PATCH 23/26] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- templates/repo/modelarts/trainjob/new.tmpl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl index 2b6ea923b..cc1c0d7f1 100755 --- a/templates/repo/modelarts/trainjob/new.tmpl +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -290,7 +290,16 @@