|
- package modelarts
-
- import (
- "path"
- "strconv"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/setting"
- )
-
- const (
- //notebook
- storageTypeOBS = "obs"
- autoStopDuration = 4 * 60 * 60
- flavor = "modelarts.kat1.xlarge"
- //profileID = "Python3-ascend910-arm"
- profileID = "efa847c0-7359-11eb-b34f-0255ac100057"
- poolID = "pool1328035d"
- poolName = "train-private-1"
- poolType = "USER_DEFINED"
-
- DataSetMountPath = "/home/ma-user/work"
- NotebookEnv = "Python3"
- NotebookType = "Ascend"
- FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
-
- //train-job
- ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
- Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
- EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
- "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
- "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
- "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
- "]}"
- FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
- "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
- "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
- "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
- "]}"
- CodePath = "/code/"
- OutputPath = "/output/"
- LogPath = "/log/"
- JobPath = "/job/"
- OrderDesc = "desc"
- OrderAsc = "asc"
- )
-
- type GenerateTrainJobReq struct {
- JobName string
- Uuid string
- Description string
- CodeObsPath string
- BootFile string
- DataUrl string
- TrainUrl string
- FlavorCode string
- LogUrl string
- PoolID string
- WorkServerNumber int
- EngineID int64
- }
-
- type VersionInfo struct {
- Version []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- } `json:"version"`
- }
-
- type Flavor struct {
- Info []struct {
- Code string `json:"code"`
- Value string `json:"value"`
- } `json:"flavor"`
- }
-
- type Engine struct {
- Info []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- } `json:"engine"`
- }
-
- type ResourcePool struct {
- Info []struct {
- ID string `json:"id"`
- Value string `json:"value"`
- } `json:"resource_pool"`
- }
-
- func GenerateTask(ctx *context.Context, jobName, uuid, description string) error {
- dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
- jobResult, err := createNotebook(models.CreateNotebookParams{
- JobName: jobName,
- Description:description,
- ProfileID: profileID,
- Flavor: flavor,
- Pool: models.Pool{
- ID: poolID,
- Name: poolName,
- Type: poolType,
- },
- Spec: models.Spec{
- Storage: models.Storage{
- Type: storageTypeOBS,
- Location:models.Location{
- Path: dataActualPath,
- },
- },
- AutoStop: models.AutoStop{
- Enable: true,
- Duration: autoStopDuration,
- },
- },
-
- })
- if err != nil {
- log.Error("CreateJob failed: %v", err.Error())
- return err
- }
-
- err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: string(models.JobWaiting),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobResult.ID,
- JobName: jobName,
- JobType: string(models.JobTypeDebug),
- Type: models.TypeCloudBrainNotebook,
- })
-
- if err != nil {
- return err
- }
-
- return nil
- }
-
- func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
- jobResult, err := createTrainJob(models.CreateTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.Config{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFile,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- },
-
- })
- if err != nil {
- log.Error("CreateJob failed: %v", err.Error())
- return err
- }
-
- err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: transTrainJobStatus(jobResult.Status),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: strconv.FormatInt(jobResult.JobID, 10),
- JobName: req.JobName,
- JobType: string(models.JobTypeDebug),
- Type: models.TypeCloudBrainTrainJob,
- VersionID: jobResult.VersionID,
- VersionName: jobResult.VersionName,
- })
-
- if err != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
- return err
- }
-
- return nil
- }
-
- func transTrainJobStatus(status int) string{
- switch status {
- case 0:
- return "UNKNOWN"
- case 1:
- return "INIT"
- case 2:
- return "IMAGE_CREATING"
- case 3:
- return "IMAGE_FAILED"
- case 4:
- return "SUBMIT_TRYING"
- case 5:
- return "SUBMIT_FAILED"
- case 6:
- return "DELETE_FAILED"
- case 7:
- return "WAITING"
- case 8:
- return "RUNNING"
- case 9:
- return "KILLING"
- case 10:
- return "COMPLETED"
- case 11:
- return "FAILED"
- case 12:
- return "KILLED"
- case 13:
- return "CANCELED"
- case 14:
- return "LOST"
- case 15:
- return "SCALING"
- case 16:
- return "SUBMIT_MODEL_FAILED"
- case 17:
- return "DEPLOY_SERVICE_FAILED"
- case 18:
- return "CHECK_INIT"
- case 19:
- return "CHECK_RUNNING"
- case 20:
- return "CHECK_RUNNING_COMPLETED"
- case 21:
- return "CHECK_FAILED"
-
- default:
- return strconv.Itoa(status)
- }
-
- return ""
- }
|