You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 6.9 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "path"
  5. "strconv"
  6. "code.gitea.io/gitea/models"
  7. "code.gitea.io/gitea/modules/context"
  8. "code.gitea.io/gitea/modules/log"
  9. "code.gitea.io/gitea/modules/setting"
  10. "code.gitea.io/gitea/modules/storage"
  11. )
  12. const (
  13. //notebook
  14. storageTypeOBS = "obs"
  15. autoStopDuration = 4 * 60 * 60
  16. DataSetMountPath = "/home/ma-user/work"
  17. NotebookEnv = "Python3"
  18. NotebookType = "Ascend"
  19. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  20. //train-job
  21. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  22. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  23. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  24. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  25. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  26. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  27. // "]}"
  28. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  29. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  30. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  31. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  32. // "]}"
  33. CodePath = "/code/"
  34. OutputPath = "/output/"
  35. LogPath = "/log/"
  36. JobPath = "/job/"
  37. OrderDesc = "desc" //向下查询
  38. OrderAsc = "asc" //向上查询
  39. Lines = 20
  40. TrainUrl = "train_url"
  41. DataUrl = "data_url"
  42. PerPage = 10
  43. SortByCreateTime = "create_time"
  44. ConfigTypeCustom = "custom"
  45. )
  46. var (
  47. poolInfos *models.PoolInfos
  48. FlavorInfos *models.FlavorInfos
  49. )
  50. type GenerateTrainJobReq struct {
  51. JobName string
  52. Uuid string
  53. Description string
  54. CodeObsPath string
  55. BootFile string
  56. DataUrl string
  57. TrainUrl string
  58. FlavorCode string
  59. LogUrl string
  60. PoolID string
  61. WorkServerNumber int
  62. EngineID int64
  63. Parameters []models.Parameter
  64. }
  65. type VersionInfo struct {
  66. Version []struct {
  67. ID int `json:"id"`
  68. Value string `json:"value"`
  69. } `json:"version"`
  70. }
  71. type Flavor struct {
  72. Info []struct {
  73. Code string `json:"code"`
  74. Value string `json:"value"`
  75. } `json:"flavor"`
  76. }
  77. type Engine struct {
  78. Info []struct {
  79. ID int `json:"id"`
  80. Value string `json:"value"`
  81. } `json:"engine"`
  82. }
  83. type ResourcePool struct {
  84. Info []struct {
  85. ID string `json:"id"`
  86. Value string `json:"value"`
  87. } `json:"resource_pool"`
  88. }
  89. func GenerateTask(ctx *context.Context, jobName, uuid, description string) error {
  90. var dataActualPath string
  91. if uuid != "" {
  92. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  93. } else {
  94. userPath := setting.UserBasePath + ctx.User.Name + "/"
  95. isExist, err := storage.ObsHasObject(userPath)
  96. if err != nil {
  97. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  98. return err
  99. }
  100. if !isExist {
  101. if err = storage.ObsCreateObject(userPath); err != nil {
  102. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  103. return err
  104. }
  105. }
  106. dataActualPath = setting.Bucket + "/" + userPath
  107. }
  108. if poolInfos == nil {
  109. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  110. }
  111. jobResult, err := CreateJob(models.CreateNotebookParams{
  112. JobName: jobName,
  113. Description: description,
  114. ProfileID: setting.ProfileID,
  115. Flavor: setting.Flavor,
  116. Pool: models.Pool{
  117. ID: poolInfos.PoolInfo[0].PoolId,
  118. Name: poolInfos.PoolInfo[0].PoolName,
  119. Type: poolInfos.PoolInfo[0].PoolType,
  120. },
  121. Spec: models.Spec{
  122. Storage: models.Storage{
  123. Type: storageTypeOBS,
  124. Location: models.Location{
  125. Path: dataActualPath,
  126. },
  127. },
  128. AutoStop: models.AutoStop{
  129. Enable: true,
  130. Duration: autoStopDuration,
  131. },
  132. },
  133. })
  134. if err != nil {
  135. log.Error("CreateJob failed: %v", err.Error())
  136. return err
  137. }
  138. err = models.CreateCloudbrain(&models.Cloudbrain{
  139. Status: string(models.JobWaiting),
  140. UserID: ctx.User.ID,
  141. RepoID: ctx.Repo.Repository.ID,
  142. JobID: jobResult.ID,
  143. JobName: jobName,
  144. JobType: string(models.JobTypeDebug),
  145. Type: models.TypeCloudBrainNotebook,
  146. Uuid: uuid,
  147. })
  148. if err != nil {
  149. return err
  150. }
  151. return nil
  152. }
  153. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
  154. jobResult, err := createTrainJob(models.CreateTrainJobParams{
  155. JobName: req.JobName,
  156. Description: req.Description,
  157. Config: models.Config{
  158. WorkServerNum: req.WorkServerNumber,
  159. AppUrl: req.CodeObsPath,
  160. BootFileUrl: req.BootFile,
  161. DataUrl: req.DataUrl,
  162. EngineID: req.EngineID,
  163. TrainUrl: req.TrainUrl,
  164. LogUrl: req.LogUrl,
  165. PoolID: req.PoolID,
  166. CreateVersion: true,
  167. Flavor: models.Flavor{
  168. Code: req.FlavorCode,
  169. },
  170. Parameter: req.Parameters,
  171. },
  172. })
  173. if err != nil {
  174. log.Error("CreateJob failed: %v", err.Error())
  175. return err
  176. }
  177. attach, err := models.GetAttachmentByUUID(req.Uuid)
  178. if err != nil {
  179. log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  180. return nil
  181. }
  182. err = models.CreateCloudbrain(&models.Cloudbrain{
  183. Status: TransTrainJobStatus(jobResult.Status),
  184. UserID: ctx.User.ID,
  185. RepoID: ctx.Repo.Repository.ID,
  186. JobID: strconv.FormatInt(jobResult.JobID, 10),
  187. JobName: req.JobName,
  188. JobType: string(models.JobTypeDebug),
  189. Type: models.TypeCloudBrainTrainJob,
  190. VersionID: jobResult.VersionID,
  191. VersionName: jobResult.VersionName,
  192. Uuid: req.Uuid,
  193. DatasetName: attach.Name,
  194. })
  195. if err != nil {
  196. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  197. return err
  198. }
  199. return nil
  200. }
  201. func TransTrainJobStatus(status int) string {
  202. switch status {
  203. case 0:
  204. return "UNKNOWN"
  205. case 1:
  206. return "INIT"
  207. case 2:
  208. return "IMAGE_CREATING"
  209. case 3:
  210. return "IMAGE_FAILED"
  211. case 4:
  212. return "SUBMIT_TRYING"
  213. case 5:
  214. return "SUBMIT_FAILED"
  215. case 6:
  216. return "DELETE_FAILED"
  217. case 7:
  218. return "WAITING"
  219. case 8:
  220. return "RUNNING"
  221. case 9:
  222. return "KILLING"
  223. case 10:
  224. return "COMPLETED"
  225. case 11:
  226. return "FAILED"
  227. case 12:
  228. return "KILLED"
  229. case 13:
  230. return "CANCELED"
  231. case 14:
  232. return "LOST"
  233. case 15:
  234. return "SCALING"
  235. case 16:
  236. return "SUBMIT_MODEL_FAILED"
  237. case 17:
  238. return "DEPLOY_SERVICE_FAILED"
  239. case 18:
  240. return "CHECK_INIT"
  241. case 19:
  242. return "CHECK_RUNNING"
  243. case 20:
  244. return "CHECK_RUNNING_COMPLETED"
  245. case 21:
  246. return "CHECK_FAILED"
  247. default:
  248. return strconv.Itoa(status)
  249. }
  250. return ""
  251. }