You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 5.8 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. package modelarts
  2. import (
  3. "path"
  4. "strconv"
  5. "code.gitea.io/gitea/models"
  6. "code.gitea.io/gitea/modules/context"
  7. "code.gitea.io/gitea/modules/log"
  8. "code.gitea.io/gitea/modules/setting"
  9. )
  10. const (
  11. //notebook
  12. storageTypeOBS = "obs"
  13. autoStopDuration = 4 * 60 * 60
  14. flavor = "modelarts.kat1.xlarge"
  15. //profileID = "Python3-ascend910-arm"
  16. profileID = "efa847c0-7359-11eb-b34f-0255ac100057"
  17. poolID = "pool1328035d"
  18. poolName = "train-private-1"
  19. poolType = "USER_DEFINED"
  20. DataSetMountPath = "/home/ma-user/work"
  21. NotebookEnv = "Python3"
  22. NotebookType = "Ascend"
  23. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  24. //train-job
  25. ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  26. Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  27. EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  28. "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  29. "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  30. "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  31. "]}"
  32. FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  33. "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  34. "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  35. "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  36. "]}"
  37. CodePath = "/code/"
  38. OutputPath = "/output/"
  39. LogPath = "/log/"
  40. JobPath = "/job/"
  41. OrderDesc = "desc"
  42. OrderAsc = "asc"
  43. TrainUrl = "train_url"
  44. DataUrl = "data_url"
  45. )
  46. type GenerateTrainJobReq struct {
  47. JobName string
  48. Uuid string
  49. Description string
  50. CodeObsPath string
  51. BootFile string
  52. DataUrl string
  53. TrainUrl string
  54. FlavorCode string
  55. LogUrl string
  56. PoolID string
  57. WorkServerNumber int
  58. EngineID int64
  59. Parameters []models.Parameter
  60. }
  61. type VersionInfo struct {
  62. Version []struct {
  63. ID int `json:"id"`
  64. Value string `json:"value"`
  65. } `json:"version"`
  66. }
  67. type Flavor struct {
  68. Info []struct {
  69. Code string `json:"code"`
  70. Value string `json:"value"`
  71. } `json:"flavor"`
  72. }
  73. type Engine struct {
  74. Info []struct {
  75. ID int `json:"id"`
  76. Value string `json:"value"`
  77. } `json:"engine"`
  78. }
  79. type ResourcePool struct {
  80. Info []struct {
  81. ID string `json:"id"`
  82. Value string `json:"value"`
  83. } `json:"resource_pool"`
  84. }
  85. func GenerateTask(ctx *context.Context, jobName, uuid, description string) error {
  86. dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  87. jobResult, err := createNotebook(models.CreateNotebookParams{
  88. JobName: jobName,
  89. Description:description,
  90. ProfileID: profileID,
  91. Flavor: flavor,
  92. Pool: models.Pool{
  93. ID: poolID,
  94. Name: poolName,
  95. Type: poolType,
  96. },
  97. Spec: models.Spec{
  98. Storage: models.Storage{
  99. Type: storageTypeOBS,
  100. Location:models.Location{
  101. Path: dataActualPath,
  102. },
  103. },
  104. AutoStop: models.AutoStop{
  105. Enable: true,
  106. Duration: autoStopDuration,
  107. },
  108. },
  109. })
  110. if err != nil {
  111. log.Error("CreateJob failed: %v", err.Error())
  112. return err
  113. }
  114. err = models.CreateCloudbrain(&models.Cloudbrain{
  115. Status: string(models.JobWaiting),
  116. UserID: ctx.User.ID,
  117. RepoID: ctx.Repo.Repository.ID,
  118. JobID: jobResult.ID,
  119. JobName: jobName,
  120. JobType: string(models.JobTypeDebug),
  121. Type: models.TypeCloudBrainNotebook,
  122. Uuid: uuid,
  123. })
  124. if err != nil {
  125. return err
  126. }
  127. return nil
  128. }
  129. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
  130. jobResult, err := createTrainJob(models.CreateTrainJobParams{
  131. JobName: req.JobName,
  132. Description: req.Description,
  133. Config: models.Config{
  134. WorkServerNum: req.WorkServerNumber,
  135. AppUrl: req.CodeObsPath,
  136. BootFileUrl: req.BootFile,
  137. DataUrl: req.DataUrl,
  138. EngineID: req.EngineID,
  139. TrainUrl: req.TrainUrl,
  140. LogUrl: req.LogUrl,
  141. PoolID: req.PoolID,
  142. CreateVersion: true,
  143. Flavor: models.Flavor{
  144. Code: req.FlavorCode,
  145. },
  146. Parameter: req.Parameters,
  147. },
  148. })
  149. if err != nil {
  150. log.Error("CreateJob failed: %v", err.Error())
  151. return err
  152. }
  153. err = models.CreateCloudbrain(&models.Cloudbrain{
  154. Status: TransTrainJobStatus(jobResult.Status),
  155. UserID: ctx.User.ID,
  156. RepoID: ctx.Repo.Repository.ID,
  157. JobID: strconv.FormatInt(jobResult.JobID, 10),
  158. JobName: req.JobName,
  159. JobType: string(models.JobTypeDebug),
  160. Type: models.TypeCloudBrainTrainJob,
  161. VersionID: jobResult.VersionID,
  162. VersionName: jobResult.VersionName,
  163. Uuid: req.Uuid,
  164. })
  165. if err != nil {
  166. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  167. return err
  168. }
  169. return nil
  170. }
  171. func TransTrainJobStatus(status int) string{
  172. switch status {
  173. case 0:
  174. return "UNKNOWN"
  175. case 1:
  176. return "INIT"
  177. case 2:
  178. return "IMAGE_CREATING"
  179. case 3:
  180. return "IMAGE_FAILED"
  181. case 4:
  182. return "SUBMIT_TRYING"
  183. case 5:
  184. return "SUBMIT_FAILED"
  185. case 6:
  186. return "DELETE_FAILED"
  187. case 7:
  188. return "WAITING"
  189. case 8:
  190. return "RUNNING"
  191. case 9:
  192. return "KILLING"
  193. case 10:
  194. return "COMPLETED"
  195. case 11:
  196. return "FAILED"
  197. case 12:
  198. return "KILLED"
  199. case 13:
  200. return "CANCELED"
  201. case 14:
  202. return "LOST"
  203. case 15:
  204. return "SCALING"
  205. case 16:
  206. return "SUBMIT_MODEL_FAILED"
  207. case 17:
  208. return "DEPLOY_SERVICE_FAILED"
  209. case 18:
  210. return "CHECK_INIT"
  211. case 19:
  212. return "CHECK_RUNNING"
  213. case 20:
  214. return "CHECK_RUNNING_COMPLETED"
  215. case 21:
  216. return "CHECK_FAILED"
  217. default:
  218. return strconv.Itoa(status)
  219. }
  220. return ""
  221. }