You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

4 years ago
3 years ago
3 years ago
4 years ago
2 years ago
4 years ago
2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago

  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "strings"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. "code.gitea.io/gitea/modules/storage"
  15. "code.gitea.io/gitea/modules/timeutil"
  16. )
  17. const (
  18. //notebook
  19. storageTypeOBS = "obs"
  20. autoStopDuration = 4 * 60 * 60
  21. autoStopDurationMs = 4 * 60 * 60 * 1000
  22. MORDELART_USER_IMAGE_ENGINE_ID = -1
  23. DataSetMountPath = "/home/ma-user/work"
  24. NotebookEnv = "Python3"
  25. NotebookType = "Ascend"
  26. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  27. //train-job
  28. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  29. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  30. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  31. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  34. // "]}"
  35. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  36. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  39. // "]}"
  40. CodePath = "/code/"
  41. OutputPath = "/output/"
  42. ResultPath = "/result/"
  43. LogPath = "/log/"
  44. JobPath = "/job/"
  45. OrderDesc = "desc" //向下查询
  46. OrderAsc = "asc" //向上查询
  47. Lines = 500
  48. TrainUrl = "train_url"
  49. DataUrl = "data_url"
  50. MultiDataUrl = "multi_data_url"
  51. ResultUrl = "result_url"
  52. CkptUrl = "ckpt_url"
  53. DeviceTarget = "device_target"
  54. Ascend = "Ascend"
  55. PerPage = 10
  56. IsLatestVersion = "1"
  57. NotLatestVersion = "0"
  58. VersionCountOne = 1
  59. SortByCreateTime = "create_time"
  60. ConfigTypeCustom = "custom"
  61. TotalVersionCount = 1
  62. )
  63. var (
  64. poolInfos *models.PoolInfos
  65. FlavorInfos *models.FlavorInfos
  66. ImageInfos *models.ImageInfosModelArts
  67. TrainFlavorInfos *Flavor
  68. SpecialPools *models.SpecialPools
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. FlavorCode string
  81. LogUrl string
  82. PoolID string
  83. WorkServerNumber int
  84. EngineID int64
  85. Parameters []models.Parameter
  86. CommitID string
  87. IsLatestVersion string
  88. Params string
  89. BranchName string
  90. PreVersionId int64
  91. PreVersionName string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. }
  100. type GenerateInferenceJobReq struct {
  101. JobName string
  102. DisplayJobName string
  103. Uuid string
  104. Description string
  105. CodeObsPath string
  106. BootFile string
  107. BootFileUrl string
  108. DataUrl string
  109. TrainUrl string
  110. FlavorCode string
  111. LogUrl string
  112. PoolID string
  113. WorkServerNumber int
  114. EngineID int64
  115. Parameters []models.Parameter
  116. CommitID string
  117. Params string
  118. BranchName string
  119. FlavorName string
  120. EngineName string
  121. LabelName string
  122. IsLatestVersion string
  123. VersionCount int
  124. TotalVersionCount int
  125. ModelName string
  126. ModelVersion string
  127. CkptName string
  128. ResultUrl string
  129. DatasetName string
  130. }
  131. type VersionInfo struct {
  132. Version []struct {
  133. ID int `json:"id"`
  134. Value string `json:"value"`
  135. Url string `json:"url"`
  136. } `json:"version"`
  137. }
  138. type Flavor struct {
  139. Info []struct {
  140. Code string `json:"code"`
  141. Value string `json:"value"`
  142. } `json:"flavor"`
  143. }
  144. type Engine struct {
  145. Info []struct {
  146. ID int `json:"id"`
  147. Value string `json:"value"`
  148. } `json:"engine"`
  149. }
  150. type ResourcePool struct {
  151. Info []struct {
  152. ID string `json:"id"`
  153. Value string `json:"value"`
  154. } `json:"resource_pool"`
  155. }
  156. // type Parameter struct {
  157. // Label string `json:"label"`
  158. // Value string `json:"value"`
  159. // }
  160. // type Parameters struct {
  161. // Parameter []Parameter `json:"parameter"`
  162. // }
  163. type Parameters struct {
  164. Parameter []struct {
  165. Label string `json:"label"`
  166. Value string `json:"value"`
  167. } `json:"parameter"`
  168. }
  169. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  170. var dataActualPath string
  171. if uuid != "" {
  172. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  173. } else {
  174. userPath := setting.UserBasePath + ctx.User.Name + "/"
  175. isExist, err := storage.ObsHasObject(userPath)
  176. if err != nil {
  177. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  178. return err
  179. }
  180. if !isExist {
  181. if err = storage.ObsCreateObject(userPath); err != nil {
  182. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  183. return err
  184. }
  185. }
  186. dataActualPath = setting.Bucket + "/" + userPath
  187. }
  188. if poolInfos == nil {
  189. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  190. }
  191. createTime := timeutil.TimeStampNow()
  192. jobResult, err := CreateJob(models.CreateNotebookParams{
  193. JobName: jobName,
  194. Description: description,
  195. ProfileID: setting.ProfileID,
  196. Flavor: flavor,
  197. Pool: models.Pool{
  198. ID: poolInfos.PoolInfo[0].PoolId,
  199. Name: poolInfos.PoolInfo[0].PoolName,
  200. Type: poolInfos.PoolInfo[0].PoolType,
  201. },
  202. Spec: models.Spec{
  203. Storage: models.Storage{
  204. Type: storageTypeOBS,
  205. Location: models.Location{
  206. Path: dataActualPath,
  207. },
  208. },
  209. AutoStop: models.AutoStop{
  210. Enable: true,
  211. Duration: autoStopDuration,
  212. },
  213. },
  214. })
  215. if err != nil {
  216. log.Error("CreateJob failed: %v", err.Error())
  217. return err
  218. }
  219. err = models.CreateCloudbrain(&models.Cloudbrain{
  220. Status: string(models.JobWaiting),
  221. UserID: ctx.User.ID,
  222. RepoID: ctx.Repo.Repository.ID,
  223. JobID: jobResult.ID,
  224. JobName: jobName,
  225. JobType: string(models.JobTypeDebug),
  226. Type: models.TypeCloudBrainTwo,
  227. Uuid: uuid,
  228. ComputeResource: models.NPUResource,
  229. CreatedUnix: createTime,
  230. UpdatedUnix: createTime,
  231. })
  232. if err != nil {
  233. return err
  234. }
  235. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  236. return nil
  237. }
  238. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
  239. if poolInfos == nil {
  240. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  241. }
  242. imageName, err := GetNotebookImageName(imageId)
  243. if err != nil {
  244. log.Error("GetNotebookImageName failed: %v", err.Error())
  245. return err
  246. }
  247. createTime := timeutil.TimeStampNow()
  248. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  249. JobName: jobName,
  250. Description: description,
  251. Flavor: flavor,
  252. Duration: autoStopDurationMs,
  253. ImageID: imageId,
  254. PoolID: poolInfos.PoolInfo[0].PoolId,
  255. Feature: models.NotebookFeature,
  256. Volume: models.VolumeReq{
  257. Capacity: setting.Capacity,
  258. Category: models.EVSCategory,
  259. Ownership: models.ManagedOwnership,
  260. },
  261. WorkspaceID: "0",
  262. })
  263. if err != nil {
  264. log.Error("createNotebook2 failed: %v", err.Error())
  265. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  266. log.Info("(%s)unknown error, set temp status", displayJobName)
  267. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  268. JobID: models.TempJobId,
  269. VersionID: models.TempVersionId,
  270. Status: models.TempJobStatus,
  271. Type: models.TypeCloudBrainTwo,
  272. JobName: jobName,
  273. JobType: string(models.JobTypeDebug),
  274. })
  275. if errTemp != nil {
  276. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  277. return errTemp
  278. }
  279. }
  280. return err
  281. }
  282. task := &models.Cloudbrain{
  283. Status: jobResult.Status,
  284. UserID: ctx.User.ID,
  285. RepoID: ctx.Repo.Repository.ID,
  286. JobID: jobResult.ID,
  287. JobName: jobName,
  288. FlavorCode: flavor,
  289. DisplayJobName: displayJobName,
  290. JobType: string(models.JobTypeDebug),
  291. Type: models.TypeCloudBrainTwo,
  292. Uuid: uuid,
  293. ComputeResource: models.NPUResource,
  294. Image: imageName,
  295. Description: description,
  296. CreatedUnix: createTime,
  297. UpdatedUnix: createTime,
  298. }
  299. err = models.CreateCloudbrain(task)
  300. if err != nil {
  301. return err
  302. }
  303. stringId := strconv.FormatInt(task.ID, 10)
  304. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  305. return nil
  306. }
  307. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  308. createTime := timeutil.TimeStampNow()
  309. var jobResult *models.CreateTrainJobResult
  310. var createErr error
  311. if req.EngineID < 0 {
  312. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  313. JobName: req.JobName,
  314. Description: req.Description,
  315. Config: models.UserImageConfig{
  316. WorkServerNum: req.WorkServerNumber,
  317. AppUrl: req.CodeObsPath,
  318. BootFileUrl: req.BootFileUrl,
  319. DataUrl: req.DataUrl,
  320. TrainUrl: req.TrainUrl,
  321. LogUrl: req.LogUrl,
  322. PoolID: req.PoolID,
  323. CreateVersion: true,
  324. Flavor: models.Flavor{
  325. Code: req.FlavorCode,
  326. },
  327. Parameter: req.Parameters,
  328. UserImageUrl: req.UserImageUrl,
  329. UserCommand: req.UserCommand,
  330. },
  331. })
  332. } else {
  333. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  334. JobName: req.JobName,
  335. Description: req.Description,
  336. Config: models.Config{
  337. WorkServerNum: req.WorkServerNumber,
  338. AppUrl: req.CodeObsPath,
  339. BootFileUrl: req.BootFileUrl,
  340. DataUrl: req.DataUrl,
  341. EngineID: req.EngineID,
  342. TrainUrl: req.TrainUrl,
  343. LogUrl: req.LogUrl,
  344. PoolID: req.PoolID,
  345. CreateVersion: true,
  346. Flavor: models.Flavor{
  347. Code: req.FlavorCode,
  348. },
  349. Parameter: req.Parameters,
  350. },
  351. })
  352. }
  353. if createErr != nil {
  354. log.Error("createTrainJob failed: %v", createErr.Error())
  355. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  356. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  357. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  358. JobID: models.TempJobId,
  359. VersionID: models.TempVersionId,
  360. Status: models.TempJobStatus,
  361. Type: models.TypeCloudBrainTwo,
  362. JobName: req.JobName,
  363. JobType: string(models.JobTypeTrain),
  364. })
  365. if errTemp != nil {
  366. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  367. return errTemp
  368. }
  369. }
  370. return createErr
  371. }
  372. jobId := strconv.FormatInt(jobResult.JobID, 10)
  373. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  374. Status: TransTrainJobStatus(jobResult.Status),
  375. UserID: ctx.User.ID,
  376. RepoID: ctx.Repo.Repository.ID,
  377. JobID: jobId,
  378. JobName: req.JobName,
  379. DisplayJobName: req.DisplayJobName,
  380. JobType: string(models.JobTypeTrain),
  381. Type: models.TypeCloudBrainTwo,
  382. VersionID: jobResult.VersionID,
  383. VersionName: jobResult.VersionName,
  384. Uuid: req.Uuid,
  385. DatasetName: req.DatasetName,
  386. CommitID: req.CommitID,
  387. IsLatestVersion: req.IsLatestVersion,
  388. ComputeResource: models.NPUResource,
  389. EngineID: req.EngineID,
  390. TrainUrl: req.TrainUrl,
  391. BranchName: req.BranchName,
  392. Parameters: req.Params,
  393. BootFile: req.BootFile,
  394. DataUrl: req.DataUrl,
  395. LogUrl: req.LogUrl,
  396. FlavorCode: req.FlavorCode,
  397. Description: req.Description,
  398. WorkServerNumber: req.WorkServerNumber,
  399. FlavorName: req.FlavorName,
  400. EngineName: req.EngineName,
  401. VersionCount: req.VersionCount,
  402. TotalVersionCount: req.TotalVersionCount,
  403. CreatedUnix: createTime,
  404. UpdatedUnix: createTime,
  405. })
  406. if createErr != nil {
  407. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  408. return createErr
  409. }
  410. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  411. return nil
  412. }
  413. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  414. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  415. JobName: req.JobName,
  416. Description: req.Description,
  417. Config: models.UserImageConfig{
  418. WorkServerNum: req.WorkServerNumber,
  419. AppUrl: req.CodeObsPath,
  420. BootFileUrl: req.BootFileUrl,
  421. DataUrl: req.DataUrl,
  422. TrainUrl: req.TrainUrl,
  423. LogUrl: req.LogUrl,
  424. PoolID: req.PoolID,
  425. CreateVersion: true,
  426. Flavor: models.Flavor{
  427. Code: req.FlavorCode,
  428. },
  429. Parameter: req.Parameters,
  430. UserImageUrl: req.UserImageUrl,
  431. UserCommand: req.UserCommand,
  432. },
  433. })
  434. }
  435. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  436. createTime := timeutil.TimeStampNow()
  437. var jobResult *models.CreateTrainJobResult
  438. var createErr error
  439. if req.EngineID < 0 {
  440. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  441. Description: req.Description,
  442. Config: models.TrainJobVersionUserImageConfig{
  443. WorkServerNum: req.WorkServerNumber,
  444. AppUrl: req.CodeObsPath,
  445. BootFileUrl: req.BootFileUrl,
  446. DataUrl: req.DataUrl,
  447. TrainUrl: req.TrainUrl,
  448. LogUrl: req.LogUrl,
  449. PoolID: req.PoolID,
  450. Flavor: models.Flavor{
  451. Code: req.FlavorCode,
  452. },
  453. Parameter: req.Parameters,
  454. PreVersionId: req.PreVersionId,
  455. UserImageUrl: req.UserImageUrl,
  456. UserCommand: req.UserCommand,
  457. },
  458. }, jobId)
  459. } else {
  460. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  461. Description: req.Description,
  462. Config: models.TrainJobVersionConfig{
  463. WorkServerNum: req.WorkServerNumber,
  464. AppUrl: req.CodeObsPath,
  465. BootFileUrl: req.BootFileUrl,
  466. DataUrl: req.DataUrl,
  467. EngineID: req.EngineID,
  468. TrainUrl: req.TrainUrl,
  469. LogUrl: req.LogUrl,
  470. PoolID: req.PoolID,
  471. Flavor: models.Flavor{
  472. Code: req.FlavorCode,
  473. },
  474. Parameter: req.Parameters,
  475. PreVersionId: req.PreVersionId,
  476. },
  477. }, jobId)
  478. }
  479. if createErr != nil {
  480. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  481. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  482. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  483. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  484. JobID: jobId,
  485. VersionID: models.TempVersionId,
  486. Status: models.TempJobStatus,
  487. Type: models.TypeCloudBrainTwo,
  488. JobName: req.JobName,
  489. JobType: string(models.JobTypeTrain),
  490. })
  491. if errTemp != nil {
  492. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  493. return errTemp
  494. }
  495. }
  496. return createErr
  497. }
  498. var jobTypes []string
  499. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  500. repo := ctx.Repo.Repository
  501. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  502. RepoID: repo.ID,
  503. Type: models.TypeCloudBrainTwo,
  504. JobTypes: jobTypes,
  505. JobID: strconv.FormatInt(jobResult.JobID, 10),
  506. })
  507. if createErr != nil {
  508. ctx.ServerError("Cloudbrain", createErr)
  509. return createErr
  510. }
  511. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  512. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  513. Status: TransTrainJobStatus(jobResult.Status),
  514. UserID: ctx.User.ID,
  515. RepoID: ctx.Repo.Repository.ID,
  516. JobID: strconv.FormatInt(jobResult.JobID, 10),
  517. JobName: req.JobName,
  518. DisplayJobName: req.DisplayJobName,
  519. JobType: string(models.JobTypeTrain),
  520. Type: models.TypeCloudBrainTwo,
  521. VersionID: jobResult.VersionID,
  522. VersionName: jobResult.VersionName,
  523. Uuid: req.Uuid,
  524. DatasetName: req.DatasetName,
  525. CommitID: req.CommitID,
  526. IsLatestVersion: req.IsLatestVersion,
  527. PreVersionName: req.PreVersionName,
  528. ComputeResource: models.NPUResource,
  529. EngineID: req.EngineID,
  530. TrainUrl: req.TrainUrl,
  531. BranchName: req.BranchName,
  532. Parameters: req.Params,
  533. BootFile: req.BootFile,
  534. DataUrl: req.DataUrl,
  535. LogUrl: req.LogUrl,
  536. PreVersionId: req.PreVersionId,
  537. FlavorCode: req.FlavorCode,
  538. Description: req.Description,
  539. WorkServerNumber: req.WorkServerNumber,
  540. FlavorName: req.FlavorName,
  541. EngineName: req.EngineName,
  542. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  543. VersionCount: VersionListCount + 1,
  544. CreatedUnix: createTime,
  545. UpdatedUnix: createTime,
  546. })
  547. if createErr != nil {
  548. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  549. return createErr
  550. }
  551. //将训练任务的上一版本的isLatestVersion设置为"0"
  552. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  553. if createErr != nil {
  554. ctx.ServerError("Update IsLatestVersion failed", createErr)
  555. return createErr
  556. }
  557. return createErr
  558. }
  559. func TransTrainJobStatus(status int) string {
  560. switch status {
  561. case 0:
  562. return "UNKNOWN"
  563. case 1:
  564. return "INIT"
  565. case 2:
  566. return "IMAGE_CREATING"
  567. case 3:
  568. return "IMAGE_FAILED"
  569. case 4:
  570. return "SUBMIT_TRYING"
  571. case 5:
  572. return "SUBMIT_FAILED"
  573. case 6:
  574. return "DELETE_FAILED"
  575. case 7:
  576. return "WAITING"
  577. case 8:
  578. return "RUNNING"
  579. case 9:
  580. return "KILLING"
  581. case 10:
  582. return "COMPLETED"
  583. case 11:
  584. return "FAILED"
  585. case 12:
  586. return "KILLED"
  587. case 13:
  588. return "CANCELED"
  589. case 14:
  590. return "LOST"
  591. case 15:
  592. return "SCALING"
  593. case 16:
  594. return "SUBMIT_MODEL_FAILED"
  595. case 17:
  596. return "DEPLOY_SERVICE_FAILED"
  597. case 18:
  598. return "CHECK_INIT"
  599. case 19:
  600. return "CHECK_RUNNING"
  601. case 20:
  602. return "CHECK_RUNNING_COMPLETED"
  603. case 21:
  604. return "CHECK_FAILED"
  605. default:
  606. return strconv.Itoa(status)
  607. }
  608. }
  609. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  610. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  611. VersionOutputPath = "V" + talVersionCountToString
  612. return VersionOutputPath
  613. }
  614. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  615. createTime := timeutil.TimeStampNow()
  616. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  617. JobName: req.JobName,
  618. Description: req.Description,
  619. InfConfig: models.InfConfig{
  620. WorkServerNum: req.WorkServerNumber,
  621. AppUrl: req.CodeObsPath,
  622. BootFileUrl: req.BootFileUrl,
  623. DataUrl: req.DataUrl,
  624. EngineID: req.EngineID,
  625. // TrainUrl: req.TrainUrl,
  626. LogUrl: req.LogUrl,
  627. PoolID: req.PoolID,
  628. CreateVersion: true,
  629. Flavor: models.Flavor{
  630. Code: req.FlavorCode,
  631. },
  632. Parameter: req.Parameters,
  633. },
  634. })
  635. if err != nil {
  636. log.Error("createInferenceJob failed: %v", err.Error())
  637. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  638. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  639. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  640. JobID: models.TempJobId,
  641. VersionID: models.TempVersionId,
  642. Status: models.TempJobStatus,
  643. Type: models.TypeCloudBrainTwo,
  644. JobName: req.JobName,
  645. JobType: string(models.JobTypeInference),
  646. })
  647. if err != nil {
  648. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  649. return err
  650. }
  651. }
  652. return err
  653. }
  654. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  655. // if err != nil {
  656. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  657. // return err
  658. // }
  659. jobID := strconv.FormatInt(jobResult.JobID, 10)
  660. err = models.CreateCloudbrain(&models.Cloudbrain{
  661. Status: TransTrainJobStatus(jobResult.Status),
  662. UserID: ctx.User.ID,
  663. RepoID: ctx.Repo.Repository.ID,
  664. JobID: jobID,
  665. JobName: req.JobName,
  666. DisplayJobName: req.DisplayJobName,
  667. JobType: string(models.JobTypeInference),
  668. Type: models.TypeCloudBrainTwo,
  669. VersionID: jobResult.VersionID,
  670. VersionName: jobResult.VersionName,
  671. Uuid: req.Uuid,
  672. DatasetName: req.DatasetName,
  673. CommitID: req.CommitID,
  674. EngineID: req.EngineID,
  675. TrainUrl: req.TrainUrl,
  676. BranchName: req.BranchName,
  677. Parameters: req.Params,
  678. BootFile: req.BootFile,
  679. DataUrl: req.DataUrl,
  680. LogUrl: req.LogUrl,
  681. FlavorCode: req.FlavorCode,
  682. Description: req.Description,
  683. WorkServerNumber: req.WorkServerNumber,
  684. FlavorName: req.FlavorName,
  685. EngineName: req.EngineName,
  686. LabelName: req.LabelName,
  687. IsLatestVersion: req.IsLatestVersion,
  688. ComputeResource: models.NPUResource,
  689. VersionCount: req.VersionCount,
  690. TotalVersionCount: req.TotalVersionCount,
  691. ModelName: req.ModelName,
  692. ModelVersion: req.ModelVersion,
  693. CkptName: req.CkptName,
  694. ResultUrl: req.ResultUrl,
  695. CreatedUnix: createTime,
  696. UpdatedUnix: createTime,
  697. })
  698. if err != nil {
  699. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  700. return err
  701. }
  702. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  703. return nil
  704. }
  705. func GetNotebookImageName(imageId string) (string, error) {
  706. var validImage = false
  707. var imageName = ""
  708. if ImageInfos == nil {
  709. json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
  710. }
  711. for _, imageInfo := range ImageInfos.ImageInfo {
  712. if imageInfo.Id == imageId {
  713. validImage = true
  714. imageName = imageInfo.Value
  715. }
  716. }
  717. if !validImage {
  718. log.Error("the image id(%s) is invalid", imageId)
  719. return imageName, errors.New("the image id is invalid")
  720. }
  721. return imageName, nil
  722. }
  723. func InitSpecialPool() {
  724. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  725. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  726. }
  727. }
  728. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  729. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  730. if err != nil {
  731. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  732. return err
  733. }
  734. if result != nil {
  735. oldStatus := task.Status
  736. task.Status = TransTrainJobStatus(result.IntStatus)
  737. task.Duration = result.Duration / 1000
  738. task.TrainJobDuration = result.TrainJobDuration
  739. if task.StartTime == 0 && result.StartTime > 0 {
  740. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  741. }
  742. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  743. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  744. task.EndTime = task.StartTime.Add(task.Duration)
  745. }
  746. task.CorrectCreateUnix()
  747. if oldStatus != task.Status {
  748. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  749. }
  750. err = models.UpdateJob(task)
  751. if err != nil {
  752. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  753. return err
  754. }
  755. }
  756. return nil
  757. }
  758. func HandleNotebookInfo(task *models.Cloudbrain) error {
  759. result, err := GetNotebook2(task.JobID)
  760. if err != nil {
  761. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  762. return err
  763. }
  764. if result != nil {
  765. oldStatus := task.Status
  766. task.Status = result.Status
  767. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  768. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  769. }
  770. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  771. task.EndTime = timeutil.TimeStampNow()
  772. }
  773. task.CorrectCreateUnix()
  774. task.ComputeAndSetDuration()
  775. if oldStatus != task.Status {
  776. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  777. }
  778. if task.FlavorCode == "" {
  779. task.FlavorCode = result.Flavor
  780. }
  781. err = models.UpdateJob(task)
  782. if err != nil {
  783. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  784. return err
  785. }
  786. }
  787. return nil
  788. }
  789. func SyncTempStatusJob() {
  790. jobs, err := models.GetCloudBrainTempJobs()
  791. if err != nil {
  792. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  793. return
  794. }
  795. for _, temp := range jobs {
  796. log.Info("start to handle record: %s", temp.JobName)
  797. if temp.Type == models.TypeCloudBrainTwo {
  798. if temp.JobType == string(models.JobTypeDebug) {
  799. err = handleNotebook(temp)
  800. if err != nil {
  801. log.Error("handleNotebook falied:%v", err)
  802. break
  803. }
  804. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  805. _, err = models.GetCloudbrainByJobID(temp.JobID)
  806. if err != nil {
  807. //one version
  808. err = handleTrainJob(temp)
  809. if err != nil {
  810. log.Error("handleTrainJob falied:%v", err)
  811. break
  812. }
  813. } else {
  814. //multi version
  815. err = handleTrainJobMultiVersion(temp)
  816. if err != nil {
  817. log.Error("handleTrainJobMultiVersion falied:%v", err)
  818. break
  819. }
  820. }
  821. }
  822. }
  823. }
  824. return
  825. }
  826. func handleNotebook(temp *models.CloudbrainTemp) error {
  827. if temp.Status == models.TempJobStatus {
  828. err := handleTempNotebook(temp)
  829. if err != nil {
  830. log.Error("handleTempNotebook failed:%v", err)
  831. return err
  832. }
  833. } else if temp.Status == string(models.ModelArtsStopping) {
  834. res, err := GetNotebook2(temp.JobID)
  835. if err != nil {
  836. log.Error("GetNotebook2 failed:%v", err)
  837. return err
  838. }
  839. temp.Status = res.Status
  840. if temp.Status == string(models.ModelArtsStopped) {
  841. err = models.UpdateCloudbrainTemp(temp)
  842. if err != nil {
  843. log.Error("UpdateCloudbrainTemp failed:%v", err)
  844. return err
  845. }
  846. _, err := DelNotebook2(temp.JobID)
  847. if err != nil {
  848. log.Error("DelNotebook2 failed:%v", err)
  849. return err
  850. }
  851. temp.Status = string(models.ModelArtsDeleted)
  852. err = models.UpdateCloudbrainTemp(temp)
  853. if err != nil {
  854. log.Error("UpdateCloudbrainTemp failed:%v", err)
  855. return err
  856. }
  857. }
  858. }
  859. return nil
  860. }
  861. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  862. var err error
  863. var isExist bool
  864. for {
  865. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  866. if err != nil {
  867. log.Error("GetNotebookList failed:%v", err)
  868. break
  869. }
  870. temp.QueryTimes++
  871. err = models.UpdateCloudbrainTemp(temp)
  872. if err != nil {
  873. log.Error("UpdateCloudbrainTemp failed:%v", err)
  874. }
  875. if result != nil {
  876. for _, notebook := range result.NotebookList {
  877. if temp.JobID == models.TempJobId {
  878. //new notebook
  879. if notebook.JobName == temp.JobName {
  880. isExist = true
  881. temp.Status = notebook.Status
  882. temp.JobID = notebook.JobID
  883. break
  884. }
  885. } else {
  886. //restart: always can find one record
  887. if notebook.JobName == temp.JobName {
  888. if notebook.Status != string(models.ModelArtsStopped) {
  889. isExist = true
  890. temp.Status = notebook.Status
  891. temp.JobID = notebook.JobID
  892. break
  893. }
  894. }
  895. }
  896. }
  897. if isExist {
  898. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  899. if temp.Status == string(models.ModelArtsCreateFailed) {
  900. err = models.UpdateCloudbrainTemp(temp)
  901. if err != nil {
  902. log.Error("UpdateCloudbrainTemp failed:%v", err)
  903. break
  904. }
  905. _, err := DelNotebook2(temp.JobID)
  906. if err != nil {
  907. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  908. break
  909. }
  910. temp.Status = string(models.ModelArtsDeleted)
  911. } else {
  912. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  913. if err != nil {
  914. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  915. break
  916. }
  917. temp.Status = string(models.ModelArtsStopping)
  918. }
  919. models.UpdateCloudbrainTemp(temp)
  920. } else {
  921. log.Error("can not find the record(%s) till now", temp.JobName)
  922. err = errors.New("not found")
  923. break
  924. }
  925. } else {
  926. log.Error("can not find the record(%s) till now", temp.JobName)
  927. err = errors.New("not found")
  928. break
  929. }
  930. break
  931. }
  932. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  933. log.Info("reach MaxTempQueryTimes, set the job failed")
  934. temp.Status = string(models.ModelArtsTrainJobFailed)
  935. err = models.UpdateCloudbrainTemp(temp)
  936. if err != nil {
  937. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  938. return err
  939. }
  940. }
  941. return err
  942. }
  943. func handleTrainJob(temp *models.CloudbrainTemp) error {
  944. if temp.Status == models.TempJobStatus {
  945. err := handleTempTrainJob(temp)
  946. if err != nil {
  947. log.Error("handleTempTrainJob failed:%v", err)
  948. return err
  949. }
  950. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  951. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  952. if err != nil {
  953. log.Error("GetTrainJob failed:%v", err)
  954. return err
  955. }
  956. temp.Status = TransTrainJobStatus(res.IntStatus)
  957. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  958. err = models.UpdateCloudbrainTemp(temp)
  959. if err != nil {
  960. log.Error("UpdateCloudbrainTemp failed:%v", err)
  961. return err
  962. }
  963. _, err := DelTrainJob(temp.JobID)
  964. if err != nil {
  965. log.Error("DelTrainJob failed:%v", err)
  966. return err
  967. }
  968. temp.Status = string(models.ModelArtsDeleted)
  969. err = models.UpdateCloudbrainTemp(temp)
  970. if err != nil {
  971. log.Error("UpdateCloudbrainTemp failed:%v", err)
  972. return err
  973. }
  974. }
  975. }
  976. return nil
  977. }
  978. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  979. if temp.Status == models.TempJobStatus {
  980. err := handleTempTrainJobMultiVersion(temp)
  981. if err != nil {
  982. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  983. return err
  984. }
  985. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  986. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  987. if err != nil {
  988. log.Error("GetTrainJob failed:%v", err)
  989. return err
  990. }
  991. temp.Status = TransTrainJobStatus(res.IntStatus)
  992. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  993. err = models.UpdateCloudbrainTemp(temp)
  994. if err != nil {
  995. log.Error("UpdateCloudbrainTemp failed:%v", err)
  996. return err
  997. }
  998. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  999. if err != nil {
  1000. log.Error("DelTrainJob failed:%v", err)
  1001. return err
  1002. }
  1003. temp.Status = string(models.ModelArtsDeleted)
  1004. err = models.UpdateCloudbrainTemp(temp)
  1005. if err != nil {
  1006. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1007. return err
  1008. }
  1009. }
  1010. }
  1011. return nil
  1012. }
  1013. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1014. var err error
  1015. var isExist bool
  1016. for {
  1017. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1018. if err != nil {
  1019. log.Error("GetTrainJobVersionList failed:%v", err)
  1020. break
  1021. }
  1022. temp.QueryTimes++
  1023. err = models.UpdateCloudbrainTemp(temp)
  1024. if err != nil {
  1025. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1026. }
  1027. if result != nil {
  1028. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1029. if result.VersionCount == int64(count+1) {
  1030. isExist = true
  1031. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1032. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1033. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1034. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1035. if err != nil {
  1036. log.Error("StopTrainJob failed:%v", err)
  1037. break
  1038. }
  1039. temp.Status = string(models.ModelArtsTrainJobKilling)
  1040. err = models.UpdateCloudbrainTemp(temp)
  1041. if err != nil {
  1042. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1043. break
  1044. }
  1045. } else {
  1046. log.Error("can not find the record(%s) till now", temp.JobName)
  1047. err = errors.New("not found")
  1048. break
  1049. }
  1050. }
  1051. break
  1052. }
  1053. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1054. log.Info("reach MaxTempQueryTimes, set the job failed")
  1055. temp.Status = string(models.ModelArtsTrainJobFailed)
  1056. err = models.UpdateCloudbrainTemp(temp)
  1057. if err != nil {
  1058. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1059. return err
  1060. }
  1061. }
  1062. return err
  1063. }
  1064. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1065. var err error
  1066. var isExist bool
  1067. for {
  1068. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1069. if err != nil {
  1070. log.Error("GetTrainJobList failed:%v", err)
  1071. break
  1072. }
  1073. temp.QueryTimes++
  1074. err = models.UpdateCloudbrainTemp(temp)
  1075. if err != nil {
  1076. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1077. }
  1078. if result != nil {
  1079. for _, job := range result.JobList {
  1080. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1081. isExist = true
  1082. temp.Status = TransTrainJobStatus(job.IntStatus)
  1083. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1084. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1085. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1086. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1087. if err != nil {
  1088. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1089. break
  1090. }
  1091. temp.Status = string(models.ModelArtsTrainJobKilling)
  1092. err = models.UpdateCloudbrainTemp(temp)
  1093. if err != nil {
  1094. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1095. break
  1096. }
  1097. }
  1098. }
  1099. if !isExist {
  1100. log.Error("can not find the record(%s) till now", temp.JobName)
  1101. err = errors.New("not found")
  1102. break
  1103. }
  1104. }
  1105. break
  1106. }
  1107. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1108. log.Info("reach MaxTempQueryTimes, set the job failed")
  1109. temp.Status = string(models.ModelArtsTrainJobFailed)
  1110. err = models.UpdateCloudbrainTemp(temp)
  1111. if err != nil {
  1112. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1113. return err
  1114. }
  1115. }
  1116. return err
  1117. }