You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

4 years ago
3 years ago
3 years ago
2 years ago
2 years ago
4 years ago
2 years ago
4 years ago
4 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
4 years ago
2 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
4 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago

  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "strconv"
  7. "strings"
  8. "code.gitea.io/gitea/modules/cloudbrain"
  9. "code.gitea.io/gitea/modules/modelarts_cd"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/modules/timeutil"
  16. )
  17. const (
  18. //notebook
  19. storageTypeOBS = "obs"
  20. autoStopDuration = 4 * 60 * 60
  21. AutoStopDurationMs = 4 * 60 * 60 * 1000
  22. CodePath = "/code/"
  23. OutputPath = "/output/"
  24. ResultPath = "/result/"
  25. LogPath = "/log/"
  26. JobPath = "/job/"
  27. OrderDesc = "desc" //向下查询
  28. OrderAsc = "asc" //向上查询
  29. Lines = 500
  30. TrainUrl = "train_url"
  31. DataUrl = "data_url"
  32. MultiDataUrl = "multi_data_url"
  33. ResultUrl = "result_url"
  34. CkptUrl = "ckpt_url"
  35. DeviceTarget = "device_target"
  36. Ascend = "Ascend"
  37. PerPage = 10
  38. IsLatestVersion = "1"
  39. NotLatestVersion = "0"
  40. VersionCountOne = 1
  41. SortByCreateTime = "create_time"
  42. ConfigTypeCustom = "custom"
  43. TotalVersionCount = 1
  44. )
  45. var (
  46. poolInfos *models.PoolInfos
  47. TrainFlavorInfos *Flavor
  48. SpecialPools *models.SpecialPools
  49. MultiNodeConfig *MultiNodes
  50. )
  51. type GenerateTrainJobReq struct {
  52. JobName string
  53. DisplayJobName string
  54. Uuid string
  55. Description string
  56. CodeObsPath string
  57. BootFile string
  58. BootFileUrl string
  59. DataUrl string
  60. TrainUrl string
  61. LogUrl string
  62. PoolID string
  63. WorkServerNumber int
  64. EngineID int64
  65. Parameters []models.Parameter
  66. CommitID string
  67. IsLatestVersion string
  68. Params string
  69. BranchName string
  70. PreVersionId int64
  71. PreVersionName string
  72. FlavorCode string
  73. FlavorName string
  74. VersionCount int
  75. EngineName string
  76. TotalVersionCount int
  77. UserImageUrl string
  78. UserCommand string
  79. DatasetName string
  80. Spec *models.Specification
  81. ModelName string
  82. LabelName string
  83. CkptName string
  84. ModelVersion string
  85. PreTrainModelUrl string
  86. }
  87. type GenerateInferenceJobReq struct {
  88. JobName string
  89. DisplayJobName string
  90. Uuid string
  91. Description string
  92. CodeObsPath string
  93. BootFile string
  94. BootFileUrl string
  95. DataUrl string
  96. TrainUrl string
  97. LogUrl string
  98. PoolID string
  99. WorkServerNumber int
  100. EngineID int64
  101. Parameters []models.Parameter
  102. CommitID string
  103. Params string
  104. BranchName string
  105. FlavorName string
  106. EngineName string
  107. LabelName string
  108. IsLatestVersion string
  109. VersionCount int
  110. TotalVersionCount int
  111. ModelName string
  112. ModelVersion string
  113. CkptName string
  114. ResultUrl string
  115. Spec *models.Specification
  116. DatasetName string
  117. JobType string
  118. UserImageUrl string
  119. UserCommand string
  120. }
  121. type VersionInfo struct {
  122. Version []struct {
  123. ID int `json:"id"`
  124. Value string `json:"value"`
  125. Url string `json:"url"`
  126. } `json:"version"`
  127. }
  128. type Flavor struct {
  129. Info []struct {
  130. Code string `json:"code"`
  131. Value string `json:"value"`
  132. UnitPrice int64 `json:"unitPrice"`
  133. } `json:"flavor"`
  134. }
  135. type Engine struct {
  136. Info []struct {
  137. ID int `json:"id"`
  138. Value string `json:"value"`
  139. } `json:"engine"`
  140. }
  141. type ResourcePool struct {
  142. Info []struct {
  143. ID string `json:"id"`
  144. Value string `json:"value"`
  145. } `json:"resource_pool"`
  146. }
  147. type MultiNodes struct {
  148. Info []OrgMultiNode `json:"multinode"`
  149. }
  150. type OrgMultiNode struct {
  151. Org string `json:"org"`
  152. Node []int `json:"node"`
  153. }
  154. type Parameters struct {
  155. Parameter []struct {
  156. Label string `json:"label"`
  157. Value string `json:"value"`
  158. } `json:"parameter"`
  159. }
  160. func GenerateNotebook2(ctx *context.Context, req cloudbrain.GenerateModelArtsNotebookReq) (string, error) {
  161. if poolInfos == nil {
  162. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  163. }
  164. imageName, err := GetNotebookImageName(req.ImageId)
  165. if err != nil {
  166. log.Error("GetNotebookImageName failed: %v", err.Error())
  167. return "", err
  168. }
  169. createTime := timeutil.TimeStampNow()
  170. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  171. JobName: req.JobName,
  172. Description: req.Description,
  173. Flavor: req.Spec.SourceSpecId,
  174. Duration: req.AutoStopDurationMs,
  175. ImageID: req.ImageId,
  176. PoolID: poolInfos.PoolInfo[0].PoolId,
  177. Feature: models.NotebookFeature,
  178. Volume: models.VolumeReq{
  179. Capacity: setting.Capacity,
  180. Category: models.EVSCategory,
  181. Ownership: models.ManagedOwnership,
  182. },
  183. WorkspaceID: "0",
  184. })
  185. if err != nil {
  186. log.Error("createNotebook2 failed: %v", err.Error())
  187. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  188. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  189. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  190. JobID: models.TempJobId,
  191. VersionID: models.TempVersionId,
  192. Status: models.TempJobStatus,
  193. Type: models.TypeCloudBrainTwo,
  194. JobName: req.JobName,
  195. JobType: string(models.JobTypeDebug),
  196. })
  197. if errTemp != nil {
  198. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  199. return "", errTemp
  200. }
  201. }
  202. return "", err
  203. }
  204. task := &models.Cloudbrain{
  205. Status: jobResult.Status,
  206. UserID: ctx.User.ID,
  207. RepoID: ctx.Repo.Repository.ID,
  208. JobID: jobResult.ID,
  209. JobName: req.JobName,
  210. FlavorCode: req.Spec.SourceSpecId,
  211. DisplayJobName: req.DisplayJobName,
  212. JobType: string(models.JobTypeDebug),
  213. Type: models.TypeCloudBrainTwo,
  214. Uuid: req.Uuid,
  215. ComputeResource: models.NPUResource,
  216. Image: imageName,
  217. BootFile: req.BootFile,
  218. BranchName: req.BranchName,
  219. Description: req.Description,
  220. CreatedUnix: createTime,
  221. UpdatedUnix: createTime,
  222. Spec: req.Spec,
  223. ModelName: req.ModelName,
  224. ModelVersion: req.ModelVersion,
  225. LabelName: req.LabelName,
  226. PreTrainModelUrl: req.PreTrainModelUrl,
  227. CkptName: req.CkptName,
  228. }
  229. err = models.CreateCloudbrain(task)
  230. if err != nil {
  231. return "", err
  232. }
  233. stringId := strconv.FormatInt(task.ID, 10)
  234. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask)
  235. return jobResult.ID, nil
  236. }
  237. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  238. createTime := timeutil.TimeStampNow()
  239. var jobResult *models.CreateTrainJobResult
  240. var createErr error
  241. if req.EngineID < 0 {
  242. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  243. JobName: req.JobName,
  244. Description: req.Description,
  245. Config: models.UserImageConfig{
  246. WorkServerNum: req.WorkServerNumber,
  247. AppUrl: req.CodeObsPath,
  248. BootFileUrl: req.BootFileUrl,
  249. DataUrl: req.DataUrl,
  250. TrainUrl: req.TrainUrl,
  251. LogUrl: req.LogUrl,
  252. PoolID: req.PoolID,
  253. CreateVersion: true,
  254. Flavor: models.Flavor{
  255. Code: req.Spec.SourceSpecId,
  256. },
  257. Parameter: req.Parameters,
  258. UserImageUrl: req.UserImageUrl,
  259. UserCommand: req.UserCommand,
  260. ShareAddr: setting.ModelArtsShareAddr,
  261. MountPath: setting.ModelArtsMountPath,
  262. NasType: setting.ModelArtsNasType,
  263. },
  264. })
  265. } else {
  266. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  267. JobName: req.JobName,
  268. Description: req.Description,
  269. Config: models.Config{
  270. WorkServerNum: req.WorkServerNumber,
  271. AppUrl: req.CodeObsPath,
  272. BootFileUrl: req.BootFileUrl,
  273. DataUrl: req.DataUrl,
  274. EngineID: req.EngineID,
  275. TrainUrl: req.TrainUrl,
  276. LogUrl: req.LogUrl,
  277. PoolID: req.PoolID,
  278. CreateVersion: true,
  279. Flavor: models.Flavor{
  280. Code: req.Spec.SourceSpecId,
  281. },
  282. Parameter: req.Parameters,
  283. ShareAddr: setting.ModelArtsShareAddr,
  284. MountPath: setting.ModelArtsMountPath,
  285. NasType: setting.ModelArtsNasType,
  286. },
  287. })
  288. }
  289. if createErr != nil {
  290. log.Error("createTrainJob failed: %v", createErr.Error())
  291. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  292. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  293. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  294. JobID: models.TempJobId,
  295. VersionID: models.TempVersionId,
  296. Status: models.TempJobStatus,
  297. Type: models.TypeCloudBrainTwo,
  298. JobName: req.JobName,
  299. JobType: string(models.JobTypeTrain),
  300. })
  301. if errTemp != nil {
  302. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  303. return "", errTemp
  304. }
  305. }
  306. return "", createErr
  307. }
  308. jobID := strconv.FormatInt(jobResult.JobID, 10)
  309. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  310. Status: TransTrainJobStatus(jobResult.Status),
  311. UserID: ctx.User.ID,
  312. RepoID: ctx.Repo.Repository.ID,
  313. JobID: jobID,
  314. JobName: req.JobName,
  315. DisplayJobName: req.DisplayJobName,
  316. JobType: string(models.JobTypeTrain),
  317. Type: models.TypeCloudBrainTwo,
  318. VersionID: jobResult.VersionID,
  319. VersionName: jobResult.VersionName,
  320. Uuid: req.Uuid,
  321. DatasetName: req.DatasetName,
  322. CommitID: req.CommitID,
  323. IsLatestVersion: req.IsLatestVersion,
  324. ComputeResource: models.NPUResource,
  325. EngineID: req.EngineID,
  326. TrainUrl: req.TrainUrl,
  327. BranchName: req.BranchName,
  328. Parameters: req.Params,
  329. BootFile: req.BootFile,
  330. DataUrl: req.DataUrl,
  331. LogUrl: req.LogUrl,
  332. FlavorCode: req.Spec.SourceSpecId,
  333. Description: req.Description,
  334. WorkServerNumber: req.WorkServerNumber,
  335. FlavorName: req.FlavorName,
  336. EngineName: req.EngineName,
  337. VersionCount: req.VersionCount,
  338. TotalVersionCount: req.TotalVersionCount,
  339. CreatedUnix: createTime,
  340. UpdatedUnix: createTime,
  341. Spec: req.Spec,
  342. ModelName: req.ModelName,
  343. ModelVersion: req.ModelVersion,
  344. LabelName: req.LabelName,
  345. PreTrainModelUrl: req.PreTrainModelUrl,
  346. CkptName: req.CkptName,
  347. })
  348. if createErr != nil {
  349. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  350. return "", createErr
  351. }
  352. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  353. return jobID, nil
  354. }
  355. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  356. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  357. JobName: req.JobName,
  358. Description: req.Description,
  359. Config: models.UserImageConfig{
  360. WorkServerNum: req.WorkServerNumber,
  361. AppUrl: req.CodeObsPath,
  362. BootFileUrl: req.BootFileUrl,
  363. DataUrl: req.DataUrl,
  364. TrainUrl: req.TrainUrl,
  365. LogUrl: req.LogUrl,
  366. PoolID: req.PoolID,
  367. CreateVersion: true,
  368. Flavor: models.Flavor{
  369. Code: req.FlavorCode,
  370. },
  371. Parameter: req.Parameters,
  372. UserImageUrl: req.UserImageUrl,
  373. UserCommand: req.UserCommand,
  374. },
  375. })
  376. }
  377. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  378. createTime := timeutil.TimeStampNow()
  379. var jobResult *models.CreateTrainJobResult
  380. var createErr error
  381. if req.EngineID < 0 {
  382. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  383. Description: req.Description,
  384. Config: models.TrainJobVersionUserImageConfig{
  385. WorkServerNum: req.WorkServerNumber,
  386. AppUrl: req.CodeObsPath,
  387. BootFileUrl: req.BootFileUrl,
  388. DataUrl: req.DataUrl,
  389. TrainUrl: req.TrainUrl,
  390. LogUrl: req.LogUrl,
  391. PoolID: req.PoolID,
  392. Flavor: models.Flavor{
  393. Code: req.Spec.SourceSpecId,
  394. },
  395. Parameter: req.Parameters,
  396. PreVersionId: req.PreVersionId,
  397. UserImageUrl: req.UserImageUrl,
  398. UserCommand: req.UserCommand,
  399. ShareAddr: setting.ModelArtsShareAddr,
  400. MountPath: setting.ModelArtsMountPath,
  401. NasType: setting.ModelArtsNasType,
  402. },
  403. }, jobId)
  404. } else {
  405. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  406. Description: req.Description,
  407. Config: models.TrainJobVersionConfig{
  408. WorkServerNum: req.WorkServerNumber,
  409. AppUrl: req.CodeObsPath,
  410. BootFileUrl: req.BootFileUrl,
  411. DataUrl: req.DataUrl,
  412. EngineID: req.EngineID,
  413. TrainUrl: req.TrainUrl,
  414. LogUrl: req.LogUrl,
  415. PoolID: req.PoolID,
  416. Flavor: models.Flavor{
  417. Code: req.Spec.SourceSpecId,
  418. },
  419. Parameter: req.Parameters,
  420. PreVersionId: req.PreVersionId,
  421. ShareAddr: setting.ModelArtsShareAddr,
  422. MountPath: setting.ModelArtsMountPath,
  423. NasType: setting.ModelArtsNasType,
  424. },
  425. }, jobId)
  426. }
  427. if createErr != nil {
  428. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  429. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  430. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  431. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  432. JobID: jobId,
  433. VersionID: models.TempVersionId,
  434. Status: models.TempJobStatus,
  435. Type: models.TypeCloudBrainTwo,
  436. JobName: req.JobName,
  437. JobType: string(models.JobTypeTrain),
  438. })
  439. if errTemp != nil {
  440. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  441. return errTemp
  442. }
  443. }
  444. return createErr
  445. }
  446. var jobTypes []string
  447. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  448. repo := ctx.Repo.Repository
  449. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  450. RepoID: repo.ID,
  451. Type: models.TypeCloudBrainTwo,
  452. JobTypes: jobTypes,
  453. JobID: strconv.FormatInt(jobResult.JobID, 10),
  454. })
  455. if createErr != nil {
  456. ctx.ServerError("Cloudbrain", createErr)
  457. return createErr
  458. }
  459. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  460. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  461. Status: TransTrainJobStatus(jobResult.Status),
  462. UserID: ctx.User.ID,
  463. RepoID: ctx.Repo.Repository.ID,
  464. JobID: strconv.FormatInt(jobResult.JobID, 10),
  465. JobName: req.JobName,
  466. DisplayJobName: req.DisplayJobName,
  467. JobType: string(models.JobTypeTrain),
  468. Type: models.TypeCloudBrainTwo,
  469. VersionID: jobResult.VersionID,
  470. VersionName: jobResult.VersionName,
  471. Uuid: req.Uuid,
  472. DatasetName: req.DatasetName,
  473. CommitID: req.CommitID,
  474. IsLatestVersion: req.IsLatestVersion,
  475. PreVersionName: req.PreVersionName,
  476. ComputeResource: models.NPUResource,
  477. EngineID: req.EngineID,
  478. TrainUrl: req.TrainUrl,
  479. BranchName: req.BranchName,
  480. Parameters: req.Params,
  481. BootFile: req.BootFile,
  482. DataUrl: req.DataUrl,
  483. LogUrl: req.LogUrl,
  484. PreVersionId: req.PreVersionId,
  485. FlavorCode: req.Spec.SourceSpecId,
  486. Description: req.Description,
  487. WorkServerNumber: req.WorkServerNumber,
  488. FlavorName: req.FlavorName,
  489. EngineName: req.EngineName,
  490. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  491. VersionCount: VersionListCount + 1,
  492. CreatedUnix: createTime,
  493. UpdatedUnix: createTime,
  494. Spec: req.Spec,
  495. ModelName: req.ModelName,
  496. ModelVersion: req.ModelVersion,
  497. LabelName: req.LabelName,
  498. PreTrainModelUrl: req.PreTrainModelUrl,
  499. CkptName: req.CkptName,
  500. })
  501. if createErr != nil {
  502. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  503. return createErr
  504. }
  505. //将训练任务的上一版本的isLatestVersion设置为"0"
  506. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  507. if createErr != nil {
  508. ctx.ServerError("Update IsLatestVersion failed", createErr)
  509. return createErr
  510. }
  511. return createErr
  512. }
  513. func TransTrainJobStatus(status int) string {
  514. switch status {
  515. case 0:
  516. return "UNKNOWN"
  517. case 1:
  518. return "INIT"
  519. case 2:
  520. return "IMAGE_CREATING"
  521. case 3:
  522. return "IMAGE_FAILED"
  523. case 4:
  524. return "SUBMIT_TRYING"
  525. case 5:
  526. return "SUBMIT_FAILED"
  527. case 6:
  528. return "DELETE_FAILED"
  529. case 7:
  530. return "WAITING"
  531. case 8:
  532. return "RUNNING"
  533. case 9:
  534. return "KILLING"
  535. case 10:
  536. return "COMPLETED"
  537. case 11:
  538. return "FAILED"
  539. case 12:
  540. return "KILLED"
  541. case 13:
  542. return "CANCELED"
  543. case 14:
  544. return "LOST"
  545. case 15:
  546. return "SCALING"
  547. case 16:
  548. return "SUBMIT_MODEL_FAILED"
  549. case 17:
  550. return "DEPLOY_SERVICE_FAILED"
  551. case 18:
  552. return "CHECK_INIT"
  553. case 19:
  554. return "CHECK_RUNNING"
  555. case 20:
  556. return "CHECK_RUNNING_COMPLETED"
  557. case 21:
  558. return "CHECK_FAILED"
  559. default:
  560. return strconv.Itoa(status)
  561. }
  562. }
  563. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  564. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  565. VersionOutputPath = "V" + talVersionCountToString
  566. return VersionOutputPath
  567. }
  568. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  569. createTime := timeutil.TimeStampNow()
  570. var jobResult *models.CreateTrainJobResult
  571. var createErr error
  572. if req.EngineID < 0 {
  573. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  574. JobName: req.JobName,
  575. Description: req.Description,
  576. Config: models.InfUserImageConfig{
  577. WorkServerNum: req.WorkServerNumber,
  578. AppUrl: req.CodeObsPath,
  579. BootFileUrl: req.BootFileUrl,
  580. DataUrl: req.DataUrl,
  581. // TrainUrl: req.TrainUrl,
  582. LogUrl: req.LogUrl,
  583. PoolID: req.PoolID,
  584. CreateVersion: true,
  585. Flavor: models.Flavor{
  586. Code: req.Spec.SourceSpecId,
  587. },
  588. Parameter: req.Parameters,
  589. UserImageUrl: req.UserImageUrl,
  590. UserCommand: req.UserCommand,
  591. },
  592. })
  593. } else {
  594. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  595. JobName: req.JobName,
  596. Description: req.Description,
  597. InfConfig: models.InfConfig{
  598. WorkServerNum: req.WorkServerNumber,
  599. AppUrl: req.CodeObsPath,
  600. BootFileUrl: req.BootFileUrl,
  601. DataUrl: req.DataUrl,
  602. EngineID: req.EngineID,
  603. // TrainUrl: req.TrainUrl,
  604. LogUrl: req.LogUrl,
  605. PoolID: req.PoolID,
  606. CreateVersion: true,
  607. Flavor: models.Flavor{
  608. Code: req.Spec.SourceSpecId,
  609. },
  610. Parameter: req.Parameters,
  611. },
  612. })
  613. }
  614. if createErr != nil {
  615. log.Error("createInferenceJob failed: %v", err.Error())
  616. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  617. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  618. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  619. JobID: models.TempJobId,
  620. VersionID: models.TempVersionId,
  621. Status: models.TempJobStatus,
  622. Type: models.TypeCloudBrainTwo,
  623. JobName: req.JobName,
  624. JobType: req.JobType,
  625. })
  626. if err != nil {
  627. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  628. return "", err
  629. }
  630. }
  631. return "", err
  632. }
  633. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  634. // if err != nil {
  635. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  636. // return err
  637. // }
  638. jobID := strconv.FormatInt(jobResult.JobID, 10)
  639. err = models.CreateCloudbrain(&models.Cloudbrain{
  640. Status: TransTrainJobStatus(jobResult.Status),
  641. UserID: ctx.User.ID,
  642. RepoID: ctx.Repo.Repository.ID,
  643. JobID: jobID,
  644. JobName: req.JobName,
  645. DisplayJobName: req.DisplayJobName,
  646. JobType: req.JobType,
  647. Type: models.TypeCloudBrainTwo,
  648. VersionID: jobResult.VersionID,
  649. VersionName: jobResult.VersionName,
  650. Uuid: req.Uuid,
  651. DatasetName: req.DatasetName,
  652. CommitID: req.CommitID,
  653. EngineID: req.EngineID,
  654. TrainUrl: req.TrainUrl,
  655. BranchName: req.BranchName,
  656. Parameters: req.Params,
  657. BootFile: req.BootFile,
  658. DataUrl: req.DataUrl,
  659. LogUrl: req.LogUrl,
  660. FlavorCode: req.Spec.SourceSpecId,
  661. Description: req.Description,
  662. WorkServerNumber: req.WorkServerNumber,
  663. FlavorName: req.FlavorName,
  664. EngineName: req.EngineName,
  665. LabelName: req.LabelName,
  666. IsLatestVersion: req.IsLatestVersion,
  667. ComputeResource: models.NPUResource,
  668. VersionCount: req.VersionCount,
  669. TotalVersionCount: req.TotalVersionCount,
  670. ModelName: req.ModelName,
  671. ModelVersion: req.ModelVersion,
  672. CkptName: req.CkptName,
  673. ResultUrl: req.ResultUrl,
  674. CreatedUnix: createTime,
  675. UpdatedUnix: createTime,
  676. Spec: req.Spec,
  677. })
  678. if err != nil {
  679. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  680. return "", err
  681. }
  682. if req.JobType == string(models.JobTypeModelSafety) {
  683. task, err := models.GetCloudbrainByJobID(jobID)
  684. if err == nil {
  685. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  686. }
  687. } else {
  688. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  689. }
  690. return jobID, nil
  691. }
  692. func GetNotebookImageName(imageId string) (string, error) {
  693. var validImage = false
  694. var imageName = ""
  695. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  696. if imageInfo.Id == imageId {
  697. validImage = true
  698. imageName = imageInfo.Value
  699. }
  700. }
  701. if !validImage {
  702. log.Error("the image id(%s) is invalid", imageId)
  703. return imageName, errors.New("the image id is invalid")
  704. }
  705. return imageName, nil
  706. }
  707. func InitSpecialPool() {
  708. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  709. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  710. }
  711. }
  712. func InitMultiNode() {
  713. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  714. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  715. }
  716. }
  717. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  718. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  719. if err != nil {
  720. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  721. return err
  722. }
  723. if result != nil {
  724. oldStatus := task.Status
  725. task.Status = TransTrainJobStatus(result.IntStatus)
  726. task.Duration = result.Duration / 1000
  727. task.TrainJobDuration = result.TrainJobDuration
  728. if task.StartTime == 0 && result.StartTime > 0 {
  729. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  730. }
  731. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  732. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  733. task.EndTime = task.StartTime.Add(task.Duration)
  734. }
  735. task.CorrectCreateUnix()
  736. if oldStatus != task.Status {
  737. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  738. }
  739. err = models.UpdateJob(task)
  740. if err != nil {
  741. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  742. return err
  743. }
  744. }
  745. return nil
  746. }
  747. func HandleNotebookInfo(task *models.Cloudbrain) error {
  748. var result *models.GetNotebook2Result
  749. var err error
  750. if task.Type == models.TypeCloudBrainTwo {
  751. result, err = GetNotebook2(task.JobID)
  752. } else if task.Type == models.TypeCDCenter {
  753. result, err = modelarts_cd.GetNotebook(task.JobID)
  754. }
  755. if err != nil {
  756. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  757. return err
  758. }
  759. if result != nil {
  760. oldStatus := task.Status
  761. task.Status = result.Status
  762. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  763. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  764. }
  765. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  766. task.EndTime = timeutil.TimeStampNow()
  767. }
  768. task.CorrectCreateUnix()
  769. task.ComputeAndSetDuration()
  770. if oldStatus != task.Status {
  771. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  772. }
  773. if task.FlavorCode == "" {
  774. task.FlavorCode = result.Flavor
  775. }
  776. err = models.UpdateJob(task)
  777. if err != nil {
  778. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  779. return err
  780. }
  781. }
  782. return nil
  783. }
  784. func SyncTempStatusJob() {
  785. jobs, err := models.GetCloudBrainTempJobs()
  786. if err != nil {
  787. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  788. return
  789. }
  790. for _, temp := range jobs {
  791. log.Info("start to handle record: %s", temp.JobName)
  792. if temp.Type == models.TypeCloudBrainTwo {
  793. if temp.JobType == string(models.JobTypeDebug) {
  794. err = handleNotebook(temp)
  795. if err != nil {
  796. log.Error("handleNotebook falied:%v", err)
  797. break
  798. }
  799. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  800. _, err = models.GetCloudbrainByJobID(temp.JobID)
  801. if err != nil {
  802. //one version
  803. err = handleTrainJob(temp)
  804. if err != nil {
  805. log.Error("handleTrainJob falied:%v", err)
  806. break
  807. }
  808. } else {
  809. //multi version
  810. err = handleTrainJobMultiVersion(temp)
  811. if err != nil {
  812. log.Error("handleTrainJobMultiVersion falied:%v", err)
  813. break
  814. }
  815. }
  816. }
  817. }
  818. }
  819. return
  820. }
  821. func handleNotebook(temp *models.CloudbrainTemp) error {
  822. if temp.Status == models.TempJobStatus {
  823. err := handleTempNotebook(temp)
  824. if err != nil {
  825. log.Error("handleTempNotebook failed:%v", err)
  826. return err
  827. }
  828. } else if temp.Status == string(models.ModelArtsStopping) {
  829. res, err := GetNotebook2(temp.JobID)
  830. if err != nil {
  831. log.Error("GetNotebook2 failed:%v", err)
  832. return err
  833. }
  834. temp.Status = res.Status
  835. if temp.Status == string(models.ModelArtsStopped) {
  836. err = models.UpdateCloudbrainTemp(temp)
  837. if err != nil {
  838. log.Error("UpdateCloudbrainTemp failed:%v", err)
  839. return err
  840. }
  841. _, err := DelNotebook2(temp.JobID)
  842. if err != nil {
  843. log.Error("DelNotebook2 failed:%v", err)
  844. return err
  845. }
  846. temp.Status = string(models.ModelArtsDeleted)
  847. err = models.UpdateCloudbrainTemp(temp)
  848. if err != nil {
  849. log.Error("UpdateCloudbrainTemp failed:%v", err)
  850. return err
  851. }
  852. }
  853. }
  854. return nil
  855. }
  856. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  857. var err error
  858. var isExist bool
  859. for {
  860. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  861. if err != nil {
  862. log.Error("GetNotebookList failed:%v", err)
  863. break
  864. }
  865. temp.QueryTimes++
  866. err = models.UpdateCloudbrainTemp(temp)
  867. if err != nil {
  868. log.Error("UpdateCloudbrainTemp failed:%v", err)
  869. }
  870. if result != nil {
  871. for _, notebook := range result.NotebookList {
  872. if temp.JobID == models.TempJobId {
  873. //new notebook
  874. if notebook.JobName == temp.JobName {
  875. isExist = true
  876. temp.Status = notebook.Status
  877. temp.JobID = notebook.JobID
  878. break
  879. }
  880. } else {
  881. //restart: always can find one record
  882. if notebook.JobName == temp.JobName {
  883. if notebook.Status != string(models.ModelArtsStopped) {
  884. isExist = true
  885. temp.Status = notebook.Status
  886. temp.JobID = notebook.JobID
  887. break
  888. }
  889. }
  890. }
  891. }
  892. if isExist {
  893. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  894. if temp.Status == string(models.ModelArtsCreateFailed) {
  895. err = models.UpdateCloudbrainTemp(temp)
  896. if err != nil {
  897. log.Error("UpdateCloudbrainTemp failed:%v", err)
  898. break
  899. }
  900. _, err := DelNotebook2(temp.JobID)
  901. if err != nil {
  902. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  903. break
  904. }
  905. temp.Status = string(models.ModelArtsDeleted)
  906. } else {
  907. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  908. if err != nil {
  909. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  910. break
  911. }
  912. temp.Status = string(models.ModelArtsStopping)
  913. }
  914. models.UpdateCloudbrainTemp(temp)
  915. } else {
  916. log.Error("can not find the record(%s) till now", temp.JobName)
  917. err = errors.New("not found")
  918. break
  919. }
  920. } else {
  921. log.Error("can not find the record(%s) till now", temp.JobName)
  922. err = errors.New("not found")
  923. break
  924. }
  925. break
  926. }
  927. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  928. log.Info("reach MaxTempQueryTimes, set the job failed")
  929. temp.Status = string(models.ModelArtsTrainJobFailed)
  930. err = models.UpdateCloudbrainTemp(temp)
  931. if err != nil {
  932. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  933. return err
  934. }
  935. }
  936. return err
  937. }
  938. func handleTrainJob(temp *models.CloudbrainTemp) error {
  939. if temp.Status == models.TempJobStatus {
  940. err := handleTempTrainJob(temp)
  941. if err != nil {
  942. log.Error("handleTempTrainJob failed:%v", err)
  943. return err
  944. }
  945. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  946. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  947. if err != nil {
  948. log.Error("GetTrainJob failed:%v", err)
  949. return err
  950. }
  951. temp.Status = TransTrainJobStatus(res.IntStatus)
  952. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  953. err = models.UpdateCloudbrainTemp(temp)
  954. if err != nil {
  955. log.Error("UpdateCloudbrainTemp failed:%v", err)
  956. return err
  957. }
  958. _, err := DelTrainJob(temp.JobID)
  959. if err != nil {
  960. log.Error("DelTrainJob failed:%v", err)
  961. return err
  962. }
  963. temp.Status = string(models.ModelArtsDeleted)
  964. err = models.UpdateCloudbrainTemp(temp)
  965. if err != nil {
  966. log.Error("UpdateCloudbrainTemp failed:%v", err)
  967. return err
  968. }
  969. }
  970. }
  971. return nil
  972. }
  973. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  974. if temp.Status == models.TempJobStatus {
  975. err := handleTempTrainJobMultiVersion(temp)
  976. if err != nil {
  977. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  978. return err
  979. }
  980. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  981. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  982. if err != nil {
  983. log.Error("GetTrainJob failed:%v", err)
  984. return err
  985. }
  986. temp.Status = TransTrainJobStatus(res.IntStatus)
  987. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  988. err = models.UpdateCloudbrainTemp(temp)
  989. if err != nil {
  990. log.Error("UpdateCloudbrainTemp failed:%v", err)
  991. return err
  992. }
  993. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  994. if err != nil {
  995. log.Error("DelTrainJob failed:%v", err)
  996. return err
  997. }
  998. temp.Status = string(models.ModelArtsDeleted)
  999. err = models.UpdateCloudbrainTemp(temp)
  1000. if err != nil {
  1001. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1002. return err
  1003. }
  1004. }
  1005. }
  1006. return nil
  1007. }
  1008. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1009. var err error
  1010. var isExist bool
  1011. for {
  1012. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1013. if err != nil {
  1014. log.Error("GetTrainJobVersionList failed:%v", err)
  1015. break
  1016. }
  1017. temp.QueryTimes++
  1018. err = models.UpdateCloudbrainTemp(temp)
  1019. if err != nil {
  1020. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1021. }
  1022. if result != nil {
  1023. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1024. if result.VersionCount == int64(count+1) {
  1025. isExist = true
  1026. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1027. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1028. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1029. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1030. if err != nil {
  1031. log.Error("StopTrainJob failed:%v", err)
  1032. break
  1033. }
  1034. temp.Status = string(models.ModelArtsTrainJobKilling)
  1035. err = models.UpdateCloudbrainTemp(temp)
  1036. if err != nil {
  1037. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1038. break
  1039. }
  1040. } else {
  1041. log.Error("can not find the record(%s) till now", temp.JobName)
  1042. err = errors.New("not found")
  1043. break
  1044. }
  1045. }
  1046. break
  1047. }
  1048. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1049. log.Info("reach MaxTempQueryTimes, set the job failed")
  1050. temp.Status = string(models.ModelArtsTrainJobFailed)
  1051. err = models.UpdateCloudbrainTemp(temp)
  1052. if err != nil {
  1053. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1054. return err
  1055. }
  1056. }
  1057. return err
  1058. }
  1059. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1060. var err error
  1061. var isExist bool
  1062. for {
  1063. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1064. if err != nil {
  1065. log.Error("GetTrainJobList failed:%v", err)
  1066. break
  1067. }
  1068. temp.QueryTimes++
  1069. err = models.UpdateCloudbrainTemp(temp)
  1070. if err != nil {
  1071. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1072. }
  1073. if result != nil {
  1074. for _, job := range result.JobList {
  1075. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1076. isExist = true
  1077. temp.Status = TransTrainJobStatus(job.IntStatus)
  1078. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1079. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1080. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1081. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1082. if err != nil {
  1083. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1084. break
  1085. }
  1086. temp.Status = string(models.ModelArtsTrainJobKilling)
  1087. err = models.UpdateCloudbrainTemp(temp)
  1088. if err != nil {
  1089. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1090. break
  1091. }
  1092. }
  1093. }
  1094. if !isExist {
  1095. log.Error("can not find the record(%s) till now", temp.JobName)
  1096. err = errors.New("not found")
  1097. break
  1098. }
  1099. }
  1100. break
  1101. }
  1102. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1103. log.Info("reach MaxTempQueryTimes, set the job failed")
  1104. temp.Status = string(models.ModelArtsTrainJobFailed)
  1105. err = models.UpdateCloudbrainTemp(temp)
  1106. if err != nil {
  1107. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1108. return err
  1109. }
  1110. }
  1111. return err
  1112. }