You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 24 kB

4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "code.gitea.io/gitea/modules/timeutil"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. "code.gitea.io/gitea/modules/storage"
  15. )
  16. const (
  17. //notebook
  18. storageTypeOBS = "obs"
  19. autoStopDuration = 4 * 60 * 60
  20. autoStopDurationMs = 4 * 60 * 60 * 1000
  21. MORDELART_USER_IMAGE_ENGINE_ID = -1
  22. DataSetMountPath = "/home/ma-user/work"
  23. NotebookEnv = "Python3"
  24. NotebookType = "Ascend"
  25. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  26. //train-job
  27. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  28. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  29. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  30. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  31. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  33. // "]}"
  34. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  35. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  36. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  38. // "]}"
  39. CodePath = "/code/"
  40. OutputPath = "/output/"
  41. ResultPath = "/result/"
  42. LogPath = "/log/"
  43. JobPath = "/job/"
  44. OrderDesc = "desc" //向下查询
  45. OrderAsc = "asc" //向上查询
  46. Lines = 500
  47. TrainUrl = "train_url"
  48. DataUrl = "data_url"
  49. MultiDataUrl = "multi_data_url"
  50. ResultUrl = "result_url"
  51. CkptUrl = "ckpt_url"
  52. DeviceTarget = "device_target"
  53. Ascend = "Ascend"
  54. PerPage = 10
  55. IsLatestVersion = "1"
  56. NotLatestVersion = "0"
  57. VersionCount = 1
  58. SortByCreateTime = "create_time"
  59. ConfigTypeCustom = "custom"
  60. TotalVersionCount = 1
  61. )
  62. var (
  63. poolInfos *models.PoolInfos
  64. FlavorInfos *models.FlavorInfos
  65. ImageInfos *models.ImageInfosModelArts
  66. )
  67. type GenerateTrainJobReq struct {
  68. JobName string
  69. DisplayJobName string
  70. Uuid string
  71. Description string
  72. CodeObsPath string
  73. BootFile string
  74. BootFileUrl string
  75. DataUrl string
  76. TrainUrl string
  77. FlavorCode string
  78. LogUrl string
  79. PoolID string
  80. WorkServerNumber int
  81. EngineID int64
  82. Parameters []models.Parameter
  83. CommitID string
  84. IsLatestVersion string
  85. Params string
  86. BranchName string
  87. PreVersionId int64
  88. PreVersionName string
  89. FlavorName string
  90. VersionCount int
  91. EngineName string
  92. TotalVersionCount int
  93. UserImageUrl string
  94. UserCommand string
  95. DatasetName string
  96. }
  97. type GenerateInferenceJobReq struct {
  98. JobName string
  99. DisplayJobName string
  100. Uuid string
  101. Description string
  102. CodeObsPath string
  103. BootFile string
  104. BootFileUrl string
  105. DataUrl string
  106. TrainUrl string
  107. FlavorCode string
  108. LogUrl string
  109. PoolID string
  110. WorkServerNumber int
  111. EngineID int64
  112. Parameters []models.Parameter
  113. CommitID string
  114. Params string
  115. BranchName string
  116. FlavorName string
  117. EngineName string
  118. LabelName string
  119. IsLatestVersion string
  120. VersionCount int
  121. TotalVersionCount int
  122. ModelName string
  123. ModelVersion string
  124. CkptName string
  125. ResultUrl string
  126. }
  127. type VersionInfo struct {
  128. Version []struct {
  129. ID int `json:"id"`
  130. Value string `json:"value"`
  131. Url string `json:"url"`
  132. } `json:"version"`
  133. }
  134. type Flavor struct {
  135. Info []struct {
  136. Code string `json:"code"`
  137. Value string `json:"value"`
  138. } `json:"flavor"`
  139. }
  140. type Engine struct {
  141. Info []struct {
  142. ID int `json:"id"`
  143. Value string `json:"value"`
  144. } `json:"engine"`
  145. }
  146. type ResourcePool struct {
  147. Info []struct {
  148. ID string `json:"id"`
  149. Value string `json:"value"`
  150. } `json:"resource_pool"`
  151. }
  152. // type Parameter struct {
  153. // Label string `json:"label"`
  154. // Value string `json:"value"`
  155. // }
  156. // type Parameters struct {
  157. // Parameter []Parameter `json:"parameter"`
  158. // }
  159. type Parameters struct {
  160. Parameter []struct {
  161. Label string `json:"label"`
  162. Value string `json:"value"`
  163. } `json:"parameter"`
  164. }
  165. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  166. var dataActualPath string
  167. if uuid != "" {
  168. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  169. } else {
  170. userPath := setting.UserBasePath + ctx.User.Name + "/"
  171. isExist, err := storage.ObsHasObject(userPath)
  172. if err != nil {
  173. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  174. return err
  175. }
  176. if !isExist {
  177. if err = storage.ObsCreateObject(userPath); err != nil {
  178. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  179. return err
  180. }
  181. }
  182. dataActualPath = setting.Bucket + "/" + userPath
  183. }
  184. if poolInfos == nil {
  185. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  186. }
  187. createTime := timeutil.TimeStampNow()
  188. jobResult, err := CreateJob(models.CreateNotebookParams{
  189. JobName: jobName,
  190. Description: description,
  191. ProfileID: setting.ProfileID,
  192. Flavor: flavor,
  193. Pool: models.Pool{
  194. ID: poolInfos.PoolInfo[0].PoolId,
  195. Name: poolInfos.PoolInfo[0].PoolName,
  196. Type: poolInfos.PoolInfo[0].PoolType,
  197. },
  198. Spec: models.Spec{
  199. Storage: models.Storage{
  200. Type: storageTypeOBS,
  201. Location: models.Location{
  202. Path: dataActualPath,
  203. },
  204. },
  205. AutoStop: models.AutoStop{
  206. Enable: true,
  207. Duration: autoStopDuration,
  208. },
  209. },
  210. })
  211. if err != nil {
  212. log.Error("CreateJob failed: %v", err.Error())
  213. return err
  214. }
  215. err = models.CreateCloudbrain(&models.Cloudbrain{
  216. Status: string(models.JobWaiting),
  217. UserID: ctx.User.ID,
  218. RepoID: ctx.Repo.Repository.ID,
  219. JobID: jobResult.ID,
  220. JobName: jobName,
  221. JobType: string(models.JobTypeDebug),
  222. Type: models.TypeCloudBrainTwo,
  223. Uuid: uuid,
  224. ComputeResource: models.NPUResource,
  225. CreatedUnix: createTime,
  226. UpdatedUnix: createTime,
  227. })
  228. if err != nil {
  229. return err
  230. }
  231. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  232. return nil
  233. }
  234. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
  235. if poolInfos == nil {
  236. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  237. }
  238. imageName, err := GetNotebookImageName(imageId)
  239. if err != nil {
  240. log.Error("GetNotebookImageName failed: %v", err.Error())
  241. return err
  242. }
  243. createTime := timeutil.TimeStampNow()
  244. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  245. JobName: jobName,
  246. Description: description,
  247. Flavor: flavor,
  248. Duration: autoStopDurationMs,
  249. ImageID: imageId,
  250. PoolID: poolInfos.PoolInfo[0].PoolId,
  251. Feature: models.NotebookFeature,
  252. Volume: models.VolumeReq{
  253. Capacity: setting.Capacity,
  254. Category: models.EVSCategory,
  255. Ownership: models.ManagedOwnership,
  256. },
  257. WorkspaceID: "0",
  258. })
  259. if err != nil {
  260. log.Error("createNotebook2 failed: %v", err.Error())
  261. return err
  262. }
  263. err = models.CreateCloudbrain(&models.Cloudbrain{
  264. Status: jobResult.Status,
  265. UserID: ctx.User.ID,
  266. RepoID: ctx.Repo.Repository.ID,
  267. JobID: jobResult.ID,
  268. JobName: jobName,
  269. FlavorCode: flavor,
  270. DisplayJobName: displayJobName,
  271. JobType: string(models.JobTypeDebug),
  272. Type: models.TypeCloudBrainTwo,
  273. Uuid: uuid,
  274. ComputeResource: models.NPUResource,
  275. Image: imageName,
  276. Description: description,
  277. CreatedUnix: createTime,
  278. UpdatedUnix: createTime,
  279. })
  280. if err != nil {
  281. return err
  282. }
  283. task, err := models.GetCloudbrainByName(jobName)
  284. if err != nil {
  285. log.Error("GetCloudbrainByName failed: %v", err.Error())
  286. return err
  287. }
  288. stringId := strconv.FormatInt(task.ID, 10)
  289. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  290. return nil
  291. }
  292. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  293. createTime := timeutil.TimeStampNow()
  294. var jobResult *models.CreateTrainJobResult
  295. var createErr error
  296. if req.EngineID < 0 {
  297. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  298. JobName: req.JobName,
  299. Description: req.Description,
  300. Config: models.UserImageConfig{
  301. WorkServerNum: req.WorkServerNumber,
  302. AppUrl: req.CodeObsPath,
  303. BootFileUrl: req.BootFileUrl,
  304. DataUrl: req.DataUrl,
  305. TrainUrl: req.TrainUrl,
  306. LogUrl: req.LogUrl,
  307. PoolID: req.PoolID,
  308. CreateVersion: true,
  309. Flavor: models.Flavor{
  310. Code: req.FlavorCode,
  311. },
  312. Parameter: req.Parameters,
  313. UserImageUrl: req.UserImageUrl,
  314. UserCommand: req.UserCommand,
  315. },
  316. })
  317. } else {
  318. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  319. JobName: req.JobName,
  320. Description: req.Description,
  321. Config: models.Config{
  322. WorkServerNum: req.WorkServerNumber,
  323. AppUrl: req.CodeObsPath,
  324. BootFileUrl: req.BootFileUrl,
  325. DataUrl: req.DataUrl,
  326. EngineID: req.EngineID,
  327. TrainUrl: req.TrainUrl,
  328. LogUrl: req.LogUrl,
  329. PoolID: req.PoolID,
  330. CreateVersion: true,
  331. Flavor: models.Flavor{
  332. Code: req.FlavorCode,
  333. },
  334. Parameter: req.Parameters,
  335. },
  336. })
  337. }
  338. if createErr != nil {
  339. log.Error("CreateJob failed: %v", createErr.Error())
  340. return createErr
  341. }
  342. jobId := strconv.FormatInt(jobResult.JobID, 10)
  343. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  344. Status: TransTrainJobStatus(jobResult.Status),
  345. UserID: ctx.User.ID,
  346. RepoID: ctx.Repo.Repository.ID,
  347. JobID: jobId,
  348. JobName: req.JobName,
  349. DisplayJobName: req.DisplayJobName,
  350. JobType: string(models.JobTypeTrain),
  351. Type: models.TypeCloudBrainTwo,
  352. VersionID: jobResult.VersionID,
  353. VersionName: jobResult.VersionName,
  354. Uuid: req.Uuid,
  355. DatasetName: req.DatasetName,
  356. CommitID: req.CommitID,
  357. IsLatestVersion: req.IsLatestVersion,
  358. ComputeResource: models.NPUResource,
  359. EngineID: req.EngineID,
  360. TrainUrl: req.TrainUrl,
  361. BranchName: req.BranchName,
  362. Parameters: req.Params,
  363. BootFile: req.BootFile,
  364. DataUrl: req.DataUrl,
  365. LogUrl: req.LogUrl,
  366. FlavorCode: req.FlavorCode,
  367. Description: req.Description,
  368. WorkServerNumber: req.WorkServerNumber,
  369. FlavorName: req.FlavorName,
  370. EngineName: req.EngineName,
  371. VersionCount: req.VersionCount,
  372. TotalVersionCount: req.TotalVersionCount,
  373. CreatedUnix: createTime,
  374. UpdatedUnix: createTime,
  375. })
  376. if createErr != nil {
  377. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  378. return createErr
  379. }
  380. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  381. return nil
  382. }
  383. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  384. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  385. JobName: req.JobName,
  386. Description: req.Description,
  387. Config: models.UserImageConfig{
  388. WorkServerNum: req.WorkServerNumber,
  389. AppUrl: req.CodeObsPath,
  390. BootFileUrl: req.BootFileUrl,
  391. DataUrl: req.DataUrl,
  392. TrainUrl: req.TrainUrl,
  393. LogUrl: req.LogUrl,
  394. PoolID: req.PoolID,
  395. CreateVersion: true,
  396. Flavor: models.Flavor{
  397. Code: req.FlavorCode,
  398. },
  399. Parameter: req.Parameters,
  400. UserImageUrl: req.UserImageUrl,
  401. UserCommand: req.UserCommand,
  402. },
  403. })
  404. }
  405. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  406. createTime := timeutil.TimeStampNow()
  407. var jobResult *models.CreateTrainJobResult
  408. var createErr error
  409. log.Info(" req.EngineID =" + fmt.Sprint(req.EngineID))
  410. if req.EngineID < 0 {
  411. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  412. Description: req.Description,
  413. Config: models.TrainJobVersionUserImageConfig{
  414. WorkServerNum: req.WorkServerNumber,
  415. AppUrl: req.CodeObsPath,
  416. BootFileUrl: req.BootFileUrl,
  417. DataUrl: req.DataUrl,
  418. TrainUrl: req.TrainUrl,
  419. LogUrl: req.LogUrl,
  420. PoolID: req.PoolID,
  421. Flavor: models.Flavor{
  422. Code: req.FlavorCode,
  423. },
  424. Parameter: req.Parameters,
  425. PreVersionId: req.PreVersionId,
  426. UserImageUrl: req.UserImageUrl,
  427. UserCommand: req.UserCommand,
  428. },
  429. }, jobId)
  430. } else {
  431. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  432. Description: req.Description,
  433. Config: models.TrainJobVersionConfig{
  434. WorkServerNum: req.WorkServerNumber,
  435. AppUrl: req.CodeObsPath,
  436. BootFileUrl: req.BootFileUrl,
  437. DataUrl: req.DataUrl,
  438. EngineID: req.EngineID,
  439. TrainUrl: req.TrainUrl,
  440. LogUrl: req.LogUrl,
  441. PoolID: req.PoolID,
  442. Flavor: models.Flavor{
  443. Code: req.FlavorCode,
  444. },
  445. Parameter: req.Parameters,
  446. PreVersionId: req.PreVersionId,
  447. },
  448. }, jobId)
  449. }
  450. if createErr != nil {
  451. log.Error("CreateJob failed: %v", createErr.Error())
  452. return createErr
  453. }
  454. var jobTypes []string
  455. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  456. repo := ctx.Repo.Repository
  457. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  458. RepoID: repo.ID,
  459. Type: models.TypeCloudBrainTwo,
  460. JobTypes: jobTypes,
  461. JobID: strconv.FormatInt(jobResult.JobID, 10),
  462. })
  463. if createErr != nil {
  464. ctx.ServerError("Cloudbrain", createErr)
  465. return createErr
  466. }
  467. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  468. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  469. Status: TransTrainJobStatus(jobResult.Status),
  470. UserID: ctx.User.ID,
  471. RepoID: ctx.Repo.Repository.ID,
  472. JobID: strconv.FormatInt(jobResult.JobID, 10),
  473. JobName: req.JobName,
  474. DisplayJobName: req.DisplayJobName,
  475. JobType: string(models.JobTypeTrain),
  476. Type: models.TypeCloudBrainTwo,
  477. VersionID: jobResult.VersionID,
  478. VersionName: jobResult.VersionName,
  479. Uuid: req.Uuid,
  480. DatasetName: req.DatasetName,
  481. CommitID: req.CommitID,
  482. IsLatestVersion: req.IsLatestVersion,
  483. PreVersionName: req.PreVersionName,
  484. ComputeResource: models.NPUResource,
  485. EngineID: req.EngineID,
  486. TrainUrl: req.TrainUrl,
  487. BranchName: req.BranchName,
  488. Parameters: req.Params,
  489. BootFile: req.BootFile,
  490. DataUrl: req.DataUrl,
  491. LogUrl: req.LogUrl,
  492. PreVersionId: req.PreVersionId,
  493. FlavorCode: req.FlavorCode,
  494. Description: req.Description,
  495. WorkServerNumber: req.WorkServerNumber,
  496. FlavorName: req.FlavorName,
  497. EngineName: req.EngineName,
  498. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  499. VersionCount: VersionListCount + 1,
  500. CreatedUnix: createTime,
  501. UpdatedUnix: createTime,
  502. })
  503. if createErr != nil {
  504. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  505. return createErr
  506. }
  507. //将训练任务的上一版本的isLatestVersion设置为"0"
  508. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
  509. if createErr != nil {
  510. ctx.ServerError("Update IsLatestVersion failed", createErr)
  511. return createErr
  512. }
  513. return createErr
  514. }
  515. func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  516. createTime := timeutil.TimeStampNow()
  517. jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  518. JobName: req.JobName,
  519. Description: req.Description,
  520. Config: models.UserImageConfig{
  521. WorkServerNum: req.WorkServerNumber,
  522. AppUrl: req.CodeObsPath,
  523. BootFileUrl: req.BootFileUrl,
  524. DataUrl: req.DataUrl,
  525. TrainUrl: req.TrainUrl,
  526. LogUrl: req.LogUrl,
  527. PoolID: req.PoolID,
  528. CreateVersion: true,
  529. Flavor: models.Flavor{
  530. Code: req.FlavorCode,
  531. },
  532. Parameter: req.Parameters,
  533. UserImageUrl: req.UserImageUrl,
  534. UserCommand: req.UserCommand,
  535. },
  536. })
  537. if err != nil {
  538. log.Error("CreateJob failed: %v", err.Error())
  539. return err
  540. }
  541. var jobTypes []string
  542. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  543. repo := ctx.Repo.Repository
  544. VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  545. RepoID: repo.ID,
  546. Type: models.TypeCloudBrainTwo,
  547. JobTypes: jobTypes,
  548. JobID: strconv.FormatInt(jobResult.JobID, 10),
  549. })
  550. if err != nil {
  551. ctx.ServerError("Cloudbrain", err)
  552. return err
  553. }
  554. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  555. err = models.CreateCloudbrain(&models.Cloudbrain{
  556. Status: TransTrainJobStatus(jobResult.Status),
  557. UserID: ctx.User.ID,
  558. RepoID: ctx.Repo.Repository.ID,
  559. JobID: strconv.FormatInt(jobResult.JobID, 10),
  560. JobName: req.JobName,
  561. DisplayJobName: req.DisplayJobName,
  562. JobType: string(models.JobTypeTrain),
  563. Type: models.TypeCloudBrainTwo,
  564. VersionID: jobResult.VersionID,
  565. VersionName: jobResult.VersionName,
  566. Uuid: req.Uuid,
  567. DatasetName: req.DatasetName,
  568. CommitID: req.CommitID,
  569. IsLatestVersion: req.IsLatestVersion,
  570. PreVersionName: req.PreVersionName,
  571. ComputeResource: models.NPUResource,
  572. EngineID: MORDELART_USER_IMAGE_ENGINE_ID,
  573. Image: req.UserImageUrl,
  574. TrainUrl: req.TrainUrl,
  575. BranchName: req.BranchName,
  576. Parameters: req.Params,
  577. BootFile: req.BootFile,
  578. DataUrl: req.DataUrl,
  579. LogUrl: req.LogUrl,
  580. PreVersionId: req.PreVersionId,
  581. FlavorCode: req.FlavorCode,
  582. Description: req.Description,
  583. WorkServerNumber: req.WorkServerNumber,
  584. FlavorName: req.FlavorName,
  585. EngineName: req.EngineName,
  586. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  587. VersionCount: VersionListCount + 1,
  588. CreatedUnix: createTime,
  589. UpdatedUnix: createTime,
  590. })
  591. if err != nil {
  592. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  593. return err
  594. }
  595. //将训练任务的上一版本的isLatestVersion设置为"0"
  596. err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
  597. if err != nil {
  598. ctx.ServerError("Update IsLatestVersion failed", err)
  599. return err
  600. }
  601. return err
  602. }
  603. func TransTrainJobStatus(status int) string {
  604. switch status {
  605. case 0:
  606. return "UNKNOWN"
  607. case 1:
  608. return "INIT"
  609. case 2:
  610. return "IMAGE_CREATING"
  611. case 3:
  612. return "IMAGE_FAILED"
  613. case 4:
  614. return "SUBMIT_TRYING"
  615. case 5:
  616. return "SUBMIT_FAILED"
  617. case 6:
  618. return "DELETE_FAILED"
  619. case 7:
  620. return "WAITING"
  621. case 8:
  622. return "RUNNING"
  623. case 9:
  624. return "KILLING"
  625. case 10:
  626. return "COMPLETED"
  627. case 11:
  628. return "FAILED"
  629. case 12:
  630. return "KILLED"
  631. case 13:
  632. return "CANCELED"
  633. case 14:
  634. return "LOST"
  635. case 15:
  636. return "SCALING"
  637. case 16:
  638. return "SUBMIT_MODEL_FAILED"
  639. case 17:
  640. return "DEPLOY_SERVICE_FAILED"
  641. case 18:
  642. return "CHECK_INIT"
  643. case 19:
  644. return "CHECK_RUNNING"
  645. case 20:
  646. return "CHECK_RUNNING_COMPLETED"
  647. case 21:
  648. return "CHECK_FAILED"
  649. default:
  650. return strconv.Itoa(status)
  651. }
  652. }
  653. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  654. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  655. VersionOutputPath = "V" + talVersionCountToString
  656. return VersionOutputPath
  657. }
  658. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  659. createTime := timeutil.TimeStampNow()
  660. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  661. JobName: req.JobName,
  662. Description: req.Description,
  663. InfConfig: models.InfConfig{
  664. WorkServerNum: req.WorkServerNumber,
  665. AppUrl: req.CodeObsPath,
  666. BootFileUrl: req.BootFileUrl,
  667. DataUrl: req.DataUrl,
  668. EngineID: req.EngineID,
  669. // TrainUrl: req.TrainUrl,
  670. LogUrl: req.LogUrl,
  671. PoolID: req.PoolID,
  672. CreateVersion: true,
  673. Flavor: models.Flavor{
  674. Code: req.FlavorCode,
  675. },
  676. Parameter: req.Parameters,
  677. },
  678. })
  679. if err != nil {
  680. log.Error("CreateJob failed: %v", err.Error())
  681. return err
  682. }
  683. attach, err := models.GetAttachmentByUUID(req.Uuid)
  684. if err != nil {
  685. log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  686. return err
  687. }
  688. jobID := strconv.FormatInt(jobResult.JobID, 10)
  689. err = models.CreateCloudbrain(&models.Cloudbrain{
  690. Status: TransTrainJobStatus(jobResult.Status),
  691. UserID: ctx.User.ID,
  692. RepoID: ctx.Repo.Repository.ID,
  693. JobID: jobID,
  694. JobName: req.JobName,
  695. DisplayJobName: req.DisplayJobName,
  696. JobType: string(models.JobTypeInference),
  697. Type: models.TypeCloudBrainTwo,
  698. VersionID: jobResult.VersionID,
  699. VersionName: jobResult.VersionName,
  700. Uuid: req.Uuid,
  701. DatasetName: attach.Name,
  702. CommitID: req.CommitID,
  703. EngineID: req.EngineID,
  704. TrainUrl: req.TrainUrl,
  705. BranchName: req.BranchName,
  706. Parameters: req.Params,
  707. BootFile: req.BootFile,
  708. DataUrl: req.DataUrl,
  709. LogUrl: req.LogUrl,
  710. FlavorCode: req.FlavorCode,
  711. Description: req.Description,
  712. WorkServerNumber: req.WorkServerNumber,
  713. FlavorName: req.FlavorName,
  714. EngineName: req.EngineName,
  715. LabelName: req.LabelName,
  716. IsLatestVersion: req.IsLatestVersion,
  717. ComputeResource: models.NPUResource,
  718. VersionCount: req.VersionCount,
  719. TotalVersionCount: req.TotalVersionCount,
  720. ModelName: req.ModelName,
  721. ModelVersion: req.ModelVersion,
  722. CkptName: req.CkptName,
  723. ResultUrl: req.ResultUrl,
  724. CreatedUnix: createTime,
  725. UpdatedUnix: createTime,
  726. })
  727. if err != nil {
  728. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  729. return err
  730. }
  731. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  732. return nil
  733. }
  734. func GetNotebookImageName(imageId string) (string, error) {
  735. var validImage = false
  736. var imageName = ""
  737. if ImageInfos == nil {
  738. json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
  739. }
  740. for _, imageInfo := range ImageInfos.ImageInfo {
  741. if imageInfo.Id == imageId {
  742. validImage = true
  743. imageName = imageInfo.Value
  744. }
  745. }
  746. if !validImage {
  747. log.Error("the image id(%s) is invalid", imageId)
  748. return imageName, errors.New("the image id is invalid")
  749. }
  750. return imageName, nil
  751. }