You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 37 kB

4 years ago
3 years ago
3 years ago
4 years ago
2 years ago
4 years ago
2 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
4 years ago
2 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
4 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "strings"
  9. "code.gitea.io/gitea/modules/modelarts_cd"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/modules/storage"
  16. "code.gitea.io/gitea/modules/timeutil"
  17. )
  18. const (
  19. //notebook
  20. storageTypeOBS = "obs"
  21. autoStopDuration = 4 * 60 * 60
  22. autoStopDurationMs = 4 * 60 * 60 * 1000
  23. MORDELART_USER_IMAGE_ENGINE_ID = -1
  24. DataSetMountPath = "/home/ma-user/work"
  25. NotebookEnv = "Python3"
  26. NotebookType = "Ascend"
  27. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  28. //train-job
  29. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  30. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  31. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  34. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  35. // "]}"
  36. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  39. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  40. // "]}"
  41. CodePath = "/code/"
  42. OutputPath = "/output/"
  43. ResultPath = "/result/"
  44. LogPath = "/log/"
  45. JobPath = "/job/"
  46. OrderDesc = "desc" //向下查询
  47. OrderAsc = "asc" //向上查询
  48. Lines = 500
  49. TrainUrl = "train_url"
  50. DataUrl = "data_url"
  51. MultiDataUrl = "multi_data_url"
  52. ResultUrl = "result_url"
  53. CkptUrl = "ckpt_url"
  54. DeviceTarget = "device_target"
  55. Ascend = "Ascend"
  56. PerPage = 10
  57. IsLatestVersion = "1"
  58. NotLatestVersion = "0"
  59. VersionCountOne = 1
  60. SortByCreateTime = "create_time"
  61. ConfigTypeCustom = "custom"
  62. TotalVersionCount = 1
  63. )
  64. var (
  65. poolInfos *models.PoolInfos
  66. TrainFlavorInfos *Flavor
  67. SpecialPools *models.SpecialPools
  68. MultiNodeConfig *MultiNodes
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. LogUrl string
  81. PoolID string
  82. WorkServerNumber int
  83. EngineID int64
  84. Parameters []models.Parameter
  85. CommitID string
  86. IsLatestVersion string
  87. Params string
  88. BranchName string
  89. PreVersionId int64
  90. PreVersionName string
  91. FlavorCode string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. Spec *models.Specification
  100. ModelName string
  101. LabelName string
  102. CkptName string
  103. ModelVersion string
  104. PreTrainModelUrl string
  105. }
  106. type GenerateInferenceJobReq struct {
  107. JobName string
  108. DisplayJobName string
  109. Uuid string
  110. Description string
  111. CodeObsPath string
  112. BootFile string
  113. BootFileUrl string
  114. DataUrl string
  115. TrainUrl string
  116. LogUrl string
  117. PoolID string
  118. WorkServerNumber int
  119. EngineID int64
  120. Parameters []models.Parameter
  121. CommitID string
  122. Params string
  123. BranchName string
  124. FlavorName string
  125. EngineName string
  126. LabelName string
  127. IsLatestVersion string
  128. VersionCount int
  129. TotalVersionCount int
  130. ModelName string
  131. ModelVersion string
  132. CkptName string
  133. ResultUrl string
  134. Spec *models.Specification
  135. DatasetName string
  136. JobType string
  137. UserImageUrl string
  138. UserCommand string
  139. }
  140. type VersionInfo struct {
  141. Version []struct {
  142. ID int `json:"id"`
  143. Value string `json:"value"`
  144. Url string `json:"url"`
  145. } `json:"version"`
  146. }
  147. type Flavor struct {
  148. Info []struct {
  149. Code string `json:"code"`
  150. Value string `json:"value"`
  151. UnitPrice int64 `json:"unitPrice"`
  152. } `json:"flavor"`
  153. }
  154. type Engine struct {
  155. Info []struct {
  156. ID int `json:"id"`
  157. Value string `json:"value"`
  158. } `json:"engine"`
  159. }
  160. type ResourcePool struct {
  161. Info []struct {
  162. ID string `json:"id"`
  163. Value string `json:"value"`
  164. } `json:"resource_pool"`
  165. }
  166. type MultiNodes struct {
  167. Info []OrgMultiNode `json:"multinode"`
  168. }
  169. type OrgMultiNode struct {
  170. Org string `json:"org"`
  171. Node []int `json:"node"`
  172. }
  173. // type Parameter struct {
  174. // Label string `json:"label"`
  175. // Value string `json:"value"`
  176. // }
  177. // type Parameters struct {
  178. // Parameter []Parameter `json:"parameter"`
  179. // }
  180. type Parameters struct {
  181. Parameter []struct {
  182. Label string `json:"label"`
  183. Value string `json:"value"`
  184. } `json:"parameter"`
  185. }
  186. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  187. var dataActualPath string
  188. if uuid != "" {
  189. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  190. } else {
  191. userPath := setting.UserBasePath + ctx.User.Name + "/"
  192. isExist, err := storage.ObsHasObject(userPath)
  193. if err != nil {
  194. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  195. return err
  196. }
  197. if !isExist {
  198. if err = storage.ObsCreateObject(userPath); err != nil {
  199. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  200. return err
  201. }
  202. }
  203. dataActualPath = setting.Bucket + "/" + userPath
  204. }
  205. if poolInfos == nil {
  206. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  207. }
  208. createTime := timeutil.TimeStampNow()
  209. jobResult, err := CreateJob(models.CreateNotebookParams{
  210. JobName: jobName,
  211. Description: description,
  212. ProfileID: setting.ProfileID,
  213. Flavor: flavor,
  214. Pool: models.Pool{
  215. ID: poolInfos.PoolInfo[0].PoolId,
  216. Name: poolInfos.PoolInfo[0].PoolName,
  217. Type: poolInfos.PoolInfo[0].PoolType,
  218. },
  219. Spec: models.Spec{
  220. Storage: models.Storage{
  221. Type: storageTypeOBS,
  222. Location: models.Location{
  223. Path: dataActualPath,
  224. },
  225. },
  226. AutoStop: models.AutoStop{
  227. Enable: true,
  228. Duration: autoStopDuration,
  229. },
  230. },
  231. })
  232. if err != nil {
  233. log.Error("CreateJob failed: %v", err.Error())
  234. return err
  235. }
  236. err = models.CreateCloudbrain(&models.Cloudbrain{
  237. Status: string(models.JobWaiting),
  238. UserID: ctx.User.ID,
  239. RepoID: ctx.Repo.Repository.ID,
  240. JobID: jobResult.ID,
  241. JobName: jobName,
  242. JobType: string(models.JobTypeDebug),
  243. Type: models.TypeCloudBrainTwo,
  244. Uuid: uuid,
  245. ComputeResource: models.NPUResource,
  246. CreatedUnix: createTime,
  247. UpdatedUnix: createTime,
  248. })
  249. if err != nil {
  250. return err
  251. }
  252. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  253. return nil
  254. }
  255. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
  256. if poolInfos == nil {
  257. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  258. }
  259. imageName, err := GetNotebookImageName(imageId)
  260. if err != nil {
  261. log.Error("GetNotebookImageName failed: %v", err.Error())
  262. return err
  263. }
  264. createTime := timeutil.TimeStampNow()
  265. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  266. JobName: jobName,
  267. Description: description,
  268. Flavor: spec.SourceSpecId,
  269. Duration: autoStopDurationMs,
  270. ImageID: imageId,
  271. PoolID: poolInfos.PoolInfo[0].PoolId,
  272. Feature: models.NotebookFeature,
  273. Volume: models.VolumeReq{
  274. Capacity: setting.Capacity,
  275. Category: models.EVSCategory,
  276. Ownership: models.ManagedOwnership,
  277. },
  278. WorkspaceID: "0",
  279. })
  280. if err != nil {
  281. log.Error("createNotebook2 failed: %v", err.Error())
  282. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  283. log.Info("(%s)unknown error, set temp status", displayJobName)
  284. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  285. JobID: models.TempJobId,
  286. VersionID: models.TempVersionId,
  287. Status: models.TempJobStatus,
  288. Type: models.TypeCloudBrainTwo,
  289. JobName: jobName,
  290. JobType: string(models.JobTypeDebug),
  291. })
  292. if errTemp != nil {
  293. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  294. return errTemp
  295. }
  296. }
  297. return err
  298. }
  299. task := &models.Cloudbrain{
  300. Status: jobResult.Status,
  301. UserID: ctx.User.ID,
  302. RepoID: ctx.Repo.Repository.ID,
  303. JobID: jobResult.ID,
  304. JobName: jobName,
  305. FlavorCode: spec.SourceSpecId,
  306. DisplayJobName: displayJobName,
  307. JobType: string(models.JobTypeDebug),
  308. Type: models.TypeCloudBrainTwo,
  309. Uuid: uuid,
  310. ComputeResource: models.NPUResource,
  311. Image: imageName,
  312. Description: description,
  313. CreatedUnix: createTime,
  314. UpdatedUnix: createTime,
  315. Spec: spec,
  316. }
  317. err = models.CreateCloudbrain(task)
  318. if err != nil {
  319. return err
  320. }
  321. stringId := strconv.FormatInt(task.ID, 10)
  322. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  323. return nil
  324. }
  325. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  326. createTime := timeutil.TimeStampNow()
  327. var jobResult *models.CreateTrainJobResult
  328. var createErr error
  329. if req.EngineID < 0 {
  330. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  331. JobName: req.JobName,
  332. Description: req.Description,
  333. Config: models.UserImageConfig{
  334. WorkServerNum: req.WorkServerNumber,
  335. AppUrl: req.CodeObsPath,
  336. BootFileUrl: req.BootFileUrl,
  337. DataUrl: req.DataUrl,
  338. TrainUrl: req.TrainUrl,
  339. LogUrl: req.LogUrl,
  340. PoolID: req.PoolID,
  341. CreateVersion: true,
  342. Flavor: models.Flavor{
  343. Code: req.Spec.SourceSpecId,
  344. },
  345. Parameter: req.Parameters,
  346. UserImageUrl: req.UserImageUrl,
  347. UserCommand: req.UserCommand,
  348. },
  349. })
  350. } else {
  351. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  352. JobName: req.JobName,
  353. Description: req.Description,
  354. Config: models.Config{
  355. WorkServerNum: req.WorkServerNumber,
  356. AppUrl: req.CodeObsPath,
  357. BootFileUrl: req.BootFileUrl,
  358. DataUrl: req.DataUrl,
  359. EngineID: req.EngineID,
  360. TrainUrl: req.TrainUrl,
  361. LogUrl: req.LogUrl,
  362. PoolID: req.PoolID,
  363. CreateVersion: true,
  364. Flavor: models.Flavor{
  365. Code: req.Spec.SourceSpecId,
  366. },
  367. Parameter: req.Parameters,
  368. },
  369. })
  370. }
  371. if createErr != nil {
  372. log.Error("createTrainJob failed: %v", createErr.Error())
  373. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  374. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  375. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  376. JobID: models.TempJobId,
  377. VersionID: models.TempVersionId,
  378. Status: models.TempJobStatus,
  379. Type: models.TypeCloudBrainTwo,
  380. JobName: req.JobName,
  381. JobType: string(models.JobTypeTrain),
  382. })
  383. if errTemp != nil {
  384. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  385. return errTemp
  386. }
  387. }
  388. return createErr
  389. }
  390. jobId := strconv.FormatInt(jobResult.JobID, 10)
  391. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  392. Status: TransTrainJobStatus(jobResult.Status),
  393. UserID: ctx.User.ID,
  394. RepoID: ctx.Repo.Repository.ID,
  395. JobID: jobId,
  396. JobName: req.JobName,
  397. DisplayJobName: req.DisplayJobName,
  398. JobType: string(models.JobTypeTrain),
  399. Type: models.TypeCloudBrainTwo,
  400. VersionID: jobResult.VersionID,
  401. VersionName: jobResult.VersionName,
  402. Uuid: req.Uuid,
  403. DatasetName: req.DatasetName,
  404. CommitID: req.CommitID,
  405. IsLatestVersion: req.IsLatestVersion,
  406. ComputeResource: models.NPUResource,
  407. EngineID: req.EngineID,
  408. TrainUrl: req.TrainUrl,
  409. BranchName: req.BranchName,
  410. Parameters: req.Params,
  411. BootFile: req.BootFile,
  412. DataUrl: req.DataUrl,
  413. LogUrl: req.LogUrl,
  414. FlavorCode: req.Spec.SourceSpecId,
  415. Description: req.Description,
  416. WorkServerNumber: req.WorkServerNumber,
  417. FlavorName: req.FlavorName,
  418. EngineName: req.EngineName,
  419. VersionCount: req.VersionCount,
  420. TotalVersionCount: req.TotalVersionCount,
  421. CreatedUnix: createTime,
  422. UpdatedUnix: createTime,
  423. Spec: req.Spec,
  424. ModelName: req.ModelName,
  425. ModelVersion: req.ModelVersion,
  426. LabelName: req.LabelName,
  427. PreTrainModelUrl: req.PreTrainModelUrl,
  428. CkptName: req.CkptName,
  429. })
  430. if createErr != nil {
  431. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  432. return createErr
  433. }
  434. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  435. return nil
  436. }
  437. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  438. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  439. JobName: req.JobName,
  440. Description: req.Description,
  441. Config: models.UserImageConfig{
  442. WorkServerNum: req.WorkServerNumber,
  443. AppUrl: req.CodeObsPath,
  444. BootFileUrl: req.BootFileUrl,
  445. DataUrl: req.DataUrl,
  446. TrainUrl: req.TrainUrl,
  447. LogUrl: req.LogUrl,
  448. PoolID: req.PoolID,
  449. CreateVersion: true,
  450. Flavor: models.Flavor{
  451. Code: req.FlavorCode,
  452. },
  453. Parameter: req.Parameters,
  454. UserImageUrl: req.UserImageUrl,
  455. UserCommand: req.UserCommand,
  456. },
  457. })
  458. }
  459. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  460. createTime := timeutil.TimeStampNow()
  461. var jobResult *models.CreateTrainJobResult
  462. var createErr error
  463. if req.EngineID < 0 {
  464. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  465. Description: req.Description,
  466. Config: models.TrainJobVersionUserImageConfig{
  467. WorkServerNum: req.WorkServerNumber,
  468. AppUrl: req.CodeObsPath,
  469. BootFileUrl: req.BootFileUrl,
  470. DataUrl: req.DataUrl,
  471. TrainUrl: req.TrainUrl,
  472. LogUrl: req.LogUrl,
  473. PoolID: req.PoolID,
  474. Flavor: models.Flavor{
  475. Code: req.Spec.SourceSpecId,
  476. },
  477. Parameter: req.Parameters,
  478. PreVersionId: req.PreVersionId,
  479. UserImageUrl: req.UserImageUrl,
  480. UserCommand: req.UserCommand,
  481. },
  482. }, jobId)
  483. } else {
  484. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  485. Description: req.Description,
  486. Config: models.TrainJobVersionConfig{
  487. WorkServerNum: req.WorkServerNumber,
  488. AppUrl: req.CodeObsPath,
  489. BootFileUrl: req.BootFileUrl,
  490. DataUrl: req.DataUrl,
  491. EngineID: req.EngineID,
  492. TrainUrl: req.TrainUrl,
  493. LogUrl: req.LogUrl,
  494. PoolID: req.PoolID,
  495. Flavor: models.Flavor{
  496. Code: req.Spec.SourceSpecId,
  497. },
  498. Parameter: req.Parameters,
  499. PreVersionId: req.PreVersionId,
  500. },
  501. }, jobId)
  502. }
  503. if createErr != nil {
  504. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  505. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  506. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  507. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  508. JobID: jobId,
  509. VersionID: models.TempVersionId,
  510. Status: models.TempJobStatus,
  511. Type: models.TypeCloudBrainTwo,
  512. JobName: req.JobName,
  513. JobType: string(models.JobTypeTrain),
  514. })
  515. if errTemp != nil {
  516. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  517. return errTemp
  518. }
  519. }
  520. return createErr
  521. }
  522. var jobTypes []string
  523. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  524. repo := ctx.Repo.Repository
  525. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  526. RepoID: repo.ID,
  527. Type: models.TypeCloudBrainTwo,
  528. JobTypes: jobTypes,
  529. JobID: strconv.FormatInt(jobResult.JobID, 10),
  530. })
  531. if createErr != nil {
  532. ctx.ServerError("Cloudbrain", createErr)
  533. return createErr
  534. }
  535. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  536. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  537. Status: TransTrainJobStatus(jobResult.Status),
  538. UserID: ctx.User.ID,
  539. RepoID: ctx.Repo.Repository.ID,
  540. JobID: strconv.FormatInt(jobResult.JobID, 10),
  541. JobName: req.JobName,
  542. DisplayJobName: req.DisplayJobName,
  543. JobType: string(models.JobTypeTrain),
  544. Type: models.TypeCloudBrainTwo,
  545. VersionID: jobResult.VersionID,
  546. VersionName: jobResult.VersionName,
  547. Uuid: req.Uuid,
  548. DatasetName: req.DatasetName,
  549. CommitID: req.CommitID,
  550. IsLatestVersion: req.IsLatestVersion,
  551. PreVersionName: req.PreVersionName,
  552. ComputeResource: models.NPUResource,
  553. EngineID: req.EngineID,
  554. TrainUrl: req.TrainUrl,
  555. BranchName: req.BranchName,
  556. Parameters: req.Params,
  557. BootFile: req.BootFile,
  558. DataUrl: req.DataUrl,
  559. LogUrl: req.LogUrl,
  560. PreVersionId: req.PreVersionId,
  561. FlavorCode: req.Spec.SourceSpecId,
  562. Description: req.Description,
  563. WorkServerNumber: req.WorkServerNumber,
  564. FlavorName: req.FlavorName,
  565. EngineName: req.EngineName,
  566. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  567. VersionCount: VersionListCount + 1,
  568. CreatedUnix: createTime,
  569. UpdatedUnix: createTime,
  570. Spec: req.Spec,
  571. ModelName: req.ModelName,
  572. ModelVersion: req.ModelVersion,
  573. LabelName: req.LabelName,
  574. PreTrainModelUrl: req.PreTrainModelUrl,
  575. CkptName: req.CkptName,
  576. })
  577. if createErr != nil {
  578. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  579. return createErr
  580. }
  581. //将训练任务的上一版本的isLatestVersion设置为"0"
  582. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  583. if createErr != nil {
  584. ctx.ServerError("Update IsLatestVersion failed", createErr)
  585. return createErr
  586. }
  587. return createErr
  588. }
  589. func TransTrainJobStatus(status int) string {
  590. switch status {
  591. case 0:
  592. return "UNKNOWN"
  593. case 1:
  594. return "INIT"
  595. case 2:
  596. return "IMAGE_CREATING"
  597. case 3:
  598. return "IMAGE_FAILED"
  599. case 4:
  600. return "SUBMIT_TRYING"
  601. case 5:
  602. return "SUBMIT_FAILED"
  603. case 6:
  604. return "DELETE_FAILED"
  605. case 7:
  606. return "WAITING"
  607. case 8:
  608. return "RUNNING"
  609. case 9:
  610. return "KILLING"
  611. case 10:
  612. return "COMPLETED"
  613. case 11:
  614. return "FAILED"
  615. case 12:
  616. return "KILLED"
  617. case 13:
  618. return "CANCELED"
  619. case 14:
  620. return "LOST"
  621. case 15:
  622. return "SCALING"
  623. case 16:
  624. return "SUBMIT_MODEL_FAILED"
  625. case 17:
  626. return "DEPLOY_SERVICE_FAILED"
  627. case 18:
  628. return "CHECK_INIT"
  629. case 19:
  630. return "CHECK_RUNNING"
  631. case 20:
  632. return "CHECK_RUNNING_COMPLETED"
  633. case 21:
  634. return "CHECK_FAILED"
  635. default:
  636. return strconv.Itoa(status)
  637. }
  638. }
  639. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  640. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  641. VersionOutputPath = "V" + talVersionCountToString
  642. return VersionOutputPath
  643. }
  644. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  645. createTime := timeutil.TimeStampNow()
  646. var jobResult *models.CreateTrainJobResult
  647. var createErr error
  648. if req.EngineID < 0 {
  649. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  650. JobName: req.JobName,
  651. Description: req.Description,
  652. Config: models.InfUserImageConfig{
  653. WorkServerNum: req.WorkServerNumber,
  654. AppUrl: req.CodeObsPath,
  655. BootFileUrl: req.BootFileUrl,
  656. DataUrl: req.DataUrl,
  657. // TrainUrl: req.TrainUrl,
  658. LogUrl: req.LogUrl,
  659. PoolID: req.PoolID,
  660. CreateVersion: true,
  661. Flavor: models.Flavor{
  662. Code: req.Spec.SourceSpecId,
  663. },
  664. Parameter: req.Parameters,
  665. UserImageUrl: req.UserImageUrl,
  666. UserCommand: req.UserCommand,
  667. },
  668. })
  669. } else {
  670. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  671. JobName: req.JobName,
  672. Description: req.Description,
  673. InfConfig: models.InfConfig{
  674. WorkServerNum: req.WorkServerNumber,
  675. AppUrl: req.CodeObsPath,
  676. BootFileUrl: req.BootFileUrl,
  677. DataUrl: req.DataUrl,
  678. EngineID: req.EngineID,
  679. // TrainUrl: req.TrainUrl,
  680. LogUrl: req.LogUrl,
  681. PoolID: req.PoolID,
  682. CreateVersion: true,
  683. Flavor: models.Flavor{
  684. Code: req.Spec.SourceSpecId,
  685. },
  686. Parameter: req.Parameters,
  687. },
  688. })
  689. }
  690. if createErr != nil {
  691. log.Error("createInferenceJob failed: %v", err.Error())
  692. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  693. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  694. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  695. JobID: models.TempJobId,
  696. VersionID: models.TempVersionId,
  697. Status: models.TempJobStatus,
  698. Type: models.TypeCloudBrainTwo,
  699. JobName: req.JobName,
  700. JobType: req.JobType,
  701. })
  702. if err != nil {
  703. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  704. return err
  705. }
  706. }
  707. return err
  708. }
  709. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  710. // if err != nil {
  711. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  712. // return err
  713. // }
  714. jobID := strconv.FormatInt(jobResult.JobID, 10)
  715. err = models.CreateCloudbrain(&models.Cloudbrain{
  716. Status: TransTrainJobStatus(jobResult.Status),
  717. UserID: ctx.User.ID,
  718. RepoID: ctx.Repo.Repository.ID,
  719. JobID: jobID,
  720. JobName: req.JobName,
  721. DisplayJobName: req.DisplayJobName,
  722. JobType: req.JobType,
  723. Type: models.TypeCloudBrainTwo,
  724. VersionID: jobResult.VersionID,
  725. VersionName: jobResult.VersionName,
  726. Uuid: req.Uuid,
  727. DatasetName: req.DatasetName,
  728. CommitID: req.CommitID,
  729. EngineID: req.EngineID,
  730. TrainUrl: req.TrainUrl,
  731. BranchName: req.BranchName,
  732. Parameters: req.Params,
  733. BootFile: req.BootFile,
  734. DataUrl: req.DataUrl,
  735. LogUrl: req.LogUrl,
  736. FlavorCode: req.Spec.SourceSpecId,
  737. Description: req.Description,
  738. WorkServerNumber: req.WorkServerNumber,
  739. FlavorName: req.FlavorName,
  740. EngineName: req.EngineName,
  741. LabelName: req.LabelName,
  742. IsLatestVersion: req.IsLatestVersion,
  743. ComputeResource: models.NPUResource,
  744. VersionCount: req.VersionCount,
  745. TotalVersionCount: req.TotalVersionCount,
  746. ModelName: req.ModelName,
  747. ModelVersion: req.ModelVersion,
  748. CkptName: req.CkptName,
  749. ResultUrl: req.ResultUrl,
  750. CreatedUnix: createTime,
  751. UpdatedUnix: createTime,
  752. Spec: req.Spec,
  753. })
  754. if err != nil {
  755. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  756. return err
  757. }
  758. if req.JobType == string(models.JobTypeModelSafety) {
  759. task, err := models.GetCloudbrainByJobID(jobID)
  760. if err == nil {
  761. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  762. }
  763. } else {
  764. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  765. }
  766. return nil
  767. }
  768. func GetNotebookImageName(imageId string) (string, error) {
  769. var validImage = false
  770. var imageName = ""
  771. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  772. if imageInfo.Id == imageId {
  773. validImage = true
  774. imageName = imageInfo.Value
  775. }
  776. }
  777. if !validImage {
  778. log.Error("the image id(%s) is invalid", imageId)
  779. return imageName, errors.New("the image id is invalid")
  780. }
  781. return imageName, nil
  782. }
  783. func InitSpecialPool() {
  784. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  785. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  786. }
  787. }
  788. func InitMultiNode() {
  789. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  790. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  791. }
  792. }
  793. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  794. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  795. if err != nil {
  796. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  797. return err
  798. }
  799. if result != nil {
  800. oldStatus := task.Status
  801. task.Status = TransTrainJobStatus(result.IntStatus)
  802. task.Duration = result.Duration / 1000
  803. task.TrainJobDuration = result.TrainJobDuration
  804. if task.StartTime == 0 && result.StartTime > 0 {
  805. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  806. }
  807. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  808. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  809. task.EndTime = task.StartTime.Add(task.Duration)
  810. }
  811. task.CorrectCreateUnix()
  812. if oldStatus != task.Status {
  813. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  814. }
  815. err = models.UpdateJob(task)
  816. if err != nil {
  817. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  818. return err
  819. }
  820. }
  821. return nil
  822. }
  823. func HandleNotebookInfo(task *models.Cloudbrain) error {
  824. var result *models.GetNotebook2Result
  825. var err error
  826. if task.Type == models.TypeCloudBrainTwo {
  827. result, err = GetNotebook2(task.JobID)
  828. } else if task.Type == models.TypeCDCenter {
  829. result, err = modelarts_cd.GetNotebook(task.JobID)
  830. }
  831. if err != nil {
  832. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  833. return err
  834. }
  835. if result != nil {
  836. oldStatus := task.Status
  837. task.Status = result.Status
  838. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  839. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  840. }
  841. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  842. task.EndTime = timeutil.TimeStampNow()
  843. }
  844. task.CorrectCreateUnix()
  845. task.ComputeAndSetDuration()
  846. if oldStatus != task.Status {
  847. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  848. }
  849. if task.FlavorCode == "" {
  850. task.FlavorCode = result.Flavor
  851. }
  852. err = models.UpdateJob(task)
  853. if err != nil {
  854. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  855. return err
  856. }
  857. }
  858. return nil
  859. }
  860. func SyncTempStatusJob() {
  861. jobs, err := models.GetCloudBrainTempJobs()
  862. if err != nil {
  863. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  864. return
  865. }
  866. for _, temp := range jobs {
  867. log.Info("start to handle record: %s", temp.JobName)
  868. if temp.Type == models.TypeCloudBrainTwo {
  869. if temp.JobType == string(models.JobTypeDebug) {
  870. err = handleNotebook(temp)
  871. if err != nil {
  872. log.Error("handleNotebook falied:%v", err)
  873. break
  874. }
  875. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  876. _, err = models.GetCloudbrainByJobID(temp.JobID)
  877. if err != nil {
  878. //one version
  879. err = handleTrainJob(temp)
  880. if err != nil {
  881. log.Error("handleTrainJob falied:%v", err)
  882. break
  883. }
  884. } else {
  885. //multi version
  886. err = handleTrainJobMultiVersion(temp)
  887. if err != nil {
  888. log.Error("handleTrainJobMultiVersion falied:%v", err)
  889. break
  890. }
  891. }
  892. }
  893. }
  894. }
  895. return
  896. }
  897. func handleNotebook(temp *models.CloudbrainTemp) error {
  898. if temp.Status == models.TempJobStatus {
  899. err := handleTempNotebook(temp)
  900. if err != nil {
  901. log.Error("handleTempNotebook failed:%v", err)
  902. return err
  903. }
  904. } else if temp.Status == string(models.ModelArtsStopping) {
  905. res, err := GetNotebook2(temp.JobID)
  906. if err != nil {
  907. log.Error("GetNotebook2 failed:%v", err)
  908. return err
  909. }
  910. temp.Status = res.Status
  911. if temp.Status == string(models.ModelArtsStopped) {
  912. err = models.UpdateCloudbrainTemp(temp)
  913. if err != nil {
  914. log.Error("UpdateCloudbrainTemp failed:%v", err)
  915. return err
  916. }
  917. _, err := DelNotebook2(temp.JobID)
  918. if err != nil {
  919. log.Error("DelNotebook2 failed:%v", err)
  920. return err
  921. }
  922. temp.Status = string(models.ModelArtsDeleted)
  923. err = models.UpdateCloudbrainTemp(temp)
  924. if err != nil {
  925. log.Error("UpdateCloudbrainTemp failed:%v", err)
  926. return err
  927. }
  928. }
  929. }
  930. return nil
  931. }
  932. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  933. var err error
  934. var isExist bool
  935. for {
  936. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  937. if err != nil {
  938. log.Error("GetNotebookList failed:%v", err)
  939. break
  940. }
  941. temp.QueryTimes++
  942. err = models.UpdateCloudbrainTemp(temp)
  943. if err != nil {
  944. log.Error("UpdateCloudbrainTemp failed:%v", err)
  945. }
  946. if result != nil {
  947. for _, notebook := range result.NotebookList {
  948. if temp.JobID == models.TempJobId {
  949. //new notebook
  950. if notebook.JobName == temp.JobName {
  951. isExist = true
  952. temp.Status = notebook.Status
  953. temp.JobID = notebook.JobID
  954. break
  955. }
  956. } else {
  957. //restart: always can find one record
  958. if notebook.JobName == temp.JobName {
  959. if notebook.Status != string(models.ModelArtsStopped) {
  960. isExist = true
  961. temp.Status = notebook.Status
  962. temp.JobID = notebook.JobID
  963. break
  964. }
  965. }
  966. }
  967. }
  968. if isExist {
  969. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  970. if temp.Status == string(models.ModelArtsCreateFailed) {
  971. err = models.UpdateCloudbrainTemp(temp)
  972. if err != nil {
  973. log.Error("UpdateCloudbrainTemp failed:%v", err)
  974. break
  975. }
  976. _, err := DelNotebook2(temp.JobID)
  977. if err != nil {
  978. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  979. break
  980. }
  981. temp.Status = string(models.ModelArtsDeleted)
  982. } else {
  983. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  984. if err != nil {
  985. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  986. break
  987. }
  988. temp.Status = string(models.ModelArtsStopping)
  989. }
  990. models.UpdateCloudbrainTemp(temp)
  991. } else {
  992. log.Error("can not find the record(%s) till now", temp.JobName)
  993. err = errors.New("not found")
  994. break
  995. }
  996. } else {
  997. log.Error("can not find the record(%s) till now", temp.JobName)
  998. err = errors.New("not found")
  999. break
  1000. }
  1001. break
  1002. }
  1003. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1004. log.Info("reach MaxTempQueryTimes, set the job failed")
  1005. temp.Status = string(models.ModelArtsTrainJobFailed)
  1006. err = models.UpdateCloudbrainTemp(temp)
  1007. if err != nil {
  1008. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1009. return err
  1010. }
  1011. }
  1012. return err
  1013. }
  1014. func handleTrainJob(temp *models.CloudbrainTemp) error {
  1015. if temp.Status == models.TempJobStatus {
  1016. err := handleTempTrainJob(temp)
  1017. if err != nil {
  1018. log.Error("handleTempTrainJob failed:%v", err)
  1019. return err
  1020. }
  1021. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1022. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1023. if err != nil {
  1024. log.Error("GetTrainJob failed:%v", err)
  1025. return err
  1026. }
  1027. temp.Status = TransTrainJobStatus(res.IntStatus)
  1028. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1029. err = models.UpdateCloudbrainTemp(temp)
  1030. if err != nil {
  1031. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1032. return err
  1033. }
  1034. _, err := DelTrainJob(temp.JobID)
  1035. if err != nil {
  1036. log.Error("DelTrainJob failed:%v", err)
  1037. return err
  1038. }
  1039. temp.Status = string(models.ModelArtsDeleted)
  1040. err = models.UpdateCloudbrainTemp(temp)
  1041. if err != nil {
  1042. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1043. return err
  1044. }
  1045. }
  1046. }
  1047. return nil
  1048. }
  1049. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1050. if temp.Status == models.TempJobStatus {
  1051. err := handleTempTrainJobMultiVersion(temp)
  1052. if err != nil {
  1053. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1054. return err
  1055. }
  1056. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1057. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1058. if err != nil {
  1059. log.Error("GetTrainJob failed:%v", err)
  1060. return err
  1061. }
  1062. temp.Status = TransTrainJobStatus(res.IntStatus)
  1063. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1064. err = models.UpdateCloudbrainTemp(temp)
  1065. if err != nil {
  1066. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1067. return err
  1068. }
  1069. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1070. if err != nil {
  1071. log.Error("DelTrainJob failed:%v", err)
  1072. return err
  1073. }
  1074. temp.Status = string(models.ModelArtsDeleted)
  1075. err = models.UpdateCloudbrainTemp(temp)
  1076. if err != nil {
  1077. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1078. return err
  1079. }
  1080. }
  1081. }
  1082. return nil
  1083. }
  1084. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1085. var err error
  1086. var isExist bool
  1087. for {
  1088. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1089. if err != nil {
  1090. log.Error("GetTrainJobVersionList failed:%v", err)
  1091. break
  1092. }
  1093. temp.QueryTimes++
  1094. err = models.UpdateCloudbrainTemp(temp)
  1095. if err != nil {
  1096. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1097. }
  1098. if result != nil {
  1099. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1100. if result.VersionCount == int64(count+1) {
  1101. isExist = true
  1102. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1103. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1104. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1105. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1106. if err != nil {
  1107. log.Error("StopTrainJob failed:%v", err)
  1108. break
  1109. }
  1110. temp.Status = string(models.ModelArtsTrainJobKilling)
  1111. err = models.UpdateCloudbrainTemp(temp)
  1112. if err != nil {
  1113. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1114. break
  1115. }
  1116. } else {
  1117. log.Error("can not find the record(%s) till now", temp.JobName)
  1118. err = errors.New("not found")
  1119. break
  1120. }
  1121. }
  1122. break
  1123. }
  1124. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1125. log.Info("reach MaxTempQueryTimes, set the job failed")
  1126. temp.Status = string(models.ModelArtsTrainJobFailed)
  1127. err = models.UpdateCloudbrainTemp(temp)
  1128. if err != nil {
  1129. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1130. return err
  1131. }
  1132. }
  1133. return err
  1134. }
  1135. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1136. var err error
  1137. var isExist bool
  1138. for {
  1139. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1140. if err != nil {
  1141. log.Error("GetTrainJobList failed:%v", err)
  1142. break
  1143. }
  1144. temp.QueryTimes++
  1145. err = models.UpdateCloudbrainTemp(temp)
  1146. if err != nil {
  1147. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1148. }
  1149. if result != nil {
  1150. for _, job := range result.JobList {
  1151. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1152. isExist = true
  1153. temp.Status = TransTrainJobStatus(job.IntStatus)
  1154. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1155. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1156. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1157. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1158. if err != nil {
  1159. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1160. break
  1161. }
  1162. temp.Status = string(models.ModelArtsTrainJobKilling)
  1163. err = models.UpdateCloudbrainTemp(temp)
  1164. if err != nil {
  1165. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1166. break
  1167. }
  1168. }
  1169. }
  1170. if !isExist {
  1171. log.Error("can not find the record(%s) till now", temp.JobName)
  1172. err = errors.New("not found")
  1173. break
  1174. }
  1175. }
  1176. break
  1177. }
  1178. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1179. log.Info("reach MaxTempQueryTimes, set the job failed")
  1180. temp.Status = string(models.ModelArtsTrainJobFailed)
  1181. err = models.UpdateCloudbrainTemp(temp)
  1182. if err != nil {
  1183. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1184. return err
  1185. }
  1186. }
  1187. return err
  1188. }