You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 19 kB

3 years ago
2 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
2 years ago
4 years ago
4 years ago
2 years ago
4 years ago
2 years ago
4 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662
  1. // Copyright 2016 The Gogs Authors. All rights reserved.
  2. // Copyright 2018 The Gitea Authors. All rights reserved.
  3. // Use of this source code is governed by a MIT-style
  4. // license that can be found in the LICENSE file.
  5. package repo
  6. import (
  7. "code.gitea.io/gitea/modules/notification"
  8. "encoding/json"
  9. "net/http"
  10. "path"
  11. "strconv"
  12. "strings"
  13. "code.gitea.io/gitea/modules/grampus"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/models"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/storage"
  21. "code.gitea.io/gitea/modules/timeutil"
  22. routerRepo "code.gitea.io/gitea/routers/repo"
  23. )
  24. func GetModelArtsNotebook(ctx *context.APIContext) {
  25. var (
  26. err error
  27. )
  28. jobID := ctx.Params(":jobid")
  29. repoID := ctx.Repo.Repository.ID
  30. job, err := models.GetRepoCloudBrainByJobID(repoID, jobID)
  31. if err != nil {
  32. ctx.NotFound(err)
  33. return
  34. }
  35. result, err := modelarts.GetJob(jobID)
  36. if err != nil {
  37. ctx.NotFound(err)
  38. return
  39. }
  40. oldStatus := job.Status
  41. job.Status = result.Status
  42. if oldStatus != result.Status {
  43. notification.NotifyChangeCloudbrainStatus(job, oldStatus)
  44. }
  45. err = models.UpdateJob(job)
  46. if err != nil {
  47. log.Error("UpdateJob failed:", err)
  48. }
  49. ctx.JSON(http.StatusOK, map[string]interface{}{
  50. "JobID": jobID,
  51. "JobStatus": result.Status,
  52. })
  53. }
  54. func GetModelArtsNotebook2(ctx *context.APIContext) {
  55. var (
  56. err error
  57. )
  58. ID := ctx.Params(":id")
  59. job, err := models.GetCloudbrainByID(ID)
  60. if err != nil {
  61. ctx.NotFound(err)
  62. return
  63. }
  64. result, err := modelarts.GetNotebook2(job.JobID)
  65. if err != nil {
  66. ctx.NotFound(err)
  67. return
  68. }
  69. if job.StartTime == 0 && result.Lease.UpdateTime > 0 {
  70. job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  71. }
  72. oldStatus := job.Status
  73. job.Status = result.Status
  74. if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) {
  75. job.EndTime = timeutil.TimeStampNow()
  76. }
  77. job.CorrectCreateUnix()
  78. job.ComputeAndSetDuration()
  79. if oldStatus != result.Status {
  80. notification.NotifyChangeCloudbrainStatus(job, oldStatus)
  81. }
  82. err = models.UpdateJob(job)
  83. if err != nil {
  84. log.Error("UpdateJob failed:", err)
  85. }
  86. ctx.JSON(http.StatusOK, map[string]interface{}{
  87. "ID": ID,
  88. "JobName": job.JobName,
  89. "JobStatus": result.Status,
  90. })
  91. }
  92. func GetModelArtsTrainJob(ctx *context.APIContext) {
  93. var (
  94. err error
  95. )
  96. jobID := ctx.Params(":jobid")
  97. repoID := ctx.Repo.Repository.ID
  98. job, err := models.GetRepoCloudBrainByJobID(repoID, jobID)
  99. if err != nil {
  100. ctx.NotFound(err)
  101. return
  102. }
  103. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
  104. if err != nil {
  105. ctx.NotFound(err)
  106. return
  107. }
  108. oldStatus := job.Status
  109. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  110. job.Duration = result.Duration
  111. job.TrainJobDuration = result.TrainJobDuration
  112. if oldStatus != job.Status {
  113. notification.NotifyChangeCloudbrainStatus(job, oldStatus)
  114. }
  115. err = models.UpdateJob(job)
  116. if err != nil {
  117. log.Error("UpdateJob failed:", err)
  118. }
  119. ctx.JSON(http.StatusOK, map[string]interface{}{
  120. "JobID": jobID,
  121. "JobStatus": job.Status,
  122. "JobDuration": job.Duration,
  123. })
  124. }
  125. func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
  126. var (
  127. err error
  128. aiCenterName string
  129. )
  130. jobID := ctx.Params(":jobid")
  131. versionName := ctx.Query("version_name")
  132. job, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  133. if err != nil {
  134. ctx.NotFound(err)
  135. return
  136. }
  137. if job.Type == models.TypeCloudBrainOne {
  138. jobResult, err := cloudbrain.GetJob(job.JobID)
  139. if err != nil {
  140. ctx.NotFound(err)
  141. log.Error("GetJob failed:", err)
  142. return
  143. }
  144. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  145. if err != nil {
  146. ctx.NotFound(err)
  147. log.Error("ConvertToJobResultPayload failed:", err)
  148. return
  149. }
  150. oldStatus := job.Status
  151. job.Status = result.JobStatus.State
  152. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  153. taskRoles := result.TaskRoles
  154. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  155. job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
  156. job.ContainerID = taskRes.TaskStatuses[0].ContainerID
  157. job.Status = taskRes.TaskStatuses[0].State
  158. }
  159. if result.JobStatus.State != string(models.JobWaiting) {
  160. models.ParseAndSetDurationFromCloudBrainOne(result, job)
  161. if oldStatus != job.Status {
  162. notification.NotifyChangeCloudbrainStatus(job, oldStatus)
  163. }
  164. err = models.UpdateJob(job)
  165. if err != nil {
  166. log.Error("UpdateJob failed:", err)
  167. }
  168. }
  169. } else if job.Type == models.TypeCloudBrainTwo {
  170. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
  171. if err != nil {
  172. ctx.NotFound(err)
  173. return
  174. }
  175. if job.StartTime == 0 && result.StartTime > 0 {
  176. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  177. }
  178. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  179. job.Duration = result.Duration / 1000
  180. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  181. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  182. job.EndTime = job.StartTime.Add(job.Duration)
  183. }
  184. job.CorrectCreateUnix()
  185. err = models.UpdateTrainJobVersion(job)
  186. if err != nil {
  187. log.Error("UpdateJob failed:", err)
  188. }
  189. } else if job.Type == models.TypeC2Net {
  190. result, err := grampus.GetJob(jobID)
  191. if err != nil {
  192. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  193. ctx.NotFound(err)
  194. return
  195. }
  196. if job.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  197. job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  198. }
  199. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  200. job.Duration = result.JobInfo.RunSec
  201. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  202. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  203. job.EndTime = job.StartTime.Add(job.Duration)
  204. }
  205. job.CorrectCreateUnix()
  206. if len(job.AiCenter) == 0 {
  207. if len(result.JobInfo.Tasks) > 0 {
  208. if len(result.JobInfo.Tasks[0].CenterID) > 0 && len(result.JobInfo.Tasks[0].CenterName) > 0 {
  209. job.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  210. aiCenterName = result.JobInfo.Tasks[0].CenterName[0]
  211. }
  212. }
  213. } else {
  214. temp := strings.Split(job.AiCenter, "+")
  215. if len(temp) > 1 {
  216. aiCenterName = temp[1]
  217. }
  218. }
  219. err = models.UpdateTrainJobVersion(job)
  220. if err != nil {
  221. log.Error("UpdateJob failed:", err)
  222. }
  223. }
  224. ctx.JSON(http.StatusOK, map[string]interface{}{
  225. "JobID": jobID,
  226. "JobStatus": job.Status,
  227. "JobDuration": job.TrainJobDuration,
  228. "AiCenter": aiCenterName,
  229. })
  230. }
  231. func TrainJobForModelConvertGetLog(ctx *context.APIContext) {
  232. var (
  233. err error
  234. )
  235. var jobID = ctx.Params(":id")
  236. var baseLine = ctx.Query("base_line")
  237. var order = ctx.Query("order")
  238. var lines = ctx.Query("lines")
  239. lines_int, err := strconv.Atoi(lines)
  240. if err != nil {
  241. log.Error("change lines(%d) string to int failed", lines_int)
  242. }
  243. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  244. log.Error("order(%s) check failed", order)
  245. ctx.JSON(http.StatusBadRequest, map[string]interface{}{
  246. "err_msg": "order check failed",
  247. })
  248. return
  249. }
  250. resultLogFile, result, err := trainJobForModelConvertGetLogContent(jobID, baseLine, order, lines_int)
  251. if err != nil {
  252. log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error())
  253. // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  254. ctx.JSON(http.StatusOK, map[string]interface{}{
  255. "JobID": jobID,
  256. "LogFileName": "",
  257. "StartLine": "0",
  258. "EndLine": "0",
  259. "Content": "",
  260. "Lines": 0,
  261. })
  262. return
  263. }
  264. ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]
  265. ctx.JSON(http.StatusOK, map[string]interface{}{
  266. "JobID": jobID,
  267. "LogFileName": resultLogFile.LogFileList[0],
  268. "StartLine": result.StartLine,
  269. "EndLine": result.EndLine,
  270. "Content": result.Content,
  271. "Lines": result.Lines,
  272. })
  273. }
  274. func trainJobForModelConvertGetLogContent(jobID string, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  275. task, err := models.QueryModelConvertById(jobID)
  276. if err != nil {
  277. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  278. return nil, nil, err
  279. }
  280. resultLogFile, err := modelarts.GetTrainJobLogFileNames(task.CloudBrainTaskId, task.ModelArtsVersionId)
  281. if err != nil {
  282. log.Error("GetTrainJobLogFileNames(%s) failed:%v", task.CloudBrainTaskId, err.Error())
  283. return nil, nil, err
  284. }
  285. result, err := modelarts.GetTrainJobLog(task.CloudBrainTaskId, task.ModelArtsVersionId, baseLine, resultLogFile.LogFileList[0], order, lines)
  286. if err != nil {
  287. log.Error("GetTrainJobLog(%s) failed:%v", task.CloudBrainTaskId, err.Error())
  288. return nil, nil, err
  289. }
  290. return resultLogFile, result, err
  291. }
  292. func TrainJobGetLog(ctx *context.APIContext) {
  293. var (
  294. err error
  295. )
  296. var jobID = ctx.Params(":jobid")
  297. var versionName = ctx.Query("version_name")
  298. var baseLine = ctx.Query("base_line")
  299. var order = ctx.Query("order")
  300. var lines = ctx.Query("lines")
  301. lines_int, err := strconv.Atoi(lines)
  302. if err != nil {
  303. log.Error("change lines(%d) string to int failed", lines_int)
  304. }
  305. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  306. log.Error("order(%s) check failed", order)
  307. ctx.JSON(http.StatusBadRequest, map[string]interface{}{
  308. "err_msg": "order check failed",
  309. })
  310. return
  311. }
  312. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  313. if err != nil {
  314. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  315. return
  316. }
  317. resultLogFile, result, err := trainJobGetLogContent(jobID, task.VersionID, baseLine, order, lines_int)
  318. if err != nil {
  319. log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error())
  320. // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  321. return
  322. }
  323. prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
  324. _, err = storage.GetObsLogFileName(prefix)
  325. var canLogDownload bool
  326. if err != nil {
  327. canLogDownload = false
  328. } else {
  329. canLogDownload = true
  330. }
  331. ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]
  332. ctx.JSON(http.StatusOK, map[string]interface{}{
  333. "JobID": jobID,
  334. "LogFileName": resultLogFile.LogFileList[0],
  335. "StartLine": result.StartLine,
  336. "EndLine": result.EndLine,
  337. "Content": result.Content,
  338. "Lines": result.Lines,
  339. "CanLogDownload": canLogDownload,
  340. })
  341. }
  342. func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  343. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10))
  344. if err != nil {
  345. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  346. return nil, nil, err
  347. }
  348. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(versionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines)
  349. if err != nil {
  350. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  351. return nil, nil, err
  352. }
  353. return resultLogFile, result, err
  354. }
  355. func DelTrainJobVersion(ctx *context.APIContext) {
  356. var (
  357. err error
  358. )
  359. var jobID = ctx.Params(":jobid")
  360. var versionName = ctx.Query("version_name")
  361. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  362. if err != nil {
  363. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  364. ctx.NotFound(err)
  365. return
  366. }
  367. //删除modelarts上的记录
  368. _, err = modelarts.DelTrainJobVersion(jobID, strconv.FormatInt(task.VersionID, 10))
  369. if err != nil {
  370. log.Error("DelTrainJobVersion(%s) failed:%v", task.JobName, err.Error())
  371. ctx.NotFound(err)
  372. return
  373. }
  374. //删除数据库记录
  375. err = models.DeleteJob(task)
  376. if err != nil {
  377. ctx.ServerError("DeleteJob failed", err)
  378. ctx.NotFound(err)
  379. return
  380. }
  381. //获取删除后的版本数量
  382. var jobTypes []string
  383. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  384. repo := ctx.Repo.Repository
  385. VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  386. RepoID: repo.ID,
  387. Type: models.TypeCloudBrainTwo,
  388. JobTypes: jobTypes,
  389. JobID: jobID,
  390. })
  391. if err != nil {
  392. ctx.ServerError("get VersionListCount failed", err)
  393. return
  394. }
  395. if VersionListCount > 0 {
  396. // 判断当前删掉的任务是否是最新版本,若是,将排序后的TotalVersionCount置为删掉的最新版本的TotalVersionCount,若不是,按时间排序后的版本列表的第一个版本设置为最新版本,TotalVersionCount不变
  397. if task.IsLatestVersion == modelarts.IsLatestVersion {
  398. err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].Cloudbrain.VersionName, VersionListCount, modelarts.IsLatestVersion, task.TotalVersionCount)
  399. if err != nil {
  400. ctx.ServerError("UpdateJobVersionCount failed", err)
  401. return
  402. }
  403. } else {
  404. err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].VersionName, VersionListCount, modelarts.IsLatestVersion, VersionTaskList[0].Cloudbrain.TotalVersionCount)
  405. if err != nil {
  406. ctx.ServerError("UpdateJobVersionCount failed", err)
  407. return
  408. }
  409. }
  410. } else { //已删除该任务下的所有版本
  411. routerRepo.DeleteJobStorage(task.JobName)
  412. }
  413. ctx.JSON(http.StatusOK, map[string]interface{}{
  414. "JobID": jobID,
  415. "VersionName": versionName,
  416. "StatusOK": 0,
  417. "VersionListCount": VersionListCount,
  418. })
  419. }
  420. func StopTrainJobVersion(ctx *context.APIContext) {
  421. var (
  422. err error
  423. )
  424. var jobID = ctx.Params(":jobid")
  425. var versionName = ctx.Query("version_name")
  426. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  427. if err != nil {
  428. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  429. return
  430. }
  431. _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  432. if err != nil {
  433. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  434. return
  435. }
  436. ctx.JSON(http.StatusOK, map[string]interface{}{
  437. "JobID": jobID,
  438. "VersionName": versionName,
  439. "StatusOK": 0,
  440. })
  441. }
  442. func ModelList(ctx *context.APIContext) {
  443. var (
  444. err error
  445. )
  446. var jobID = ctx.Params(":jobid")
  447. var versionName = ctx.Query("version_name")
  448. parentDir := ctx.Query("parentDir")
  449. dirArray := strings.Split(parentDir, "/")
  450. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  451. if err != nil {
  452. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  453. return
  454. }
  455. var fileInfos []storage.FileInfo
  456. if task.ComputeResource == models.NPUResource {
  457. fileInfos, err = storage.GetObsListObject(task.JobName, "output/", parentDir, versionName)
  458. if err != nil {
  459. log.Info("get TrainJobListModel failed:", err)
  460. ctx.ServerError("GetObsListObject:", err)
  461. return
  462. }
  463. } else if task.ComputeResource == models.GPUResource {
  464. files, err := routerRepo.GetModelDirs(task.JobName, parentDir)
  465. if err != nil {
  466. log.Info("GetModelDirs failed:", err)
  467. ctx.ServerError("GetModelDirs:", err)
  468. return
  469. }
  470. err = json.Unmarshal([]byte(files), &fileInfos)
  471. if err != nil {
  472. log.Error("json.Unmarshal failed:%v", err.Error(), ctx.Data["msgID"])
  473. ctx.ServerError("json.Unmarshal failed:", err)
  474. return
  475. }
  476. }
  477. ctx.JSON(http.StatusOK, map[string]interface{}{
  478. "JobID": jobID,
  479. "VersionName": versionName,
  480. "StatusOK": 0,
  481. "Path": dirArray,
  482. "Dirs": fileInfos,
  483. "task": task,
  484. "PageIsCloudBrain": true,
  485. })
  486. }
  487. func GetModelArtsInferenceJob(ctx *context.APIContext) {
  488. var (
  489. err error
  490. )
  491. jobID := ctx.Params(":jobid")
  492. job, err := models.GetCloudbrainByJobID(jobID)
  493. if err != nil {
  494. ctx.NotFound(err)
  495. return
  496. }
  497. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
  498. if err != nil {
  499. ctx.NotFound(err)
  500. return
  501. }
  502. if job.StartTime == 0 && result.StartTime > 0 {
  503. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  504. }
  505. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  506. job.Duration = result.Duration / 1000
  507. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  508. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  509. job.EndTime = job.StartTime.Add(job.Duration)
  510. }
  511. job.CorrectCreateUnix()
  512. err = models.UpdateInferenceJob(job)
  513. if err != nil {
  514. log.Error("UpdateJob failed:", err)
  515. }
  516. ctx.JSON(http.StatusOK, map[string]interface{}{
  517. "JobID": jobID,
  518. "JobStatus": job.Status,
  519. "JobDuration": job.TrainJobDuration,
  520. })
  521. }
  522. func ResultList(ctx *context.APIContext) {
  523. var (
  524. err error
  525. )
  526. var jobID = ctx.Params(":jobid")
  527. var versionName = ctx.Query("version_name")
  528. parentDir := ctx.Query("parentDir")
  529. dirArray := strings.Split(parentDir, "/")
  530. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  531. if err != nil {
  532. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  533. return
  534. }
  535. models, err := storage.GetObsListObject(task.JobName, "result/", parentDir, versionName)
  536. if err != nil {
  537. log.Info("get TrainJobListModel failed:", err)
  538. ctx.ServerError("GetObsListObject:", err)
  539. return
  540. }
  541. ctx.JSON(http.StatusOK, map[string]interface{}{
  542. "JobID": jobID,
  543. "VersionName": versionName,
  544. "StatusOK": 0,
  545. "Path": dirArray,
  546. "Dirs": models,
  547. "task": task,
  548. "PageIsCloudBrain": true,
  549. })
  550. }
  551. func TrainJobGetMetricStatistic(ctx *context.APIContext) {
  552. var (
  553. err error
  554. )
  555. var jobID = ctx.Params(":jobid")
  556. var versionName = ctx.Query("version_name")
  557. result, err := trainJobGetMetricStatistic(jobID, versionName)
  558. if err != nil {
  559. log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error())
  560. return
  561. }
  562. ctx.JSON(http.StatusOK, map[string]interface{}{
  563. "JobID": jobID,
  564. "Interval": result.Interval,
  565. "MetricsInfo": result.MetricsInfo,
  566. })
  567. }
  568. func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) {
  569. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  570. if err != nil {
  571. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  572. return nil, err
  573. }
  574. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  575. if err != nil {
  576. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  577. return nil, err
  578. }
  579. result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0])
  580. if err != nil {
  581. log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error())
  582. return nil, err
  583. }
  584. return result, err
  585. }