You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 16 kB

3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
2 years ago
4 years ago
4 years ago
2 years ago
4 years ago
2 years ago
4 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. // Copyright 2016 The Gogs Authors. All rights reserved.
  2. // Copyright 2018 The Gitea Authors. All rights reserved.
  3. // Use of this source code is governed by a MIT-style
  4. // license that can be found in the LICENSE file.
  5. package repo
  6. import (
  7. "code.gitea.io/gitea/modules/grampus"
  8. "code.gitea.io/gitea/modules/setting"
  9. "encoding/json"
  10. "net/http"
  11. "path"
  12. "strconv"
  13. "strings"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/cloudbrain"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/log"
  18. "code.gitea.io/gitea/modules/modelarts"
  19. "code.gitea.io/gitea/modules/storage"
  20. "code.gitea.io/gitea/modules/timeutil"
  21. routerRepo "code.gitea.io/gitea/routers/repo"
  22. )
  23. func GetModelArtsNotebook(ctx *context.APIContext) {
  24. var (
  25. err error
  26. )
  27. jobID := ctx.Params(":jobid")
  28. repoID := ctx.Repo.Repository.ID
  29. job, err := models.GetRepoCloudBrainByJobID(repoID, jobID)
  30. if err != nil {
  31. ctx.NotFound(err)
  32. return
  33. }
  34. result, err := modelarts.GetJob(jobID)
  35. if err != nil {
  36. ctx.NotFound(err)
  37. return
  38. }
  39. job.Status = result.Status
  40. err = models.UpdateJob(job)
  41. if err != nil {
  42. log.Error("UpdateJob failed:", err)
  43. }
  44. ctx.JSON(http.StatusOK, map[string]interface{}{
  45. "JobID": jobID,
  46. "JobStatus": result.Status,
  47. })
  48. }
  49. func GetModelArtsNotebook2(ctx *context.APIContext) {
  50. var (
  51. err error
  52. )
  53. ID := ctx.Params(":id")
  54. job, err := models.GetCloudbrainByID(ID)
  55. if err != nil {
  56. ctx.NotFound(err)
  57. return
  58. }
  59. result, err := modelarts.GetNotebook2(job.JobID)
  60. if err != nil {
  61. ctx.NotFound(err)
  62. return
  63. }
  64. if job.StartTime == 0 && result.Lease.UpdateTime > 0 {
  65. job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  66. }
  67. job.Status = result.Status
  68. if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) {
  69. job.EndTime = timeutil.TimeStampNow()
  70. }
  71. job.CorrectCreateUnix()
  72. job.ComputeAndSetDuration()
  73. err = models.UpdateJob(job)
  74. if err != nil {
  75. log.Error("UpdateJob failed:", err)
  76. }
  77. ctx.JSON(http.StatusOK, map[string]interface{}{
  78. "ID": ID,
  79. "JobName": job.JobName,
  80. "JobStatus": result.Status,
  81. })
  82. }
  83. func GetModelArtsTrainJob(ctx *context.APIContext) {
  84. var (
  85. err error
  86. )
  87. jobID := ctx.Params(":jobid")
  88. repoID := ctx.Repo.Repository.ID
  89. job, err := models.GetRepoCloudBrainByJobID(repoID, jobID)
  90. if err != nil {
  91. ctx.NotFound(err)
  92. return
  93. }
  94. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
  95. if err != nil {
  96. ctx.NotFound(err)
  97. return
  98. }
  99. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  100. job.Duration = result.Duration
  101. job.TrainJobDuration = result.TrainJobDuration
  102. err = models.UpdateJob(job)
  103. if err != nil {
  104. log.Error("UpdateJob failed:", err)
  105. }
  106. ctx.JSON(http.StatusOK, map[string]interface{}{
  107. "JobID": jobID,
  108. "JobStatus": job.Status,
  109. "JobDuration": job.Duration,
  110. })
  111. }
  112. func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
  113. var (
  114. err error
  115. aiCenterName string
  116. )
  117. jobID := ctx.Params(":jobid")
  118. versionName := ctx.Query("version_name")
  119. job, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  120. if err != nil {
  121. ctx.NotFound(err)
  122. return
  123. }
  124. if job.Type == models.TypeCloudBrainOne {
  125. jobResult, err := cloudbrain.GetJob(job.JobID)
  126. if err != nil {
  127. ctx.NotFound(err)
  128. log.Error("GetJob failed:", err)
  129. return
  130. }
  131. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  132. if err != nil {
  133. ctx.NotFound(err)
  134. log.Error("ConvertToJobResultPayload failed:", err)
  135. return
  136. }
  137. job.Status = result.JobStatus.State
  138. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  139. taskRoles := result.TaskRoles
  140. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  141. job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
  142. job.ContainerID = taskRes.TaskStatuses[0].ContainerID
  143. job.Status = taskRes.TaskStatuses[0].State
  144. }
  145. if result.JobStatus.State != string(models.JobWaiting) {
  146. models.ParseAndSetDurationFromCloudBrainOne(result, job)
  147. err = models.UpdateJob(job)
  148. if err != nil {
  149. log.Error("UpdateJob failed:", err)
  150. }
  151. }
  152. } else if job.Type == models.TypeCloudBrainTwo {
  153. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
  154. if err != nil {
  155. ctx.NotFound(err)
  156. return
  157. }
  158. if job.StartTime == 0 && result.StartTime > 0 {
  159. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  160. }
  161. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  162. job.Duration = result.Duration / 1000
  163. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  164. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  165. job.EndTime = job.StartTime.Add(job.Duration)
  166. }
  167. job.CorrectCreateUnix()
  168. err = models.UpdateTrainJobVersion(job)
  169. if err != nil {
  170. log.Error("UpdateJob failed:", err)
  171. }
  172. } else if job.Type == models.TypeC2Net {
  173. result, err := grampus.GetJob(jobID)
  174. if err != nil {
  175. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  176. ctx.NotFound(err)
  177. return
  178. }
  179. if job.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  180. job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  181. }
  182. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  183. job.Duration = result.JobInfo.RunSec
  184. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  185. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  186. job.EndTime = job.StartTime.Add(job.Duration)
  187. }
  188. job.CorrectCreateUnix()
  189. if len(job.AiCenter) == 0 {
  190. if len(result.JobInfo.Tasks) > 0 {
  191. if len(result.JobInfo.Tasks[0].CenterID) > 0 && len(result.JobInfo.Tasks[0].CenterName) > 0 {
  192. job.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  193. aiCenterName = result.JobInfo.Tasks[0].CenterName[0]
  194. }
  195. }
  196. } else {
  197. temp := strings.Split(job.AiCenter, "+")
  198. if len(temp) > 1 {
  199. aiCenterName = temp[1]
  200. }
  201. }
  202. err = models.UpdateTrainJobVersion(job)
  203. if err != nil {
  204. log.Error("UpdateJob failed:", err)
  205. }
  206. }
  207. ctx.JSON(http.StatusOK, map[string]interface{}{
  208. "JobID": jobID,
  209. "JobStatus": job.Status,
  210. "JobDuration": job.TrainJobDuration,
  211. "AiCenter": aiCenterName,
  212. })
  213. }
  214. func TrainJobGetLog(ctx *context.APIContext) {
  215. var (
  216. err error
  217. )
  218. var jobID = ctx.Params(":jobid")
  219. var versionName = ctx.Query("version_name")
  220. var baseLine = ctx.Query("base_line")
  221. var order = ctx.Query("order")
  222. var lines = ctx.Query("lines")
  223. lines_int, err := strconv.Atoi(lines)
  224. if err != nil {
  225. log.Error("change lines(%d) string to int failed", lines_int)
  226. }
  227. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  228. log.Error("order(%s) check failed", order)
  229. ctx.JSON(http.StatusBadRequest, map[string]interface{}{
  230. "err_msg": "order check failed",
  231. })
  232. return
  233. }
  234. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  235. if err != nil {
  236. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  237. return
  238. }
  239. resultLogFile, result, err := trainJobGetLogContent(jobID, task.VersionID, baseLine, order, lines_int)
  240. if err != nil {
  241. log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error())
  242. // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  243. return
  244. }
  245. prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
  246. _, err = storage.GetObsLogFileName(prefix)
  247. var canLogDownload bool
  248. if err != nil {
  249. canLogDownload = false
  250. } else {
  251. canLogDownload = true
  252. }
  253. ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]
  254. ctx.JSON(http.StatusOK, map[string]interface{}{
  255. "JobID": jobID,
  256. "LogFileName": resultLogFile.LogFileList[0],
  257. "StartLine": result.StartLine,
  258. "EndLine": result.EndLine,
  259. "Content": result.Content,
  260. "Lines": result.Lines,
  261. "CanLogDownload": canLogDownload,
  262. })
  263. }
  264. func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  265. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10))
  266. if err != nil {
  267. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  268. return nil, nil, err
  269. }
  270. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(versionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines)
  271. if err != nil {
  272. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  273. return nil, nil, err
  274. }
  275. return resultLogFile, result, err
  276. }
  277. func DelTrainJobVersion(ctx *context.APIContext) {
  278. var (
  279. err error
  280. )
  281. var jobID = ctx.Params(":jobid")
  282. var versionName = ctx.Query("version_name")
  283. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  284. if err != nil {
  285. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  286. ctx.NotFound(err)
  287. return
  288. }
  289. //删除modelarts上的记录
  290. _, err = modelarts.DelTrainJobVersion(jobID, strconv.FormatInt(task.VersionID, 10))
  291. if err != nil {
  292. log.Error("DelTrainJobVersion(%s) failed:%v", task.JobName, err.Error())
  293. ctx.NotFound(err)
  294. return
  295. }
  296. //删除数据库记录
  297. err = models.DeleteJob(task)
  298. if err != nil {
  299. ctx.ServerError("DeleteJob failed", err)
  300. ctx.NotFound(err)
  301. return
  302. }
  303. //获取删除后的版本数量
  304. var jobTypes []string
  305. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  306. repo := ctx.Repo.Repository
  307. VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  308. RepoID: repo.ID,
  309. Type: models.TypeCloudBrainTwo,
  310. JobTypes: jobTypes,
  311. JobID: jobID,
  312. })
  313. if err != nil {
  314. ctx.ServerError("get VersionListCount failed", err)
  315. return
  316. }
  317. if VersionListCount > 0 {
  318. // 判断当前删掉的任务是否是最新版本,若是,将排序后的TotalVersionCount置为删掉的最新版本的TotalVersionCount,若不是,按时间排序后的版本列表的第一个版本设置为最新版本,TotalVersionCount不变
  319. if task.IsLatestVersion == modelarts.IsLatestVersion {
  320. err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].Cloudbrain.VersionName, VersionListCount, modelarts.IsLatestVersion, task.TotalVersionCount)
  321. if err != nil {
  322. ctx.ServerError("UpdateJobVersionCount failed", err)
  323. return
  324. }
  325. } else {
  326. err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].VersionName, VersionListCount, modelarts.IsLatestVersion, VersionTaskList[0].Cloudbrain.TotalVersionCount)
  327. if err != nil {
  328. ctx.ServerError("UpdateJobVersionCount failed", err)
  329. return
  330. }
  331. }
  332. } else { //已删除该任务下的所有版本
  333. routerRepo.DeleteJobStorage(task.JobName)
  334. }
  335. ctx.JSON(http.StatusOK, map[string]interface{}{
  336. "JobID": jobID,
  337. "VersionName": versionName,
  338. "StatusOK": 0,
  339. "VersionListCount": VersionListCount,
  340. })
  341. }
  342. func StopTrainJobVersion(ctx *context.APIContext) {
  343. var (
  344. err error
  345. )
  346. var jobID = ctx.Params(":jobid")
  347. var versionName = ctx.Query("version_name")
  348. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  349. if err != nil {
  350. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  351. return
  352. }
  353. _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  354. if err != nil {
  355. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  356. return
  357. }
  358. ctx.JSON(http.StatusOK, map[string]interface{}{
  359. "JobID": jobID,
  360. "VersionName": versionName,
  361. "StatusOK": 0,
  362. })
  363. }
  364. func ModelList(ctx *context.APIContext) {
  365. var (
  366. err error
  367. )
  368. var jobID = ctx.Params(":jobid")
  369. var versionName = ctx.Query("version_name")
  370. parentDir := ctx.Query("parentDir")
  371. dirArray := strings.Split(parentDir, "/")
  372. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  373. if err != nil {
  374. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  375. return
  376. }
  377. var fileInfos []storage.FileInfo
  378. if task.ComputeResource == models.NPUResource {
  379. fileInfos, err = storage.GetObsListObject(task.JobName, "output/", parentDir, versionName)
  380. if err != nil {
  381. log.Info("get TrainJobListModel failed:", err)
  382. ctx.ServerError("GetObsListObject:", err)
  383. return
  384. }
  385. } else if task.ComputeResource == models.GPUResource {
  386. files, err := routerRepo.GetModelDirs(task.JobName, parentDir)
  387. if err != nil {
  388. log.Info("GetModelDirs failed:", err)
  389. ctx.ServerError("GetModelDirs:", err)
  390. return
  391. }
  392. err = json.Unmarshal([]byte(files), &fileInfos)
  393. if err != nil {
  394. log.Error("json.Unmarshal failed:%v", err.Error(), ctx.Data["msgID"])
  395. ctx.ServerError("json.Unmarshal failed:", err)
  396. return
  397. }
  398. }
  399. ctx.JSON(http.StatusOK, map[string]interface{}{
  400. "JobID": jobID,
  401. "VersionName": versionName,
  402. "StatusOK": 0,
  403. "Path": dirArray,
  404. "Dirs": fileInfos,
  405. "task": task,
  406. "PageIsCloudBrain": true,
  407. })
  408. }
  409. func GetModelArtsInferenceJob(ctx *context.APIContext) {
  410. var (
  411. err error
  412. )
  413. jobID := ctx.Params(":jobid")
  414. job, err := models.GetCloudbrainByJobID(jobID)
  415. if err != nil {
  416. ctx.NotFound(err)
  417. return
  418. }
  419. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
  420. if err != nil {
  421. ctx.NotFound(err)
  422. return
  423. }
  424. if job.StartTime == 0 && result.StartTime > 0 {
  425. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  426. }
  427. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  428. job.Duration = result.Duration / 1000
  429. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  430. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  431. job.EndTime = job.StartTime.Add(job.Duration)
  432. }
  433. job.CorrectCreateUnix()
  434. err = models.UpdateInferenceJob(job)
  435. if err != nil {
  436. log.Error("UpdateJob failed:", err)
  437. }
  438. ctx.JSON(http.StatusOK, map[string]interface{}{
  439. "JobID": jobID,
  440. "JobStatus": job.Status,
  441. "JobDuration": job.TrainJobDuration,
  442. })
  443. }
  444. func ResultList(ctx *context.APIContext) {
  445. var (
  446. err error
  447. )
  448. var jobID = ctx.Params(":jobid")
  449. var versionName = ctx.Query("version_name")
  450. parentDir := ctx.Query("parentDir")
  451. dirArray := strings.Split(parentDir, "/")
  452. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  453. if err != nil {
  454. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  455. return
  456. }
  457. models, err := storage.GetObsListObject(task.JobName, "result/", parentDir, versionName)
  458. if err != nil {
  459. log.Info("get TrainJobListModel failed:", err)
  460. ctx.ServerError("GetObsListObject:", err)
  461. return
  462. }
  463. ctx.JSON(http.StatusOK, map[string]interface{}{
  464. "JobID": jobID,
  465. "VersionName": versionName,
  466. "StatusOK": 0,
  467. "Path": dirArray,
  468. "Dirs": models,
  469. "task": task,
  470. "PageIsCloudBrain": true,
  471. })
  472. }
  473. func TrainJobGetMetricStatistic(ctx *context.APIContext) {
  474. var (
  475. err error
  476. )
  477. var jobID = ctx.Params(":jobid")
  478. var versionName = ctx.Query("version_name")
  479. result, err := trainJobGetMetricStatistic(jobID, versionName)
  480. if err != nil {
  481. log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error())
  482. return
  483. }
  484. ctx.JSON(http.StatusOK, map[string]interface{}{
  485. "JobID": jobID,
  486. "Interval": result.Interval,
  487. "MetricsInfo": result.MetricsInfo,
  488. })
  489. }
  490. func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) {
  491. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  492. if err != nil {
  493. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  494. return nil, err
  495. }
  496. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  497. if err != nil {
  498. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  499. return nil, err
  500. }
  501. result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0])
  502. if err != nil {
  503. log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error())
  504. return nil, err
  505. }
  506. return result, err
  507. }