You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 96 kB

4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
2 years ago
3 years ago
2 years ago
4 years ago
3 years ago
2 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
2 years ago
3 years ago
4 years ago
3 years ago
2 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
2 years ago
4 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
2 years ago
4 years ago
4 years ago
2 years ago
4 years ago
2 years ago
4 years ago
2 years ago
3 years ago
2 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
2 years ago
4 years ago
3 years ago
4 years ago
2 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
2 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
4 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
4 years ago
2 years ago
4 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
4 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
4 years ago
4 years ago
2 years ago
4 years ago
4 years ago
2 years ago
4 years ago
2 years ago
3 years ago
4 years ago
2 years ago
4 years ago
4 years ago
3 years ago
4 years ago
2 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
4 years ago
2 years ago
4 years ago
3 years ago
4 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
2 years ago
4 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package repo
  2. import (
  3. "archive/zip"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "path"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "unicode/utf8"
  16. cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
  17. "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
  18. "code.gitea.io/gitea/modules/dataset"
  19. "code.gitea.io/gitea/modules/modelarts_cd"
  20. "code.gitea.io/gitea/services/cloudbrain/resource"
  21. "code.gitea.io/gitea/services/reward/point/account"
  22. "code.gitea.io/gitea/models"
  23. "code.gitea.io/gitea/modules/auth"
  24. "code.gitea.io/gitea/modules/base"
  25. "code.gitea.io/gitea/modules/cloudbrain"
  26. "code.gitea.io/gitea/modules/context"
  27. "code.gitea.io/gitea/modules/git"
  28. "code.gitea.io/gitea/modules/log"
  29. "code.gitea.io/gitea/modules/modelarts"
  30. "code.gitea.io/gitea/modules/notification"
  31. "code.gitea.io/gitea/modules/obs"
  32. "code.gitea.io/gitea/modules/redis/redis_key"
  33. "code.gitea.io/gitea/modules/redis/redis_lock"
  34. "code.gitea.io/gitea/modules/setting"
  35. "code.gitea.io/gitea/modules/storage"
  36. "code.gitea.io/gitea/modules/timeutil"
  37. "code.gitea.io/gitea/modules/util"
  38. )
  39. const (
  40. tplDebugJobIndex base.TplName = "repo/debugjob/index"
  41. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  42. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  43. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  44. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  45. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  46. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  47. tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
  48. tplModelArtsInferenceJobIndex base.TplName = "repo/modelarts/inferencejob/index"
  49. tplModelArtsInferenceJobNew base.TplName = "repo/modelarts/inferencejob/new"
  50. tplModelArtsInferenceJobShow base.TplName = "repo/modelarts/inferencejob/show"
  51. )
  52. func DebugJobIndex(ctx *context.Context) {
  53. listType := ctx.Query("debugListType")
  54. if listType == "" {
  55. listType = models.AllResource
  56. }
  57. ctx.Data["ListType"] = listType
  58. MustEnableCloudbrain(ctx)
  59. repo := ctx.Repo.Repository
  60. page := ctx.QueryInt("page")
  61. if page <= 0 {
  62. page = 1
  63. }
  64. jobTypeNot := false
  65. var computeResource string
  66. if listType != models.AllResource {
  67. computeResource = listType
  68. }
  69. var jobTypes []string
  70. jobTypes = append(jobTypes, string(models.JobTypeDebug))
  71. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  72. ListOptions: models.ListOptions{
  73. Page: page,
  74. PageSize: setting.UI.IssuePagingNum,
  75. },
  76. RepoID: repo.ID,
  77. ComputeResource: computeResource,
  78. Type: models.TypeCloudBrainAll,
  79. JobTypeNot: jobTypeNot,
  80. JobTypes: jobTypes,
  81. })
  82. if err != nil {
  83. ctx.ServerError("Get debugjob faild:", err)
  84. return
  85. }
  86. for i, task := range ciTasks {
  87. ciTasks[i].CanDebug = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  88. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  89. ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
  90. }
  91. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  92. pager.AddParam(ctx, "debugListType", "ListType")
  93. ctx.Data["Page"] = pager
  94. ctx.Data["PageIsCloudBrain"] = true
  95. ctx.Data["Tasks"] = ciTasks
  96. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  97. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  98. ctx.Data["debugListType"] = listType
  99. ctx.HTML(200, tplDebugJobIndex)
  100. }
  101. // MustEnableDataset check if repository enable internal cb
  102. func MustEnableModelArts(ctx *context.Context) {
  103. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  104. ctx.NotFound("MustEnableCloudbrain", nil)
  105. return
  106. }
  107. }
  108. func NotebookNew(ctx *context.Context) {
  109. notebookNewDataPrepare(ctx)
  110. ctx.HTML(200, tplModelArtsNotebookNew)
  111. }
  112. func notebookNewDataPrepare(ctx *context.Context) error {
  113. ctx.Data["PageIsCloudBrain"] = true
  114. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  115. ctx.Data["display_job_name"] = displayJobName
  116. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  117. if err != nil {
  118. ctx.ServerError("GetAllUserAttachments failed:", err)
  119. return err
  120. }
  121. ctx.Data["attachments"] = attachs
  122. ctx.Data["images"] = setting.StImageInfos.ImageInfo
  123. prepareCloudbrainTwoDebugSpecs(ctx)
  124. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  125. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  126. ctx.Data["WaitCount"] = waitCount
  127. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeDebug))
  128. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  129. return nil
  130. }
  131. func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) {
  132. aiCenterCode := models.AICenterOfCloudBrainTwo
  133. if setting.ModelartsCD.Enabled {
  134. aiCenterCode = models.AICenterOfChengdu
  135. }
  136. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  137. JobType: models.JobTypeDebug,
  138. ComputeResource: models.NPU,
  139. Cluster: models.OpenICluster,
  140. AiCenterCode: aiCenterCode,
  141. })
  142. ctx.Data["Specs"] = noteBookSpecs
  143. }
  144. func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  145. ctx.Data["PageIsNotebook"] = true
  146. displayJobName := form.DisplayJobName
  147. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  148. uuid := form.Attachment
  149. description := form.Description
  150. imageId := form.ImageId
  151. repo := ctx.Repo.Repository
  152. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName))
  153. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  154. if !isOk {
  155. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  156. notebookNewDataPrepare(ctx)
  157. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsNotebookNew, &form)
  158. return
  159. }
  160. defer lock.UnLock()
  161. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeDebug))
  162. if err != nil {
  163. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  164. notebookNewDataPrepare(ctx)
  165. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  166. return
  167. } else {
  168. if count >= 1 {
  169. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  170. notebookNewDataPrepare(ctx)
  171. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  172. return
  173. }
  174. }
  175. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  176. if err == nil {
  177. if len(tasks) != 0 {
  178. log.Error("the job name did already exist", ctx.Data["MsgID"])
  179. notebookNewDataPrepare(ctx)
  180. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  181. return
  182. }
  183. } else {
  184. if !models.IsErrJobNotExist(err) {
  185. log.Error("system error, %v", err, ctx.Data["MsgID"])
  186. notebookNewDataPrepare(ctx)
  187. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  188. return
  189. }
  190. }
  191. var datasetInfos map[string]models.DatasetInfo
  192. var attachSize int64
  193. if uuid != "" {
  194. datasetInfos, _, err = models.GetDatasetInfo(uuid)
  195. for _, infos := range datasetInfos {
  196. attachSize += infos.Size
  197. }
  198. if attachSize > int64(setting.DebugAttachSize*1000*1000*1000) {
  199. log.Error("The DatasetSize exceeds the limit (%dGB)", setting.DebugAttachSize) //GB
  200. notebookNewDataPrepare(ctx)
  201. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.debug_datasetsize", setting.DebugAttachSize), tplModelArtsNotebookNew, &form)
  202. return
  203. }
  204. }
  205. var aiCenterCode = models.AICenterOfCloudBrainTwo
  206. if setting.ModelartsCD.Enabled {
  207. aiCenterCode = models.AICenterOfChengdu
  208. }
  209. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  210. JobType: models.JobTypeDebug,
  211. ComputeResource: models.NPU,
  212. Cluster: models.OpenICluster,
  213. AiCenterCode: aiCenterCode})
  214. if err != nil || spec == nil {
  215. notebookNewDataPrepare(ctx)
  216. ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form)
  217. return
  218. }
  219. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  220. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  221. notebookNewDataPrepare(ctx)
  222. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsNotebookNew, &form)
  223. return
  224. }
  225. req := cloudbrain.GenerateModelArtsNotebookReq{
  226. DisplayJobName: displayJobName,
  227. JobName: jobName,
  228. Description: description,
  229. Uuid: uuid,
  230. ImageId: imageId,
  231. Spec: spec,
  232. BootFile: "",
  233. AutoStopDurationMs: modelarts.AutoStopDurationMs,
  234. }
  235. if form.ModelName != "" { //使用预训练模型训练
  236. _, err := models.QueryModelByPath(form.PreTrainModelUrl)
  237. if err != nil {
  238. log.Error("Can not find model", err)
  239. notebookNewDataPrepare(ctx)
  240. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tplModelArtsNotebookNew, &form)
  241. return
  242. }
  243. req.ModelName = form.ModelName
  244. req.LabelName = form.LabelName
  245. req.CkptName = form.CkptName
  246. req.ModelVersion = form.ModelVersion
  247. req.PreTrainModelUrl = form.PreTrainModelUrl
  248. }
  249. if setting.ModelartsCD.Enabled {
  250. _, err = modelarts_cd.GenerateNotebook(ctx, req)
  251. } else {
  252. _, err = modelarts.GenerateNotebook2(ctx, req)
  253. }
  254. if err != nil {
  255. log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
  256. notebookNewDataPrepare(ctx)
  257. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  258. return
  259. }
  260. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  261. }
  262. func NotebookShow(ctx *context.Context) {
  263. ctx.Data["PageIsCloudBrain"] = true
  264. debugListType := ctx.Query("debugListType")
  265. if debugListType == "" {
  266. debugListType = "all"
  267. }
  268. var ID = ctx.Params(":id")
  269. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  270. if err != nil {
  271. log.Error("GET job error", err.Error())
  272. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  273. return
  274. }
  275. if task.DeletedAt.IsZero() { //normal record
  276. err := modelarts.HandleNotebookInfo(task)
  277. if err != nil {
  278. ctx.Data["error"] = err.Error()
  279. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  280. return
  281. }
  282. } else { //deleted record
  283. }
  284. datasetDownload := make([]*models.DatasetDownload, 0)
  285. var modelDownload models.ModelDownload
  286. if ctx.IsSigned {
  287. if task.Uuid != "" && task.UserID == ctx.User.ID {
  288. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, true)
  289. }
  290. if task.ModelName != "" && task.UserID == ctx.User.ID {
  291. modelDownload = GetModelDownload(task)
  292. }
  293. }
  294. user, err := models.GetUserByID(task.UserID)
  295. if err == nil {
  296. task.User = user
  297. }
  298. prepareSpec4Show(ctx, task)
  299. if task.TrainJobDuration == "" {
  300. if task.Duration == 0 {
  301. var duration int64
  302. if task.Status == string(models.JobRunning) {
  303. duration = time.Now().Unix() - int64(task.CreatedUnix)
  304. } else {
  305. duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
  306. }
  307. task.Duration = duration
  308. }
  309. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  310. }
  311. ctx.Data["duration"] = task.TrainJobDuration
  312. ctx.Data["datasetDownload"] = datasetDownload
  313. ctx.Data["modelDownload"] = modelDownload
  314. ctx.Data["task"] = task
  315. ctx.Data["ID"] = ID
  316. ctx.Data["jobName"] = task.JobName
  317. ctx.Data["debugListType"] = debugListType
  318. ctx.HTML(200, tplModelArtsNotebookShow)
  319. }
  320. func GetModelDownload(task *models.Cloudbrain) models.ModelDownload {
  321. index := strings.Index(task.PreTrainModelUrl, "/")
  322. key := task.PreTrainModelUrl[index+1:] + task.CkptName
  323. url, _ := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, key)
  324. modelDownload := models.ModelDownload{
  325. Name: task.CkptName,
  326. DownloadLink: url,
  327. IsDelete: false,
  328. }
  329. if !HasModelFile(task) {
  330. log.Warn("Can not get model by path:" + task.PreTrainModelUrl)
  331. modelDownload.IsDelete = true
  332. }
  333. return modelDownload
  334. }
  335. func GetCloudBrainDataSetInfo(uuid string, datasetname string, isNeedDown bool) []*models.DatasetDownload {
  336. datasetDownload := make([]*models.DatasetDownload, 0)
  337. if len(uuid) == 0 {
  338. return datasetDownload
  339. }
  340. uuidList := strings.Split(uuid, ";")
  341. datasetnameList := strings.Split(datasetname, ";")
  342. for i, uuidStr := range uuidList {
  343. name := ""
  344. link := ""
  345. url := ""
  346. isDelete := false
  347. attachment, err := models.GetAttachmentByUUID(uuidStr)
  348. if err != nil {
  349. log.Error("GetAttachmentByUUID failed:%v", err.Error())
  350. if len(datasetnameList) <= i || len(datasetname) == 0 {
  351. continue
  352. }
  353. name = datasetnameList[i]
  354. isDelete = true
  355. } else {
  356. name = attachment.Name
  357. dataset, err := models.GetDatasetByID(attachment.DatasetID)
  358. if err != nil {
  359. log.Error("GetDatasetByID failed:%v", err.Error())
  360. } else {
  361. repo, err := models.GetRepositoryByID(dataset.RepoID)
  362. if err != nil {
  363. log.Error("GetRepositoryByID failed:%v", err.Error())
  364. } else {
  365. link = repo.Link() + "/datasets"
  366. }
  367. }
  368. if isNeedDown {
  369. url = attachment.S3DownloadURL()
  370. }
  371. }
  372. datasetDownload = append(datasetDownload, &models.DatasetDownload{
  373. DatasetName: name,
  374. DatasetDownloadLink: url,
  375. RepositoryLink: link,
  376. IsDelete: isDelete,
  377. })
  378. }
  379. log.Info("dataset length=" + fmt.Sprint(len(datasetDownload)))
  380. return datasetDownload
  381. }
  382. func setShowSpecBySpecialPoolConfig(ctx *context.Context, findSpec bool, task *models.Cloudbrain) {
  383. modelarts.InitSpecialPool()
  384. if modelarts.SpecialPools != nil && !findSpec {
  385. for _, pool := range modelarts.SpecialPools.Pools {
  386. for _, flavor := range pool.Flavor {
  387. if flavor.Value == task.FlavorCode {
  388. ctx.Data["resource_spec"] = flavor.Desc
  389. }
  390. }
  391. }
  392. }
  393. }
  394. func NotebookDebug2(ctx *context.Context) {
  395. var err error
  396. var result *models.GetNotebook2Result
  397. task := ctx.Cloudbrain
  398. if task.Type == models.TypeCloudBrainTwo {
  399. result, err = modelarts.GetNotebook2(task.JobID)
  400. } else if task.Type == models.TypeCDCenter {
  401. result, err = modelarts_cd.GetNotebook(task.JobID)
  402. }
  403. if err != nil {
  404. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  405. return
  406. }
  407. if ctx.QueryTrim("file") != "" {
  408. ctx.Redirect(getFileUrl(result.Url, ctx.QueryTrim("file")) + "?token=" + result.Token)
  409. } else {
  410. if task.BootFile != "" {
  411. go cloudbrainTask.UploadNotebookFiles(task)
  412. }
  413. ctx.Redirect(result.Url + "?token=" + result.Token)
  414. }
  415. }
  416. func getFileUrl(url string, filename string) string {
  417. middle := ""
  418. if url[len(url)-3:] == "lab" || url[len(url)-4:] == "lab/" {
  419. if url[len(url)-1] == '/' {
  420. middle = "tree/"
  421. } else {
  422. middle = "/tree/"
  423. }
  424. } else {
  425. if url[len(url)-1] == '/' {
  426. middle = "lab/tree/"
  427. } else {
  428. middle = "/lab/tree/"
  429. }
  430. }
  431. return url + middle + filename + "?reset"
  432. }
  433. func NotebookRestart(ctx *context.Context) {
  434. var id = ctx.Params(":id")
  435. var resultCode = "-1"
  436. var errorMsg = ""
  437. var status = ""
  438. var spec *models.Specification
  439. task := ctx.Cloudbrain
  440. for {
  441. ctx.CheckWechatBind()
  442. if ctx.Written() {
  443. return
  444. }
  445. if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
  446. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  447. errorMsg = "the job is not stopped"
  448. break
  449. }
  450. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeDebug))
  451. if err != nil {
  452. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  453. errorMsg = "system error"
  454. break
  455. } else {
  456. if count >= 1 {
  457. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  458. resultCode = "2"
  459. errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
  460. break
  461. }
  462. }
  463. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  464. if err != nil || oldSpec == nil {
  465. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  466. errorMsg = "Resource specification not available"
  467. break
  468. }
  469. aiCenterCode := models.AICenterOfCloudBrainTwo
  470. if task.Type == models.TypeCDCenter {
  471. aiCenterCode = models.AICenterOfChengdu
  472. }
  473. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  474. JobType: models.JobType(task.JobType),
  475. ComputeResource: models.NPU,
  476. Cluster: models.OpenICluster,
  477. AiCenterCode: aiCenterCode})
  478. if err != nil || spec == nil {
  479. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  480. errorMsg = "Resource specification not support any more"
  481. break
  482. }
  483. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  484. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  485. errorMsg = ctx.Tr("points.insufficient_points_balance")
  486. break
  487. }
  488. if !HasModelFile(task) { //使用预训练模型训练
  489. errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
  490. break
  491. }
  492. if hasDatasetDeleted(task) {
  493. errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
  494. break
  495. }
  496. createTime := timeutil.TimeStampNow()
  497. param := models.NotebookAction{
  498. Action: models.ActionStart,
  499. }
  500. var res *models.NotebookActionResult
  501. if task.Type == models.TypeCloudBrainTwo {
  502. res, err = modelarts.ManageNotebook2(task.JobID, param)
  503. } else if task.Type == models.TypeCDCenter {
  504. res, err = modelarts_cd.ManageNotebook(task.JobID, param)
  505. }
  506. if err != nil {
  507. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  508. /* 暂不处理再次调试502的场景,详情见方案
  509. if strings.HasPrefix(err.Error(), modelarts.UnknownErrorPrefix) {
  510. log.Info("(%s)unknown error, set temp status", task.DisplayJobName)
  511. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  512. JobID: task.JobID,
  513. VersionID: models.TempVersionId,
  514. Status: models.TempJobStatus,
  515. Type: task.Type,
  516. JobName: task.JobName,
  517. JobType: task.JobType,
  518. })
  519. if errTemp != nil {
  520. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  521. }
  522. }
  523. */
  524. errorMsg = err.Error()
  525. break
  526. }
  527. newTask := &models.Cloudbrain{
  528. Status: res.Status,
  529. UserID: task.UserID,
  530. RepoID: task.RepoID,
  531. JobID: task.JobID,
  532. JobName: task.JobName,
  533. DisplayJobName: task.DisplayJobName,
  534. JobType: task.JobType,
  535. Type: task.Type,
  536. Uuid: task.Uuid,
  537. Image: task.Image,
  538. ComputeResource: task.ComputeResource,
  539. Description: task.Description,
  540. CreatedUnix: createTime,
  541. UpdatedUnix: createTime,
  542. Spec: spec,
  543. ModelName: task.ModelName,
  544. ModelVersion: task.ModelVersion,
  545. LabelName: task.LabelName,
  546. PreTrainModelUrl: task.PreTrainModelUrl,
  547. CkptName: task.CkptName,
  548. }
  549. err = models.RestartCloudbrain(task, newTask)
  550. if err != nil {
  551. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  552. errorMsg = "system error"
  553. break
  554. }
  555. id = strconv.FormatInt(newTask.ID, 10)
  556. status = res.Status
  557. resultCode = "0"
  558. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, models.ActionCreateDebugNPUTask)
  559. break
  560. }
  561. ctx.JSON(200, map[string]string{
  562. "result_code": resultCode,
  563. "error_msg": errorMsg,
  564. "status": status,
  565. "id": id,
  566. })
  567. }
  568. func NotebookStop(ctx *context.Context) {
  569. var id = ctx.Params(":id")
  570. var resultCode = "0"
  571. var errorMsg = ""
  572. var status = ""
  573. task := ctx.Cloudbrain
  574. for {
  575. if task.Status != string(models.ModelArtsRunning) {
  576. log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
  577. resultCode = "-1"
  578. errorMsg = ctx.Tr("cloudbrain.Already_stopped")
  579. break
  580. }
  581. err, res := StopModelArtsNotebook(task)
  582. if err != nil {
  583. log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  584. resultCode = "-1"
  585. errorMsg = err.Error()
  586. if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
  587. errorMsg = "the job's version is too old and can not be restarted"
  588. }
  589. break
  590. }
  591. status = res.Status
  592. oldStatus := task.Status
  593. task.Status = res.Status
  594. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  595. task.EndTime = timeutil.TimeStampNow()
  596. }
  597. task.ComputeAndSetDuration()
  598. if oldStatus != task.Status {
  599. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  600. }
  601. err = models.UpdateJob(task)
  602. if err != nil {
  603. log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  604. resultCode = "-1"
  605. errorMsg = "system error"
  606. break
  607. }
  608. break
  609. }
  610. ctx.JSON(200, map[string]string{
  611. "result_code": resultCode,
  612. "error_msg": errorMsg,
  613. "status": status,
  614. "id": id,
  615. })
  616. }
  617. func StopModelArtsNotebook(task *models.Cloudbrain) (error, *models.NotebookActionResult) {
  618. param := models.NotebookAction{
  619. Action: models.ActionStop,
  620. }
  621. var err error
  622. var res *models.NotebookActionResult
  623. if task.Type == models.TypeCloudBrainTwo {
  624. res, err = modelarts.ManageNotebook2(task.JobID, param)
  625. } else if task.Type == models.TypeCDCenter {
  626. res, err = modelarts_cd.ManageNotebook(task.JobID, param)
  627. }
  628. return err, res
  629. }
  630. func NotebookDel(ctx *context.Context) {
  631. var listType = ctx.Query("debugListType")
  632. task := ctx.Cloudbrain
  633. if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsDeleted) {
  634. log.Error("the job(%s) has not been stopped", task.JobName)
  635. ctx.RenderWithErr("the job has not been stopped", tplDebugJobIndex, nil)
  636. return
  637. }
  638. var err error
  639. if task.Type == models.TypeCloudBrainTwo {
  640. _, err = modelarts.DelNotebook2(task.JobID)
  641. } else if task.Type == models.TypeCDCenter {
  642. _, err = modelarts_cd.DelNotebook(task.JobID)
  643. }
  644. if err != nil {
  645. log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error())
  646. if strings.Contains(err.Error(), modelarts.NotebookNotFound) || strings.Contains(err.Error(), modelarts.NotebookNoPermission) || strings.Contains(err.Error(), modelarts.NotebookInvalid) {
  647. log.Info("old notebook version")
  648. } else {
  649. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  650. return
  651. }
  652. }
  653. err = models.DeleteJob(task)
  654. if err != nil {
  655. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  656. return
  657. }
  658. var isAdminPage = ctx.Query("isadminpage")
  659. var isHomePage = ctx.Query("ishomepage")
  660. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  661. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  662. } else if isHomePage == "true" {
  663. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  664. } else {
  665. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  666. }
  667. }
  668. func TrainJobIndex(ctx *context.Context) {
  669. MustEnableModelArts(ctx)
  670. repo := ctx.Repo.Repository
  671. page := ctx.QueryInt("page")
  672. if page <= 0 {
  673. page = 1
  674. }
  675. listType := ctx.Query("listType")
  676. ctx.Data["ListType"] = listType
  677. if listType == models.AllResource {
  678. listType = ""
  679. }
  680. var jobTypes []string
  681. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  682. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  683. ListOptions: models.ListOptions{
  684. Page: page,
  685. PageSize: setting.UI.IssuePagingNum,
  686. },
  687. RepoID: repo.ID,
  688. JobTypeNot: false,
  689. JobTypes: jobTypes,
  690. IsLatestVersion: modelarts.IsLatestVersion,
  691. ComputeResource: listType,
  692. Type: models.TypeCloudBrainAll,
  693. })
  694. if err != nil {
  695. ctx.ServerError("Cloudbrain", err)
  696. return
  697. }
  698. for i, task := range tasks {
  699. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  700. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  701. }
  702. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  703. pager.SetDefaultParams(ctx)
  704. pager.AddParam(ctx, "listType", "ListType")
  705. ctx.Data["Page"] = pager
  706. ctx.Data["PageIsCloudBrain"] = true
  707. ctx.Data["Tasks"] = tasks
  708. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  709. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  710. ctx.HTML(200, tplModelArtsTrainJobIndex)
  711. }
  712. func TrainJobNew(ctx *context.Context) {
  713. err := trainJobNewDataPrepare(ctx)
  714. if err != nil {
  715. ctx.ServerError("get new train-job info failed", err)
  716. return
  717. }
  718. ctx.HTML(200, tplModelArtsTrainJobNew)
  719. }
  720. func trainJobNewDataPrepare(ctx *context.Context) error {
  721. ctx.Data["PageIsCloudBrain"] = true
  722. //can, err := canUserCreateTrainJob(ctx.User.ID)
  723. //if err != nil {
  724. // ctx.ServerError("canUserCreateTrainJob", err)
  725. // return
  726. //}
  727. //
  728. //if !can {
  729. // log.Error("the user can not create train-job")
  730. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  731. // return
  732. //}
  733. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  734. ctx.Data["display_job_name"] = displayJobName
  735. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  736. if err != nil {
  737. ctx.ServerError("GetAllUserAttachments failed:", err)
  738. return err
  739. }
  740. ctx.Data["attachments"] = attachs
  741. var resourcePools modelarts.ResourcePool
  742. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  743. ctx.ServerError("json.Unmarshal failed:", err)
  744. return err
  745. }
  746. ctx.Data["resource_pools"] = resourcePools.Info
  747. var engines modelarts.Engine
  748. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  749. ctx.ServerError("json.Unmarshal failed:", err)
  750. return err
  751. }
  752. ctx.Data["engines"] = engines.Info
  753. var versionInfos modelarts.VersionInfo
  754. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  755. ctx.ServerError("json.Unmarshal failed:", err)
  756. return err
  757. }
  758. ctx.Data["engine_versions"] = versionInfos.Version
  759. prepareCloudbrainTwoTrainSpecs(ctx)
  760. ctx.Data["params"] = ""
  761. ctx.Data["branchName"] = ctx.Repo.BranchName
  762. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  763. if err != nil {
  764. ctx.ServerError("getConfigList failed:", err)
  765. return err
  766. }
  767. ctx.Data["config_list"] = configList.ParaConfigs
  768. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  769. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  770. ctx.Data["WaitCount"] = waitCount
  771. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeTrain))
  772. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  773. setMultiNodeIfConfigureMatch(ctx)
  774. return nil
  775. }
  776. func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) {
  777. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  778. JobType: models.JobTypeTrain,
  779. ComputeResource: models.NPU,
  780. Cluster: models.OpenICluster,
  781. AiCenterCode: models.AICenterOfCloudBrainTwo,
  782. })
  783. ctx.Data["Specs"] = noteBookSpecs
  784. }
  785. func setMultiNodeIfConfigureMatch(ctx *context.Context) {
  786. modelarts.InitMultiNode()
  787. if modelarts.MultiNodeConfig != nil {
  788. for _, info := range modelarts.MultiNodeConfig.Info {
  789. if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg {
  790. ctx.Data["WorkNode"] = info.Node
  791. break
  792. }
  793. }
  794. }
  795. }
  796. func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) {
  797. modelarts.InitSpecialPool()
  798. if modelarts.SpecialPools != nil {
  799. for _, specialPool := range modelarts.SpecialPools.Pools {
  800. if cloudbrain.IsElementExist(specialPool.JobType, jobType) {
  801. if isInOrg, _ := models.IsOrganizationMemberByOrgName(specialPool.Org, ctx.User.ID); isInOrg {
  802. var specialFlavor []struct {
  803. Code string
  804. Value string
  805. }
  806. if jobType == string(models.JobTypeDebug) {
  807. ctx.Data["flavors"] = specialPool.Flavor
  808. } else {
  809. for _, tempFlavor := range specialPool.Flavor {
  810. specialFlavor = append(specialFlavor, struct {
  811. Code string
  812. Value string
  813. }{Code: tempFlavor.Value, Value: tempFlavor.Desc})
  814. }
  815. ctx.Data["flavor_infos"] = specialFlavor
  816. }
  817. }
  818. }
  819. }
  820. }
  821. }
  822. func TrainJobNewVersion(ctx *context.Context) {
  823. err := trainJobNewVersionDataPrepare(ctx)
  824. if err != nil {
  825. ctx.ServerError("get new train-job info failed", err)
  826. return
  827. }
  828. ctx.HTML(200, tplModelArtsTrainJobVersionNew)
  829. }
  830. func trainJobNewVersionDataPrepare(ctx *context.Context) error {
  831. ctx.Data["PageIsCloudBrain"] = true
  832. var jobID = ctx.Params(":jobid")
  833. var versionName = ctx.Query("version_name")
  834. // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
  835. // if err != nil {
  836. // ctx.ServerError("canNewJob can info failed", err)
  837. // return err
  838. // }
  839. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  840. if err != nil {
  841. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  842. return err
  843. }
  844. ctx.Data["display_job_name"] = task.DisplayJobName
  845. ctx.Data["job_name"] = task.JobName
  846. // attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  847. // if err != nil {
  848. // ctx.ServerError("GetAllUserAttachments failed:", err)
  849. // return err
  850. // }
  851. // ctx.Data["attachments"] = attachs
  852. var resourcePools modelarts.ResourcePool
  853. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  854. ctx.ServerError("json.Unmarshal failed:", err)
  855. return err
  856. }
  857. ctx.Data["resource_pools"] = resourcePools.Info
  858. var engines modelarts.Engine
  859. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  860. ctx.ServerError("json.Unmarshal failed:", err)
  861. return err
  862. }
  863. ctx.Data["engines"] = engines.Info
  864. var versionInfos modelarts.VersionInfo
  865. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  866. ctx.ServerError("json.Unmarshal failed:", err)
  867. return err
  868. }
  869. ctx.Data["engine_versions"] = versionInfos.Version
  870. prepareCloudbrainTwoTrainSpecs(ctx)
  871. spec, _ := resource.GetCloudbrainSpec(task.ID)
  872. if spec != nil {
  873. log.Info("spec_id = %d", spec.ID)
  874. ctx.Data["spec_id"] = spec.ID
  875. }
  876. ctx.Data["run_para_list"] = task.Parameters
  877. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  878. if err != nil {
  879. ctx.ServerError("GetBranches error:", err)
  880. return err
  881. }
  882. uuids, datasetNames := dataset.GetFilterDeletedAttachments(task.Uuid)
  883. ctx.Data["dataset_name"] = datasetNames
  884. ctx.Data["branches"] = branches
  885. ctx.Data["branch_name"] = task.BranchName
  886. ctx.Data["description"] = task.Description
  887. ctx.Data["boot_file"] = task.BootFile
  888. ctx.Data["work_server_number"] = task.WorkServerNumber
  889. ctx.Data["flavor_name"] = task.FlavorName
  890. ctx.Data["engine_name"] = task.EngineName
  891. ctx.Data["attachment"] = uuids
  892. ctx.Data["flavor_code"] = task.FlavorCode
  893. ctx.Data["engine_id"] = task.EngineID
  894. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  895. //pretrain model
  896. ctx.Data["model_name"] = task.ModelName
  897. ctx.Data["model_version"] = task.ModelVersion
  898. ctx.Data["ckpt_name"] = task.CkptName
  899. ctx.Data["label_names"] = task.LabelName
  900. ctx.Data["pre_train_model_url"] = task.PreTrainModelUrl
  901. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  902. if err != nil {
  903. ctx.ServerError("getConfigList failed:", err)
  904. return err
  905. }
  906. ctx.Data["config_list"] = configList.ParaConfigs
  907. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  908. ctx.Data["WaitCount"] = waitCount
  909. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeTrain))
  910. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  911. return nil
  912. }
  913. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  914. ctx.Data["PageIsTrainJob"] = true
  915. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  916. displayJobName := form.DisplayJobName
  917. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  918. uuid := form.Attachment
  919. description := form.Description
  920. workServerNumber := form.WorkServerNumber
  921. engineID := form.EngineID
  922. bootFile := strings.TrimSpace(form.BootFile)
  923. params := form.Params
  924. poolID := form.PoolID
  925. //isSaveParam := form.IsSaveParam
  926. repo := ctx.Repo.Repository
  927. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  928. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  929. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  930. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  931. // dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  932. branchName := form.BranchName
  933. isLatestVersion := modelarts.IsLatestVersion
  934. FlavorName := form.FlavorName
  935. VersionCount := modelarts.VersionCountOne
  936. EngineName := form.EngineName
  937. errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber)
  938. if errStr != "" {
  939. trainJobNewDataPrepare(ctx)
  940. ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form)
  941. return
  942. }
  943. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  944. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  945. if !isOk {
  946. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  947. trainJobNewDataPrepare(ctx)
  948. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsTrainJobNew, &form)
  949. return
  950. }
  951. defer lock.UnLock()
  952. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeTrain))
  953. if err != nil {
  954. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  955. trainJobNewDataPrepare(ctx)
  956. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  957. return
  958. } else {
  959. if count >= 1 {
  960. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  961. trainJobNewDataPrepare(ctx)
  962. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
  963. return
  964. }
  965. }
  966. if err := paramCheckCreateTrainJob(form); err != nil {
  967. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  968. trainJobNewDataPrepare(ctx)
  969. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  970. return
  971. }
  972. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  973. if err != nil || !bootFileExist {
  974. log.Error("Get bootfile error:", err)
  975. trainJobNewDataPrepare(ctx)
  976. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobNew, &form)
  977. return
  978. }
  979. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  980. JobType: models.JobTypeTrain,
  981. ComputeResource: models.NPU,
  982. Cluster: models.OpenICluster,
  983. AiCenterCode: models.AICenterOfCloudBrainTwo})
  984. if err != nil || spec == nil {
  985. trainJobNewDataPrepare(ctx)
  986. ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form)
  987. return
  988. }
  989. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice*form.WorkServerNumber) {
  990. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  991. trainJobNewDataPrepare(ctx)
  992. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsTrainJobNew, &form)
  993. return
  994. }
  995. //Determine whether the task name of the task in the project is duplicated
  996. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  997. if err == nil {
  998. if len(tasks) != 0 {
  999. log.Error("the job name did already exist", ctx.Data["MsgID"])
  1000. trainJobNewDataPrepare(ctx)
  1001. ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form)
  1002. return
  1003. }
  1004. } else {
  1005. if !models.IsErrJobNotExist(err) {
  1006. log.Error("system error, %v", err, ctx.Data["MsgID"])
  1007. trainJobNewDataPrepare(ctx)
  1008. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  1009. return
  1010. }
  1011. }
  1012. //todo: del the codeLocalPath
  1013. _, err = ioutil.ReadDir(codeLocalPath)
  1014. if err == nil {
  1015. os.RemoveAll(codeLocalPath)
  1016. }
  1017. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1018. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  1019. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  1020. log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
  1021. trainJobNewDataPrepare(ctx)
  1022. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form)
  1023. return
  1024. }
  1025. //todo: upload code (send to file_server todo this work?)
  1026. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  1027. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  1028. trainJobNewDataPrepare(ctx)
  1029. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  1030. return
  1031. }
  1032. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1033. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1034. trainJobNewDataPrepare(ctx)
  1035. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  1036. return
  1037. }
  1038. parentDir := VersionOutputPath + "/"
  1039. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  1040. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  1041. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1042. trainJobNewDataPrepare(ctx)
  1043. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form)
  1044. return
  1045. }
  1046. var parameters models.Parameters
  1047. param := make([]models.Parameter, 0)
  1048. existDeviceTarget := false
  1049. if len(params) != 0 {
  1050. err := json.Unmarshal([]byte(params), &parameters)
  1051. if err != nil {
  1052. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1053. trainJobNewDataPrepare(ctx)
  1054. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  1055. return
  1056. }
  1057. for _, parameter := range parameters.Parameter {
  1058. if parameter.Label == modelarts.DeviceTarget {
  1059. existDeviceTarget = true
  1060. }
  1061. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1062. param = append(param, models.Parameter{
  1063. Label: parameter.Label,
  1064. Value: parameter.Value,
  1065. })
  1066. }
  1067. }
  1068. }
  1069. if !existDeviceTarget {
  1070. param = append(param, models.Parameter{
  1071. Label: modelarts.DeviceTarget,
  1072. Value: modelarts.Ascend,
  1073. })
  1074. }
  1075. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  1076. if err != nil {
  1077. log.Error("Failed to getDatasUrlListByUUIDS: %v", err)
  1078. trainJobNewDataPrepare(ctx)
  1079. ctx.RenderWithErr("Failed to getDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1080. return
  1081. }
  1082. dataPath := dataUrl
  1083. jsondatas, err := json.Marshal(datasUrlList)
  1084. if err != nil {
  1085. log.Error("Failed to Marshal: %v", err)
  1086. trainJobNewDataPrepare(ctx)
  1087. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1088. return
  1089. }
  1090. if isMultiDataset {
  1091. param = append(param, models.Parameter{
  1092. Label: modelarts.MultiDataUrl,
  1093. Value: string(jsondatas),
  1094. })
  1095. }
  1096. if form.ModelName != "" { //使用预训练模型训练
  1097. ckptUrl := "/" + form.PreTrainModelUrl + form.CkptName
  1098. param = append(param, models.Parameter{
  1099. Label: modelarts.CkptUrl,
  1100. Value: "s3:/" + ckptUrl,
  1101. })
  1102. }
  1103. //save param config
  1104. // if isSaveParam == "on" {
  1105. // saveparams := append(param, models.Parameter{
  1106. // Label: modelarts.TrainUrl,
  1107. // Value: outputObsPath,
  1108. // }, models.Parameter{
  1109. // Label: modelarts.DataUrl,
  1110. // Value: dataPath,
  1111. // })
  1112. // if form.ParameterTemplateName == "" {
  1113. // log.Error("ParameterTemplateName is empty")
  1114. // trainJobNewDataPrepare(ctx)
  1115. // ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  1116. // return
  1117. // }
  1118. // _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  1119. // ConfigName: form.ParameterTemplateName,
  1120. // Description: form.PrameterDescription,
  1121. // DataUrl: dataPath,
  1122. // AppUrl: codeObsPath,
  1123. // BootFileUrl: codeObsPath + bootFile,
  1124. // TrainUrl: outputObsPath,
  1125. // Flavor: models.Flavor{
  1126. // Code: flavorCode,
  1127. // },
  1128. // WorkServerNum: workServerNumber,
  1129. // EngineID: int64(engineID),
  1130. // LogUrl: logObsPath,
  1131. // PoolID: poolID,
  1132. // Parameter: saveparams,
  1133. // })
  1134. // if err != nil {
  1135. // log.Error("Failed to CreateTrainJobConfig: %v", err)
  1136. // trainJobErrorNewDataPrepare(ctx, form)
  1137. // ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1138. // return
  1139. // }
  1140. // }
  1141. req := &modelarts.GenerateTrainJobReq{
  1142. JobName: jobName,
  1143. DisplayJobName: displayJobName,
  1144. DataUrl: dataPath,
  1145. Description: description,
  1146. CodeObsPath: codeObsPath,
  1147. BootFileUrl: codeObsPath + bootFile,
  1148. BootFile: bootFile,
  1149. TrainUrl: outputObsPath,
  1150. WorkServerNumber: workServerNumber,
  1151. EngineID: int64(engineID),
  1152. LogUrl: logObsPath,
  1153. PoolID: poolID,
  1154. Uuid: uuid,
  1155. Parameters: param,
  1156. CommitID: commitID,
  1157. IsLatestVersion: isLatestVersion,
  1158. BranchName: branchName,
  1159. Params: form.Params,
  1160. FlavorName: FlavorName,
  1161. EngineName: EngineName,
  1162. VersionCount: VersionCount,
  1163. TotalVersionCount: modelarts.TotalVersionCount,
  1164. DatasetName: datasetNames,
  1165. Spec: spec,
  1166. }
  1167. if form.ModelName != "" { //使用预训练模型训练
  1168. req.ModelName = form.ModelName
  1169. req.LabelName = form.LabelName
  1170. req.CkptName = form.CkptName
  1171. req.ModelVersion = form.ModelVersion
  1172. req.PreTrainModelUrl = form.PreTrainModelUrl
  1173. }
  1174. userCommand, userImageUrl := getUserCommand(engineID, req)
  1175. req.UserCommand = userCommand
  1176. req.UserImageUrl = userImageUrl
  1177. //将params转换Parameters.Parameter,出错时返回给前端
  1178. var Parameters modelarts.Parameters
  1179. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  1180. ctx.ServerError("json.Unmarshal failed:", err)
  1181. return
  1182. }
  1183. _, err = modelarts.GenerateTrainJob(ctx, req)
  1184. if err != nil {
  1185. log.Error("GenerateTrainJob failed:%v", err.Error())
  1186. trainJobNewDataPrepare(ctx)
  1187. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  1188. return
  1189. }
  1190. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1191. }
  1192. func checkMultiNode(userId int64, serverNum int) string {
  1193. if serverNum == 1 {
  1194. return ""
  1195. }
  1196. modelarts.InitMultiNode()
  1197. var isServerNumValid = false
  1198. if modelarts.MultiNodeConfig != nil {
  1199. for _, info := range modelarts.MultiNodeConfig.Info {
  1200. if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg {
  1201. if isInNodes(info.Node, serverNum) {
  1202. isServerNumValid = true
  1203. break
  1204. }
  1205. }
  1206. }
  1207. }
  1208. if isServerNumValid {
  1209. return ""
  1210. } else {
  1211. return "repo.modelarts.no_node_right"
  1212. }
  1213. }
  1214. func checkInferenceJobMultiNode(userId int64, serverNum int) string {
  1215. if serverNum == 1 {
  1216. return ""
  1217. }
  1218. return "repo.modelarts.no_node_right"
  1219. }
  1220. func isInNodes(nodes []int, num int) bool {
  1221. for _, node := range nodes {
  1222. if node == num {
  1223. return true
  1224. }
  1225. }
  1226. return false
  1227. }
  1228. func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) {
  1229. userImageUrl := ""
  1230. userCommand := ""
  1231. if engineId < 0 {
  1232. tmpCodeObsPath := strings.Trim(req.CodeObsPath, "/")
  1233. tmpCodeObsPaths := strings.Split(tmpCodeObsPath, "/")
  1234. lastCodeDir := "code"
  1235. if len(tmpCodeObsPaths) > 0 {
  1236. lastCodeDir = tmpCodeObsPaths[len(tmpCodeObsPaths)-1]
  1237. }
  1238. userCommand = "/bin/bash /home/work/run_train.sh 's3://" + req.CodeObsPath + "' '" + lastCodeDir + "/" + req.BootFile + "' '/tmp/log/train.log' --'data_url'='s3://" + req.DataUrl + "' --'train_url'='s3://" + req.TrainUrl + "'"
  1239. var versionInfos modelarts.VersionInfo
  1240. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1241. log.Info("json parse err." + err.Error())
  1242. } else {
  1243. for _, engine := range versionInfos.Version {
  1244. if engine.ID == engineId {
  1245. userImageUrl = engine.Url
  1246. break
  1247. }
  1248. }
  1249. }
  1250. for _, param := range req.Parameters {
  1251. userCommand += " --'" + param.Label + "'='" + param.Value + "'"
  1252. }
  1253. return userCommand, userImageUrl
  1254. }
  1255. return userCommand, userImageUrl
  1256. }
  1257. func getInfJobUserCommand(engineId int, req *modelarts.GenerateInferenceJobReq) (string, string) {
  1258. userImageUrl := ""
  1259. userCommand := ""
  1260. if engineId < 0 {
  1261. tmpCodeObsPath := strings.Trim(req.CodeObsPath, "/")
  1262. tmpCodeObsPaths := strings.Split(tmpCodeObsPath, "/")
  1263. lastCodeDir := "code"
  1264. if len(tmpCodeObsPaths) > 0 {
  1265. lastCodeDir = tmpCodeObsPaths[len(tmpCodeObsPaths)-1]
  1266. }
  1267. userCommand = "/bin/bash /home/work/run_train.sh 's3://" + req.CodeObsPath + "' '" + lastCodeDir + "/" + req.BootFile + "' '/tmp/log/train.log' --'data_url'='s3://" + req.DataUrl + "' --'train_url'='s3://" + req.TrainUrl + "'"
  1268. var versionInfos modelarts.VersionInfo
  1269. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1270. log.Info("json parse err." + err.Error())
  1271. } else {
  1272. for _, engine := range versionInfos.Version {
  1273. if engine.ID == engineId {
  1274. userImageUrl = engine.Url
  1275. break
  1276. }
  1277. }
  1278. }
  1279. for _, param := range req.Parameters {
  1280. userCommand += " --'" + param.Label + "'='" + param.Value + "'"
  1281. }
  1282. return userCommand, userImageUrl
  1283. }
  1284. return userCommand, userImageUrl
  1285. }
  1286. func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  1287. ctx.Data["PageIsTrainJob"] = true
  1288. var jobID = ctx.Params(":jobid")
  1289. errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber)
  1290. if errStr != "" {
  1291. trainJobNewVersionDataPrepare(ctx)
  1292. ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form)
  1293. return
  1294. }
  1295. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeTrain))
  1296. if err != nil {
  1297. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1298. trainJobNewVersionDataPrepare(ctx)
  1299. ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
  1300. return
  1301. } else {
  1302. if count >= 1 {
  1303. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1304. trainJobNewVersionDataPrepare(ctx)
  1305. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
  1306. return
  1307. }
  1308. }
  1309. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
  1310. if err != nil {
  1311. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  1312. return
  1313. }
  1314. VersionOutputPath := modelarts.GetOutputPathByCount(latestTask.TotalVersionCount + 1)
  1315. displayJobName := form.DisplayJobName
  1316. jobName := form.JobName
  1317. uuid := form.Attachment
  1318. description := form.Description
  1319. workServerNumber := form.WorkServerNumber
  1320. engineID := form.EngineID
  1321. bootFile := strings.TrimSpace(form.BootFile)
  1322. params := form.Params
  1323. poolID := form.PoolID
  1324. //isSaveParam := form.IsSaveParam
  1325. repo := ctx.Repo.Repository
  1326. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1327. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  1328. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  1329. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1330. // dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1331. branchName := form.BranchName
  1332. PreVersionName := form.VersionName
  1333. FlavorName := form.FlavorName
  1334. EngineName := form.EngineName
  1335. isLatestVersion := modelarts.IsLatestVersion
  1336. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  1337. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  1338. if !isOk {
  1339. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  1340. trainJobNewVersionDataPrepare(ctx)
  1341. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsTrainJobVersionNew, &form)
  1342. return
  1343. }
  1344. defer lock.UnLock()
  1345. canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
  1346. if !canNewJob {
  1347. trainJobNewVersionDataPrepare(ctx)
  1348. ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
  1349. return
  1350. }
  1351. if err := paramCheckCreateTrainJob(form); err != nil {
  1352. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  1353. trainJobNewVersionDataPrepare(ctx)
  1354. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1355. return
  1356. }
  1357. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  1358. if err != nil || !bootFileExist {
  1359. log.Error("Get bootfile error:", err)
  1360. trainJobNewVersionDataPrepare(ctx)
  1361. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobVersionNew, &form)
  1362. return
  1363. }
  1364. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  1365. JobType: models.JobTypeTrain,
  1366. ComputeResource: models.NPU,
  1367. Cluster: models.OpenICluster,
  1368. AiCenterCode: models.AICenterOfCloudBrainTwo})
  1369. if err != nil || spec == nil {
  1370. trainJobNewVersionDataPrepare(ctx)
  1371. ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form)
  1372. return
  1373. }
  1374. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1375. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1376. trainJobNewVersionDataPrepare(ctx)
  1377. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsTrainJobVersionNew, &form)
  1378. return
  1379. }
  1380. //todo: del the codeLocalPath
  1381. _, err = ioutil.ReadDir(codeLocalPath)
  1382. if err == nil {
  1383. os.RemoveAll(codeLocalPath)
  1384. }
  1385. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1386. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  1387. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  1388. log.Error("Failed git clone repo to local(!: %s (%v)", repo.FullName(), err)
  1389. trainJobNewVersionDataPrepare(ctx)
  1390. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form)
  1391. return
  1392. }
  1393. //todo: upload code (send to file_server todo this work?)
  1394. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  1395. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  1396. trainJobNewVersionDataPrepare(ctx)
  1397. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
  1398. return
  1399. }
  1400. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1401. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1402. trainJobNewVersionDataPrepare(ctx)
  1403. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
  1404. return
  1405. }
  1406. parentDir := VersionOutputPath + "/"
  1407. // parentDir := ""
  1408. // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1409. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  1410. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1411. trainJobNewVersionDataPrepare(ctx)
  1412. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form)
  1413. return
  1414. }
  1415. //todo: del local code?
  1416. var parameters models.Parameters
  1417. param := make([]models.Parameter, 0)
  1418. existDeviceTarget := false
  1419. if len(params) != 0 {
  1420. err := json.Unmarshal([]byte(params), &parameters)
  1421. if err != nil {
  1422. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1423. trainJobNewVersionDataPrepare(ctx)
  1424. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
  1425. return
  1426. }
  1427. for _, parameter := range parameters.Parameter {
  1428. if parameter.Label == modelarts.DeviceTarget {
  1429. existDeviceTarget = true
  1430. }
  1431. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1432. param = append(param, models.Parameter{
  1433. Label: parameter.Label,
  1434. Value: parameter.Value,
  1435. })
  1436. }
  1437. }
  1438. }
  1439. if !existDeviceTarget {
  1440. param = append(param, models.Parameter{
  1441. Label: modelarts.DeviceTarget,
  1442. Value: modelarts.Ascend,
  1443. })
  1444. }
  1445. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  1446. if err != nil {
  1447. log.Error("Failed to getDatasUrlListByUUIDS: %v", err)
  1448. trainJobNewVersionDataPrepare(ctx)
  1449. ctx.RenderWithErr("Failed to getDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1450. return
  1451. }
  1452. dataPath := dataUrl
  1453. jsondatas, err := json.Marshal(datasUrlList)
  1454. if err != nil {
  1455. log.Error("Failed to Marshal: %v", err)
  1456. trainJobNewVersionDataPrepare(ctx)
  1457. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1458. return
  1459. }
  1460. if isMultiDataset {
  1461. param = append(param, models.Parameter{
  1462. Label: modelarts.MultiDataUrl,
  1463. Value: string(jsondatas),
  1464. })
  1465. }
  1466. if form.ModelName != "" { //使用预训练模型训练
  1467. ckptUrl := "/" + form.PreTrainModelUrl + form.CkptName
  1468. param = append(param, models.Parameter{
  1469. Label: modelarts.CkptUrl,
  1470. Value: "s3:/" + ckptUrl,
  1471. })
  1472. }
  1473. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
  1474. if err != nil {
  1475. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  1476. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1477. return
  1478. }
  1479. req := &modelarts.GenerateTrainJobReq{
  1480. JobName: jobName,
  1481. DisplayJobName: displayJobName,
  1482. DataUrl: dataPath,
  1483. Description: description,
  1484. CodeObsPath: codeObsPath,
  1485. BootFileUrl: codeObsPath + bootFile,
  1486. BootFile: bootFile,
  1487. TrainUrl: outputObsPath,
  1488. WorkServerNumber: workServerNumber,
  1489. IsLatestVersion: isLatestVersion,
  1490. EngineID: int64(engineID),
  1491. LogUrl: logObsPath,
  1492. PoolID: poolID,
  1493. Uuid: uuid,
  1494. Params: form.Params,
  1495. Parameters: param,
  1496. PreVersionId: task.VersionID,
  1497. CommitID: commitID,
  1498. BranchName: branchName,
  1499. FlavorName: FlavorName,
  1500. EngineName: EngineName,
  1501. PreVersionName: PreVersionName,
  1502. TotalVersionCount: latestTask.TotalVersionCount + 1,
  1503. DatasetName: datasetNames,
  1504. Spec: spec,
  1505. }
  1506. if form.ModelName != "" { //使用预训练模型训练
  1507. req.ModelName = form.ModelName
  1508. req.LabelName = form.LabelName
  1509. req.CkptName = form.CkptName
  1510. req.ModelVersion = form.ModelVersion
  1511. req.PreTrainModelUrl = form.PreTrainModelUrl
  1512. }
  1513. userCommand, userImageUrl := getUserCommand(engineID, req)
  1514. req.UserCommand = userCommand
  1515. req.UserImageUrl = userImageUrl
  1516. err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
  1517. if err != nil {
  1518. log.Error("GenerateTrainJob failed:%v", err.Error())
  1519. trainJobNewVersionDataPrepare(ctx)
  1520. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1521. return
  1522. }
  1523. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
  1524. // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1525. }
  1526. // readDir reads the directory named by dirname and returns
  1527. // a list of directory entries sorted by filename.
  1528. func readDir(dirname string) ([]os.FileInfo, error) {
  1529. f, err := os.Open(dirname)
  1530. if err != nil {
  1531. return nil, err
  1532. }
  1533. list, err := f.Readdir(0)
  1534. f.Close()
  1535. if err != nil {
  1536. //todo: can not upload empty folder
  1537. if err == io.EOF {
  1538. return nil, nil
  1539. }
  1540. return nil, err
  1541. }
  1542. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  1543. return list, nil
  1544. }
  1545. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  1546. files, err := readDir(codePath)
  1547. if err != nil {
  1548. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  1549. return err
  1550. }
  1551. for _, file := range files {
  1552. if file.IsDir() {
  1553. input := &obs.PutObjectInput{}
  1554. input.Bucket = setting.Bucket
  1555. input.Key = parentDir + file.Name() + "/"
  1556. _, err = storage.ObsCli.PutObject(input)
  1557. if err != nil {
  1558. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1559. return err
  1560. }
  1561. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  1562. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  1563. return err
  1564. }
  1565. } else {
  1566. input := &obs.PutFileInput{}
  1567. input.Bucket = setting.Bucket
  1568. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  1569. input.SourceFile = codePath + file.Name()
  1570. _, err = storage.ObsCli.PutFile(input)
  1571. if err != nil {
  1572. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  1573. return err
  1574. }
  1575. }
  1576. }
  1577. return nil
  1578. }
  1579. func obsMkdir(dir string) error {
  1580. input := &obs.PutObjectInput{}
  1581. input.Bucket = setting.Bucket
  1582. input.Key = dir
  1583. _, err := storage.ObsCli.PutObject(input)
  1584. if err != nil {
  1585. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1586. return err
  1587. }
  1588. return nil
  1589. }
  1590. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  1591. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  1592. log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
  1593. return errors.New("启动文件必须是python文件")
  1594. }
  1595. if form.BranchName == "" {
  1596. log.Error("the branch must not be null!", form.BranchName)
  1597. return errors.New("代码分支不能为空!")
  1598. }
  1599. return nil
  1600. }
  1601. func paramCheckCreateInferenceJob(form auth.CreateModelArtsInferenceJobForm) error {
  1602. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  1603. log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
  1604. return errors.New("启动文件必须是python文件")
  1605. }
  1606. if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 {
  1607. log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber)
  1608. return errors.New("计算节点数必须在1-2之间")
  1609. }
  1610. if form.ModelName == "" {
  1611. log.Error("the ModelName(%d) must not be nil", form.ModelName)
  1612. return errors.New("模型名称不能为空")
  1613. }
  1614. if form.ModelVersion == "" {
  1615. log.Error("the ModelVersion(%d) must not be nil", form.ModelVersion)
  1616. return errors.New("模型版本不能为空")
  1617. }
  1618. if form.CkptName == "" {
  1619. log.Error("the CkptName(%d) must not be nil", form.CkptName)
  1620. return errors.New("权重文件不能为空")
  1621. }
  1622. if form.BranchName == "" {
  1623. log.Error("the Branch(%d) must not be nil", form.BranchName)
  1624. return errors.New("分支名不能为空")
  1625. }
  1626. if utf8.RuneCountInString(form.Description) > 255 {
  1627. log.Error("the Description length(%d) must not more than 255", form.Description)
  1628. return errors.New("描述字符不能超过255个字符")
  1629. }
  1630. return nil
  1631. }
  1632. func TrainJobShow(ctx *context.Context) {
  1633. ctx.Data["PageIsCloudBrain"] = true
  1634. var jobID = ctx.Params(":jobid")
  1635. repo := ctx.Repo.Repository
  1636. page := ctx.QueryInt("page")
  1637. if page <= 0 {
  1638. page = 1
  1639. }
  1640. var jobTypes []string
  1641. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1642. VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1643. ListOptions: models.ListOptions{
  1644. Page: page,
  1645. PageSize: setting.UI.IssuePagingNum,
  1646. },
  1647. RepoID: repo.ID,
  1648. Type: models.TypeCloudBrainTwo,
  1649. JobTypes: jobTypes,
  1650. JobID: jobID,
  1651. })
  1652. if err != nil {
  1653. log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
  1654. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1655. return
  1656. }
  1657. if len(VersionListTasks) == 0 {
  1658. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1659. return
  1660. }
  1661. //设置权限
  1662. canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
  1663. if err != nil {
  1664. ctx.ServerError("canNewJob failed", err)
  1665. return
  1666. }
  1667. ctx.Data["canNewJob"] = canNewJob
  1668. datasetList := make([][]*models.DatasetDownload, 0)
  1669. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1670. for i, task := range VersionListTasks {
  1671. var parameters models.Parameters
  1672. err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), &parameters)
  1673. if err != nil {
  1674. log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
  1675. trainJobNewDataPrepare(ctx)
  1676. return
  1677. }
  1678. if len(parameters.Parameter) > 0 {
  1679. paramTemp := ""
  1680. for _, Parameter := range parameters.Parameter {
  1681. param := Parameter.Label + " = " + Parameter.Value + "; "
  1682. paramTemp = paramTemp + param
  1683. }
  1684. VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
  1685. } else {
  1686. VersionListTasks[i].Parameters = ""
  1687. }
  1688. datasetList = append(datasetList, GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false))
  1689. VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1690. VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1691. VersionListTasks[i].ContainerIp = ""
  1692. //add spec
  1693. s, err := resource.GetCloudbrainSpec(task.Cloudbrain.ID)
  1694. if err != nil {
  1695. log.Error("TrainJobShow GetCloudbrainSpec error:" + err.Error())
  1696. continue
  1697. }
  1698. VersionListTasks[i].Cloudbrain.Spec = s
  1699. }
  1700. pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
  1701. pager.SetDefaultParams(ctx)
  1702. ctx.Data["Page"] = pager
  1703. ctx.Data["jobID"] = jobID
  1704. ctx.Data["displayJobName"] = VersionListTasks[0].DisplayJobName
  1705. ctx.Data["version_list_task"] = VersionListTasks
  1706. ctx.Data["version_list_count"] = VersionListCount
  1707. ctx.Data["datasetList"] = datasetList
  1708. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, &VersionListTasks[0].Cloudbrain)
  1709. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1710. }
  1711. func TrainJobDel(ctx *context.Context) {
  1712. var jobID = ctx.Params(":jobid")
  1713. var listType = ctx.Query("listType")
  1714. repo := ctx.Repo.Repository
  1715. var jobTypes []string
  1716. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1717. VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1718. RepoID: repo.ID,
  1719. Type: models.TypeCloudBrainTwo,
  1720. JobTypes: jobTypes,
  1721. JobID: jobID,
  1722. })
  1723. if err != nil {
  1724. ctx.ServerError("get VersionListTasks failed", err)
  1725. return
  1726. }
  1727. for _, task := range VersionListTasks {
  1728. if task.Status != string(models.ModelArtsTrainJobImageFailed) && task.Status != string(models.ModelArtsTrainJobSubmitFailed) && task.Status != string(models.ModelArtsTrainJobDeleteFailed) &&
  1729. task.Status != string(models.ModelArtsTrainJobCompleted) && task.Status != string(models.ModelArtsTrainJobFailed) &&
  1730. task.Status != string(models.ModelArtsTrainJobKilled) && task.Status != string(models.ModelArtsTrainJobCanceled) && task.Status != string(models.ModelArtsTrainJobLost) {
  1731. log.Error("the job(%s) version has not been stopped", task.JobName)
  1732. ctx.RenderWithErr("the job version has not been stopped", tplModelArtsTrainJobIndex, nil)
  1733. return
  1734. }
  1735. }
  1736. //删除modelarts上的任务记录
  1737. _, err = modelarts.DelTrainJob(jobID)
  1738. if err != nil {
  1739. log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
  1740. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1741. return
  1742. }
  1743. //删除数据库Cloudbrain表的记录
  1744. for _, task := range VersionListTasks {
  1745. err = models.DeleteJob(&task.Cloudbrain)
  1746. if err != nil {
  1747. ctx.ServerError("DeleteJob failed", err)
  1748. return
  1749. }
  1750. }
  1751. //删除存储
  1752. if len(VersionListTasks) > 0 {
  1753. DeleteJobStorage(VersionListTasks[0].JobName)
  1754. }
  1755. var isAdminPage = ctx.Query("isadminpage")
  1756. var isHomePage = ctx.Query("ishomepage")
  1757. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  1758. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  1759. } else if isHomePage == "true" {
  1760. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  1761. } else {
  1762. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  1763. }
  1764. }
  1765. func TrainJobStop(ctx *context.Context) {
  1766. var jobID = ctx.Params(":jobid")
  1767. var listType = ctx.Query("listType")
  1768. task := ctx.Cloudbrain
  1769. _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  1770. if err != nil {
  1771. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  1772. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1773. return
  1774. }
  1775. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  1776. }
  1777. func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
  1778. if ctx == nil || ctx.User == nil {
  1779. log.Error("user unlogin!")
  1780. return false, nil
  1781. }
  1782. if userID == ctx.User.ID || ctx.User.IsAdmin {
  1783. return true, nil
  1784. } else {
  1785. log.Error("Only user itself and admin can new trainjob!")
  1786. return false, nil
  1787. }
  1788. }
  1789. func TrainJobGetConfigList(ctx *context.Context) {
  1790. ctx.Data["PageIsTrainJob"] = true
  1791. var jobID = ctx.Params(":jobid")
  1792. var logFileName = ctx.Query("file_name")
  1793. var baseLine = ctx.Query("base_line")
  1794. var order = ctx.Query("order")
  1795. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1796. log.Error("order(%s) check failed", order)
  1797. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1798. return
  1799. }
  1800. task, err := models.GetCloudbrainByJobID(jobID)
  1801. if err != nil {
  1802. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1803. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1804. return
  1805. }
  1806. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1807. if err != nil {
  1808. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1809. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1810. return
  1811. }
  1812. ctx.Data["log"] = result
  1813. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1814. }
  1815. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  1816. var result models.GetConfigListResult
  1817. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  1818. if err != nil {
  1819. log.Error("GetConfigList failed:", err)
  1820. return &result, err
  1821. }
  1822. for _, config := range list.ParaConfigs {
  1823. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  1824. if err != nil {
  1825. log.Error("GetParaConfig failed:", err)
  1826. return &result, err
  1827. }
  1828. config.Result = paraConfig
  1829. }
  1830. return list, nil
  1831. }
  1832. func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) {
  1833. ctx.Data["PageIsTrainJob"] = true
  1834. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  1835. displayJobName := form.DisplayJobName
  1836. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  1837. uuid := form.Attachment
  1838. description := form.Description
  1839. workServerNumber := form.WorkServerNumber
  1840. engineID := form.EngineID
  1841. bootFile := strings.TrimSpace(form.BootFile)
  1842. params := form.Params
  1843. poolID := form.PoolID
  1844. repo := ctx.Repo.Repository
  1845. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1846. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  1847. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  1848. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1849. //dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1850. branchName := form.BranchName
  1851. FlavorName := form.FlavorName
  1852. EngineName := form.EngineName
  1853. LabelName := form.LabelName
  1854. isLatestVersion := modelarts.IsLatestVersion
  1855. VersionCount := modelarts.VersionCountOne
  1856. trainUrl := form.TrainUrl
  1857. modelName := form.ModelName
  1858. modelVersion := form.ModelVersion
  1859. ckptName := form.CkptName
  1860. ckptUrl := "/" + form.TrainUrl + form.CkptName
  1861. log.Info("ckpt url:" + ckptUrl)
  1862. errStr := checkInferenceJobMultiNode(ctx.User.ID, form.WorkServerNumber)
  1863. if errStr != "" {
  1864. inferenceJobErrorNewDataPrepare(ctx, form)
  1865. ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)
  1866. return
  1867. }
  1868. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeInference), displayJobName))
  1869. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  1870. if !isOk {
  1871. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  1872. inferenceJobErrorNewDataPrepare(ctx, form)
  1873. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsInferenceJobNew, &form)
  1874. return
  1875. }
  1876. defer lock.UnLock()
  1877. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeInference))
  1878. if err != nil {
  1879. log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1880. inferenceJobErrorNewDataPrepare(ctx, form)
  1881. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1882. return
  1883. } else {
  1884. if count >= 1 {
  1885. log.Error("the user already has running or waiting inference task", ctx.Data["MsgID"])
  1886. inferenceJobErrorNewDataPrepare(ctx, form)
  1887. ctx.RenderWithErr("you have already a running or waiting inference task, can not create more", tplModelArtsInferenceJobNew, &form)
  1888. return
  1889. }
  1890. }
  1891. if err := paramCheckCreateInferenceJob(form); err != nil {
  1892. log.Error("paramCheckCreateInferenceJob failed:(%v)", err)
  1893. inferenceJobErrorNewDataPrepare(ctx, form)
  1894. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1895. return
  1896. }
  1897. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  1898. if err != nil || !bootFileExist {
  1899. log.Error("Get bootfile error:", err)
  1900. inferenceJobErrorNewDataPrepare(ctx, form)
  1901. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsInferenceJobNew, &form)
  1902. return
  1903. }
  1904. //Determine whether the task name of the task in the project is duplicated
  1905. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeInference), displayJobName)
  1906. if err == nil {
  1907. if len(tasks) != 0 {
  1908. log.Error("the job name did already exist", ctx.Data["MsgID"])
  1909. inferenceJobErrorNewDataPrepare(ctx, form)
  1910. ctx.RenderWithErr("the job name did already exist", tplModelArtsInferenceJobNew, &form)
  1911. return
  1912. }
  1913. } else {
  1914. if !models.IsErrJobNotExist(err) {
  1915. log.Error("system error, %v", err, ctx.Data["MsgID"])
  1916. inferenceJobErrorNewDataPrepare(ctx, form)
  1917. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1918. return
  1919. }
  1920. }
  1921. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  1922. JobType: models.JobTypeInference,
  1923. ComputeResource: models.NPU,
  1924. Cluster: models.OpenICluster,
  1925. AiCenterCode: models.AICenterOfCloudBrainTwo})
  1926. if err != nil || spec == nil {
  1927. inferenceJobErrorNewDataPrepare(ctx, form)
  1928. ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form)
  1929. return
  1930. }
  1931. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1932. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  1933. inferenceJobErrorNewDataPrepare(ctx, form)
  1934. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsInferenceJobNew, &form)
  1935. return
  1936. }
  1937. //todo: del the codeLocalPath
  1938. _, err = ioutil.ReadDir(codeLocalPath)
  1939. if err == nil {
  1940. os.RemoveAll(codeLocalPath)
  1941. }
  1942. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1943. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  1944. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  1945. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  1946. inferenceJobErrorNewDataPrepare(ctx, form)
  1947. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsInferenceJobNew, &form)
  1948. return
  1949. }
  1950. //todo: upload code (send to file_server todo this work?)
  1951. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  1952. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  1953. inferenceJobErrorNewDataPrepare(ctx, form)
  1954. ctx.RenderWithErr("Failed to obsMkdir_result", tplModelArtsInferenceJobNew, &form)
  1955. return
  1956. }
  1957. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1958. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1959. inferenceJobErrorNewDataPrepare(ctx, form)
  1960. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsInferenceJobNew, &form)
  1961. return
  1962. }
  1963. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1964. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1965. inferenceJobErrorNewDataPrepare(ctx, form)
  1966. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsInferenceJobNew, &form)
  1967. return
  1968. }
  1969. var parameters models.Parameters
  1970. param := make([]models.Parameter, 0)
  1971. param = append(param, models.Parameter{
  1972. Label: modelarts.ResultUrl,
  1973. Value: "s3:/" + resultObsPath,
  1974. }, models.Parameter{
  1975. Label: modelarts.CkptUrl,
  1976. Value: "s3:/" + ckptUrl,
  1977. })
  1978. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  1979. if err != nil {
  1980. inferenceJobErrorNewDataPrepare(ctx, form)
  1981. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1982. return
  1983. }
  1984. dataPath := dataUrl
  1985. jsondatas, err := json.Marshal(datasUrlList)
  1986. if err != nil {
  1987. log.Error("Failed to Marshal: %v", err)
  1988. inferenceJobErrorNewDataPrepare(ctx, form)
  1989. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsInferenceJobNew, &form)
  1990. return
  1991. }
  1992. if isMultiDataset {
  1993. param = append(param, models.Parameter{
  1994. Label: modelarts.MultiDataUrl,
  1995. Value: string(jsondatas),
  1996. })
  1997. }
  1998. existDeviceTarget := false
  1999. if len(params) != 0 {
  2000. err := json.Unmarshal([]byte(params), &parameters)
  2001. if err != nil {
  2002. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  2003. inferenceJobErrorNewDataPrepare(ctx, form)
  2004. ctx.RenderWithErr("运行参数错误", tplModelArtsInferenceJobNew, &form)
  2005. return
  2006. }
  2007. for _, parameter := range parameters.Parameter {
  2008. if parameter.Label == modelarts.DeviceTarget {
  2009. existDeviceTarget = true
  2010. }
  2011. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  2012. param = append(param, models.Parameter{
  2013. Label: parameter.Label,
  2014. Value: parameter.Value,
  2015. })
  2016. }
  2017. }
  2018. }
  2019. if !existDeviceTarget {
  2020. param = append(param, models.Parameter{
  2021. Label: modelarts.DeviceTarget,
  2022. Value: modelarts.Ascend,
  2023. })
  2024. }
  2025. req := &modelarts.GenerateInferenceJobReq{
  2026. JobName: jobName,
  2027. DisplayJobName: displayJobName,
  2028. DataUrl: dataPath,
  2029. Description: description,
  2030. CodeObsPath: codeObsPath,
  2031. BootFileUrl: codeObsPath + bootFile,
  2032. BootFile: bootFile,
  2033. TrainUrl: trainUrl,
  2034. WorkServerNumber: workServerNumber,
  2035. EngineID: int64(engineID),
  2036. LogUrl: logObsPath,
  2037. PoolID: poolID,
  2038. Uuid: uuid,
  2039. Parameters: param, //modelarts train parameters
  2040. CommitID: commitID,
  2041. BranchName: branchName,
  2042. Params: form.Params,
  2043. FlavorName: FlavorName,
  2044. EngineName: EngineName,
  2045. LabelName: LabelName,
  2046. IsLatestVersion: isLatestVersion,
  2047. VersionCount: VersionCount,
  2048. TotalVersionCount: modelarts.TotalVersionCount,
  2049. ModelName: modelName,
  2050. ModelVersion: modelVersion,
  2051. CkptName: ckptName,
  2052. ResultUrl: resultObsPath,
  2053. Spec: spec,
  2054. DatasetName: datasetNames,
  2055. JobType: string(models.JobTypeInference),
  2056. }
  2057. userCommand, userImageUrl := getInfJobUserCommand(engineID, req)
  2058. req.UserCommand = userCommand
  2059. req.UserImageUrl = userImageUrl
  2060. _, err = modelarts.GenerateInferenceJob(ctx, req)
  2061. if err != nil {
  2062. log.Error("GenerateTrainJob failed:%v", err.Error())
  2063. inferenceJobErrorNewDataPrepare(ctx, form)
  2064. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  2065. return
  2066. }
  2067. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")
  2068. }
  2069. func checkModelArtsSpecialPool(ctx *context.Context, flavorCode string, jobType string) string {
  2070. if modelarts.SpecialPools != nil {
  2071. isMatchPool := false
  2072. for _, specialPool := range modelarts.SpecialPools.Pools {
  2073. if cloudbrain.IsElementExist(specialPool.JobType, jobType) {
  2074. if isInOrg, _ := models.IsOrganizationMemberByOrgName(specialPool.Org, ctx.User.ID); isInOrg {
  2075. isMatchPool = true
  2076. isMatchSpec := false
  2077. for _, flavor := range specialPool.Flavor {
  2078. if flavor.Value == flavorCode {
  2079. isMatchSpec = true
  2080. break
  2081. }
  2082. }
  2083. if !isMatchSpec {
  2084. return "cloudbrain.wrong_specification"
  2085. }
  2086. }
  2087. }
  2088. }
  2089. if !isMatchPool {
  2090. isMatchSpec := false
  2091. if jobType == string(models.JobTypeDebug) {
  2092. for _, flavor := range setting.StFlavorInfo.FlavorInfo {
  2093. if flavor.Value == flavorCode {
  2094. isMatchSpec = true
  2095. break
  2096. }
  2097. }
  2098. } else {
  2099. var flavorInfos modelarts.Flavor
  2100. json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos)
  2101. for _, flavor := range flavorInfos.Info {
  2102. if flavor.Code == flavorCode {
  2103. isMatchSpec = true
  2104. break
  2105. }
  2106. }
  2107. }
  2108. if !isMatchSpec {
  2109. return "cloudbrain.wrong_specification"
  2110. }
  2111. }
  2112. }
  2113. return ""
  2114. }
  2115. func InferenceJobIndex(ctx *context.Context) {
  2116. MustEnableModelArts(ctx)
  2117. repo := ctx.Repo.Repository
  2118. page := ctx.QueryInt("page")
  2119. if page <= 0 {
  2120. page = 1
  2121. }
  2122. listType := ctx.Query("listType")
  2123. ctx.Data["ListType"] = listType
  2124. if listType == models.AllResource {
  2125. listType = ""
  2126. }
  2127. var jobTypes []string
  2128. jobTypes = append(jobTypes, string(models.JobTypeInference))
  2129. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  2130. ListOptions: models.ListOptions{
  2131. Page: page,
  2132. PageSize: setting.UI.IssuePagingNum,
  2133. },
  2134. RepoID: repo.ID,
  2135. ComputeResource: listType,
  2136. JobTypes: jobTypes,
  2137. Type: models.TypeCloudBrainAll,
  2138. })
  2139. if err != nil {
  2140. ctx.ServerError("Cloudbrain", err)
  2141. return
  2142. }
  2143. for i, task := range tasks {
  2144. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  2145. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  2146. if tasks[i].ComputeResource == "" {
  2147. tasks[i].ComputeResource = models.NPUResource
  2148. }
  2149. }
  2150. isQueryPrivate := isQueryPrivateModel(ctx)
  2151. repoId := ctx.Repo.Repository.ID
  2152. Type := -1
  2153. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  2154. ListOptions: models.ListOptions{
  2155. Page: 1,
  2156. PageSize: 2,
  2157. },
  2158. RepoID: repoId,
  2159. Type: Type,
  2160. New: MODEL_LATEST,
  2161. IsOnlyThisRepo: true,
  2162. Status: 0,
  2163. IsQueryPrivate: isQueryPrivate,
  2164. })
  2165. ctx.Data["MODEL_COUNT"] = model_count
  2166. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  2167. pager.SetDefaultParams(ctx)
  2168. ctx.Data["Page"] = pager
  2169. ctx.Data["PageIsCloudBrain"] = true
  2170. ctx.Data["Tasks"] = tasks
  2171. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  2172. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  2173. ctx.HTML(200, tplModelArtsInferenceJobIndex)
  2174. }
  2175. func InferenceJobNew(ctx *context.Context) {
  2176. err := inferenceJobNewDataPrepare(ctx)
  2177. if err != nil {
  2178. ctx.ServerError("get new inference-job info failed", err)
  2179. return
  2180. }
  2181. ctx.HTML(200, tplModelArtsInferenceJobNew)
  2182. }
  2183. func inferenceJobNewDataPrepare(ctx *context.Context) error {
  2184. ctx.Data["PageIsCloudBrain"] = true
  2185. ctx.Data["newInference"] = true
  2186. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  2187. ctx.Data["display_job_name"] = displayJobName
  2188. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  2189. if err != nil {
  2190. ctx.ServerError("GetAllUserAttachments failed:", err)
  2191. return err
  2192. }
  2193. ctx.Data["attachments"] = attachs
  2194. var resourcePools modelarts.ResourcePool
  2195. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  2196. ctx.ServerError("json.Unmarshal failed:", err)
  2197. return err
  2198. }
  2199. ctx.Data["resource_pools"] = resourcePools.Info
  2200. var engines modelarts.Engine
  2201. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  2202. ctx.ServerError("json.Unmarshal failed:", err)
  2203. return err
  2204. }
  2205. ctx.Data["engines"] = engines.Info
  2206. var versionInfos modelarts.VersionInfo
  2207. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  2208. ctx.ServerError("json.Unmarshal failed:", err)
  2209. return err
  2210. }
  2211. ctx.Data["engine_versions"] = versionInfos.Version
  2212. prepareCloudbrainTwoInferenceSpecs(ctx)
  2213. ctx.Data["params"] = ""
  2214. ctx.Data["branchName"] = ctx.Repo.BranchName
  2215. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  2216. if err != nil {
  2217. ctx.ServerError("getConfigList failed:", err)
  2218. return err
  2219. }
  2220. ctx.Data["config_list"] = configList.ParaConfigs
  2221. isQueryPrivate := isQueryPrivateModel(ctx)
  2222. repoId := ctx.Repo.Repository.ID
  2223. Type := -1
  2224. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  2225. ListOptions: models.ListOptions{
  2226. Page: 1,
  2227. PageSize: 2,
  2228. },
  2229. RepoID: repoId,
  2230. Type: Type,
  2231. New: MODEL_LATEST,
  2232. IsOnlyThisRepo: true,
  2233. Status: 0,
  2234. IsQueryPrivate: isQueryPrivate,
  2235. })
  2236. ctx.Data["MODEL_COUNT"] = model_count
  2237. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  2238. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  2239. ctx.Data["WaitCount"] = waitCount
  2240. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeInference))
  2241. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  2242. return nil
  2243. }
  2244. func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) {
  2245. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  2246. JobType: models.JobTypeInference,
  2247. ComputeResource: models.NPU,
  2248. Cluster: models.OpenICluster,
  2249. AiCenterCode: models.AICenterOfCloudBrainTwo,
  2250. })
  2251. ctx.Data["Specs"] = noteBookSpecs
  2252. }
  2253. func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
  2254. ctx.Data["PageIsCloudBrain"] = true
  2255. t := time.Now()
  2256. var jobName = "inference" + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  2257. ctx.Data["job_name"] = jobName
  2258. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  2259. if err != nil {
  2260. ctx.ServerError("GetAllUserAttachments failed:", err)
  2261. return err
  2262. }
  2263. ctx.Data["attachments"] = attachs
  2264. var resourcePools modelarts.ResourcePool
  2265. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  2266. ctx.ServerError("json.Unmarshal failed:", err)
  2267. return err
  2268. }
  2269. ctx.Data["resource_pools"] = resourcePools.Info
  2270. var engines modelarts.Engine
  2271. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  2272. ctx.ServerError("json.Unmarshal failed:", err)
  2273. return err
  2274. }
  2275. ctx.Data["engines"] = engines.Info
  2276. var versionInfos modelarts.VersionInfo
  2277. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  2278. ctx.ServerError("json.Unmarshal failed:", err)
  2279. return err
  2280. }
  2281. ctx.Data["engine_versions"] = versionInfos.Version
  2282. prepareCloudbrainTwoInferenceSpecs(ctx)
  2283. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  2284. if err != nil {
  2285. ctx.ServerError("getConfigList failed:", err)
  2286. return err
  2287. }
  2288. var Parameters modelarts.Parameters
  2289. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  2290. ctx.ServerError("json.Unmarshal failed:", err)
  2291. return err
  2292. }
  2293. ctx.Data["params"] = Parameters.Parameter
  2294. ctx.Data["config_list"] = configList.ParaConfigs
  2295. ctx.Data["bootFile"] = form.BootFile
  2296. ctx.Data["uuid"] = form.Attachment
  2297. _, datasetNames, err := models.GetDatasetInfo(form.Attachment)
  2298. if err != nil {
  2299. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  2300. return nil
  2301. }
  2302. ctx.Data["dataset_name"] = datasetNames
  2303. ctx.Data["branch_name"] = form.BranchName
  2304. ctx.Data["model_name"] = form.ModelName
  2305. ctx.Data["model_version"] = form.ModelVersion
  2306. ctx.Data["ckpt_name"] = form.CkptName
  2307. ctx.Data["train_url"] = form.TrainUrl
  2308. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  2309. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  2310. ctx.Data["WaitCount"] = waitCount
  2311. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainTwo, string(models.JobTypeInference))
  2312. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  2313. return nil
  2314. }
  2315. func InferenceJobShow(ctx *context.Context) {
  2316. ctx.Data["PageIsCloudBrain"] = true
  2317. var jobID = ctx.Params(":jobid")
  2318. page := ctx.QueryInt("page")
  2319. if page <= 0 {
  2320. page = 1
  2321. }
  2322. task, err := models.GetCloudbrainByJobID(jobID)
  2323. if err != nil {
  2324. log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
  2325. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  2326. return
  2327. }
  2328. //设置权限
  2329. canNewJob, err := canUserCreateTrainJobVersion(ctx, task.UserID)
  2330. if err != nil {
  2331. ctx.ServerError("canNewJob failed", err)
  2332. return
  2333. }
  2334. ctx.Data["canNewJob"] = canNewJob
  2335. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  2336. var parameters models.Parameters
  2337. err = json.Unmarshal([]byte(task.Parameters), &parameters)
  2338. if err != nil {
  2339. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  2340. trainJobNewDataPrepare(ctx)
  2341. return
  2342. }
  2343. if len(parameters.Parameter) > 0 {
  2344. paramTemp := ""
  2345. for _, Parameter := range parameters.Parameter {
  2346. param := Parameter.Label + " = " + Parameter.Value + "; "
  2347. paramTemp = paramTemp + param
  2348. }
  2349. task.Parameters = paramTemp[:len(paramTemp)-2]
  2350. } else {
  2351. task.Parameters = ""
  2352. }
  2353. prepareSpec4Show(ctx, task)
  2354. LabelName := strings.Fields(task.LabelName)
  2355. ctx.Data["labelName"] = LabelName
  2356. ctx.Data["jobID"] = jobID
  2357. ctx.Data["jobName"] = task.JobName
  2358. ctx.Data["displayJobName"] = task.DisplayJobName
  2359. ctx.Data["task"] = task
  2360. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  2361. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  2362. tempUids := []int64{}
  2363. tempUids = append(tempUids, task.UserID)
  2364. JobCreater, err := models.GetUserNamesByIDs(tempUids)
  2365. if err != nil {
  2366. log.Error("GetUserNamesByIDs (WhitelistUserIDs): %v", err)
  2367. }
  2368. ctx.Data["userName"] = JobCreater[0]
  2369. ctx.HTML(http.StatusOK, tplModelArtsInferenceJobShow)
  2370. }
  2371. func MultiModelDownload(ctx *context.Context) {
  2372. var (
  2373. err error
  2374. )
  2375. jobID := ctx.Params(":jobid")
  2376. versionName := ctx.Query("version_name")
  2377. parentDir := ctx.Query("parent_dir")
  2378. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2379. if err != nil {
  2380. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error())
  2381. return
  2382. }
  2383. if task.ComputeResource == models.NPUResource {
  2384. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir), "/")
  2385. path = strings.TrimSuffix(path, "/")
  2386. path += "/"
  2387. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  2388. if err == nil {
  2389. returnFileName := task.DisplayJobName + ".zip"
  2390. ObsDownloadManyFile(path, ctx, returnFileName, allFile)
  2391. } else {
  2392. log.Info("error,msg=" + err.Error())
  2393. ctx.ServerError("no file to download.", err)
  2394. }
  2395. } else if task.ComputeResource == models.GPUResource {
  2396. filePath := setting.CBCodePathPrefix + task.JobName + cloudbrain.ModelMountPath + "/" + parentDir
  2397. allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, filePath)
  2398. if err == nil {
  2399. returnFileName := task.DisplayJobName + ".zip"
  2400. MinioDownloadManyFile(filePath, ctx, returnFileName, allFile)
  2401. } else {
  2402. log.Info("error,msg=" + err.Error())
  2403. ctx.ServerError("no file to download.", err)
  2404. }
  2405. }
  2406. }
  2407. func ModelDownload(ctx *context.Context) {
  2408. var (
  2409. err error
  2410. )
  2411. jobID := ctx.Params(":jobid")
  2412. versionName := ctx.Query("version_name")
  2413. parentDir := ctx.Query("parent_dir")
  2414. fileName := ctx.Query("file_name")
  2415. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2416. if err != nil {
  2417. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error())
  2418. return
  2419. }
  2420. var url string
  2421. if task.ComputeResource == models.NPUResource {
  2422. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
  2423. url, err = storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  2424. if err != nil {
  2425. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  2426. ctx.ServerError("GetObsCreateSignedUrl", err)
  2427. return
  2428. }
  2429. } else if task.ComputeResource == models.GPUResource {
  2430. filePath := setting.CBCodePathPrefix + task.JobName + cloudbrain.ModelMountPath + "/" + parentDir
  2431. url, err = storage.Attachments.PresignedGetURL(filePath, fileName)
  2432. if err != nil {
  2433. log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
  2434. ctx.ServerError("PresignedGetURL", err)
  2435. return
  2436. }
  2437. }
  2438. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2439. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2440. }
  2441. func ResultDownload(ctx *context.Context) {
  2442. var (
  2443. err error
  2444. )
  2445. versionName := ctx.Query("version_name")
  2446. parentDir := ctx.Query("parent_dir")
  2447. fileName := ctx.Query("file_name")
  2448. task := ctx.Cloudbrain
  2449. if err != nil {
  2450. ctx.Data["error"] = err.Error()
  2451. }
  2452. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName, parentDir, fileName), "/")
  2453. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  2454. if err != nil {
  2455. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  2456. ctx.ServerError("GetObsCreateSignedUrl", err)
  2457. return
  2458. }
  2459. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2460. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2461. }
  2462. func DeleteJobStorage(jobName string) error {
  2463. //delete local
  2464. localJobPath := setting.JobPath + jobName
  2465. err := os.RemoveAll(localJobPath)
  2466. if err != nil {
  2467. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  2468. }
  2469. //delete oss
  2470. dirPath := setting.CodePathPrefix + jobName + "/"
  2471. err = storage.ObsRemoveObject(setting.Bucket, dirPath)
  2472. if err != nil {
  2473. log.Error("ObsRemoveObject(%s) failed:%v", localJobPath, err)
  2474. }
  2475. return nil
  2476. }
  2477. func DownloadMultiResultFile(ctx *context.Context) {
  2478. var jobID = ctx.Params(":jobid")
  2479. var versionName = ctx.Query("version_name")
  2480. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2481. if err != nil {
  2482. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  2483. return
  2484. }
  2485. // if !isCanDeleteOrDownload(ctx, task) {
  2486. // ctx.ServerError("no right.", errors.New(ctx.Tr("repo.model_noright")))
  2487. // return
  2488. // }
  2489. // path := Model_prefix + models.AttachmentRelativePath(id) + "/"
  2490. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName), "/") + "/"
  2491. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  2492. if err == nil {
  2493. //count++
  2494. // models.ModifyModelDownloadCount(id)
  2495. returnFileName := task.DisplayJobName + ".zip"
  2496. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+returnFileName)
  2497. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  2498. w := zip.NewWriter(ctx.Resp)
  2499. defer w.Close()
  2500. for _, oneFile := range allFile {
  2501. if oneFile.IsDir {
  2502. log.Info("zip dir name:" + oneFile.FileName)
  2503. } else {
  2504. log.Info("zip file name:" + oneFile.FileName)
  2505. fDest, err := w.Create(oneFile.FileName)
  2506. if err != nil {
  2507. log.Info("create zip entry error, download file failed: %s\n", err.Error())
  2508. ctx.ServerError("download file failed:", err)
  2509. return
  2510. }
  2511. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  2512. if err != nil {
  2513. log.Info("download file failed: %s\n", err.Error())
  2514. ctx.ServerError("download file failed:", err)
  2515. return
  2516. } else {
  2517. defer body.Close()
  2518. p := make([]byte, 1024)
  2519. var readErr error
  2520. var readCount int
  2521. // 读取对象内容
  2522. for {
  2523. readCount, readErr = body.Read(p)
  2524. if readCount > 0 {
  2525. fDest.Write(p[:readCount])
  2526. }
  2527. if readErr != nil {
  2528. break
  2529. }
  2530. }
  2531. }
  2532. }
  2533. }
  2534. } else {
  2535. log.Info("error,msg=" + err.Error())
  2536. ctx.ServerError("no file to download.", err)
  2537. }
  2538. }
  2539. func SetJobCount(ctx *context.Context) {
  2540. repoId := ctx.Repo.Repository.ID
  2541. _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{
  2542. RepoID: repoId,
  2543. Type: models.TypeCloudBrainAll,
  2544. })
  2545. if err != nil {
  2546. ctx.ServerError("Get job faild:", err)
  2547. return
  2548. }
  2549. ctx.Data["jobCount"] = jobCount
  2550. }
  2551. func TrainJobDownloadLogFile(ctx *context.Context) {
  2552. var (
  2553. err error
  2554. )
  2555. var jobID = ctx.Params(":jobid")
  2556. versionName := ctx.Query("version_name")
  2557. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2558. if err != nil {
  2559. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"])
  2560. ctx.ServerError("GetCloudbrainByJobIDAndVersionName", err)
  2561. return
  2562. }
  2563. prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
  2564. key, err := storage.GetObsLogFileName(prefix)
  2565. if err != nil {
  2566. log.Error("GetObsLogFileName(%s) failed:%v", jobID, err.Error(), ctx.Data["msgID"])
  2567. ctx.ServerError("GetObsLogFileName", err)
  2568. return
  2569. }
  2570. if len(key) > 1 {
  2571. ObsDownloadManyFile(prefix[0:len(prefix)-3], ctx, task.DisplayJobName+".zip", key)
  2572. } else {
  2573. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, key[0].ParenDir+key[0].FileName)
  2574. if err != nil {
  2575. log.Error("GetObsCreateSignedUrlByBucketAndKey failed: %v", err.Error(), ctx.Data["msgID"])
  2576. ctx.ServerError("GetObsCreateSignedUrlByBucketAndKey", err)
  2577. return
  2578. }
  2579. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2580. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect)
  2581. }
  2582. }
  2583. func getDatasUrlListByUUIDS(uuidStr string) ([]models.Datasurl, string, string, bool, error) {
  2584. var isMultiDataset bool
  2585. var dataUrl string
  2586. var datasetNames string
  2587. var datasUrlList []models.Datasurl
  2588. uuids := strings.Split(uuidStr, ";")
  2589. if len(uuids) > setting.MaxDatasetNum {
  2590. log.Error("the dataset count(%d) exceed the limit", len(uuids))
  2591. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("the dataset count exceed the limit")
  2592. }
  2593. datasetInfos := make(map[string]models.DatasetInfo)
  2594. attachs, err := models.GetAttachmentsByUUIDs(uuids)
  2595. if err != nil || len(attachs) != len(uuids) {
  2596. log.Error("GetAttachmentsByUUIDs failed: %v", err)
  2597. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("GetAttachmentsByUUIDs failed")
  2598. }
  2599. for i, tmpUuid := range uuids {
  2600. var attach *models.Attachment
  2601. for _, tmpAttach := range attachs {
  2602. if tmpAttach.UUID == tmpUuid {
  2603. attach = tmpAttach
  2604. break
  2605. }
  2606. }
  2607. if attach == nil {
  2608. log.Error("GetAttachmentsByUUIDs failed: %v", err)
  2609. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("GetAttachmentsByUUIDs failed")
  2610. }
  2611. fileName := strings.TrimSuffix(strings.TrimSuffix(strings.TrimSuffix(attach.Name, ".zip"), ".tar.gz"), ".tgz")
  2612. for _, datasetInfo := range datasetInfos {
  2613. if fileName == datasetInfo.Name {
  2614. log.Error("the dataset name is same: %v", attach.Name)
  2615. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("the dataset name is same")
  2616. }
  2617. }
  2618. if len(attachs) <= 1 {
  2619. dataUrl = "/" + setting.Bucket + "/" + setting.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + attach.UUID + "/"
  2620. isMultiDataset = false
  2621. } else {
  2622. dataUrl = "/" + setting.Bucket + "/" + setting.BasePath + path.Join(attachs[0].UUID[0:1], attachs[0].UUID[1:2]) + "/" + attachs[0].UUID + attachs[0].UUID + "/"
  2623. datasetUrl := "s3://" + setting.Bucket + "/" + setting.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + attach.UUID + "/"
  2624. datasUrlList = append(datasUrlList, models.Datasurl{
  2625. DatasetUrl: datasetUrl,
  2626. DatasetName: fileName,
  2627. })
  2628. isMultiDataset = true
  2629. }
  2630. if i == 0 {
  2631. datasetNames = attach.Name
  2632. } else {
  2633. datasetNames += ";" + attach.Name
  2634. }
  2635. }
  2636. return datasUrlList, dataUrl, datasetNames, isMultiDataset, nil
  2637. }