You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 82 kB

4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package repo
  2. import (
  3. "archive/zip"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "path"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "unicode/utf8"
  16. "code.gitea.io/gitea/modules/notification"
  17. "code.gitea.io/gitea/modules/timeutil"
  18. "code.gitea.io/gitea/models"
  19. "code.gitea.io/gitea/modules/auth"
  20. "code.gitea.io/gitea/modules/base"
  21. "code.gitea.io/gitea/modules/cloudbrain"
  22. "code.gitea.io/gitea/modules/context"
  23. "code.gitea.io/gitea/modules/git"
  24. "code.gitea.io/gitea/modules/log"
  25. "code.gitea.io/gitea/modules/modelarts"
  26. "code.gitea.io/gitea/modules/obs"
  27. "code.gitea.io/gitea/modules/setting"
  28. "code.gitea.io/gitea/modules/storage"
  29. "code.gitea.io/gitea/modules/util"
  30. )
  31. const (
  32. tplDebugJobIndex base.TplName = "repo/debugjob/index"
  33. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  34. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  35. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  36. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  37. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  38. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  39. tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
  40. tplModelArtsInferenceJobIndex base.TplName = "repo/modelarts/inferencejob/index"
  41. tplModelArtsInferenceJobNew base.TplName = "repo/modelarts/inferencejob/new"
  42. tplModelArtsInferenceJobShow base.TplName = "repo/modelarts/inferencejob/show"
  43. )
  44. func DebugJobIndex(ctx *context.Context) {
  45. listType := ctx.Query("debugListType")
  46. if listType == "" {
  47. listType = models.AllResource
  48. }
  49. ctx.Data["ListType"] = listType
  50. MustEnableCloudbrain(ctx)
  51. repo := ctx.Repo.Repository
  52. page := ctx.QueryInt("page")
  53. if page <= 0 {
  54. page = 1
  55. }
  56. typeCloudBrain := models.TypeCloudBrainAll
  57. jobTypeNot := false
  58. if listType == models.GPUResource {
  59. typeCloudBrain = models.TypeCloudBrainOne
  60. } else if listType == models.NPUResource {
  61. typeCloudBrain = models.TypeCloudBrainTwo
  62. } else if listType == models.AllResource {
  63. typeCloudBrain = models.TypeCloudBrainAll
  64. } else {
  65. log.Error("listType(%s) error", listType)
  66. ctx.ServerError("listType error", errors.New("listType error"))
  67. return
  68. }
  69. var jobTypes []string
  70. jobTypes = append(jobTypes, string(models.JobTypeDebug))
  71. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  72. ListOptions: models.ListOptions{
  73. Page: page,
  74. PageSize: setting.UI.IssuePagingNum,
  75. },
  76. RepoID: repo.ID,
  77. Type: typeCloudBrain,
  78. JobTypeNot: jobTypeNot,
  79. JobTypes: jobTypes,
  80. })
  81. if err != nil {
  82. ctx.ServerError("Get debugjob faild:", err)
  83. return
  84. }
  85. for i, task := range ciTasks {
  86. ciTasks[i].CanDebug = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  87. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  88. ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
  89. }
  90. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  91. pager.AddParam(ctx, "debugListType", "ListType")
  92. ctx.Data["Page"] = pager
  93. ctx.Data["PageIsCloudBrain"] = true
  94. ctx.Data["Tasks"] = ciTasks
  95. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  96. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  97. ctx.Data["debugListType"] = listType
  98. ctx.HTML(200, tplDebugJobIndex)
  99. }
  100. // MustEnableDataset check if repository enable internal cb
  101. func MustEnableModelArts(ctx *context.Context) {
  102. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  103. ctx.NotFound("MustEnableCloudbrain", nil)
  104. return
  105. }
  106. }
  107. func NotebookNew(ctx *context.Context) {
  108. notebookNewDataPrepare(ctx)
  109. ctx.HTML(200, tplModelArtsNotebookNew)
  110. }
  111. func notebookNewDataPrepare(ctx *context.Context) error {
  112. ctx.Data["PageIsCloudBrain"] = true
  113. t := time.Now()
  114. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  115. ctx.Data["display_job_name"] = displayJobName
  116. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  117. if err != nil {
  118. ctx.ServerError("GetAllUserAttachments failed:", err)
  119. return err
  120. }
  121. ctx.Data["attachments"] = attachs
  122. if modelarts.ImageInfos == nil {
  123. json.Unmarshal([]byte(setting.ImageInfos), &modelarts.ImageInfos)
  124. }
  125. ctx.Data["images"] = modelarts.ImageInfos.ImageInfo
  126. if modelarts.FlavorInfos == nil {
  127. json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
  128. }
  129. ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
  130. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  131. return nil
  132. }
  133. func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  134. ctx.Data["PageIsNotebook"] = true
  135. jobName := form.JobName
  136. uuid := form.Attachment
  137. description := form.Description
  138. flavor := form.Flavor
  139. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  140. if err != nil {
  141. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  142. cloudBrainNewDataPrepare(ctx)
  143. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  144. return
  145. } else {
  146. if count >= 1 {
  147. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  148. cloudBrainNewDataPrepare(ctx)
  149. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  150. return
  151. }
  152. }
  153. _, err = models.GetCloudbrainByName(jobName)
  154. if err == nil {
  155. log.Error("the job name did already exist", ctx.Data["MsgID"])
  156. cloudBrainNewDataPrepare(ctx)
  157. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  158. return
  159. } else {
  160. if !models.IsErrJobNotExist(err) {
  161. log.Error("system error, %v", err, ctx.Data["MsgID"])
  162. cloudBrainNewDataPrepare(ctx)
  163. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  164. return
  165. }
  166. }
  167. err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
  168. if err != nil {
  169. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  170. return
  171. }
  172. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  173. }
  174. func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  175. ctx.Data["PageIsNotebook"] = true
  176. displayJobName := form.DisplayJobName
  177. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  178. uuid := form.Attachment
  179. description := form.Description
  180. flavor := form.Flavor
  181. imageId := form.ImageId
  182. repo := ctx.Repo.Repository
  183. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  184. if err != nil {
  185. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  186. notebookNewDataPrepare(ctx)
  187. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  188. return
  189. } else {
  190. if count >= 1 {
  191. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  192. notebookNewDataPrepare(ctx)
  193. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  194. return
  195. }
  196. }
  197. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  198. if err == nil {
  199. if len(tasks) != 0 {
  200. log.Error("the job name did already exist", ctx.Data["MsgID"])
  201. notebookNewDataPrepare(ctx)
  202. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  203. return
  204. }
  205. } else {
  206. if !models.IsErrJobNotExist(err) {
  207. log.Error("system error, %v", err, ctx.Data["MsgID"])
  208. notebookNewDataPrepare(ctx)
  209. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  210. return
  211. }
  212. }
  213. err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId)
  214. if err != nil {
  215. log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
  216. notebookNewDataPrepare(ctx)
  217. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  218. return
  219. }
  220. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  221. }
  222. func NotebookShow(ctx *context.Context) {
  223. ctx.Data["PageIsCloudBrain"] = true
  224. debugListType := ctx.Query("debugListType")
  225. if debugListType == "" {
  226. debugListType = "all"
  227. }
  228. var ID = ctx.Params(":id")
  229. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  230. if err != nil {
  231. ctx.Data["error"] = err.Error()
  232. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  233. return
  234. }
  235. result, err := modelarts.GetNotebook2(task.JobID)
  236. if err != nil {
  237. ctx.Data["error"] = err.Error()
  238. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  239. return
  240. }
  241. if result != nil {
  242. if task.DeletedAt.IsZero() { //normal record
  243. if task.Status != result.Status {
  244. task.Status = result.Status
  245. models.ParseAndSetDurationFromModelArtsNotebook(result, task)
  246. err = models.UpdateJob(task)
  247. if err != nil {
  248. ctx.Data["error"] = err.Error()
  249. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  250. return
  251. }
  252. }
  253. } else { //deleted record
  254. }
  255. }
  256. datasetDownloadLink := ""
  257. datasetDownload := make([]models.DatasetDownload, 0)
  258. if ctx.IsSigned {
  259. if task.Uuid != "" && task.UserID == ctx.User.ID {
  260. uuidList := strings.Split(task.Uuid, ";")
  261. for _, uuidStr := range uuidList {
  262. attachment, err := models.GetAttachmentByUUID(uuidStr)
  263. if err == nil {
  264. datasetDownloadLink = datasetDownloadLink + attachment.S3DownloadURL()
  265. datasetDownload = append(datasetDownload, models.DatasetDownload{
  266. DatasetName: attachment.Name,
  267. DatasetDownloadLink: datasetDownloadLink,
  268. })
  269. }
  270. }
  271. // datasetName, err := GetDatasetNameByUUID(task.Uuid)
  272. // if err == nil {
  273. // task.DatasetName = datasetName
  274. // }
  275. }
  276. }
  277. user, err := models.GetUserByID(task.UserID)
  278. if err == nil {
  279. task.User = user
  280. }
  281. if modelarts.FlavorInfos == nil {
  282. json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
  283. }
  284. if modelarts.FlavorInfos != nil {
  285. ctx.Data["resource_spec"] = modelarts.FlavorInfos.FlavorInfo[0].Desc
  286. for _, f := range modelarts.FlavorInfos.FlavorInfo {
  287. if fmt.Sprint(f.Value) == task.FlavorCode {
  288. ctx.Data["resource_spec"] = f.Desc
  289. break
  290. }
  291. }
  292. }
  293. if task.TrainJobDuration == "" {
  294. if task.Duration == 0 {
  295. var duration int64
  296. if task.Status == string(models.JobRunning) {
  297. duration = time.Now().Unix() - int64(task.CreatedUnix)
  298. } else {
  299. duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
  300. }
  301. task.Duration = duration
  302. }
  303. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  304. }
  305. ctx.Data["duration"] = task.TrainJobDuration
  306. ctx.Data["datasetDownloadLink"] = datasetDownloadLink
  307. ctx.Data["datasetDownload"] = datasetDownload
  308. ctx.Data["task"] = task
  309. ctx.Data["ID"] = ID
  310. ctx.Data["jobName"] = task.JobName
  311. ctx.Data["debugListType"] = debugListType
  312. ctx.HTML(200, tplModelArtsNotebookShow)
  313. }
  314. func NotebookDebug(ctx *context.Context) {
  315. var jobID = ctx.Params(":jobid")
  316. result, err := modelarts.GetJob(jobID)
  317. if err != nil {
  318. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  319. return
  320. }
  321. res, err := modelarts.GetJobToken(jobID)
  322. if err != nil {
  323. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  324. return
  325. }
  326. urls := strings.Split(result.Spec.Annotations.Url, "/")
  327. urlPrefix := result.Spec.Annotations.TargetDomain
  328. for i, url := range urls {
  329. if i > 2 {
  330. urlPrefix += "/" + url
  331. }
  332. }
  333. debugUrl := urlPrefix + "?token=" + res.Token
  334. ctx.Redirect(debugUrl)
  335. }
  336. func NotebookDebug2(ctx *context.Context) {
  337. task := ctx.Cloudbrain
  338. result, err := modelarts.GetNotebook2(task.JobID)
  339. if err != nil {
  340. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  341. return
  342. }
  343. ctx.Redirect(result.Url + "?token=" + result.Token)
  344. }
  345. func NotebookManage(ctx *context.Context) {
  346. var ID = ctx.Params(":id")
  347. var action = ctx.Params(":action")
  348. var resultCode = "0"
  349. var errorMsg = ""
  350. var status = ""
  351. for {
  352. task, err := models.GetCloudbrainByID(ID)
  353. if err != nil {
  354. log.Error("get task(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  355. resultCode = "-1"
  356. errorMsg = "system error"
  357. break
  358. }
  359. if action == models.ActionStop {
  360. if task.Status != string(models.ModelArtsRunning) {
  361. log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
  362. resultCode = "-1"
  363. errorMsg = "the job is not running"
  364. break
  365. }
  366. if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin() && !ctx.IsUserRepoOwner()) {
  367. log.Error("the user has no right ro stop the job", task.JobName, ctx.Data["MsgID"])
  368. resultCode = "-1"
  369. errorMsg = "you have no right to stop the job"
  370. break
  371. }
  372. } else if action == models.ActionRestart {
  373. ctx.CheckWechatBind()
  374. if ctx.Written() {
  375. return
  376. }
  377. if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
  378. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  379. resultCode = "-1"
  380. errorMsg = "the job is not stopped"
  381. break
  382. }
  383. if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
  384. log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
  385. resultCode = "-1"
  386. errorMsg = "you have no right to restart the job"
  387. break
  388. }
  389. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  390. if err != nil {
  391. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  392. resultCode = "-1"
  393. errorMsg = "system error"
  394. break
  395. } else {
  396. if count >= 1 {
  397. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  398. resultCode = "-1"
  399. errorMsg = "you have already a running or waiting task, can not create more"
  400. break
  401. }
  402. }
  403. action = models.ActionStart
  404. } else {
  405. log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
  406. resultCode = "-1"
  407. errorMsg = "非法操作"
  408. break
  409. }
  410. param := models.NotebookAction{
  411. Action: action,
  412. }
  413. createTime := timeutil.TimeStampNow()
  414. res, err := modelarts.ManageNotebook2(task.JobID, param)
  415. if err != nil {
  416. log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  417. resultCode = "-1"
  418. errorMsg = err.Error()
  419. if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
  420. errorMsg = "the job's version is too old and can not be restarted"
  421. }
  422. break
  423. }
  424. status = res.Status
  425. if action == models.ActionStart {
  426. newTask := &models.Cloudbrain{
  427. Status: status,
  428. UserID: task.UserID,
  429. RepoID: task.RepoID,
  430. JobID: task.JobID,
  431. JobName: task.JobName,
  432. DisplayJobName: task.DisplayJobName,
  433. JobType: task.JobType,
  434. Type: task.Type,
  435. Uuid: task.Uuid,
  436. Image: task.Image,
  437. ComputeResource: task.ComputeResource,
  438. Description: task.Description,
  439. CreatedUnix: createTime,
  440. UpdatedUnix: createTime,
  441. }
  442. err = models.RestartCloudbrain(task, newTask)
  443. if err != nil {
  444. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  445. resultCode = "-1"
  446. errorMsg = "system error"
  447. break
  448. }
  449. ID = strconv.FormatInt(newTask.ID, 10)
  450. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, ID, task.DisplayJobName, models.ActionCreateDebugNPUTask)
  451. } else {
  452. task.Status = res.Status
  453. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  454. task.EndTime = timeutil.TimeStampNow()
  455. }
  456. task.ComputeAndSetDuration()
  457. err = models.UpdateJob(task)
  458. if err != nil {
  459. log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  460. resultCode = "-1"
  461. errorMsg = "system error"
  462. break
  463. }
  464. }
  465. break
  466. }
  467. ctx.JSON(200, map[string]string{
  468. "result_code": resultCode,
  469. "error_msg": errorMsg,
  470. "status": status,
  471. "id": ID,
  472. })
  473. }
  474. func NotebookDel(ctx *context.Context) {
  475. var listType = ctx.Query("debugListType")
  476. task := ctx.Cloudbrain
  477. if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) {
  478. log.Error("the job(%s) has not been stopped", task.JobName)
  479. ctx.RenderWithErr("the job has not been stopped", tplDebugJobIndex, nil)
  480. return
  481. }
  482. _, err := modelarts.DelNotebook2(task.JobID)
  483. if err != nil {
  484. log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error())
  485. if strings.Contains(err.Error(), modelarts.NotebookNotFound) || strings.Contains(err.Error(), modelarts.NotebookNoPermission) || strings.Contains(err.Error(), modelarts.NotebookInvalid) {
  486. log.Info("old notebook version")
  487. } else {
  488. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  489. return
  490. }
  491. }
  492. err = models.DeleteJob(task)
  493. if err != nil {
  494. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  495. return
  496. }
  497. var isAdminPage = ctx.Query("isadminpage")
  498. var isHomePage = ctx.Query("ishomepage")
  499. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  500. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  501. } else if isHomePage == "true" {
  502. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  503. } else {
  504. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  505. }
  506. }
  507. func TrainJobIndex(ctx *context.Context) {
  508. MustEnableModelArts(ctx)
  509. repo := ctx.Repo.Repository
  510. page := ctx.QueryInt("page")
  511. if page <= 0 {
  512. page = 1
  513. }
  514. listType := ctx.Query("listType")
  515. ctx.Data["ListType"] = listType
  516. if listType == models.AllResource {
  517. listType = ""
  518. }
  519. var jobTypes []string
  520. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  521. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  522. ListOptions: models.ListOptions{
  523. Page: page,
  524. PageSize: setting.UI.IssuePagingNum,
  525. },
  526. RepoID: repo.ID,
  527. JobTypeNot: false,
  528. JobTypes: jobTypes,
  529. IsLatestVersion: modelarts.IsLatestVersion,
  530. ComputeResource: listType,
  531. Type: models.TypeCloudBrainAll,
  532. })
  533. if err != nil {
  534. ctx.ServerError("Cloudbrain", err)
  535. return
  536. }
  537. for i, task := range tasks {
  538. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  539. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  540. }
  541. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  542. pager.SetDefaultParams(ctx)
  543. pager.AddParam(ctx, "listType", "ListType")
  544. ctx.Data["Page"] = pager
  545. ctx.Data["PageIsCloudBrain"] = true
  546. ctx.Data["Tasks"] = tasks
  547. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  548. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  549. ctx.HTML(200, tplModelArtsTrainJobIndex)
  550. }
  551. func TrainJobNew(ctx *context.Context) {
  552. err := trainJobNewDataPrepare(ctx)
  553. if err != nil {
  554. ctx.ServerError("get new train-job info failed", err)
  555. return
  556. }
  557. ctx.HTML(200, tplModelArtsTrainJobNew)
  558. }
  559. func trainJobNewDataPrepare(ctx *context.Context) error {
  560. ctx.Data["PageIsCloudBrain"] = true
  561. //can, err := canUserCreateTrainJob(ctx.User.ID)
  562. //if err != nil {
  563. // ctx.ServerError("canUserCreateTrainJob", err)
  564. // return
  565. //}
  566. //
  567. //if !can {
  568. // log.Error("the user can not create train-job")
  569. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  570. // return
  571. //}
  572. t := time.Now()
  573. var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  574. ctx.Data["display_job_name"] = displayJobName
  575. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  576. if err != nil {
  577. ctx.ServerError("GetAllUserAttachments failed:", err)
  578. return err
  579. }
  580. ctx.Data["attachments"] = attachs
  581. var resourcePools modelarts.ResourcePool
  582. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  583. ctx.ServerError("json.Unmarshal failed:", err)
  584. return err
  585. }
  586. ctx.Data["resource_pools"] = resourcePools.Info
  587. var engines modelarts.Engine
  588. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  589. ctx.ServerError("json.Unmarshal failed:", err)
  590. return err
  591. }
  592. ctx.Data["engines"] = engines.Info
  593. var versionInfos modelarts.VersionInfo
  594. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  595. ctx.ServerError("json.Unmarshal failed:", err)
  596. return err
  597. }
  598. ctx.Data["engine_versions"] = versionInfos.Version
  599. var flavorInfos modelarts.Flavor
  600. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  601. ctx.ServerError("json.Unmarshal failed:", err)
  602. return err
  603. }
  604. ctx.Data["flavor_infos"] = flavorInfos.Info
  605. ctx.Data["params"] = ""
  606. ctx.Data["branchName"] = ctx.Repo.BranchName
  607. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  608. if err != nil {
  609. ctx.ServerError("getConfigList failed:", err)
  610. return err
  611. }
  612. ctx.Data["config_list"] = configList.ParaConfigs
  613. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  614. return nil
  615. }
  616. func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  617. ctx.Data["PageIsCloudBrain"] = true
  618. //can, err := canUserCreateTrainJob(ctx.User.ID)
  619. //if err != nil {
  620. // ctx.ServerError("canUserCreateTrainJob", err)
  621. // return
  622. //}
  623. //
  624. //if !can {
  625. // log.Error("the user can not create train-job")
  626. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  627. // return
  628. //}
  629. t := time.Now()
  630. var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  631. ctx.Data["display_job_name"] = displayJobName
  632. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  633. if err != nil {
  634. ctx.ServerError("GetAllUserAttachments failed:", err)
  635. return err
  636. }
  637. ctx.Data["attachments"] = attachs
  638. var resourcePools modelarts.ResourcePool
  639. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  640. ctx.ServerError("json.Unmarshal failed:", err)
  641. return err
  642. }
  643. ctx.Data["resource_pools"] = resourcePools.Info
  644. var engines modelarts.Engine
  645. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  646. ctx.ServerError("json.Unmarshal failed:", err)
  647. return err
  648. }
  649. ctx.Data["engines"] = engines.Info
  650. var versionInfos modelarts.VersionInfo
  651. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  652. ctx.ServerError("json.Unmarshal failed:", err)
  653. return err
  654. }
  655. ctx.Data["engine_versions"] = versionInfos.Version
  656. var flavorInfos modelarts.Flavor
  657. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  658. ctx.ServerError("json.Unmarshal failed:", err)
  659. return err
  660. }
  661. ctx.Data["flavor_infos"] = flavorInfos.Info
  662. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  663. if err != nil {
  664. ctx.ServerError("getConfigList failed:", err)
  665. return err
  666. }
  667. var Parameters modelarts.Parameters
  668. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  669. ctx.ServerError("json.Unmarshal failed:", err)
  670. return err
  671. }
  672. ctx.Data["params"] = Parameters.Parameter
  673. ctx.Data["config_list"] = configList.ParaConfigs
  674. ctx.Data["bootFile"] = form.BootFile
  675. ctx.Data["uuid"] = form.Attachment
  676. ctx.Data["branch_name"] = form.BranchName
  677. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  678. return nil
  679. }
  680. func TrainJobNewVersion(ctx *context.Context) {
  681. err := trainJobNewVersionDataPrepare(ctx)
  682. if err != nil {
  683. ctx.ServerError("get new train-job info failed", err)
  684. return
  685. }
  686. ctx.HTML(200, tplModelArtsTrainJobVersionNew)
  687. }
  688. func trainJobNewVersionDataPrepare(ctx *context.Context) error {
  689. ctx.Data["PageIsCloudBrain"] = true
  690. var jobID = ctx.Params(":jobid")
  691. var versionName = ctx.Query("version_name")
  692. // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
  693. // if err != nil {
  694. // ctx.ServerError("canNewJob can info failed", err)
  695. // return err
  696. // }
  697. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  698. if err != nil {
  699. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  700. return err
  701. }
  702. ctx.Data["display_job_name"] = task.DisplayJobName
  703. ctx.Data["job_name"] = task.JobName
  704. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  705. if err != nil {
  706. ctx.ServerError("GetAllUserAttachments failed:", err)
  707. return err
  708. }
  709. ctx.Data["attachments"] = attachs
  710. var resourcePools modelarts.ResourcePool
  711. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  712. ctx.ServerError("json.Unmarshal failed:", err)
  713. return err
  714. }
  715. ctx.Data["resource_pools"] = resourcePools.Info
  716. var engines modelarts.Engine
  717. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  718. ctx.ServerError("json.Unmarshal failed:", err)
  719. return err
  720. }
  721. ctx.Data["engines"] = engines.Info
  722. var versionInfos modelarts.VersionInfo
  723. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  724. ctx.ServerError("json.Unmarshal failed:", err)
  725. return err
  726. }
  727. ctx.Data["engine_versions"] = versionInfos.Version
  728. var flavorInfos modelarts.Flavor
  729. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  730. ctx.ServerError("json.Unmarshal failed:", err)
  731. return err
  732. }
  733. ctx.Data["flavor_infos"] = flavorInfos.Info
  734. var Parameters modelarts.Parameters
  735. if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
  736. ctx.ServerError("json.Unmarshal failed:", err)
  737. return err
  738. }
  739. ctx.Data["params"] = Parameters.Parameter
  740. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  741. if err != nil {
  742. ctx.ServerError("GetBranches error:", err)
  743. return err
  744. }
  745. ctx.Data["branches"] = branches
  746. ctx.Data["branch_name"] = task.BranchName
  747. ctx.Data["description"] = task.Description
  748. ctx.Data["boot_file"] = task.BootFile
  749. ctx.Data["dataset_name"] = task.DatasetName
  750. ctx.Data["work_server_number"] = task.WorkServerNumber
  751. ctx.Data["flavor_name"] = task.FlavorName
  752. ctx.Data["engine_name"] = task.EngineName
  753. ctx.Data["uuid"] = task.Uuid
  754. ctx.Data["flavor_code"] = task.FlavorCode
  755. ctx.Data["engine_id"] = task.EngineID
  756. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  757. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  758. if err != nil {
  759. ctx.ServerError("getConfigList failed:", err)
  760. return err
  761. }
  762. ctx.Data["config_list"] = configList.ParaConfigs
  763. return nil
  764. }
  765. func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  766. ctx.Data["PageIsCloudBrain"] = true
  767. var jobID = ctx.Params(":jobid")
  768. // var versionName = ctx.Params(":version-name")
  769. var versionName = ctx.Query("version_name")
  770. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  771. if err != nil {
  772. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  773. return err
  774. }
  775. t := time.Now()
  776. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  777. ctx.Data["job_name"] = task.JobName
  778. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  779. if err != nil {
  780. ctx.ServerError("GetAllUserAttachments failed:", err)
  781. return err
  782. }
  783. ctx.Data["attachments"] = attachs
  784. var resourcePools modelarts.ResourcePool
  785. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  786. ctx.ServerError("json.Unmarshal failed:", err)
  787. return err
  788. }
  789. ctx.Data["resource_pools"] = resourcePools.Info
  790. var engines modelarts.Engine
  791. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  792. ctx.ServerError("json.Unmarshal failed:", err)
  793. return err
  794. }
  795. ctx.Data["engines"] = engines.Info
  796. var versionInfos modelarts.VersionInfo
  797. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  798. ctx.ServerError("json.Unmarshal failed:", err)
  799. return err
  800. }
  801. ctx.Data["engine_versions"] = versionInfos.Version
  802. var flavorInfos modelarts.Flavor
  803. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  804. ctx.ServerError("json.Unmarshal failed:", err)
  805. return err
  806. }
  807. ctx.Data["flavor_infos"] = flavorInfos.Info
  808. var Parameters modelarts.Parameters
  809. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  810. ctx.ServerError("json.Unmarshal failed:", err)
  811. return err
  812. }
  813. ctx.Data["params"] = Parameters.Parameter
  814. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  815. ctx.Data["train_url"] = outputObsPath
  816. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  817. if err != nil {
  818. ctx.ServerError("GetBranches error:", err)
  819. return err
  820. }
  821. ctx.Data["branches"] = branches
  822. ctx.Data["description"] = form.Description
  823. ctx.Data["dataset_name"] = task.DatasetName
  824. ctx.Data["work_server_number"] = form.WorkServerNumber
  825. ctx.Data["flavor_name"] = form.FlavorName
  826. ctx.Data["engine_name"] = form.EngineName
  827. ctx.Data["flavor_code"] = task.FlavorCode
  828. ctx.Data["engine_id"] = task.EngineID
  829. ctx.Data["version_name"] = form.VersionName
  830. ctx.Data["bootFile"] = form.BootFile
  831. ctx.Data["uuid"] = form.Attachment
  832. ctx.Data["branch_name"] = form.BranchName
  833. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  834. if err != nil {
  835. ctx.ServerError("getConfigList failed:", err)
  836. return err
  837. }
  838. ctx.Data["config_list"] = configList.ParaConfigs
  839. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  840. return nil
  841. }
  842. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  843. ctx.Data["PageIsTrainJob"] = true
  844. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  845. displayJobName := form.DisplayJobName
  846. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  847. uuid := form.Attachment
  848. description := form.Description
  849. workServerNumber := form.WorkServerNumber
  850. engineID := form.EngineID
  851. bootFile := strings.TrimSpace(form.BootFile)
  852. flavorCode := form.Flavor
  853. params := form.Params
  854. poolID := form.PoolID
  855. isSaveParam := form.IsSaveParam
  856. repo := ctx.Repo.Repository
  857. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  858. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  859. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  860. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  861. // dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  862. branch_name := form.BranchName
  863. isLatestVersion := modelarts.IsLatestVersion
  864. FlavorName := form.FlavorName
  865. VersionCount := modelarts.VersionCount
  866. EngineName := form.EngineName
  867. if IsDatasetUseCountExceed(uuid) {
  868. log.Error("DatasetUseCount is Exceed:%v")
  869. trainJobErrorNewDataPrepare(ctx, form)
  870. ctx.RenderWithErr("DatasetUseCount is Exceed", tplModelArtsTrainJobNew, &form)
  871. return
  872. }
  873. _, datasetNames, _, err := models.GetDatasetInfo(uuid)
  874. if err != nil {
  875. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  876. trainJobErrorNewDataPrepare(ctx, form)
  877. ctx.RenderWithErr("GetDatasetInfo error", tplModelArtsTrainJobNew, &form)
  878. return
  879. }
  880. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  881. if err != nil {
  882. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  883. trainJobErrorNewDataPrepare(ctx, form)
  884. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  885. return
  886. } else {
  887. if count >= 1 {
  888. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  889. trainJobErrorNewDataPrepare(ctx, form)
  890. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
  891. return
  892. }
  893. }
  894. if err := paramCheckCreateTrainJob(form); err != nil {
  895. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  896. trainJobErrorNewDataPrepare(ctx, form)
  897. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  898. return
  899. }
  900. //Determine whether the task name of the task in the project is duplicated
  901. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  902. if err == nil {
  903. if len(tasks) != 0 {
  904. log.Error("the job name did already exist", ctx.Data["MsgID"])
  905. trainJobErrorNewDataPrepare(ctx, form)
  906. ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form)
  907. return
  908. }
  909. } else {
  910. if !models.IsErrJobNotExist(err) {
  911. log.Error("system error, %v", err, ctx.Data["MsgID"])
  912. trainJobErrorNewDataPrepare(ctx, form)
  913. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  914. return
  915. }
  916. }
  917. //todo: del the codeLocalPath
  918. _, err = ioutil.ReadDir(codeLocalPath)
  919. if err == nil {
  920. os.RemoveAll(codeLocalPath)
  921. }
  922. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  923. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  924. if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
  925. log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
  926. trainJobErrorNewDataPrepare(ctx, form)
  927. ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form)
  928. return
  929. }
  930. //todo: upload code (send to file_server todo this work?)
  931. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  932. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  933. trainJobErrorNewDataPrepare(ctx, form)
  934. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  935. return
  936. }
  937. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  938. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  939. trainJobErrorNewDataPrepare(ctx, form)
  940. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  941. return
  942. }
  943. // parentDir := VersionOutputPath + "/"
  944. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  945. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  946. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  947. trainJobErrorNewDataPrepare(ctx, form)
  948. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
  949. return
  950. }
  951. var parameters models.Parameters
  952. param := make([]models.Parameter, 0)
  953. existDeviceTarget := false
  954. if len(params) != 0 {
  955. err := json.Unmarshal([]byte(params), &parameters)
  956. if err != nil {
  957. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  958. trainJobErrorNewDataPrepare(ctx, form)
  959. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  960. return
  961. }
  962. for _, parameter := range parameters.Parameter {
  963. if parameter.Label == modelarts.DeviceTarget {
  964. existDeviceTarget = true
  965. }
  966. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  967. param = append(param, models.Parameter{
  968. Label: parameter.Label,
  969. Value: parameter.Value,
  970. })
  971. }
  972. }
  973. }
  974. if !existDeviceTarget {
  975. param = append(param, models.Parameter{
  976. Label: modelarts.DeviceTarget,
  977. Value: modelarts.Ascend,
  978. })
  979. }
  980. DatasUrlList, dataUrl, isMultiDataset, err := GetDatasUrlListByUUIDS(uuid)
  981. if err != nil {
  982. log.Error("Failed to GetDatasUrlListByUUIDS: %v", err)
  983. trainJobErrorNewDataPrepare(ctx, form)
  984. ctx.RenderWithErr("Failed to GetDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobNew, &form)
  985. return
  986. }
  987. dataPath := dataUrl
  988. jsondatas, err := json.Marshal(DatasUrlList)
  989. if err != nil {
  990. log.Error("Failed to Marshal: %v", err)
  991. trainJobErrorNewDataPrepare(ctx, form)
  992. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobNew, &form)
  993. return
  994. }
  995. if isMultiDataset {
  996. param = append(param, models.Parameter{
  997. Label: modelarts.MultiDataUrl,
  998. Value: string(jsondatas),
  999. })
  1000. }
  1001. //save param config
  1002. if isSaveParam == "on" {
  1003. saveparams := append(param, models.Parameter{
  1004. Label: modelarts.TrainUrl,
  1005. Value: outputObsPath,
  1006. }, models.Parameter{
  1007. Label: modelarts.DataUrl,
  1008. Value: dataPath,
  1009. })
  1010. if form.ParameterTemplateName == "" {
  1011. log.Error("ParameterTemplateName is empty")
  1012. trainJobNewDataPrepare(ctx)
  1013. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  1014. return
  1015. }
  1016. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  1017. ConfigName: form.ParameterTemplateName,
  1018. Description: form.PrameterDescription,
  1019. DataUrl: dataPath,
  1020. AppUrl: codeObsPath,
  1021. BootFileUrl: codeObsPath + bootFile,
  1022. TrainUrl: outputObsPath,
  1023. Flavor: models.Flavor{
  1024. Code: flavorCode,
  1025. },
  1026. WorkServerNum: workServerNumber,
  1027. EngineID: int64(engineID),
  1028. LogUrl: logObsPath,
  1029. PoolID: poolID,
  1030. Parameter: saveparams,
  1031. })
  1032. if err != nil {
  1033. log.Error("Failed to CreateTrainJobConfig: %v", err)
  1034. trainJobErrorNewDataPrepare(ctx, form)
  1035. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1036. return
  1037. }
  1038. }
  1039. req := &modelarts.GenerateTrainJobReq{
  1040. JobName: jobName,
  1041. DisplayJobName: displayJobName,
  1042. DataUrl: dataPath,
  1043. Description: description,
  1044. CodeObsPath: codeObsPath,
  1045. BootFileUrl: codeObsPath + bootFile,
  1046. BootFile: bootFile,
  1047. TrainUrl: outputObsPath,
  1048. FlavorCode: flavorCode,
  1049. WorkServerNumber: workServerNumber,
  1050. EngineID: int64(engineID),
  1051. LogUrl: logObsPath,
  1052. PoolID: poolID,
  1053. Uuid: uuid,
  1054. Parameters: param,
  1055. CommitID: commitID,
  1056. IsLatestVersion: isLatestVersion,
  1057. BranchName: branch_name,
  1058. Params: form.Params,
  1059. FlavorName: FlavorName,
  1060. EngineName: EngineName,
  1061. VersionCount: VersionCount,
  1062. TotalVersionCount: modelarts.TotalVersionCount,
  1063. DatasetName: datasetNames,
  1064. }
  1065. //将params转换Parameters.Parameter,出错时返回给前端
  1066. var Parameters modelarts.Parameters
  1067. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  1068. ctx.ServerError("json.Unmarshal failed:", err)
  1069. return
  1070. }
  1071. err = modelarts.GenerateTrainJob(ctx, req)
  1072. if err != nil {
  1073. log.Error("GenerateTrainJob failed:%v", err.Error())
  1074. trainJobErrorNewDataPrepare(ctx, form)
  1075. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  1076. return
  1077. }
  1078. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1079. }
  1080. func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  1081. ctx.Data["PageIsTrainJob"] = true
  1082. var jobID = ctx.Params(":jobid")
  1083. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  1084. if err != nil {
  1085. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1086. versionErrorDataPrepare(ctx, form)
  1087. ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
  1088. return
  1089. } else {
  1090. if count >= 1 {
  1091. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1092. versionErrorDataPrepare(ctx, form)
  1093. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
  1094. return
  1095. }
  1096. }
  1097. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
  1098. if err != nil {
  1099. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  1100. return
  1101. }
  1102. VersionOutputPath := modelarts.GetOutputPathByCount(latestTask.TotalVersionCount + 1)
  1103. displayJobName := form.DisplayJobName
  1104. jobName := form.JobName
  1105. uuid := form.Attachment
  1106. description := form.Description
  1107. workServerNumber := form.WorkServerNumber
  1108. engineID := form.EngineID
  1109. bootFile := strings.TrimSpace(form.BootFile)
  1110. flavorCode := form.Flavor
  1111. params := form.Params
  1112. poolID := form.PoolID
  1113. isSaveParam := form.IsSaveParam
  1114. repo := ctx.Repo.Repository
  1115. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1116. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  1117. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  1118. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1119. // dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1120. branch_name := form.BranchName
  1121. PreVersionName := form.VersionName
  1122. FlavorName := form.FlavorName
  1123. EngineName := form.EngineName
  1124. isLatestVersion := modelarts.IsLatestVersion
  1125. if IsDatasetUseCountExceed(uuid) {
  1126. log.Error("DatasetUseCount is Exceed:%v")
  1127. versionErrorDataPrepare(ctx, form)
  1128. ctx.RenderWithErr("DatasetUseCount is Exceed", tplModelArtsTrainJobVersionNew, &form)
  1129. return
  1130. }
  1131. _, datasetNames, _, err := models.GetDatasetInfo(uuid)
  1132. if err != nil {
  1133. log.Error("GetDatasetInfo failed:%v", err, ctx.Data["MsgID"])
  1134. versionErrorDataPrepare(ctx, form)
  1135. ctx.RenderWithErr("GetDatasetInfo error", tplModelArtsTrainJobVersionNew, &form)
  1136. return
  1137. }
  1138. canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
  1139. if !canNewJob {
  1140. ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
  1141. return
  1142. }
  1143. if err := paramCheckCreateTrainJob(form); err != nil {
  1144. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  1145. versionErrorDataPrepare(ctx, form)
  1146. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1147. return
  1148. }
  1149. //todo: del the codeLocalPath
  1150. _, err = ioutil.ReadDir(codeLocalPath)
  1151. if err == nil {
  1152. os.RemoveAll(codeLocalPath)
  1153. }
  1154. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1155. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  1156. if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
  1157. log.Error("Failed git clone repo to local(!: %s (%v)", repo.FullName(), err)
  1158. versionErrorDataPrepare(ctx, form)
  1159. ctx.RenderWithErr("Failed git clone repo to local!", tplModelArtsTrainJobVersionNew, &form)
  1160. return
  1161. }
  1162. //todo: upload code (send to file_server todo this work?)
  1163. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  1164. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  1165. versionErrorDataPrepare(ctx, form)
  1166. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
  1167. return
  1168. }
  1169. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1170. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1171. versionErrorDataPrepare(ctx, form)
  1172. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
  1173. return
  1174. }
  1175. parentDir := VersionOutputPath + "/"
  1176. // parentDir := ""
  1177. // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1178. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  1179. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1180. versionErrorDataPrepare(ctx, form)
  1181. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
  1182. return
  1183. }
  1184. //todo: del local code?
  1185. var parameters models.Parameters
  1186. param := make([]models.Parameter, 0)
  1187. existDeviceTarget := false
  1188. if len(params) != 0 {
  1189. err := json.Unmarshal([]byte(params), &parameters)
  1190. if err != nil {
  1191. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1192. versionErrorDataPrepare(ctx, form)
  1193. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
  1194. return
  1195. }
  1196. for _, parameter := range parameters.Parameter {
  1197. if parameter.Label == modelarts.DeviceTarget {
  1198. existDeviceTarget = true
  1199. }
  1200. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1201. param = append(param, models.Parameter{
  1202. Label: parameter.Label,
  1203. Value: parameter.Value,
  1204. })
  1205. }
  1206. }
  1207. }
  1208. if !existDeviceTarget {
  1209. param = append(param, models.Parameter{
  1210. Label: modelarts.DeviceTarget,
  1211. Value: modelarts.Ascend,
  1212. })
  1213. }
  1214. DatasUrlList, dataUrl, isMultiDataset, err := GetDatasUrlListByUUIDS(uuid)
  1215. if err != nil {
  1216. log.Error("Failed to GetDatasUrlListByUUIDS: %v", err)
  1217. versionErrorDataPrepare(ctx, form)
  1218. ctx.RenderWithErr("Failed to GetDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1219. return
  1220. }
  1221. dataPath := dataUrl
  1222. jsondatas, err := json.Marshal(DatasUrlList)
  1223. if err != nil {
  1224. log.Error("Failed to Marshal: %v", err)
  1225. versionErrorDataPrepare(ctx, form)
  1226. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1227. return
  1228. }
  1229. if isMultiDataset {
  1230. param = append(param, models.Parameter{
  1231. Label: modelarts.MultiDataUrl,
  1232. Value: string(jsondatas),
  1233. })
  1234. }
  1235. //save param config
  1236. if isSaveParam == "on" {
  1237. saveparams := append(param, models.Parameter{
  1238. Label: modelarts.TrainUrl,
  1239. Value: outputObsPath,
  1240. }, models.Parameter{
  1241. Label: modelarts.DataUrl,
  1242. Value: dataPath,
  1243. })
  1244. if form.ParameterTemplateName == "" {
  1245. log.Error("ParameterTemplateName is empty")
  1246. versionErrorDataPrepare(ctx, form)
  1247. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
  1248. return
  1249. }
  1250. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  1251. ConfigName: form.ParameterTemplateName,
  1252. Description: form.PrameterDescription,
  1253. DataUrl: dataPath,
  1254. AppUrl: codeObsPath,
  1255. BootFileUrl: codeObsPath + bootFile,
  1256. TrainUrl: outputObsPath,
  1257. Flavor: models.Flavor{
  1258. Code: flavorCode,
  1259. },
  1260. WorkServerNum: workServerNumber,
  1261. EngineID: int64(engineID),
  1262. LogUrl: logObsPath,
  1263. PoolID: poolID,
  1264. Parameter: saveparams,
  1265. })
  1266. if err != nil {
  1267. log.Error("Failed to CreateTrainJobConfig: %v", err)
  1268. versionErrorDataPrepare(ctx, form)
  1269. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1270. return
  1271. }
  1272. }
  1273. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
  1274. if err != nil {
  1275. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  1276. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1277. return
  1278. }
  1279. req := &modelarts.GenerateTrainJobReq{
  1280. JobName: jobName,
  1281. DisplayJobName: displayJobName,
  1282. DataUrl: dataPath,
  1283. Description: description,
  1284. CodeObsPath: codeObsPath,
  1285. BootFileUrl: codeObsPath + bootFile,
  1286. BootFile: bootFile,
  1287. TrainUrl: outputObsPath,
  1288. FlavorCode: flavorCode,
  1289. WorkServerNumber: workServerNumber,
  1290. IsLatestVersion: isLatestVersion,
  1291. EngineID: int64(engineID),
  1292. LogUrl: logObsPath,
  1293. PoolID: poolID,
  1294. Uuid: uuid,
  1295. Params: form.Params,
  1296. Parameters: param,
  1297. PreVersionId: task.VersionID,
  1298. CommitID: commitID,
  1299. BranchName: branch_name,
  1300. FlavorName: FlavorName,
  1301. EngineName: EngineName,
  1302. PreVersionName: PreVersionName,
  1303. TotalVersionCount: latestTask.TotalVersionCount + 1,
  1304. DatasetName: datasetNames,
  1305. }
  1306. err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
  1307. if err != nil {
  1308. log.Error("GenerateTrainJob failed:%v", err.Error())
  1309. versionErrorDataPrepare(ctx, form)
  1310. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1311. return
  1312. }
  1313. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
  1314. // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1315. }
  1316. // readDir reads the directory named by dirname and returns
  1317. // a list of directory entries sorted by filename.
  1318. func readDir(dirname string) ([]os.FileInfo, error) {
  1319. f, err := os.Open(dirname)
  1320. if err != nil {
  1321. return nil, err
  1322. }
  1323. list, err := f.Readdir(0)
  1324. f.Close()
  1325. if err != nil {
  1326. //todo: can not upload empty folder
  1327. if err == io.EOF {
  1328. return nil, nil
  1329. }
  1330. return nil, err
  1331. }
  1332. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  1333. return list, nil
  1334. }
  1335. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  1336. files, err := readDir(codePath)
  1337. if err != nil {
  1338. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  1339. return err
  1340. }
  1341. for _, file := range files {
  1342. if file.IsDir() {
  1343. input := &obs.PutObjectInput{}
  1344. input.Bucket = setting.Bucket
  1345. input.Key = parentDir + file.Name() + "/"
  1346. _, err = storage.ObsCli.PutObject(input)
  1347. if err != nil {
  1348. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1349. return err
  1350. }
  1351. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  1352. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  1353. return err
  1354. }
  1355. } else {
  1356. input := &obs.PutFileInput{}
  1357. input.Bucket = setting.Bucket
  1358. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  1359. input.SourceFile = codePath + file.Name()
  1360. _, err = storage.ObsCli.PutFile(input)
  1361. if err != nil {
  1362. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  1363. return err
  1364. }
  1365. }
  1366. }
  1367. return nil
  1368. }
  1369. func obsMkdir(dir string) error {
  1370. input := &obs.PutObjectInput{}
  1371. input.Bucket = setting.Bucket
  1372. input.Key = dir
  1373. _, err := storage.ObsCli.PutObject(input)
  1374. if err != nil {
  1375. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1376. return err
  1377. }
  1378. return nil
  1379. }
  1380. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  1381. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  1382. log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
  1383. return errors.New("启动文件必须是python文件")
  1384. }
  1385. if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 {
  1386. log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber)
  1387. return errors.New("计算节点数必须在1-2之间")
  1388. }
  1389. if form.BranchName == "" {
  1390. log.Error("the branch must not be null!", form.BranchName)
  1391. return errors.New("代码分支不能为空!")
  1392. }
  1393. return nil
  1394. }
  1395. func paramCheckCreateInferenceJob(form auth.CreateModelArtsInferenceJobForm) error {
  1396. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  1397. log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
  1398. return errors.New("启动文件必须是python文件")
  1399. }
  1400. if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 {
  1401. log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber)
  1402. return errors.New("计算节点数必须在1-2之间")
  1403. }
  1404. if form.ModelName == "" {
  1405. log.Error("the ModelName(%d) must not be nil", form.ModelName)
  1406. return errors.New("模型名称不能为空")
  1407. }
  1408. if form.ModelVersion == "" {
  1409. log.Error("the ModelVersion(%d) must not be nil", form.ModelVersion)
  1410. return errors.New("模型版本不能为空")
  1411. }
  1412. if form.CkptName == "" {
  1413. log.Error("the CkptName(%d) must not be nil", form.CkptName)
  1414. return errors.New("权重文件不能为空")
  1415. }
  1416. if form.BranchName == "" {
  1417. log.Error("the Branch(%d) must not be nil", form.BranchName)
  1418. return errors.New("分支名不能为空")
  1419. }
  1420. if utf8.RuneCountInString(form.Description) > 255 {
  1421. log.Error("the Description length(%d) must not more than 255", form.Description)
  1422. return errors.New("描述字符不能超过255个字符")
  1423. }
  1424. return nil
  1425. }
  1426. func TrainJobShow(ctx *context.Context) {
  1427. ctx.Data["PageIsCloudBrain"] = true
  1428. var jobID = ctx.Params(":jobid")
  1429. repo := ctx.Repo.Repository
  1430. page := ctx.QueryInt("page")
  1431. if page <= 0 {
  1432. page = 1
  1433. }
  1434. var jobTypes []string
  1435. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1436. VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1437. ListOptions: models.ListOptions{
  1438. Page: page,
  1439. PageSize: setting.UI.IssuePagingNum,
  1440. },
  1441. RepoID: repo.ID,
  1442. Type: models.TypeCloudBrainTwo,
  1443. JobTypes: jobTypes,
  1444. JobID: jobID,
  1445. })
  1446. if err != nil {
  1447. log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
  1448. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1449. return
  1450. }
  1451. //设置权限
  1452. canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
  1453. if err != nil {
  1454. ctx.ServerError("canNewJob failed", err)
  1455. return
  1456. }
  1457. ctx.Data["canNewJob"] = canNewJob
  1458. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1459. for i, task := range VersionListTasks {
  1460. var parameters models.Parameters
  1461. err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), &parameters)
  1462. if err != nil {
  1463. log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
  1464. trainJobNewDataPrepare(ctx)
  1465. return
  1466. }
  1467. if len(parameters.Parameter) > 0 {
  1468. paramTemp := ""
  1469. for _, Parameter := range parameters.Parameter {
  1470. param := Parameter.Label + " = " + Parameter.Value + "; "
  1471. paramTemp = paramTemp + param
  1472. }
  1473. VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
  1474. } else {
  1475. VersionListTasks[i].Parameters = ""
  1476. }
  1477. VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1478. VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1479. }
  1480. pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
  1481. pager.SetDefaultParams(ctx)
  1482. ctx.Data["Page"] = pager
  1483. ctx.Data["jobID"] = jobID
  1484. ctx.Data["displayJobName"] = VersionListTasks[0].DisplayJobName
  1485. ctx.Data["version_list_task"] = VersionListTasks
  1486. ctx.Data["version_list_count"] = VersionListCount
  1487. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, &VersionListTasks[0].Cloudbrain)
  1488. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1489. }
  1490. func TrainJobGetLog(ctx *context.Context) {
  1491. ctx.Data["PageIsTrainJob"] = true
  1492. var jobID = ctx.Params(":jobid")
  1493. var logFileName = ctx.Query("file_name")
  1494. var baseLine = ctx.Query("base_line")
  1495. var order = ctx.Query("order")
  1496. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1497. log.Error("order(%s) check failed", order)
  1498. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1499. return
  1500. }
  1501. task, err := models.GetCloudbrainByJobID(jobID)
  1502. if err != nil {
  1503. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1504. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1505. return
  1506. }
  1507. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1508. if err != nil {
  1509. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1510. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1511. return
  1512. }
  1513. ctx.Data["log"] = result
  1514. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1515. }
  1516. func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  1517. task, err := models.GetCloudbrainByJobID(jobID)
  1518. if err != nil {
  1519. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1520. return nil, nil, err
  1521. }
  1522. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  1523. if err != nil {
  1524. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  1525. return nil, nil, err
  1526. }
  1527. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
  1528. if err != nil {
  1529. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1530. return nil, nil, err
  1531. }
  1532. return resultLogFile, result, err
  1533. }
  1534. func TrainJobDel(ctx *context.Context) {
  1535. var jobID = ctx.Params(":jobid")
  1536. var listType = ctx.Query("listType")
  1537. repo := ctx.Repo.Repository
  1538. var jobTypes []string
  1539. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1540. VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1541. RepoID: repo.ID,
  1542. Type: models.TypeCloudBrainTwo,
  1543. JobTypes: jobTypes,
  1544. JobID: jobID,
  1545. })
  1546. if err != nil {
  1547. ctx.ServerError("get VersionListTasks failed", err)
  1548. return
  1549. }
  1550. //删除modelarts上的任务记录
  1551. _, err = modelarts.DelTrainJob(jobID)
  1552. if err != nil {
  1553. log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
  1554. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1555. return
  1556. }
  1557. //删除数据库Cloudbrain表的记录
  1558. for _, task := range VersionListTasks {
  1559. err = models.DeleteJob(&task.Cloudbrain)
  1560. if err != nil {
  1561. ctx.ServerError("DeleteJob failed", err)
  1562. return
  1563. }
  1564. }
  1565. //删除存储
  1566. if len(VersionListTasks) > 0 {
  1567. DeleteJobStorage(VersionListTasks[0].JobName)
  1568. }
  1569. var isAdminPage = ctx.Query("isadminpage")
  1570. var isHomePage = ctx.Query("ishomepage")
  1571. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  1572. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  1573. } else if isHomePage == "true" {
  1574. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  1575. } else {
  1576. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  1577. }
  1578. }
  1579. func TrainJobStop(ctx *context.Context) {
  1580. var jobID = ctx.Params(":jobid")
  1581. var listType = ctx.Query("listType")
  1582. task := ctx.Cloudbrain
  1583. _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  1584. if err != nil {
  1585. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  1586. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1587. return
  1588. }
  1589. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  1590. }
  1591. func canUserCreateTrainJob(uid int64) (bool, error) {
  1592. org, err := models.GetOrgByName(setting.AllowedOrg)
  1593. if err != nil {
  1594. log.Error("get allowed org failed: ", setting.AllowedOrg)
  1595. return false, err
  1596. }
  1597. return org.IsOrgMember(uid)
  1598. }
  1599. func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
  1600. if ctx == nil || ctx.User == nil {
  1601. log.Error("user unlogin!")
  1602. return false, nil
  1603. }
  1604. if userID == ctx.User.ID || ctx.User.IsAdmin {
  1605. return true, nil
  1606. } else {
  1607. log.Error("Only user itself and admin can new trainjob!")
  1608. return false, nil
  1609. }
  1610. }
  1611. func TrainJobGetConfigList(ctx *context.Context) {
  1612. ctx.Data["PageIsTrainJob"] = true
  1613. var jobID = ctx.Params(":jobid")
  1614. var logFileName = ctx.Query("file_name")
  1615. var baseLine = ctx.Query("base_line")
  1616. var order = ctx.Query("order")
  1617. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1618. log.Error("order(%s) check failed", order)
  1619. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1620. return
  1621. }
  1622. task, err := models.GetCloudbrainByJobID(jobID)
  1623. if err != nil {
  1624. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1625. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1626. return
  1627. }
  1628. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1629. if err != nil {
  1630. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1631. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1632. return
  1633. }
  1634. ctx.Data["log"] = result
  1635. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1636. }
  1637. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  1638. var result models.GetConfigListResult
  1639. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  1640. if err != nil {
  1641. log.Error("GetConfigList failed:", err)
  1642. return &result, err
  1643. }
  1644. for _, config := range list.ParaConfigs {
  1645. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  1646. if err != nil {
  1647. log.Error("GetParaConfig failed:", err)
  1648. return &result, err
  1649. }
  1650. config.Result = paraConfig
  1651. }
  1652. return list, nil
  1653. }
  1654. func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) {
  1655. ctx.Data["PageIsTrainJob"] = true
  1656. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  1657. displayJobName := form.DisplayJobName
  1658. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  1659. uuid := form.Attachment
  1660. description := form.Description
  1661. workServerNumber := form.WorkServerNumber
  1662. engineID := form.EngineID
  1663. bootFile := strings.TrimSpace(form.BootFile)
  1664. flavorCode := form.Flavor
  1665. params := form.Params
  1666. poolID := form.PoolID
  1667. repo := ctx.Repo.Repository
  1668. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1669. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  1670. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  1671. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1672. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1673. branch_name := form.BranchName
  1674. FlavorName := form.FlavorName
  1675. EngineName := form.EngineName
  1676. LabelName := form.LabelName
  1677. isLatestVersion := modelarts.IsLatestVersion
  1678. VersionCount := modelarts.VersionCount
  1679. trainUrl := form.TrainUrl
  1680. modelName := form.ModelName
  1681. modelVersion := form.ModelVersion
  1682. ckptName := form.CkptName
  1683. ckptUrl := form.TrainUrl + form.CkptName
  1684. count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID)
  1685. if err != nil {
  1686. log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1687. inferenceJobErrorNewDataPrepare(ctx, form)
  1688. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1689. return
  1690. } else {
  1691. if count >= 1 {
  1692. log.Error("the user already has running or waiting inference task", ctx.Data["MsgID"])
  1693. inferenceJobErrorNewDataPrepare(ctx, form)
  1694. ctx.RenderWithErr("you have already a running or waiting inference task, can not create more", tplModelArtsInferenceJobNew, &form)
  1695. return
  1696. }
  1697. }
  1698. if err := paramCheckCreateInferenceJob(form); err != nil {
  1699. log.Error("paramCheckCreateInferenceJob failed:(%v)", err)
  1700. inferenceJobErrorNewDataPrepare(ctx, form)
  1701. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1702. return
  1703. }
  1704. //Determine whether the task name of the task in the project is duplicated
  1705. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeInference), displayJobName)
  1706. if err == nil {
  1707. if len(tasks) != 0 {
  1708. log.Error("the job name did already exist", ctx.Data["MsgID"])
  1709. inferenceJobErrorNewDataPrepare(ctx, form)
  1710. ctx.RenderWithErr("the job name did already exist", tplModelArtsInferenceJobNew, &form)
  1711. return
  1712. }
  1713. } else {
  1714. if !models.IsErrJobNotExist(err) {
  1715. log.Error("system error, %v", err, ctx.Data["MsgID"])
  1716. inferenceJobErrorNewDataPrepare(ctx, form)
  1717. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1718. return
  1719. }
  1720. }
  1721. //todo: del the codeLocalPath
  1722. _, err = ioutil.ReadDir(codeLocalPath)
  1723. if err == nil {
  1724. os.RemoveAll(codeLocalPath)
  1725. }
  1726. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1727. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  1728. if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
  1729. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  1730. inferenceJobErrorNewDataPrepare(ctx, form)
  1731. ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsInferenceJobNew, &form)
  1732. return
  1733. }
  1734. //todo: upload code (send to file_server todo this work?)
  1735. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  1736. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  1737. inferenceJobErrorNewDataPrepare(ctx, form)
  1738. ctx.RenderWithErr("Failed to obsMkdir_result", tplModelArtsInferenceJobNew, &form)
  1739. return
  1740. }
  1741. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1742. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1743. inferenceJobErrorNewDataPrepare(ctx, form)
  1744. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsInferenceJobNew, &form)
  1745. return
  1746. }
  1747. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1748. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1749. inferenceJobErrorNewDataPrepare(ctx, form)
  1750. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsInferenceJobNew, &form)
  1751. return
  1752. }
  1753. var parameters models.Parameters
  1754. param := make([]models.Parameter, 0)
  1755. param = append(param, models.Parameter{
  1756. Label: modelarts.ResultUrl,
  1757. Value: "s3:/" + resultObsPath,
  1758. }, models.Parameter{
  1759. Label: modelarts.CkptUrl,
  1760. Value: "s3:/" + ckptUrl,
  1761. })
  1762. existDeviceTarget := false
  1763. if len(params) != 0 {
  1764. err := json.Unmarshal([]byte(params), &parameters)
  1765. if err != nil {
  1766. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1767. inferenceJobErrorNewDataPrepare(ctx, form)
  1768. ctx.RenderWithErr("运行参数错误", tplModelArtsInferenceJobNew, &form)
  1769. return
  1770. }
  1771. for _, parameter := range parameters.Parameter {
  1772. if parameter.Label == modelarts.DeviceTarget {
  1773. existDeviceTarget = true
  1774. }
  1775. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1776. param = append(param, models.Parameter{
  1777. Label: parameter.Label,
  1778. Value: parameter.Value,
  1779. })
  1780. }
  1781. }
  1782. }
  1783. if !existDeviceTarget {
  1784. param = append(param, models.Parameter{
  1785. Label: modelarts.DeviceTarget,
  1786. Value: modelarts.Ascend,
  1787. })
  1788. }
  1789. req := &modelarts.GenerateInferenceJobReq{
  1790. JobName: jobName,
  1791. DisplayJobName: displayJobName,
  1792. DataUrl: dataPath,
  1793. Description: description,
  1794. CodeObsPath: codeObsPath,
  1795. BootFileUrl: codeObsPath + bootFile,
  1796. BootFile: bootFile,
  1797. TrainUrl: trainUrl,
  1798. FlavorCode: flavorCode,
  1799. WorkServerNumber: workServerNumber,
  1800. EngineID: int64(engineID),
  1801. LogUrl: logObsPath,
  1802. PoolID: poolID,
  1803. Uuid: uuid,
  1804. Parameters: param, //modelarts train parameters
  1805. CommitID: commitID,
  1806. BranchName: branch_name,
  1807. Params: form.Params,
  1808. FlavorName: FlavorName,
  1809. EngineName: EngineName,
  1810. LabelName: LabelName,
  1811. IsLatestVersion: isLatestVersion,
  1812. VersionCount: VersionCount,
  1813. TotalVersionCount: modelarts.TotalVersionCount,
  1814. ModelName: modelName,
  1815. ModelVersion: modelVersion,
  1816. CkptName: ckptName,
  1817. ResultUrl: resultObsPath,
  1818. }
  1819. err = modelarts.GenerateInferenceJob(ctx, req)
  1820. if err != nil {
  1821. log.Error("GenerateTrainJob failed:%v", err.Error())
  1822. inferenceJobErrorNewDataPrepare(ctx, form)
  1823. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1824. return
  1825. }
  1826. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")
  1827. }
  1828. func InferenceJobIndex(ctx *context.Context) {
  1829. MustEnableModelArts(ctx)
  1830. repo := ctx.Repo.Repository
  1831. page := ctx.QueryInt("page")
  1832. if page <= 0 {
  1833. page = 1
  1834. }
  1835. var jobTypes []string
  1836. jobTypes = append(jobTypes, string(models.JobTypeInference))
  1837. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  1838. ListOptions: models.ListOptions{
  1839. Page: page,
  1840. PageSize: setting.UI.IssuePagingNum,
  1841. },
  1842. RepoID: repo.ID,
  1843. Type: models.TypeCloudBrainTwo,
  1844. JobTypes: jobTypes,
  1845. })
  1846. if err != nil {
  1847. ctx.ServerError("Cloudbrain", err)
  1848. return
  1849. }
  1850. for i, task := range tasks {
  1851. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1852. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1853. tasks[i].ComputeResource = models.NPUResource
  1854. }
  1855. repoId := ctx.Repo.Repository.ID
  1856. Type := -1
  1857. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  1858. ListOptions: models.ListOptions{
  1859. Page: 1,
  1860. PageSize: 2,
  1861. },
  1862. RepoID: repoId,
  1863. Type: Type,
  1864. New: MODEL_LATEST,
  1865. })
  1866. ctx.Data["MODEL_COUNT"] = model_count
  1867. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  1868. pager.SetDefaultParams(ctx)
  1869. ctx.Data["Page"] = pager
  1870. ctx.Data["PageIsCloudBrain"] = true
  1871. ctx.Data["Tasks"] = tasks
  1872. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  1873. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  1874. ctx.HTML(200, tplModelArtsInferenceJobIndex)
  1875. }
  1876. func InferenceJobNew(ctx *context.Context) {
  1877. err := inferenceJobNewDataPrepare(ctx)
  1878. if err != nil {
  1879. ctx.ServerError("get new inference-job info failed", err)
  1880. return
  1881. }
  1882. ctx.HTML(200, tplModelArtsInferenceJobNew)
  1883. }
  1884. func inferenceJobNewDataPrepare(ctx *context.Context) error {
  1885. ctx.Data["PageIsCloudBrain"] = true
  1886. ctx.Data["newInference"] = true
  1887. t := time.Now()
  1888. var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  1889. ctx.Data["display_job_name"] = displayJobName
  1890. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  1891. if err != nil {
  1892. ctx.ServerError("GetAllUserAttachments failed:", err)
  1893. return err
  1894. }
  1895. ctx.Data["attachments"] = attachs
  1896. var resourcePools modelarts.ResourcePool
  1897. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  1898. ctx.ServerError("json.Unmarshal failed:", err)
  1899. return err
  1900. }
  1901. ctx.Data["resource_pools"] = resourcePools.Info
  1902. var engines modelarts.Engine
  1903. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  1904. ctx.ServerError("json.Unmarshal failed:", err)
  1905. return err
  1906. }
  1907. ctx.Data["engines"] = engines.Info
  1908. var versionInfos modelarts.VersionInfo
  1909. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1910. ctx.ServerError("json.Unmarshal failed:", err)
  1911. return err
  1912. }
  1913. ctx.Data["engine_versions"] = versionInfos.Version
  1914. var flavorInfos modelarts.Flavor
  1915. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  1916. ctx.ServerError("json.Unmarshal failed:", err)
  1917. return err
  1918. }
  1919. ctx.Data["flavor_infos"] = flavorInfos.Info
  1920. ctx.Data["params"] = ""
  1921. ctx.Data["branchName"] = ctx.Repo.BranchName
  1922. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  1923. if err != nil {
  1924. ctx.ServerError("getConfigList failed:", err)
  1925. return err
  1926. }
  1927. ctx.Data["config_list"] = configList.ParaConfigs
  1928. repoId := ctx.Repo.Repository.ID
  1929. Type := -1
  1930. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  1931. ListOptions: models.ListOptions{
  1932. Page: 1,
  1933. PageSize: 2,
  1934. },
  1935. RepoID: repoId,
  1936. Type: Type,
  1937. New: MODEL_LATEST,
  1938. })
  1939. ctx.Data["MODEL_COUNT"] = model_count
  1940. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  1941. return nil
  1942. }
  1943. func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
  1944. ctx.Data["PageIsCloudBrain"] = true
  1945. t := time.Now()
  1946. var jobName = "inference" + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  1947. ctx.Data["job_name"] = jobName
  1948. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  1949. if err != nil {
  1950. ctx.ServerError("GetAllUserAttachments failed:", err)
  1951. return err
  1952. }
  1953. ctx.Data["attachments"] = attachs
  1954. var resourcePools modelarts.ResourcePool
  1955. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  1956. ctx.ServerError("json.Unmarshal failed:", err)
  1957. return err
  1958. }
  1959. ctx.Data["resource_pools"] = resourcePools.Info
  1960. var engines modelarts.Engine
  1961. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  1962. ctx.ServerError("json.Unmarshal failed:", err)
  1963. return err
  1964. }
  1965. ctx.Data["engines"] = engines.Info
  1966. var versionInfos modelarts.VersionInfo
  1967. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1968. ctx.ServerError("json.Unmarshal failed:", err)
  1969. return err
  1970. }
  1971. ctx.Data["engine_versions"] = versionInfos.Version
  1972. var flavorInfos modelarts.Flavor
  1973. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  1974. ctx.ServerError("json.Unmarshal failed:", err)
  1975. return err
  1976. }
  1977. ctx.Data["flavor_infos"] = flavorInfos.Info
  1978. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  1979. if err != nil {
  1980. ctx.ServerError("getConfigList failed:", err)
  1981. return err
  1982. }
  1983. var Parameters modelarts.Parameters
  1984. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  1985. ctx.ServerError("json.Unmarshal failed:", err)
  1986. return err
  1987. }
  1988. ctx.Data["params"] = Parameters.Parameter
  1989. ctx.Data["config_list"] = configList.ParaConfigs
  1990. ctx.Data["bootFile"] = form.BootFile
  1991. ctx.Data["uuid"] = form.Attachment
  1992. ctx.Data["branch_name"] = form.BranchName
  1993. ctx.Data["model_name"] = form.ModelName
  1994. ctx.Data["model_version"] = form.ModelVersion
  1995. ctx.Data["ckpt_name"] = form.CkptName
  1996. ctx.Data["train_url"] = form.TrainUrl
  1997. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  1998. return nil
  1999. }
  2000. func InferenceJobShow(ctx *context.Context) {
  2001. ctx.Data["PageIsCloudBrain"] = true
  2002. var jobID = ctx.Params(":jobid")
  2003. page := ctx.QueryInt("page")
  2004. if page <= 0 {
  2005. page = 1
  2006. }
  2007. task, err := models.GetCloudbrainByJobID(jobID)
  2008. if err != nil {
  2009. log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
  2010. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobShow, nil)
  2011. return
  2012. }
  2013. //设置权限
  2014. canNewJob, err := canUserCreateTrainJobVersion(ctx, task.UserID)
  2015. if err != nil {
  2016. ctx.ServerError("canNewJob failed", err)
  2017. return
  2018. }
  2019. ctx.Data["canNewJob"] = canNewJob
  2020. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  2021. var parameters models.Parameters
  2022. err = json.Unmarshal([]byte(task.Parameters), &parameters)
  2023. if err != nil {
  2024. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  2025. trainJobNewDataPrepare(ctx)
  2026. return
  2027. }
  2028. if len(parameters.Parameter) > 0 {
  2029. paramTemp := ""
  2030. for _, Parameter := range parameters.Parameter {
  2031. param := Parameter.Label + " = " + Parameter.Value + "; "
  2032. paramTemp = paramTemp + param
  2033. }
  2034. task.Parameters = paramTemp[:len(paramTemp)-2]
  2035. } else {
  2036. task.Parameters = ""
  2037. }
  2038. LabelName := strings.Fields(task.LabelName)
  2039. ctx.Data["labelName"] = LabelName
  2040. ctx.Data["jobID"] = jobID
  2041. ctx.Data["jobName"] = task.JobName
  2042. ctx.Data["displayJobName"] = task.DisplayJobName
  2043. ctx.Data["task"] = task
  2044. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  2045. tempUids := []int64{}
  2046. tempUids = append(tempUids, task.UserID)
  2047. JobCreater, err := models.GetUserNamesByIDs(tempUids)
  2048. if err != nil {
  2049. log.Error("GetUserNamesByIDs (WhitelistUserIDs): %v", err)
  2050. }
  2051. ctx.Data["userName"] = JobCreater[0]
  2052. ctx.HTML(http.StatusOK, tplModelArtsInferenceJobShow)
  2053. }
  2054. func ModelDownload(ctx *context.Context) {
  2055. var (
  2056. err error
  2057. )
  2058. jobID := ctx.Params(":jobid")
  2059. versionName := ctx.Query("version_name")
  2060. parentDir := ctx.Query("parent_dir")
  2061. fileName := ctx.Query("file_name")
  2062. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2063. if err != nil {
  2064. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error())
  2065. return
  2066. }
  2067. var url string
  2068. if task.ComputeResource == models.NPUResource {
  2069. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
  2070. url, err = storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  2071. if err != nil {
  2072. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  2073. ctx.ServerError("GetObsCreateSignedUrl", err)
  2074. return
  2075. }
  2076. } else if task.ComputeResource == models.GPUResource {
  2077. filePath := setting.CBCodePathPrefix + task.JobName + cloudbrain.ModelMountPath + "/" + parentDir
  2078. url, err = storage.Attachments.PresignedGetURL(filePath, fileName)
  2079. if err != nil {
  2080. log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
  2081. ctx.ServerError("PresignedGetURL", err)
  2082. return
  2083. }
  2084. }
  2085. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2086. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2087. }
  2088. func ResultDownload(ctx *context.Context) {
  2089. var (
  2090. err error
  2091. )
  2092. versionName := ctx.Query("version_name")
  2093. parentDir := ctx.Query("parent_dir")
  2094. fileName := ctx.Query("file_name")
  2095. task := ctx.Cloudbrain
  2096. if err != nil {
  2097. ctx.Data["error"] = err.Error()
  2098. }
  2099. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName, parentDir, fileName), "/")
  2100. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  2101. if err != nil {
  2102. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  2103. ctx.ServerError("GetObsCreateSignedUrl", err)
  2104. return
  2105. }
  2106. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2107. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2108. }
  2109. func DeleteJobStorage(jobName string) error {
  2110. //delete local
  2111. localJobPath := setting.JobPath + jobName
  2112. err := os.RemoveAll(localJobPath)
  2113. if err != nil {
  2114. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  2115. }
  2116. //delete oss
  2117. dirPath := setting.CodePathPrefix + jobName + "/"
  2118. err = storage.ObsRemoveObject(setting.Bucket, dirPath)
  2119. if err != nil {
  2120. log.Error("ObsRemoveObject(%s) failed:%v", localJobPath, err)
  2121. }
  2122. return nil
  2123. }
  2124. func DownloadMultiResultFile(ctx *context.Context) {
  2125. var jobID = ctx.Params(":jobid")
  2126. var versionName = ctx.Query("version_name")
  2127. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2128. if err != nil {
  2129. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  2130. return
  2131. }
  2132. // if !isCanDeleteOrDownload(ctx, task) {
  2133. // ctx.ServerError("no right.", errors.New(ctx.Tr("repo.model_noright")))
  2134. // return
  2135. // }
  2136. // path := Model_prefix + models.AttachmentRelativePath(id) + "/"
  2137. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName), "/") + "/"
  2138. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  2139. if err == nil {
  2140. //count++
  2141. // models.ModifyModelDownloadCount(id)
  2142. returnFileName := task.DisplayJobName + ".zip"
  2143. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+returnFileName)
  2144. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  2145. w := zip.NewWriter(ctx.Resp)
  2146. defer w.Close()
  2147. for _, oneFile := range allFile {
  2148. if oneFile.IsDir {
  2149. log.Info("zip dir name:" + oneFile.FileName)
  2150. } else {
  2151. log.Info("zip file name:" + oneFile.FileName)
  2152. fDest, err := w.Create(oneFile.FileName)
  2153. if err != nil {
  2154. log.Info("create zip entry error, download file failed: %s\n", err.Error())
  2155. ctx.ServerError("download file failed:", err)
  2156. return
  2157. }
  2158. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  2159. if err != nil {
  2160. log.Info("download file failed: %s\n", err.Error())
  2161. ctx.ServerError("download file failed:", err)
  2162. return
  2163. } else {
  2164. defer body.Close()
  2165. p := make([]byte, 1024)
  2166. var readErr error
  2167. var readCount int
  2168. // 读取对象内容
  2169. for {
  2170. readCount, readErr = body.Read(p)
  2171. if readCount > 0 {
  2172. fDest.Write(p[:readCount])
  2173. }
  2174. if readErr != nil {
  2175. break
  2176. }
  2177. }
  2178. }
  2179. }
  2180. }
  2181. } else {
  2182. log.Info("error,msg=" + err.Error())
  2183. ctx.ServerError("no file to download.", err)
  2184. }
  2185. }
  2186. func SetJobCount(ctx *context.Context) {
  2187. repoId := ctx.Repo.Repository.ID
  2188. _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{
  2189. RepoID: repoId,
  2190. Type: models.TypeCloudBrainAll,
  2191. })
  2192. if err != nil {
  2193. ctx.ServerError("Get job faild:", err)
  2194. return
  2195. }
  2196. ctx.Data["jobCount"] = jobCount
  2197. }
  2198. func TrainJobDownloadLogFile(ctx *context.Context) {
  2199. var (
  2200. err error
  2201. )
  2202. var jobID = ctx.Params(":jobid")
  2203. versionName := ctx.Query("version_name")
  2204. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2205. if err != nil {
  2206. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"])
  2207. ctx.ServerError("GetCloudbrainByJobIDAndVersionName", err)
  2208. return
  2209. }
  2210. prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
  2211. key, err := storage.GetObsLogFileName(prefix)
  2212. if err != nil {
  2213. log.Error("GetObsLogFileName(%s) failed:%v", jobID, err.Error(), ctx.Data["msgID"])
  2214. ctx.ServerError("GetObsLogFileName", err)
  2215. return
  2216. }
  2217. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, key)
  2218. if err != nil {
  2219. log.Error("GetObsCreateSignedUrlByBucketAndKey failed: %v", err.Error(), ctx.Data["msgID"])
  2220. ctx.ServerError("GetObsCreateSignedUrlByBucketAndKey", err)
  2221. return
  2222. }
  2223. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2224. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2225. }
  2226. func GetDatasUrlListByUUIDS(uuidStr string) ([]models.Datasurl, string, bool, error) {
  2227. var dataUrl string
  2228. var DatasUrlList []models.Datasurl
  2229. uuidList := strings.Split(uuidStr, ";")
  2230. isMultiDataset := false
  2231. if len(uuidList) > 1 {
  2232. for _, uuid := range uuidList {
  2233. attach, err := models.GetAttachmentByUUID(uuid)
  2234. if err != nil {
  2235. log.Error("GetAttachmentByUUID failed: %v", err)
  2236. return nil, "", isMultiDataset, err
  2237. }
  2238. datasetName := attach.Name
  2239. index := strings.LastIndex(datasetName, ".")
  2240. if index <= 0 {
  2241. index = 0
  2242. }
  2243. datasetNameHead := datasetName[:index]
  2244. datasetUrl := "s3://" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  2245. DatasUrlList = append(DatasUrlList, models.Datasurl{
  2246. DatasetUrl: datasetUrl,
  2247. DatasetName: datasetNameHead,
  2248. })
  2249. }
  2250. firstDataset := uuidList[0]
  2251. dataUrl = "/" + setting.Bucket + "/" + setting.BasePath + path.Join(firstDataset[0:1], firstDataset[1:2]) + "/" + firstDataset + firstDataset + "/"
  2252. isMultiDataset = true
  2253. return DatasUrlList, dataUrl, isMultiDataset, nil
  2254. }
  2255. return nil, "", isMultiDataset, nil
  2256. }
  2257. func IsDatasetUseCountExceed(uuid string) bool {
  2258. uuidList := strings.Split(uuid, ";")
  2259. if len(uuidList) > setting.MaxDatasetNum {
  2260. return true
  2261. } else {
  2262. return false
  2263. }
  2264. }