You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 58 kB

3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753
  1. package repo
  2. import (
  3. "code.gitea.io/gitea/services/lock"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "path"
  11. "strconv"
  12. "strings"
  13. "code.gitea.io/gitea/modules/urfs_client/urchin"
  14. "code.gitea.io/gitea/routers/response"
  15. "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
  16. "code.gitea.io/gitea/modules/dataset"
  17. "code.gitea.io/gitea/services/cloudbrain/resource"
  18. "code.gitea.io/gitea/services/reward/point/account"
  19. "code.gitea.io/gitea/modules/auth"
  20. "code.gitea.io/gitea/modules/git"
  21. "code.gitea.io/gitea/modules/grampus"
  22. "code.gitea.io/gitea/modules/modelarts"
  23. "code.gitea.io/gitea/modules/notification"
  24. "code.gitea.io/gitea/modules/redis/redis_key"
  25. "code.gitea.io/gitea/modules/redis/redis_lock"
  26. "code.gitea.io/gitea/modules/timeutil"
  27. "code.gitea.io/gitea/modules/util"
  28. "github.com/unknwon/com"
  29. "code.gitea.io/gitea/models"
  30. "code.gitea.io/gitea/modules/base"
  31. "code.gitea.io/gitea/modules/cloudbrain"
  32. "code.gitea.io/gitea/modules/context"
  33. "code.gitea.io/gitea/modules/log"
  34. "code.gitea.io/gitea/modules/setting"
  35. cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
  36. )
  37. const (
  38. tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
  39. tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show"
  40. //GPU
  41. tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new"
  42. tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
  43. //NPU
  44. tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new"
  45. tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
  46. )
  47. func GrampusNotebookNew(ctx *context.Context) {
  48. ctx.Data["IsCreate"] = true
  49. notebookType := ctx.QueryInt("type")
  50. processType := grampus.ProcessorTypeGPU
  51. if notebookType == 1 {
  52. processType = grampus.ProcessorTypeNPU
  53. }
  54. err := grampusNotebookNewDataPrepare(ctx, processType)
  55. if err != nil {
  56. ctx.ServerError("get new notebook-job info failed", err)
  57. return
  58. }
  59. if processType == grampus.ProcessorTypeGPU {
  60. ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
  61. } else {
  62. ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
  63. }
  64. }
  65. func GrampusTrainJobGPUNew(ctx *context.Context) {
  66. ctx.Data["IsCreate"] = true
  67. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  68. if err != nil {
  69. ctx.ServerError("get new train-job info failed", err)
  70. return
  71. }
  72. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  73. }
  74. func GrampusTrainJobNPUNew(ctx *context.Context) {
  75. ctx.Data["IsCreate"] = true
  76. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  77. if err != nil {
  78. ctx.ServerError("get new train-job info failed", err)
  79. return
  80. }
  81. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  82. }
  83. func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
  84. ctx.Data["IsCreate"] = true
  85. displayJobName := form.DisplayJobName
  86. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  87. uuid := form.Attachment
  88. description := form.Description
  89. repo := ctx.Repo.Repository
  90. branchName := form.BranchName
  91. image := strings.TrimSpace(form.Image)
  92. codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  93. tpl := tplGrampusNotebookGPUNew
  94. processType := grampus.ProcessorTypeGPU
  95. computeSource := models.GPUResource
  96. computeSourceSimple := models.GPU
  97. if form.Type == 1 {
  98. tpl = tplGrampusNotebookNPUNew
  99. processType = grampus.ProcessorTypeNPU
  100. computeSource = models.NPUResource
  101. computeSourceSimple = models.NPU
  102. codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath
  103. }
  104. limiterCtx := &lock.LockContext{Repo: ctx.Repo.Repository, DisplayJobName: displayJobName, User: ctx.User}
  105. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(limiterCtx)
  106. defer func() {
  107. if lockOperator != nil {
  108. lockOperator.Unlock(limiterCtx)
  109. }
  110. }()
  111. if errMsg != "" {
  112. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  113. grampusNotebookNewDataPrepare(ctx, processType)
  114. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  115. return
  116. }
  117. if !jobNamePattern.MatchString(displayJobName) {
  118. grampusNotebookNewDataPrepare(ctx, processType)
  119. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  120. return
  121. }
  122. //check count limit
  123. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource)
  124. if err != nil {
  125. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  126. grampusNotebookNewDataPrepare(ctx, processType)
  127. ctx.RenderWithErr("system error", tpl, &form)
  128. return
  129. } else {
  130. if count >= 1 {
  131. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  132. grampusNotebookNewDataPrepare(ctx, processType)
  133. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  134. return
  135. }
  136. }
  137. //check whether the task name in the project is duplicated
  138. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  139. if err == nil {
  140. if len(tasks) != 0 {
  141. log.Error("the job name did already exist", ctx.Data["MsgID"])
  142. grampusNotebookNewDataPrepare(ctx, processType)
  143. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  144. return
  145. }
  146. } else {
  147. if !models.IsErrJobNotExist(err) {
  148. log.Error("system error, %v", err, ctx.Data["MsgID"])
  149. grampusNotebookNewDataPrepare(ctx, processType)
  150. ctx.RenderWithErr("system error", tpl, &form)
  151. return
  152. }
  153. }
  154. //check specification
  155. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  156. JobType: models.JobTypeDebug,
  157. ComputeResource: computeSourceSimple,
  158. Cluster: models.C2NetCluster,
  159. })
  160. if err != nil || spec == nil {
  161. grampusNotebookNewDataPrepare(ctx, processType)
  162. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  163. return
  164. }
  165. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  166. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  167. grampusNotebookNewDataPrepare(ctx, processType)
  168. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
  169. return
  170. }
  171. var datasetInfos map[string]models.DatasetInfo
  172. var datasetNames string
  173. //var
  174. if uuid != "" {
  175. datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple)
  176. if err != nil {
  177. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  178. grampusNotebookNewDataPrepare(ctx, processType)
  179. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  180. return
  181. }
  182. }
  183. //prepare code and out path
  184. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  185. _, err = ioutil.ReadDir(codeLocalPath)
  186. if err == nil {
  187. os.RemoveAll(codeLocalPath)
  188. }
  189. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  190. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  191. grampusNotebookNewDataPrepare(ctx, processType)
  192. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  193. return
  194. }
  195. if processType == grampus.ProcessorTypeGPU {
  196. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  197. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  198. grampusNotebookNewDataPrepare(ctx, processType)
  199. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  200. return
  201. }
  202. } else {
  203. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  204. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  205. grampusNotebookNewDataPrepare(ctx, processType)
  206. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  207. return
  208. }
  209. }
  210. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  211. req := &grampus.GenerateNotebookJobReq{
  212. JobName: jobName,
  213. DisplayJobName: displayJobName,
  214. ComputeResource: computeSource,
  215. ProcessType: processType,
  216. ImageUrl: image,
  217. ImageId: form.ImageID,
  218. Description: description,
  219. Uuid: uuid,
  220. CommitID: commitID,
  221. BranchName: branchName,
  222. DatasetNames: datasetNames,
  223. DatasetInfos: datasetInfos,
  224. Spec: spec,
  225. CodeStoragePath: codeStoragePath,
  226. CodeName: strings.ToLower(repo.Name),
  227. }
  228. if form.ModelName != "" { //使用预训练模型训练
  229. _, err := models.QueryModelByPath(form.PreTrainModelUrl)
  230. if err != nil {
  231. log.Error("Can not find model", err)
  232. grampusNotebookNewDataPrepare(ctx, processType)
  233. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form)
  234. return
  235. }
  236. req.ModelName = form.ModelName
  237. req.LabelName = form.LabelName
  238. req.CkptName = form.CkptName
  239. req.ModelVersion = form.ModelVersion
  240. req.PreTrainModelUrl = form.PreTrainModelUrl
  241. req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  242. }
  243. _, err = grampus.GenerateNotebookJob(ctx, req)
  244. if err != nil {
  245. log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"])
  246. grampusTrainJobNewDataPrepare(ctx, processType)
  247. ctx.RenderWithErr(err.Error(), tpl, &form)
  248. return
  249. }
  250. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  251. }
  252. func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error {
  253. ctx.Data["PageIsCloudBrain"] = true
  254. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  255. ctx.Data["display_job_name"] = displayJobName
  256. //get valid images
  257. if processType == grampus.ProcessorTypeNPU {
  258. images, err := grampus.GetImages(processType, string(models.JobTypeDebug))
  259. if err != nil {
  260. log.Error("GetImages failed:", err.Error())
  261. } else {
  262. ctx.Data["images"] = images.Infos
  263. }
  264. }
  265. //prepare available specs
  266. computeResourceSimple := models.GPU
  267. datasetType := models.TypeCloudBrainOne
  268. computeResource := models.GPUResource
  269. if processType == grampus.ProcessorTypeNPU {
  270. computeResourceSimple = models.NPU
  271. datasetType = models.TypeCloudBrainTwo
  272. computeResource = models.NPUResource
  273. }
  274. prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug)
  275. //get branches
  276. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  277. if err != nil {
  278. log.Error("GetBranches error:", err.Error())
  279. } else {
  280. ctx.Data["branches"] = branches
  281. }
  282. ctx.Data["branchName"] = ctx.Repo.BranchName
  283. ctx.Data["datasetType"] = datasetType
  284. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug)
  285. ctx.Data["WaitCount"] = waitCount
  286. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource)
  287. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  288. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  289. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  290. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  291. return nil
  292. }
  293. func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
  294. ctx.Data["PageIsCloudBrain"] = true
  295. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  296. ctx.Data["display_job_name"] = displayJobName
  297. //get valid images
  298. if processType == grampus.ProcessorTypeNPU {
  299. images, err := grampus.GetImages(processType, string(models.JobTypeTrain))
  300. if err != nil {
  301. log.Error("GetImages failed:", err.Error())
  302. } else {
  303. ctx.Data["images"] = images.Infos
  304. }
  305. }
  306. //prepare available specs
  307. if processType == grampus.ProcessorTypeNPU {
  308. prepareGrampusSpecs(ctx, models.NPU)
  309. } else if processType == grampus.ProcessorTypeGPU {
  310. prepareGrampusSpecs(ctx, models.GPU)
  311. }
  312. //get branches
  313. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  314. if err != nil {
  315. log.Error("GetBranches error:", err.Error())
  316. } else {
  317. ctx.Data["branches"] = branches
  318. }
  319. ctx.Data["branchName"] = ctx.Repo.BranchName
  320. if processType == grampus.ProcessorTypeGPU {
  321. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  322. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain)
  323. ctx.Data["WaitCount"] = waitCount
  324. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  325. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  326. } else if processType == grampus.ProcessorTypeNPU {
  327. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  328. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain)
  329. ctx.Data["WaitCount"] = waitCount
  330. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  331. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  332. }
  333. if ctx.Cloudbrain != nil {
  334. uuids, datasetNames := dataset.GetFilterDeletedAttachments(ctx.Cloudbrain.Uuid)
  335. ctx.Data["attachment"] = uuids
  336. ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile
  337. ctx.Data["image_id"] = ctx.Cloudbrain.ImageID
  338. ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters
  339. ctx.Data["description"] = ctx.Cloudbrain.Description
  340. ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName
  341. ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName
  342. ctx.Data["work_server_number"] = ctx.Cloudbrain.WorkServerNumber
  343. if ctx.Cloudbrain.Image != "" {
  344. ctx.Data["image"] = ctx.Cloudbrain.Image
  345. } else {
  346. ctx.Data["image"] = ctx.Cloudbrain.EngineName
  347. }
  348. ctx.Data["dataset_name"] = datasetNames
  349. ctx.Data["model_name"] = ctx.Cloudbrain.ModelName
  350. ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion
  351. ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName
  352. ctx.Data["label_names"] = ctx.Cloudbrain.LabelName
  353. ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl
  354. spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID)
  355. if spec != nil {
  356. ctx.Data["spec_id"] = spec.ID
  357. }
  358. }
  359. return nil
  360. }
  361. func GrampusTrainJobVersionNew(ctx *context.Context) {
  362. task := ctx.Cloudbrain
  363. ctx.Data["IsCreate"] = false
  364. if task.ComputeResource == models.GPUResource {
  365. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  366. if err != nil {
  367. ctx.ServerError("get new train-job version info failed", err)
  368. return
  369. }
  370. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  371. } else if task.ComputeResource == models.NPUResource {
  372. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  373. if err != nil {
  374. ctx.ServerError("get new train-job version info failed", err)
  375. return
  376. }
  377. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  378. }
  379. }
  380. func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) {
  381. tempJobType := models.JobTypeTrain
  382. if len(jobType) > 0 {
  383. tempJobType = jobType[0]
  384. }
  385. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  386. JobType: tempJobType,
  387. ComputeResource: computeResource,
  388. Cluster: models.C2NetCluster,
  389. })
  390. ctx.Data["Specs"] = noteBookSpecs
  391. }
  392. func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
  393. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  394. log.Error("the boot file(%s) must be a python file", form.BootFile)
  395. return errors.New("启动文件必须是python文件")
  396. }
  397. if form.BranchName == "" {
  398. log.Error("the branch must not be null!", form.BranchName)
  399. return errors.New("代码分支不能为空!")
  400. }
  401. return nil
  402. }
  403. func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  404. ctx.Data["IsCreate"] = true
  405. grampusTrainJobGpuCreate(ctx, form)
  406. }
  407. func grampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  408. displayJobName := form.DisplayJobName
  409. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  410. uuid := form.Attachment
  411. description := form.Description
  412. bootFile := strings.TrimSpace(form.BootFile)
  413. params := form.Params
  414. repo := ctx.Repo.Repository
  415. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  416. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  417. branchName := form.BranchName
  418. image := strings.TrimSpace(form.Image)
  419. tpl := tplGrampusTrainJobGPUNew
  420. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  421. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  422. if !isOk {
  423. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  424. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  425. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobGPUNew, &form)
  426. return
  427. }
  428. defer lock.UnLock()
  429. if !jobNamePattern.MatchString(displayJobName) {
  430. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  431. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  432. return
  433. }
  434. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  435. if err != nil || !bootFileExist {
  436. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  437. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  438. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  439. return
  440. }
  441. //check count limit
  442. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  443. if err != nil {
  444. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  445. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  446. ctx.RenderWithErr("system error", tpl, &form)
  447. return
  448. } else {
  449. if count >= 1 {
  450. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  451. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  452. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  453. return
  454. }
  455. }
  456. //check param
  457. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  458. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  459. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  460. ctx.RenderWithErr(err.Error(), tpl, &form)
  461. return
  462. }
  463. //check whether the task name in the project is duplicated
  464. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  465. if err == nil {
  466. if len(tasks) != 0 {
  467. log.Error("the job name did already exist", ctx.Data["MsgID"])
  468. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  469. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  470. return
  471. }
  472. } else {
  473. if !models.IsErrJobNotExist(err) {
  474. log.Error("system error, %v", err, ctx.Data["MsgID"])
  475. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  476. ctx.RenderWithErr("system error", tpl, &form)
  477. return
  478. }
  479. }
  480. //check specification
  481. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  482. JobType: models.JobTypeTrain,
  483. ComputeResource: models.GPU,
  484. Cluster: models.C2NetCluster,
  485. })
  486. if err != nil || spec == nil {
  487. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  488. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  489. return
  490. }
  491. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  492. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  493. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  494. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobGPUNew, &form)
  495. return
  496. }
  497. //check dataset
  498. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  499. if err != nil {
  500. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  501. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  502. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  503. return
  504. }
  505. //prepare code and out path
  506. _, err = ioutil.ReadDir(codeLocalPath)
  507. if err == nil {
  508. os.RemoveAll(codeLocalPath)
  509. }
  510. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  511. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  512. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  513. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  514. return
  515. }
  516. //todo: upload code (send to file_server todo this work?)
  517. //upload code
  518. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  519. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  520. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  521. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  522. return
  523. }
  524. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  525. if err := mkModelPath(modelPath); err != nil {
  526. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  527. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  528. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  529. return
  530. }
  531. //init model readme
  532. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  533. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  534. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  535. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  536. return
  537. }
  538. var datasetRemotePath, allFileName string
  539. for _, datasetInfo := range datasetInfos {
  540. if datasetRemotePath == "" {
  541. datasetRemotePath = datasetInfo.DataLocalPath
  542. allFileName = datasetInfo.FullName
  543. } else {
  544. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  545. allFileName = allFileName + ";" + datasetInfo.FullName
  546. }
  547. }
  548. //prepare command
  549. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  550. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName, "")
  551. if err != nil {
  552. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  553. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  554. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  555. return
  556. }
  557. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  558. req := &grampus.GenerateTrainJobReq{
  559. JobName: jobName,
  560. DisplayJobName: displayJobName,
  561. ComputeResource: models.GPUResource,
  562. ProcessType: grampus.ProcessorTypeGPU,
  563. Command: command,
  564. ImageUrl: image,
  565. Description: description,
  566. BootFile: bootFile,
  567. Uuid: uuid,
  568. CommitID: commitID,
  569. BranchName: branchName,
  570. Params: form.Params,
  571. EngineName: image,
  572. DatasetNames: datasetNames,
  573. DatasetInfos: datasetInfos,
  574. IsLatestVersion: modelarts.IsLatestVersion,
  575. VersionCount: modelarts.VersionCountOne,
  576. WorkServerNumber: 1,
  577. Spec: spec,
  578. }
  579. if form.ModelName != "" { //使用预训练模型训练
  580. req.ModelName = form.ModelName
  581. req.LabelName = form.LabelName
  582. req.CkptName = form.CkptName
  583. req.ModelVersion = form.ModelVersion
  584. req.PreTrainModelUrl = form.PreTrainModelUrl
  585. }
  586. _, err = grampus.GenerateTrainJob(ctx, req)
  587. if err != nil {
  588. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  589. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  590. ctx.RenderWithErr(err.Error(), tpl, &form)
  591. return
  592. }
  593. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  594. }
  595. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  596. index := strings.Index(pretrainModelDir, "/")
  597. if index > 0 {
  598. filterBucket := pretrainModelDir[index+1:]
  599. return filterBucket + fileName
  600. } else {
  601. return ""
  602. }
  603. }
  604. func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  605. ctx.Data["IsCreate"] = false
  606. computeResource := ctx.Query("compute_resource")
  607. if computeResource == models.GPUResource {
  608. grampusTrainJobGpuCreate(ctx, form)
  609. } else if computeResource == models.NPUResource {
  610. grampusTrainJobNpuCreate(ctx, form)
  611. } else {
  612. ctx.ServerError("resource error", errors.New("compute resource is not support"))
  613. return
  614. }
  615. }
  616. func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  617. ctx.Data["IsCreate"] = true
  618. grampusTrainJobNpuCreate(ctx, form)
  619. }
  620. func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  621. displayJobName := form.DisplayJobName
  622. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  623. uuid := form.Attachment
  624. description := form.Description
  625. bootFile := strings.TrimSpace(form.BootFile)
  626. params := form.Params
  627. repo := ctx.Repo.Repository
  628. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  629. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  630. //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  631. branchName := form.BranchName
  632. isLatestVersion := modelarts.IsLatestVersion
  633. versionCount := modelarts.VersionCountOne
  634. engineName := form.EngineName
  635. tpl := tplGrampusTrainJobNPUNew
  636. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  637. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  638. if !isOk {
  639. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  640. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  641. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobNPUNew, &form)
  642. return
  643. }
  644. defer lock.UnLock()
  645. if !jobNamePattern.MatchString(displayJobName) {
  646. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  647. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  648. return
  649. }
  650. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  651. if err != nil || !bootFileExist {
  652. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  653. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  654. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  655. return
  656. }
  657. //check count limit
  658. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  659. if err != nil {
  660. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  661. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  662. ctx.RenderWithErr("system error", tpl, &form)
  663. return
  664. } else {
  665. if count >= 1 {
  666. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  667. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  668. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  669. return
  670. }
  671. }
  672. //check param
  673. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  674. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  675. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  676. ctx.RenderWithErr(err.Error(), tpl, &form)
  677. return
  678. }
  679. //check whether the task name in the project is duplicated
  680. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  681. if err == nil {
  682. if len(tasks) != 0 {
  683. log.Error("the job name did already exist", ctx.Data["MsgID"])
  684. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  685. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  686. return
  687. }
  688. } else {
  689. if !models.IsErrJobNotExist(err) {
  690. log.Error("system error, %v", err, ctx.Data["MsgID"])
  691. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  692. ctx.RenderWithErr("system error", tpl, &form)
  693. return
  694. }
  695. }
  696. //check specification
  697. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  698. JobType: models.JobTypeTrain,
  699. ComputeResource: models.NPU,
  700. Cluster: models.C2NetCluster,
  701. })
  702. if err != nil || spec == nil {
  703. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  704. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  705. return
  706. }
  707. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  708. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  709. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  710. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobNPUNew, &form)
  711. return
  712. }
  713. //check dataset
  714. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU)
  715. if err != nil {
  716. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  717. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  718. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  719. return
  720. }
  721. //prepare code and out path
  722. _, err = ioutil.ReadDir(codeLocalPath)
  723. if err == nil {
  724. os.RemoveAll(codeLocalPath)
  725. }
  726. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  727. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  728. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  729. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  730. return
  731. }
  732. //todo: upload code (send to file_server todo this work?)
  733. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  734. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  735. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  736. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  737. return
  738. }
  739. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  740. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  741. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  742. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  743. return
  744. }
  745. var datasetRemotePath, allFileName string
  746. for _, datasetInfo := range datasetInfos {
  747. if datasetRemotePath == "" {
  748. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  749. allFileName = datasetInfo.FullName
  750. } else {
  751. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  752. allFileName = allFileName + ";" + datasetInfo.FullName
  753. }
  754. }
  755. //prepare command
  756. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  757. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName))
  758. if err != nil {
  759. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  760. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  761. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  762. return
  763. }
  764. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  765. req := &grampus.GenerateTrainJobReq{
  766. JobName: jobName,
  767. DisplayJobName: displayJobName,
  768. ComputeResource: models.NPUResource,
  769. ProcessType: grampus.ProcessorTypeNPU,
  770. Command: command,
  771. ImageId: form.ImageID,
  772. Description: description,
  773. CodeObsPath: codeObsPath,
  774. BootFileUrl: codeObsPath + bootFile,
  775. BootFile: bootFile,
  776. WorkServerNumber: form.WorkServerNumber,
  777. Uuid: uuid,
  778. CommitID: commitID,
  779. IsLatestVersion: isLatestVersion,
  780. BranchName: branchName,
  781. Params: form.Params,
  782. EngineName: engineName,
  783. VersionCount: versionCount,
  784. TotalVersionCount: modelarts.TotalVersionCount,
  785. DatasetNames: datasetNames,
  786. DatasetInfos: datasetInfos,
  787. Spec: spec,
  788. CodeName: strings.ToLower(repo.Name),
  789. }
  790. if form.ModelName != "" { //使用预训练模型训练
  791. req.ModelName = form.ModelName
  792. req.LabelName = form.LabelName
  793. req.CkptName = form.CkptName
  794. req.ModelVersion = form.ModelVersion
  795. req.PreTrainModelUrl = form.PreTrainModelUrl
  796. req.PreTrainModelPath = preTrainModelPath
  797. }
  798. _, err = grampus.GenerateTrainJob(ctx, req)
  799. if err != nil {
  800. log.Error("GenerateTrainJob failed:%v", err.Error())
  801. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  802. ctx.RenderWithErr(err.Error(), tpl, &form)
  803. return
  804. }
  805. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  806. }
  807. func GetGrampusNotebook(ctx *context.APIContext) {
  808. var (
  809. err error
  810. )
  811. ID := ctx.Params(":id")
  812. job, err := models.GetCloudbrainByID(ID)
  813. if err != nil {
  814. ctx.NotFound("", err)
  815. log.Error("GetCloudbrainByID failed:", err)
  816. return
  817. }
  818. jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job)
  819. aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context)
  820. if err != nil {
  821. ctx.NotFound(err)
  822. log.Error("Sync cloud brain one status failed:", err)
  823. return
  824. }
  825. ctx.JSON(http.StatusOK, map[string]interface{}{
  826. "ID": ID,
  827. "JobName": jobAfter.JobName,
  828. "JobStatus": jobAfter.Status,
  829. "AiCenter": aiCenterName,
  830. "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
  831. "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
  832. "JobDuration": jobAfter.TrainJobDuration,
  833. })
  834. }
  835. func GrampusStopJob(ctx *context.Context) {
  836. var ID = ctx.Params(":id")
  837. var resultCode = "0"
  838. var errorMsg = ""
  839. var status = ""
  840. task := ctx.Cloudbrain
  841. for {
  842. if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded {
  843. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  844. resultCode = "-1"
  845. errorMsg = ctx.Tr("cloudbrain.Already_stopped")
  846. break
  847. }
  848. res, err := grampus.StopJob(task.JobID, task.JobType)
  849. if err != nil {
  850. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  851. resultCode = strconv.Itoa(res.ErrorCode)
  852. errorMsg = ctx.Tr("cloudbrain.Stopped_failed")
  853. break
  854. }
  855. oldStatus := task.Status
  856. task.Status = getStopJobResponseStatus(res)
  857. if task.EndTime == 0 {
  858. task.EndTime = timeutil.TimeStampNow()
  859. }
  860. task.ComputeAndSetDuration()
  861. if oldStatus != task.Status {
  862. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  863. }
  864. err = models.UpdateJob(task)
  865. if err != nil {
  866. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  867. resultCode = "-1"
  868. errorMsg = "system error"
  869. break
  870. }
  871. status = task.Status
  872. break
  873. }
  874. ctx.JSON(200, map[string]interface{}{
  875. "result_code": resultCode,
  876. "error_msg": errorMsg,
  877. "status": status,
  878. "id": ID,
  879. "StatusOK": 0,
  880. })
  881. }
  882. func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string {
  883. newStatus := models.GrampusStatusStopping
  884. if res.Status != "" {
  885. newStatus = grampus.TransTrainJobStatus(res.Status)
  886. }
  887. return newStatus
  888. }
  889. func GrampusNotebookDel(ctx *context.Context) {
  890. var listType = ctx.Query("listType")
  891. if err := deleteGrampusJob(ctx); err != nil {
  892. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  893. ctx.ServerError(err.Error(), err)
  894. return
  895. }
  896. var isAdminPage = ctx.Query("isadminpage")
  897. var isHomePage = ctx.Query("ishomepage")
  898. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  899. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  900. } else if isHomePage == "true" {
  901. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  902. } else {
  903. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  904. }
  905. }
  906. func GrampusTrainJobDel(ctx *context.Context) {
  907. var listType = ctx.Query("listType")
  908. if err := deleteGrampusJob(ctx); err != nil {
  909. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  910. ctx.ServerError(err.Error(), err)
  911. return
  912. }
  913. var isAdminPage = ctx.Query("isadminpage")
  914. var isHomePage = ctx.Query("ishomepage")
  915. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  916. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  917. } else if isHomePage == "true" {
  918. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  919. } else {
  920. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  921. }
  922. }
  923. func deleteGrampusJob(ctx *context.Context) error {
  924. task := ctx.Cloudbrain
  925. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  926. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  927. return errors.New(ctx.Tr("cloudbrain.Not_Stopped"))
  928. }
  929. err := models.DeleteJob(task)
  930. if err != nil {
  931. log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"])
  932. return err
  933. }
  934. storageType := models.TypeCloudBrainOne
  935. if task.ComputeResource == models.NPUResource {
  936. storageType = models.TypeCloudBrainTwo
  937. }
  938. DeleteCloudbrainJobStorage(task.JobName, storageType)
  939. return nil
  940. }
  941. type NotebookDataset struct {
  942. DatasetUrl string `json:"dataset_url"`
  943. }
  944. func GrampusNotebookShow(ctx *context.Context) {
  945. ctx.Data["PageIsCloudBrain"] = true
  946. var task *models.Cloudbrain
  947. task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
  948. if err != nil {
  949. log.Error("GetCloudbrainByID failed:" + err.Error())
  950. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  951. return
  952. }
  953. task.ContainerIp = ""
  954. if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
  955. result, err := grampus.GetNotebookJob(task.JobID)
  956. if err != nil {
  957. log.Error("GetJob failed:" + err.Error())
  958. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  959. return
  960. }
  961. if result != nil {
  962. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  963. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  964. }
  965. oldStatus := task.Status
  966. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  967. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  968. task.Duration = result.JobInfo.RunSec
  969. if task.Duration < 0 {
  970. task.Duration = 0
  971. }
  972. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  973. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  974. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  975. }
  976. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  977. task.EndTime = task.StartTime.Add(task.Duration)
  978. }
  979. task.CorrectCreateUnix()
  980. if oldStatus != task.Status {
  981. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  982. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  983. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  984. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  985. }
  986. }
  987. }
  988. }
  989. err = models.UpdateJob(task)
  990. if err != nil {
  991. log.Error("UpdateJob failed:" + err.Error())
  992. }
  993. }
  994. }
  995. if len(task.Parameters) > 0 {
  996. var parameters models.Parameters
  997. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  998. if err != nil {
  999. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1000. ctx.ServerError("system error", err)
  1001. return
  1002. }
  1003. if len(parameters.Parameter) > 0 {
  1004. paramTemp := ""
  1005. for _, Parameter := range parameters.Parameter {
  1006. param := Parameter.Label + " = " + Parameter.Value + "; "
  1007. paramTemp = paramTemp + param
  1008. }
  1009. task.Parameters = paramTemp[:len(paramTemp)-2]
  1010. } else {
  1011. task.Parameters = ""
  1012. }
  1013. }
  1014. user, err := models.GetUserByID(task.UserID)
  1015. if err == nil {
  1016. task.User = user
  1017. }
  1018. prepareSpec4Show(ctx, task)
  1019. ctx.Data["task"] = task
  1020. ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
  1021. ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
  1022. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1023. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1024. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  1025. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  1026. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  1027. ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
  1028. }
  1029. func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
  1030. datasetDownload := make([]*models.DatasetDownload, 0)
  1031. if ctx.IsSigned {
  1032. if task.Uuid != "" && task.UserID == ctx.User.ID {
  1033. if task.IsGPUTask() {
  1034. return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1035. } else {
  1036. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1037. datasetObsUrlList := make([]NotebookDataset, 0)
  1038. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1039. for _, datasetInfo := range datasetDownload {
  1040. for _, datasetObs := range datasetObsUrlList {
  1041. log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName)
  1042. if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) {
  1043. datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl
  1044. break
  1045. }
  1046. }
  1047. }
  1048. }
  1049. }
  1050. }
  1051. return datasetDownload
  1052. }
  1053. func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload {
  1054. var modelDownload models.ModelDownload
  1055. if ctx.IsSigned {
  1056. if task.ModelName != "" && task.UserID == ctx.User.ID {
  1057. if task.IsNPUTask() {
  1058. modelDownload = models.ModelDownload{
  1059. Name: task.CkptName,
  1060. DownloadLink: "",
  1061. IsDelete: false,
  1062. }
  1063. if !HasModelFile(task) {
  1064. modelDownload.IsDelete = true
  1065. }
  1066. datasetObsUrlList := make([]NotebookDataset, 0)
  1067. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1068. for _, datasetObs := range datasetObsUrlList {
  1069. if strings.Contains(datasetObs.DatasetUrl, task.CkptName) {
  1070. modelDownload.DownloadLink = datasetObs.DatasetUrl
  1071. break
  1072. }
  1073. }
  1074. }
  1075. }
  1076. }
  1077. return &modelDownload
  1078. }
  1079. func GrampusTrainJobShow(ctx *context.Context) {
  1080. ctx.Data["PageIsCloudBrain"] = true
  1081. var task *models.Cloudbrain
  1082. task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
  1083. if err != nil {
  1084. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  1085. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1086. return
  1087. }
  1088. task.ContainerIp = ""
  1089. task.User, _ = models.GetUserByID(task.UserID)
  1090. if task.DeletedAt.IsZero() { //normal record
  1091. result, err := grampus.GetJob(task.JobID)
  1092. if err != nil {
  1093. log.Error("GetJob failed:" + err.Error())
  1094. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1095. return
  1096. }
  1097. if result != nil {
  1098. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1099. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1100. }
  1101. oldStatus := task.Status
  1102. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1103. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  1104. task.Duration = result.JobInfo.RunSec
  1105. if task.Duration < 0 {
  1106. task.Duration = 0
  1107. }
  1108. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  1109. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  1110. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  1111. }
  1112. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1113. task.EndTime = task.StartTime.Add(task.Duration)
  1114. }
  1115. task.CorrectCreateUnix()
  1116. if oldStatus != task.Status {
  1117. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1118. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1119. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1120. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1121. }
  1122. }
  1123. }
  1124. }
  1125. err = models.UpdateJob(task)
  1126. if err != nil {
  1127. log.Error("UpdateJob failed:" + err.Error())
  1128. }
  1129. }
  1130. }
  1131. if len(task.Parameters) > 0 {
  1132. var parameters models.Parameters
  1133. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1134. if err != nil {
  1135. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1136. ctx.ServerError("system error", err)
  1137. return
  1138. }
  1139. if len(parameters.Parameter) > 0 {
  1140. paramTemp := ""
  1141. for _, Parameter := range parameters.Parameter {
  1142. param := Parameter.Label + " = " + Parameter.Value + "; "
  1143. paramTemp = paramTemp + param
  1144. }
  1145. task.Parameters = paramTemp[:len(paramTemp)-2]
  1146. } else {
  1147. task.Parameters = ""
  1148. }
  1149. }
  1150. taskList := make([]*models.Cloudbrain, 0)
  1151. taskList = append(taskList, task)
  1152. prepareSpec4Show(ctx, task)
  1153. ctx.Data["version_list_task"] = taskList
  1154. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1155. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1156. ctx.Data["displayJobName"] = task.DisplayJobName
  1157. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1158. ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
  1159. }
  1160. func GrampusDownloadLog(ctx *context.Context) {
  1161. jobID := ctx.Params(":jobid")
  1162. job, err := models.GetCloudbrainByJobID(jobID)
  1163. if err != nil {
  1164. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1165. ctx.ServerError(err.Error(), err)
  1166. return
  1167. }
  1168. content, err := grampus.GetTrainJobLog(job.JobID)
  1169. if err != nil {
  1170. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1171. content = ""
  1172. }
  1173. fileName := job.JobName + "-log.txt"
  1174. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
  1175. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  1176. var b []byte = []byte(content)
  1177. ctx.Resp.Write(b)
  1178. }
  1179. func GrampusGetLog(ctx *context.Context) {
  1180. jobID := ctx.Params(":jobid")
  1181. job, err := models.GetCloudbrainByJobID(jobID)
  1182. if err != nil {
  1183. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1184. ctx.ServerError(err.Error(), err)
  1185. return
  1186. }
  1187. content, err := grampus.GetTrainJobLog(job.JobID)
  1188. if err != nil {
  1189. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1190. ctx.JSON(http.StatusOK, map[string]interface{}{
  1191. "JobName": job.JobName,
  1192. "Content": "",
  1193. "CanLogDownload": false,
  1194. })
  1195. return
  1196. }
  1197. canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
  1198. ctx.JSON(http.StatusOK, map[string]interface{}{
  1199. "JobName": job.JobName,
  1200. "Content": content,
  1201. "CanLogDownload": canLogDownload,
  1202. })
  1203. return
  1204. }
  1205. func GrampusMetrics(ctx *context.Context) {
  1206. jobID := ctx.Params(":jobid")
  1207. job, err := models.GetCloudbrainByJobID(jobID)
  1208. if err != nil {
  1209. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1210. ctx.ServerError(err.Error(), err)
  1211. return
  1212. }
  1213. result, err := grampus.GetGrampusMetrics(job.JobID)
  1214. if err != nil {
  1215. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1216. }
  1217. ctx.JSON(http.StatusOK, map[string]interface{}{
  1218. "JobID": jobID,
  1219. "Interval": result.Interval,
  1220. "MetricsInfo": result.MetricsInfo,
  1221. })
  1222. return
  1223. }
  1224. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
  1225. var command string
  1226. //prepare
  1227. workDir := grampus.NpuWorkDir
  1228. if processorType == grampus.ProcessorTypeNPU {
  1229. command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu
  1230. } else if processorType == grampus.ProcessorTypeGPU {
  1231. workDir = grampus.GpuWorkDir
  1232. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  1233. }
  1234. //download code & dataset
  1235. if processorType == grampus.ProcessorTypeNPU {
  1236. //no need to download code & dataset by internet
  1237. } else if processorType == grampus.ProcessorTypeGPU {
  1238. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  1239. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  1240. command += commandDownload
  1241. }
  1242. //unzip code & dataset
  1243. if processorType == grampus.ProcessorTypeNPU {
  1244. //no need to process
  1245. } else if processorType == grampus.ProcessorTypeGPU {
  1246. unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
  1247. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  1248. command += commandUnzip
  1249. }
  1250. command += "echo \"unzip finished;start to exec code;\";"
  1251. // set export
  1252. var commandExport string
  1253. if processorType == grampus.ProcessorTypeNPU {
  1254. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  1255. } else if processorType == grampus.ProcessorTypeGPU {
  1256. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  1257. }
  1258. command += commandExport
  1259. //exec code
  1260. var parameters models.Parameters
  1261. var paramCode string
  1262. if len(paramSrc) != 0 {
  1263. err := json.Unmarshal([]byte(paramSrc), &parameters)
  1264. if err != nil {
  1265. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  1266. return command, err
  1267. }
  1268. for _, parameter := range parameters.Parameter {
  1269. paramCode += " --" + parameter.Label + "=" + parameter.Value
  1270. }
  1271. }
  1272. var commandCode string
  1273. if processorType == grampus.ProcessorTypeNPU {
  1274. paramCode += " --model_url=" + modelRemoteObsUrl
  1275. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";"
  1276. } else if processorType == grampus.ProcessorTypeGPU {
  1277. if pretrainModelFileName != "" {
  1278. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  1279. }
  1280. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  1281. }
  1282. command += commandCode
  1283. //get exec result
  1284. commandGetRes := "result=$?;"
  1285. command += commandGetRes
  1286. //upload models
  1287. if processorType == grampus.ProcessorTypeNPU {
  1288. // no need to upload
  1289. } else if processorType == grampus.ProcessorTypeGPU {
  1290. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  1291. command += commandUpload
  1292. }
  1293. //check exec result
  1294. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  1295. command += commandCheckRes
  1296. return command, nil
  1297. }
  1298. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  1299. commandDownloadTemp := commandDownload
  1300. if pretrainModelPath != "" {
  1301. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  1302. }
  1303. commandDownloadTemp += ";"
  1304. return commandDownloadTemp
  1305. }
  1306. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  1307. archiveType := git.ZIP
  1308. archivePath := codePath
  1309. if !com.IsDir(archivePath) {
  1310. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  1311. log.Error("MkdirAll failed:" + err.Error())
  1312. return err
  1313. }
  1314. }
  1315. // Get corresponding commit.
  1316. var (
  1317. commit *git.Commit
  1318. err error
  1319. )
  1320. gitRepo := ctx.Repo.GitRepo
  1321. if err != nil {
  1322. log.Error("OpenRepository failed:" + err.Error())
  1323. return err
  1324. }
  1325. if gitRepo.IsBranchExist(branchName) {
  1326. commit, err = gitRepo.GetBranchCommit(branchName)
  1327. if err != nil {
  1328. log.Error("GetBranchCommit failed:" + err.Error())
  1329. return err
  1330. }
  1331. } else {
  1332. log.Error("the branch is not exist: " + branchName)
  1333. return fmt.Errorf("The branch does not exist.")
  1334. }
  1335. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  1336. if !com.IsFile(archivePath) {
  1337. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  1338. Format: archiveType,
  1339. Prefix: setting.Repository.PrefixArchiveFiles,
  1340. }); err != nil {
  1341. log.Error("CreateArchive failed:" + err.Error())
  1342. return err
  1343. }
  1344. }
  1345. return nil
  1346. }
  1347. func HandleTaskWithAiCenter(ctx *context.Context) {
  1348. log.Info("HandleTaskWithAiCenter start")
  1349. updateCounts := 0
  1350. cloudBrains, err := models.GetC2NetWithAiCenterWrongJob()
  1351. if err != nil {
  1352. log.Error("GetC2NetWithAiCenterWrongJob failed:" + err.Error())
  1353. return
  1354. }
  1355. if len(cloudBrains) == 0 {
  1356. log.Info("HandleC2NetWithAiCenterWrongJob:no task need handle")
  1357. return
  1358. }
  1359. cloudBrainCounts := len(cloudBrains)
  1360. for _, task := range cloudBrains {
  1361. result, err := grampus.GetJob(task.JobID)
  1362. if err != nil {
  1363. log.Error("GetJob failed:" + err.Error())
  1364. continue
  1365. }
  1366. if len(result.JobInfo.Tasks) != 0 {
  1367. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1368. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1369. }
  1370. err = models.UpdateJob(task)
  1371. if err != nil {
  1372. log.Error("UpdateJob failed:" + err.Error())
  1373. }
  1374. updateCounts++
  1375. }
  1376. }
  1377. r := make(map[string]interface{}, 0)
  1378. r["cloudBrainCounts"] = cloudBrainCounts
  1379. r["updateCounts"] = updateCounts
  1380. ctx.JSON(http.StatusOK, response.SuccessWithData(r))
  1381. }
  1382. func GrampusNotebookDebug(ctx *context.Context) {
  1383. result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID)
  1384. if err != nil {
  1385. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  1386. return
  1387. }
  1388. if len(result.JobInfo.Tasks) > 0 {
  1389. ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token)
  1390. return
  1391. }
  1392. ctx.NotFound("Can not find the job.", nil)
  1393. }
  1394. func GrampusNotebookRestart(ctx *context.Context) {
  1395. var id = ctx.Params(":id")
  1396. var resultCode = "-1"
  1397. var errorMsg = ""
  1398. var status = ""
  1399. var spec *models.Specification
  1400. task := ctx.Cloudbrain
  1401. if ctx.Written() {
  1402. return
  1403. }
  1404. for {
  1405. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  1406. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  1407. errorMsg = "the job is not stopped"
  1408. break
  1409. }
  1410. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource)
  1411. if err != nil {
  1412. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1413. errorMsg = "system error"
  1414. break
  1415. } else {
  1416. if count >= 1 {
  1417. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1418. resultCode = "2"
  1419. errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
  1420. break
  1421. }
  1422. }
  1423. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  1424. if err != nil || oldSpec == nil {
  1425. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  1426. errorMsg = "Resource specification not available"
  1427. break
  1428. }
  1429. computeSourceSimple := models.GPU
  1430. action := models.ActionCreateGrampusGPUDebugTask
  1431. if task.ComputeResource == models.NPUResource {
  1432. computeSourceSimple = models.NPU
  1433. action = models.ActionCreateGrampusNPUDebugTask
  1434. }
  1435. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  1436. JobType: models.JobType(task.JobType),
  1437. ComputeResource: computeSourceSimple,
  1438. Cluster: models.C2NetCluster,
  1439. })
  1440. if err != nil || spec == nil {
  1441. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  1442. errorMsg = "Resource specification not support any more"
  1443. break
  1444. }
  1445. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1446. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1447. errorMsg = ctx.Tr("points.insufficient_points_balance")
  1448. break
  1449. }
  1450. if task.IsGPUTask() {
  1451. if _, err := os.Stat(getOldJobPath(task)); err != nil {
  1452. log.Error("Can not find job minio path", err)
  1453. resultCode = "-1"
  1454. errorMsg = ctx.Tr("cloudbrain.result_cleared")
  1455. break
  1456. }
  1457. }
  1458. if !HasModelFile(task) { //使用预训练模型训练
  1459. errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
  1460. break
  1461. }
  1462. if hasDatasetDeleted(task) {
  1463. errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
  1464. break
  1465. }
  1466. createTime := timeutil.TimeStampNow()
  1467. res, err := grampus.RestartNotebookJob(task.JobID)
  1468. if err != nil {
  1469. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  1470. errorMsg = ctx.Tr("repo.debug_again_fail")
  1471. break
  1472. }
  1473. if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
  1474. log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
  1475. errorMsg = ctx.Tr("repo.debug_again_fail")
  1476. if res.GrampusResult.ErrorCode == 5005 {
  1477. errorMsg = ctx.Tr("repo.debug_again_fail_forever")
  1478. }
  1479. break
  1480. }
  1481. newTask := &models.Cloudbrain{
  1482. Status: res.Status,
  1483. UserID: task.UserID,
  1484. RepoID: task.RepoID,
  1485. JobID: res.NewId,
  1486. JobName: task.JobName,
  1487. DisplayJobName: task.DisplayJobName,
  1488. JobType: task.JobType,
  1489. Type: task.Type,
  1490. Uuid: task.Uuid,
  1491. Image: task.Image,
  1492. ImageID: task.ImageID,
  1493. EngineID: task.EngineID,
  1494. CommitID: task.CommitID,
  1495. EngineName: task.EngineName,
  1496. IsLatestVersion: "1",
  1497. BranchName: task.BranchName,
  1498. DatasetName: task.DatasetName,
  1499. ComputeResource: task.ComputeResource,
  1500. Description: task.Description,
  1501. CreatedUnix: createTime,
  1502. UpdatedUnix: createTime,
  1503. Spec: spec,
  1504. ModelName: task.ModelName,
  1505. ModelVersion: task.ModelVersion,
  1506. LabelName: task.LabelName,
  1507. PreTrainModelUrl: task.PreTrainModelUrl,
  1508. CkptName: task.CkptName,
  1509. WorkServerNumber: 1,
  1510. }
  1511. err = models.RestartCloudbrain(task, newTask)
  1512. if err != nil {
  1513. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  1514. errorMsg = "system error"
  1515. break
  1516. }
  1517. id = strconv.FormatInt(newTask.ID, 10)
  1518. status = res.Status
  1519. resultCode = "0"
  1520. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action)
  1521. break
  1522. }
  1523. ctx.JSON(200, map[string]string{
  1524. "result_code": resultCode,
  1525. "error_msg": errorMsg,
  1526. "status": status,
  1527. "id": id,
  1528. })
  1529. }