You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 61 kB

3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822
  1. package repo
  2. import (
  3. "code.gitea.io/gitea/services/lock"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "path"
  11. "strconv"
  12. "strings"
  13. "code.gitea.io/gitea/modules/urfs_client/urchin"
  14. "code.gitea.io/gitea/routers/response"
  15. "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
  16. "code.gitea.io/gitea/modules/dataset"
  17. "code.gitea.io/gitea/services/cloudbrain/resource"
  18. "code.gitea.io/gitea/services/reward/point/account"
  19. "code.gitea.io/gitea/modules/auth"
  20. "code.gitea.io/gitea/modules/git"
  21. "code.gitea.io/gitea/modules/grampus"
  22. "code.gitea.io/gitea/modules/modelarts"
  23. "code.gitea.io/gitea/modules/notification"
  24. "code.gitea.io/gitea/modules/timeutil"
  25. "code.gitea.io/gitea/modules/util"
  26. "github.com/unknwon/com"
  27. "code.gitea.io/gitea/models"
  28. "code.gitea.io/gitea/modules/base"
  29. "code.gitea.io/gitea/modules/cloudbrain"
  30. "code.gitea.io/gitea/modules/context"
  31. "code.gitea.io/gitea/modules/log"
  32. "code.gitea.io/gitea/modules/setting"
  33. cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
  34. )
  35. const (
  36. tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
  37. tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show"
  38. //GPU
  39. tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new"
  40. tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
  41. //NPU
  42. tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new"
  43. tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
  44. //GCU
  45. tplGrampusNotebookGCUNew base.TplName = "repo/grampus/notebook/gcu/new"
  46. )
  47. func GrampusNotebookNew(ctx *context.Context) {
  48. ctx.Data["IsCreate"] = true
  49. notebookType := ctx.QueryInt("type")
  50. processType := grampus.ProcessorTypeGPU
  51. if notebookType == 1 {
  52. processType = grampus.ProcessorTypeNPU
  53. } else if notebookType == 2 {
  54. processType = grampus.ProcessorTypeGCU
  55. }
  56. err := grampusNotebookNewDataPrepare(ctx, processType)
  57. if err != nil {
  58. ctx.ServerError("get new notebook-job info failed", err)
  59. return
  60. }
  61. if processType == grampus.ProcessorTypeGPU {
  62. ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
  63. } else if processType == grampus.ProcessorTypeNPU {
  64. ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
  65. } else if processType == grampus.ProcessorTypeGCU {
  66. ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
  67. }
  68. }
  69. func GrampusTrainJobGPUNew(ctx *context.Context) {
  70. ctx.Data["IsCreate"] = true
  71. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  72. if err != nil {
  73. ctx.ServerError("get new train-job info failed", err)
  74. return
  75. }
  76. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  77. }
  78. func GrampusTrainJobNPUNew(ctx *context.Context) {
  79. ctx.Data["IsCreate"] = true
  80. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  81. if err != nil {
  82. ctx.ServerError("get new train-job info failed", err)
  83. return
  84. }
  85. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  86. }
  87. func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
  88. ctx.Data["IsCreate"] = true
  89. displayJobName := form.DisplayJobName
  90. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  91. uuid := form.Attachment
  92. description := form.Description
  93. repo := ctx.Repo.Repository
  94. branchName := form.BranchName
  95. image := strings.TrimSpace(form.Image)
  96. codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  97. tpl := tplGrampusNotebookGPUNew
  98. processType := grampus.ProcessorTypeGPU
  99. computeSource := models.GPUResource
  100. computeSourceSimple := models.GPU
  101. if form.Type == 1 {
  102. tpl = tplGrampusNotebookNPUNew
  103. processType = grampus.ProcessorTypeNPU
  104. computeSource = models.NPUResource
  105. computeSourceSimple = models.NPU
  106. codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath
  107. } else if form.Type == 2 {
  108. tpl = tplGrampusNotebookGCUNew
  109. processType = grampus.ProcessorTypeGCU
  110. computeSource = models.GCUResource
  111. computeSourceSimple = models.GCU
  112. codeStoragePath = setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  113. }
  114. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug)}, User: ctx.User})
  115. defer func() {
  116. if lockOperator != nil {
  117. lockOperator.Unlock()
  118. }
  119. }()
  120. if errMsg != "" {
  121. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  122. grampusNotebookNewDataPrepare(ctx, processType)
  123. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  124. return
  125. }
  126. if !jobNamePattern.MatchString(displayJobName) {
  127. grampusNotebookNewDataPrepare(ctx, processType)
  128. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  129. return
  130. }
  131. //check count limit
  132. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource)
  133. if err != nil {
  134. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  135. grampusNotebookNewDataPrepare(ctx, processType)
  136. ctx.RenderWithErr("system error", tpl, &form)
  137. return
  138. } else {
  139. if count >= 1 {
  140. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  141. grampusNotebookNewDataPrepare(ctx, processType)
  142. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  143. return
  144. }
  145. }
  146. //check whether the task name in the project is duplicated
  147. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  148. if err == nil {
  149. if len(tasks) != 0 {
  150. log.Error("the job name did already exist", ctx.Data["MsgID"])
  151. grampusNotebookNewDataPrepare(ctx, processType)
  152. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  153. return
  154. }
  155. } else {
  156. if !models.IsErrJobNotExist(err) {
  157. log.Error("system error, %v", err, ctx.Data["MsgID"])
  158. grampusNotebookNewDataPrepare(ctx, processType)
  159. ctx.RenderWithErr("system error", tpl, &form)
  160. return
  161. }
  162. }
  163. //check specification
  164. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  165. JobType: models.JobTypeDebug,
  166. ComputeResource: computeSourceSimple,
  167. Cluster: models.C2NetCluster,
  168. })
  169. if err != nil || spec == nil {
  170. grampusNotebookNewDataPrepare(ctx, processType)
  171. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  172. return
  173. }
  174. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  175. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  176. grampusNotebookNewDataPrepare(ctx, processType)
  177. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
  178. return
  179. }
  180. var datasetInfos map[string]models.DatasetInfo
  181. var datasetNames string
  182. //var
  183. if uuid != "" {
  184. datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple)
  185. if err != nil {
  186. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  187. grampusNotebookNewDataPrepare(ctx, processType)
  188. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  189. return
  190. }
  191. uuidArray := strings.Split(uuid, ";")
  192. if datasetInfos == nil || len(datasetInfos) < len(uuidArray) {
  193. grampusNotebookNewDataPrepare(ctx, processType)
  194. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.partial_datasets_not_available"), tpl, &form)
  195. return
  196. }
  197. }
  198. //prepare code and out path
  199. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  200. _, err = ioutil.ReadDir(codeLocalPath)
  201. if err == nil {
  202. os.RemoveAll(codeLocalPath)
  203. }
  204. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  205. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  206. grampusNotebookNewDataPrepare(ctx, processType)
  207. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  208. return
  209. }
  210. if processType == grampus.ProcessorTypeGPU || processType == grampus.ProcessorTypeGCU {
  211. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  212. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  213. grampusNotebookNewDataPrepare(ctx, processType)
  214. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  215. return
  216. }
  217. } else {
  218. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  219. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  220. grampusNotebookNewDataPrepare(ctx, processType)
  221. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  222. return
  223. }
  224. }
  225. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  226. req := &grampus.GenerateNotebookJobReq{
  227. JobName: jobName,
  228. DisplayJobName: displayJobName,
  229. ComputeResource: computeSource,
  230. ProcessType: processType,
  231. ImageUrl: image,
  232. ImageId: form.ImageID,
  233. Description: description,
  234. Uuid: uuid,
  235. CommitID: commitID,
  236. BranchName: branchName,
  237. DatasetNames: datasetNames,
  238. DatasetInfos: datasetInfos,
  239. Spec: spec,
  240. CodeStoragePath: codeStoragePath,
  241. CodeName: strings.ToLower(repo.Name),
  242. }
  243. if form.ModelName != "" { //使用预训练模型训练
  244. m, err := models.QueryModelByPath(form.PreTrainModelUrl)
  245. if err != nil {
  246. log.Error("Can not find model", err)
  247. grampusNotebookNewDataPrepare(ctx, processType)
  248. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form)
  249. return
  250. }
  251. if !cloudbrainTask.IsModelFileExists(m, form.CkptName) {
  252. log.Error("model file not exist.name = %s", form.CkptName)
  253. grampusNotebookNewDataPrepare(ctx, processType)
  254. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_file_not_exist"), tpl, &form)
  255. return
  256. }
  257. req.ModelName = form.ModelName
  258. req.LabelName = form.LabelName
  259. req.CkptName = form.CkptName
  260. req.ModelVersion = form.ModelVersion
  261. req.PreTrainModelUrl = form.PreTrainModelUrl
  262. req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  263. req.ModelStorageType = m.Type
  264. }
  265. _, err = grampus.GenerateNotebookJob(ctx, req)
  266. if err != nil {
  267. log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"])
  268. grampusTrainJobNewDataPrepare(ctx, processType)
  269. ctx.RenderWithErr(err.Error(), tpl, &form)
  270. return
  271. }
  272. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  273. }
  274. func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error {
  275. ctx.Data["PageIsCloudBrain"] = true
  276. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  277. ctx.Data["display_job_name"] = displayJobName
  278. //get valid images
  279. if processType == grampus.ProcessorTypeNPU || processType == grampus.ProcessorTypeGCU {
  280. images, err := grampus.GetImages(processType, string(models.JobTypeDebug))
  281. if err != nil {
  282. log.Error("GetImages failed:", err.Error())
  283. } else {
  284. ctx.Data["images"] = images.Infos
  285. }
  286. }
  287. //prepare available specs
  288. computeResourceSimple := models.GPU
  289. datasetType := models.TypeCloudBrainOne
  290. computeResource := models.GPUResource
  291. if processType == grampus.ProcessorTypeNPU {
  292. computeResourceSimple = models.NPU
  293. datasetType = models.TypeCloudBrainTwo
  294. computeResource = models.NPUResource
  295. } else if processType == grampus.ProcessorTypeGCU {
  296. computeResourceSimple = models.GCU
  297. datasetType = models.TypeCloudBrainAll
  298. computeResource = models.GCUResource
  299. }
  300. prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug)
  301. //get branches
  302. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  303. if err != nil {
  304. log.Error("GetBranches error:", err.Error())
  305. } else {
  306. ctx.Data["branches"] = branches
  307. }
  308. ctx.Data["branchName"] = ctx.Repo.BranchName
  309. ctx.Data["datasetType"] = datasetType
  310. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug)
  311. ctx.Data["WaitCount"] = waitCount
  312. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource)
  313. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  314. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  315. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  316. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  317. return nil
  318. }
  319. func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
  320. ctx.Data["PageIsCloudBrain"] = true
  321. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  322. ctx.Data["display_job_name"] = displayJobName
  323. //get valid images
  324. if processType == grampus.ProcessorTypeNPU {
  325. images, err := grampus.GetImages(processType, string(models.JobTypeTrain))
  326. if err != nil {
  327. log.Error("GetImages failed:", err.Error())
  328. } else {
  329. ctx.Data["images"] = images.Infos
  330. }
  331. }
  332. //prepare available specs
  333. if processType == grampus.ProcessorTypeNPU {
  334. prepareGrampusSpecs(ctx, models.NPU)
  335. } else if processType == grampus.ProcessorTypeGPU {
  336. prepareGrampusSpecs(ctx, models.GPU)
  337. }
  338. //get branches
  339. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  340. if err != nil {
  341. log.Error("GetBranches error:", err.Error())
  342. } else {
  343. ctx.Data["branches"] = branches
  344. }
  345. ctx.Data["branchName"] = ctx.Repo.BranchName
  346. if processType == grampus.ProcessorTypeGPU {
  347. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  348. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain)
  349. ctx.Data["WaitCount"] = waitCount
  350. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  351. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  352. } else if processType == grampus.ProcessorTypeNPU {
  353. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  354. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain)
  355. ctx.Data["WaitCount"] = waitCount
  356. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  357. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  358. }
  359. if ctx.Cloudbrain != nil {
  360. uuids, datasetNames := dataset.GetFilterDeletedAttachments(ctx.Cloudbrain.Uuid)
  361. ctx.Data["attachment"] = uuids
  362. ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile
  363. ctx.Data["image_id"] = ctx.Cloudbrain.ImageID
  364. ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters
  365. ctx.Data["description"] = ctx.Cloudbrain.Description
  366. ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName
  367. ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName
  368. ctx.Data["work_server_number"] = ctx.Cloudbrain.WorkServerNumber
  369. if ctx.Cloudbrain.Image != "" {
  370. ctx.Data["image"] = ctx.Cloudbrain.Image
  371. } else {
  372. ctx.Data["image"] = ctx.Cloudbrain.EngineName
  373. }
  374. ctx.Data["dataset_name"] = datasetNames
  375. ctx.Data["model_name"] = ctx.Cloudbrain.ModelName
  376. ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion
  377. ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName
  378. ctx.Data["label_names"] = ctx.Cloudbrain.LabelName
  379. ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl
  380. spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID)
  381. if spec != nil {
  382. ctx.Data["spec_id"] = spec.ID
  383. }
  384. }
  385. return nil
  386. }
  387. func GrampusTrainJobVersionNew(ctx *context.Context) {
  388. task := ctx.Cloudbrain
  389. ctx.Data["IsCreate"] = false
  390. if task.ComputeResource == models.GPUResource {
  391. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  392. if err != nil {
  393. ctx.ServerError("get new train-job version info failed", err)
  394. return
  395. }
  396. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  397. } else if task.ComputeResource == models.NPUResource {
  398. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  399. if err != nil {
  400. ctx.ServerError("get new train-job version info failed", err)
  401. return
  402. }
  403. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  404. }
  405. }
  406. func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) {
  407. tempJobType := models.JobTypeTrain
  408. if len(jobType) > 0 {
  409. tempJobType = jobType[0]
  410. }
  411. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  412. JobType: tempJobType,
  413. ComputeResource: computeResource,
  414. Cluster: models.C2NetCluster,
  415. })
  416. ctx.Data["Specs"] = noteBookSpecs
  417. }
  418. func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
  419. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  420. log.Error("the boot file(%s) must be a python file", form.BootFile)
  421. return errors.New("启动文件必须是python文件")
  422. }
  423. if form.BranchName == "" {
  424. log.Error("the branch must not be null!", form.BranchName)
  425. return errors.New("代码分支不能为空!")
  426. }
  427. return nil
  428. }
  429. func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  430. ctx.Data["IsCreate"] = true
  431. grampusTrainJobGpuCreate(ctx, form)
  432. }
  433. func grampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  434. displayJobName := form.DisplayJobName
  435. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  436. uuid := form.Attachment
  437. description := form.Description
  438. bootFile := strings.TrimSpace(form.BootFile)
  439. params := form.Params
  440. repo := ctx.Repo.Repository
  441. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  442. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  443. branchName := form.BranchName
  444. image := strings.TrimSpace(form.Image)
  445. tpl := tplGrampusTrainJobGPUNew
  446. if !jobNamePattern.MatchString(displayJobName) {
  447. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  448. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  449. return
  450. }
  451. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeTrain)}, User: ctx.User})
  452. defer func() {
  453. if lockOperator != nil {
  454. lockOperator.Unlock()
  455. }
  456. }()
  457. if errMsg != "" {
  458. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  459. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  460. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  461. return
  462. }
  463. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  464. if err != nil || !bootFileExist {
  465. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  466. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  467. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  468. return
  469. }
  470. //check count limit
  471. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  472. if err != nil {
  473. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  474. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  475. ctx.RenderWithErr("system error", tpl, &form)
  476. return
  477. } else {
  478. if count >= 1 {
  479. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  480. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  481. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  482. return
  483. }
  484. }
  485. //check param
  486. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  487. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  488. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  489. ctx.RenderWithErr(err.Error(), tpl, &form)
  490. return
  491. }
  492. //check whether the task name in the project is duplicated
  493. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  494. if err == nil {
  495. if len(tasks) != 0 {
  496. log.Error("the job name did already exist", ctx.Data["MsgID"])
  497. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  498. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  499. return
  500. }
  501. } else {
  502. if !models.IsErrJobNotExist(err) {
  503. log.Error("system error, %v", err, ctx.Data["MsgID"])
  504. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  505. ctx.RenderWithErr("system error", tpl, &form)
  506. return
  507. }
  508. }
  509. //check specification
  510. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  511. JobType: models.JobTypeTrain,
  512. ComputeResource: models.GPU,
  513. Cluster: models.C2NetCluster,
  514. })
  515. if err != nil || spec == nil {
  516. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  517. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  518. return
  519. }
  520. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  521. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  522. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  523. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobGPUNew, &form)
  524. return
  525. }
  526. //check dataset
  527. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  528. if err != nil {
  529. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  530. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  531. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  532. return
  533. }
  534. //prepare code and out path
  535. _, err = ioutil.ReadDir(codeLocalPath)
  536. if err == nil {
  537. os.RemoveAll(codeLocalPath)
  538. }
  539. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  540. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  541. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  542. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  543. return
  544. }
  545. //todo: upload code (send to file_server todo this work?)
  546. //upload code
  547. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  548. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  549. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  550. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  551. return
  552. }
  553. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  554. if err := mkModelPath(modelPath); err != nil {
  555. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  556. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  557. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  558. return
  559. }
  560. //init model readme
  561. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  562. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  563. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  564. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  565. return
  566. }
  567. var datasetRemotePath, allFileName string
  568. for _, datasetInfo := range datasetInfos {
  569. if datasetRemotePath == "" {
  570. datasetRemotePath = datasetInfo.DataLocalPath
  571. allFileName = datasetInfo.FullName
  572. } else {
  573. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  574. allFileName = allFileName + ";" + datasetInfo.FullName
  575. }
  576. }
  577. //prepare command
  578. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  579. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName, "")
  580. if err != nil {
  581. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  582. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  583. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  584. return
  585. }
  586. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  587. req := &grampus.GenerateTrainJobReq{
  588. JobName: jobName,
  589. DisplayJobName: displayJobName,
  590. ComputeResource: models.GPUResource,
  591. ProcessType: grampus.ProcessorTypeGPU,
  592. Command: command,
  593. ImageUrl: image,
  594. Description: description,
  595. BootFile: bootFile,
  596. Uuid: uuid,
  597. CommitID: commitID,
  598. BranchName: branchName,
  599. Params: form.Params,
  600. EngineName: image,
  601. DatasetNames: datasetNames,
  602. DatasetInfos: datasetInfos,
  603. IsLatestVersion: modelarts.IsLatestVersion,
  604. VersionCount: modelarts.VersionCountOne,
  605. WorkServerNumber: 1,
  606. Spec: spec,
  607. }
  608. if form.ModelName != "" { //使用预训练模型训练
  609. req.ModelName = form.ModelName
  610. req.LabelName = form.LabelName
  611. req.CkptName = form.CkptName
  612. req.ModelVersion = form.ModelVersion
  613. req.PreTrainModelUrl = form.PreTrainModelUrl
  614. }
  615. _, err = grampus.GenerateTrainJob(ctx, req)
  616. if err != nil {
  617. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  618. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  619. ctx.RenderWithErr(err.Error(), tpl, &form)
  620. return
  621. }
  622. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  623. }
  624. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  625. index := strings.Index(pretrainModelDir, "/")
  626. if index > 0 {
  627. filterBucket := pretrainModelDir[index+1:]
  628. return filterBucket + fileName
  629. } else {
  630. return ""
  631. }
  632. }
  633. func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  634. ctx.Data["IsCreate"] = false
  635. computeResource := ctx.Query("compute_resource")
  636. if computeResource == models.GPUResource {
  637. grampusTrainJobGpuCreate(ctx, form)
  638. } else if computeResource == models.NPUResource {
  639. grampusTrainJobNpuCreate(ctx, form)
  640. } else {
  641. ctx.ServerError("resource error", errors.New("compute resource is not support"))
  642. return
  643. }
  644. }
  645. func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  646. ctx.Data["IsCreate"] = true
  647. grampusTrainJobNpuCreate(ctx, form)
  648. }
  649. func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  650. displayJobName := form.DisplayJobName
  651. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  652. uuid := form.Attachment
  653. description := form.Description
  654. bootFile := strings.TrimSpace(form.BootFile)
  655. params := form.Params
  656. repo := ctx.Repo.Repository
  657. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  658. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  659. //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  660. branchName := form.BranchName
  661. isLatestVersion := modelarts.IsLatestVersion
  662. versionCount := modelarts.VersionCountOne
  663. engineName := form.EngineName
  664. tpl := tplGrampusTrainJobNPUNew
  665. if !jobNamePattern.MatchString(displayJobName) {
  666. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  667. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  668. return
  669. }
  670. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeTrain)}, User: ctx.User})
  671. defer func() {
  672. if lockOperator != nil {
  673. lockOperator.Unlock()
  674. }
  675. }()
  676. if errMsg != "" {
  677. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  678. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  679. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  680. return
  681. }
  682. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  683. if err != nil || !bootFileExist {
  684. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  685. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  686. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  687. return
  688. }
  689. //check count limit
  690. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  691. if err != nil {
  692. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  693. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  694. ctx.RenderWithErr("system error", tpl, &form)
  695. return
  696. } else {
  697. if count >= 1 {
  698. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  699. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  700. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  701. return
  702. }
  703. }
  704. //check param
  705. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  706. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  707. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  708. ctx.RenderWithErr(err.Error(), tpl, &form)
  709. return
  710. }
  711. //check whether the task name in the project is duplicated
  712. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  713. if err == nil {
  714. if len(tasks) != 0 {
  715. log.Error("the job name did already exist", ctx.Data["MsgID"])
  716. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  717. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  718. return
  719. }
  720. } else {
  721. if !models.IsErrJobNotExist(err) {
  722. log.Error("system error, %v", err, ctx.Data["MsgID"])
  723. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  724. ctx.RenderWithErr("system error", tpl, &form)
  725. return
  726. }
  727. }
  728. //check specification
  729. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  730. JobType: models.JobTypeTrain,
  731. ComputeResource: models.NPU,
  732. Cluster: models.C2NetCluster,
  733. })
  734. if err != nil || spec == nil {
  735. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  736. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  737. return
  738. }
  739. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  740. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  741. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  742. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobNPUNew, &form)
  743. return
  744. }
  745. //check dataset
  746. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU)
  747. if err != nil {
  748. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  749. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  750. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  751. return
  752. }
  753. //prepare code and out path
  754. _, err = ioutil.ReadDir(codeLocalPath)
  755. if err == nil {
  756. os.RemoveAll(codeLocalPath)
  757. }
  758. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  759. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  760. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  761. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  762. return
  763. }
  764. //todo: upload code (send to file_server todo this work?)
  765. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  766. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  767. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  768. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  769. return
  770. }
  771. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  772. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  773. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  774. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  775. return
  776. }
  777. var datasetRemotePath, allFileName string
  778. for _, datasetInfo := range datasetInfos {
  779. if datasetRemotePath == "" {
  780. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  781. allFileName = datasetInfo.FullName
  782. } else {
  783. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  784. allFileName = allFileName + ";" + datasetInfo.FullName
  785. }
  786. }
  787. //prepare command
  788. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  789. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName))
  790. if err != nil {
  791. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  792. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  793. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  794. return
  795. }
  796. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  797. req := &grampus.GenerateTrainJobReq{
  798. JobName: jobName,
  799. DisplayJobName: displayJobName,
  800. ComputeResource: models.NPUResource,
  801. ProcessType: grampus.ProcessorTypeNPU,
  802. Command: command,
  803. ImageId: form.ImageID,
  804. Description: description,
  805. CodeObsPath: codeObsPath,
  806. BootFileUrl: codeObsPath + bootFile,
  807. BootFile: bootFile,
  808. WorkServerNumber: form.WorkServerNumber,
  809. Uuid: uuid,
  810. CommitID: commitID,
  811. IsLatestVersion: isLatestVersion,
  812. BranchName: branchName,
  813. Params: form.Params,
  814. EngineName: engineName,
  815. VersionCount: versionCount,
  816. TotalVersionCount: modelarts.TotalVersionCount,
  817. DatasetNames: datasetNames,
  818. DatasetInfos: datasetInfos,
  819. Spec: spec,
  820. CodeName: strings.ToLower(repo.Name),
  821. }
  822. if form.ModelName != "" { //使用预训练模型训练
  823. req.ModelName = form.ModelName
  824. req.LabelName = form.LabelName
  825. req.CkptName = form.CkptName
  826. req.ModelVersion = form.ModelVersion
  827. req.PreTrainModelUrl = form.PreTrainModelUrl
  828. req.PreTrainModelPath = preTrainModelPath
  829. }
  830. _, err = grampus.GenerateTrainJob(ctx, req)
  831. if err != nil {
  832. log.Error("GenerateTrainJob failed:%v", err.Error())
  833. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  834. ctx.RenderWithErr(err.Error(), tpl, &form)
  835. return
  836. }
  837. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  838. }
  839. func GetGrampusNotebook(ctx *context.APIContext) {
  840. var (
  841. err error
  842. )
  843. ID := ctx.Params(":id")
  844. job, err := models.GetCloudbrainByID(ID)
  845. if err != nil {
  846. ctx.NotFound("", err)
  847. log.Error("GetCloudbrainByID failed:", err)
  848. return
  849. }
  850. jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job)
  851. aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context)
  852. if err != nil {
  853. ctx.NotFound(err)
  854. log.Error("Sync cloud brain one status failed:", err)
  855. return
  856. }
  857. ctx.JSON(http.StatusOK, map[string]interface{}{
  858. "ID": ID,
  859. "JobName": jobAfter.JobName,
  860. "JobStatus": jobAfter.Status,
  861. "AiCenter": aiCenterName,
  862. "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
  863. "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
  864. "JobDuration": jobAfter.TrainJobDuration,
  865. })
  866. }
  867. func GrampusStopJob(ctx *context.Context) {
  868. var ID = ctx.Params(":id")
  869. var resultCode = "0"
  870. var errorMsg = ""
  871. var status = ""
  872. task := ctx.Cloudbrain
  873. for {
  874. if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded {
  875. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  876. resultCode = "-1"
  877. errorMsg = ctx.Tr("cloudbrain.Already_stopped")
  878. break
  879. }
  880. res, err := grampus.StopJob(task.JobID, task.JobType)
  881. if err != nil {
  882. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  883. resultCode = strconv.Itoa(res.ErrorCode)
  884. errorMsg = ctx.Tr("cloudbrain.Stopped_failed")
  885. break
  886. }
  887. oldStatus := task.Status
  888. task.Status = getStopJobResponseStatus(res)
  889. if task.EndTime == 0 {
  890. task.EndTime = timeutil.TimeStampNow()
  891. }
  892. task.ComputeAndSetDuration()
  893. if oldStatus != task.Status {
  894. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  895. }
  896. err = models.UpdateJob(task)
  897. if err != nil {
  898. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  899. resultCode = "-1"
  900. errorMsg = "system error"
  901. break
  902. }
  903. status = task.Status
  904. break
  905. }
  906. ctx.JSON(200, map[string]interface{}{
  907. "result_code": resultCode,
  908. "error_msg": errorMsg,
  909. "status": status,
  910. "id": ID,
  911. "StatusOK": 0,
  912. })
  913. }
  914. func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string {
  915. newStatus := models.GrampusStatusStopping
  916. if res.Status != "" {
  917. newStatus = grampus.TransTrainJobStatus(res.Status)
  918. }
  919. return newStatus
  920. }
  921. func GrampusNotebookDel(ctx *context.Context) {
  922. var listType = ctx.Query("listType")
  923. if err := deleteGrampusJob(ctx); err != nil {
  924. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  925. ctx.ServerError(err.Error(), err)
  926. return
  927. }
  928. var isAdminPage = ctx.Query("isadminpage")
  929. var isHomePage = ctx.Query("ishomepage")
  930. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  931. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  932. } else if isHomePage == "true" {
  933. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  934. } else {
  935. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  936. }
  937. }
  938. func GrampusTrainJobDel(ctx *context.Context) {
  939. var listType = ctx.Query("listType")
  940. if err := deleteGrampusJob(ctx); err != nil {
  941. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  942. ctx.ServerError(err.Error(), err)
  943. return
  944. }
  945. var isAdminPage = ctx.Query("isadminpage")
  946. var isHomePage = ctx.Query("ishomepage")
  947. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  948. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  949. } else if isHomePage == "true" {
  950. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  951. } else {
  952. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  953. }
  954. }
  955. func deleteGrampusJob(ctx *context.Context) error {
  956. task := ctx.Cloudbrain
  957. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  958. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  959. return errors.New(ctx.Tr("cloudbrain.Not_Stopped"))
  960. }
  961. err := models.DeleteJob(task)
  962. if err != nil {
  963. log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"])
  964. return err
  965. }
  966. storageType := models.TypeCloudBrainOne
  967. if task.ComputeResource == models.NPUResource {
  968. storageType = models.TypeCloudBrainTwo
  969. }
  970. DeleteCloudbrainJobStorage(task.JobName, storageType)
  971. return nil
  972. }
  973. type NotebookDataset struct {
  974. DatasetUrl string `json:"dataset_url"`
  975. }
  976. func GrampusNotebookShow(ctx *context.Context) {
  977. ctx.Data["PageIsCloudBrain"] = true
  978. var task *models.Cloudbrain
  979. task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
  980. if err != nil {
  981. log.Error("GetCloudbrainByID failed:" + err.Error())
  982. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  983. return
  984. }
  985. task.ContainerIp = ""
  986. if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
  987. result, err := grampus.GetNotebookJob(task.JobID)
  988. if err != nil {
  989. log.Error("GetJob failed:" + err.Error())
  990. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  991. return
  992. }
  993. if result != nil {
  994. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  995. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  996. }
  997. oldStatus := task.Status
  998. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  999. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  1000. task.Duration = result.JobInfo.RunSec
  1001. if task.Duration < 0 {
  1002. task.Duration = 0
  1003. }
  1004. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  1005. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  1006. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  1007. }
  1008. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1009. task.EndTime = task.StartTime.Add(task.Duration)
  1010. }
  1011. task.CorrectCreateUnix()
  1012. if oldStatus != task.Status {
  1013. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1014. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1015. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1016. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1017. }
  1018. }
  1019. }
  1020. }
  1021. err = models.UpdateJob(task)
  1022. if err != nil {
  1023. log.Error("UpdateJob failed:" + err.Error())
  1024. }
  1025. }
  1026. }
  1027. if len(task.Parameters) > 0 {
  1028. var parameters models.Parameters
  1029. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1030. if err != nil {
  1031. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1032. ctx.ServerError("system error", err)
  1033. return
  1034. }
  1035. if len(parameters.Parameter) > 0 {
  1036. paramTemp := ""
  1037. for _, Parameter := range parameters.Parameter {
  1038. param := Parameter.Label + " = " + Parameter.Value + "; "
  1039. paramTemp = paramTemp + param
  1040. }
  1041. task.Parameters = paramTemp[:len(paramTemp)-2]
  1042. } else {
  1043. task.Parameters = ""
  1044. }
  1045. }
  1046. user, err := models.GetUserByID(task.UserID)
  1047. if err == nil {
  1048. task.User = user
  1049. }
  1050. prepareSpec4Show(ctx, task)
  1051. ctx.Data["task"] = task
  1052. ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
  1053. ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
  1054. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1055. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1056. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  1057. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  1058. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  1059. ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
  1060. }
  1061. func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
  1062. datasetDownload := make([]*models.DatasetDownload, 0)
  1063. if ctx.IsSigned {
  1064. if task.Uuid != "" && task.UserID == ctx.User.ID {
  1065. if task.IsGPUTask() {
  1066. return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1067. } else {
  1068. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1069. datasetObsUrlList := make([]NotebookDataset, 0)
  1070. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1071. for _, datasetInfo := range datasetDownload {
  1072. for _, datasetObs := range datasetObsUrlList {
  1073. log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName)
  1074. if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) {
  1075. datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl
  1076. break
  1077. }
  1078. }
  1079. }
  1080. }
  1081. }
  1082. }
  1083. return datasetDownload
  1084. }
  1085. func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload {
  1086. var modelDownload models.ModelDownload
  1087. if ctx.IsSigned {
  1088. if task.ModelName != "" && task.UserID == ctx.User.ID {
  1089. if task.IsNPUTask() {
  1090. modelDownload = models.ModelDownload{
  1091. Name: task.CkptName,
  1092. DownloadLink: "",
  1093. IsDelete: false,
  1094. }
  1095. if !HasModelFile(task) {
  1096. modelDownload.IsDelete = true
  1097. }
  1098. datasetObsUrlList := make([]NotebookDataset, 0)
  1099. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1100. for _, datasetObs := range datasetObsUrlList {
  1101. if strings.Contains(datasetObs.DatasetUrl, task.CkptName) {
  1102. modelDownload.DownloadLink = datasetObs.DatasetUrl
  1103. break
  1104. }
  1105. }
  1106. }
  1107. }
  1108. }
  1109. return &modelDownload
  1110. }
  1111. func GrampusTrainJobShow(ctx *context.Context) {
  1112. ctx.Data["PageIsCloudBrain"] = true
  1113. var task *models.Cloudbrain
  1114. task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
  1115. if err != nil {
  1116. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  1117. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1118. return
  1119. }
  1120. task.ContainerIp = ""
  1121. task.User, _ = models.GetUserByID(task.UserID)
  1122. if task.DeletedAt.IsZero() { //normal record
  1123. result, err := grampus.GetJob(task.JobID)
  1124. if err != nil {
  1125. log.Error("GetJob failed:" + err.Error())
  1126. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1127. return
  1128. }
  1129. if result != nil {
  1130. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1131. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1132. }
  1133. oldStatus := task.Status
  1134. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1135. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  1136. task.Duration = result.JobInfo.RunSec
  1137. if task.Duration < 0 {
  1138. task.Duration = 0
  1139. }
  1140. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  1141. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  1142. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  1143. }
  1144. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1145. task.EndTime = task.StartTime.Add(task.Duration)
  1146. }
  1147. task.CorrectCreateUnix()
  1148. if oldStatus != task.Status {
  1149. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1150. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1151. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1152. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1153. }
  1154. }
  1155. }
  1156. }
  1157. err = models.UpdateJob(task)
  1158. if err != nil {
  1159. log.Error("UpdateJob failed:" + err.Error())
  1160. }
  1161. }
  1162. }
  1163. if len(task.Parameters) > 0 {
  1164. var parameters models.Parameters
  1165. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1166. if err != nil {
  1167. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1168. ctx.ServerError("system error", err)
  1169. return
  1170. }
  1171. if len(parameters.Parameter) > 0 {
  1172. paramTemp := ""
  1173. for _, Parameter := range parameters.Parameter {
  1174. param := Parameter.Label + " = " + Parameter.Value + "; "
  1175. paramTemp = paramTemp + param
  1176. }
  1177. task.Parameters = paramTemp[:len(paramTemp)-2]
  1178. } else {
  1179. task.Parameters = ""
  1180. }
  1181. }
  1182. taskList := make([]*models.Cloudbrain, 0)
  1183. taskList = append(taskList, task)
  1184. prepareSpec4Show(ctx, task)
  1185. ctx.Data["version_list_task"] = taskList
  1186. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1187. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1188. ctx.Data["displayJobName"] = task.DisplayJobName
  1189. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1190. ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
  1191. }
  1192. func GrampusDownloadLog(ctx *context.Context) {
  1193. jobID := ctx.Params(":jobid")
  1194. job, err := models.GetCloudbrainByJobID(jobID)
  1195. if err != nil {
  1196. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1197. ctx.ServerError(err.Error(), err)
  1198. return
  1199. }
  1200. content, err := grampus.GetTrainJobLog(job.JobID)
  1201. if err != nil {
  1202. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1203. content = ""
  1204. }
  1205. fileName := job.JobName + "-log.txt"
  1206. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
  1207. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  1208. var b []byte = []byte(content)
  1209. ctx.Resp.Write(b)
  1210. }
  1211. func GrampusGetLog(ctx *context.Context) {
  1212. jobID := ctx.Params(":jobid")
  1213. job, err := models.GetCloudbrainByJobID(jobID)
  1214. if err != nil {
  1215. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1216. ctx.ServerError(err.Error(), err)
  1217. return
  1218. }
  1219. content, err := grampus.GetTrainJobLog(job.JobID)
  1220. if err != nil {
  1221. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1222. ctx.JSON(http.StatusOK, map[string]interface{}{
  1223. "JobName": job.JobName,
  1224. "Content": "",
  1225. "CanLogDownload": false,
  1226. })
  1227. return
  1228. }
  1229. result, err := grampus.GetJob(jobID)
  1230. if err != nil {
  1231. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  1232. ctx.JSON(http.StatusOK, map[string]interface{}{
  1233. "JobName": job.JobName,
  1234. "Content": content,
  1235. "CanLogDownload": false,
  1236. })
  1237. return
  1238. }
  1239. if result != nil {
  1240. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1241. if job.Status == models.GrampusStatusFailed {
  1242. content = content + "\n" + result.ExitDiagnostics
  1243. }
  1244. }
  1245. canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
  1246. ctx.JSON(http.StatusOK, map[string]interface{}{
  1247. "JobName": job.JobName,
  1248. "Content": content,
  1249. "CanLogDownload": canLogDownload,
  1250. })
  1251. return
  1252. }
  1253. func GrampusMetrics(ctx *context.Context) {
  1254. jobID := ctx.Params(":jobid")
  1255. job, err := models.GetCloudbrainByJobID(jobID)
  1256. if err != nil {
  1257. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1258. ctx.ServerError(err.Error(), err)
  1259. return
  1260. }
  1261. result, err := grampus.GetGrampusMetrics(job.JobID)
  1262. if err != nil {
  1263. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1264. }
  1265. ctx.JSON(http.StatusOK, map[string]interface{}{
  1266. "JobID": jobID,
  1267. "Interval": result.Interval,
  1268. "MetricsInfo": result.MetricsInfo,
  1269. })
  1270. return
  1271. }
  1272. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
  1273. var command string
  1274. //prepare
  1275. workDir := grampus.NpuWorkDir
  1276. if processorType == grampus.ProcessorTypeNPU {
  1277. command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu
  1278. } else if processorType == grampus.ProcessorTypeGPU {
  1279. workDir = grampus.GpuWorkDir
  1280. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  1281. }
  1282. //download code & dataset
  1283. if processorType == grampus.ProcessorTypeNPU {
  1284. //no need to download code & dataset by internet
  1285. } else if processorType == grampus.ProcessorTypeGPU {
  1286. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  1287. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  1288. command += commandDownload
  1289. }
  1290. //unzip code & dataset
  1291. if processorType == grampus.ProcessorTypeNPU {
  1292. //no need to process
  1293. } else if processorType == grampus.ProcessorTypeGPU {
  1294. unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
  1295. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  1296. command += commandUnzip
  1297. }
  1298. command += "echo \"unzip finished;start to exec code;\";"
  1299. // set export
  1300. var commandExport string
  1301. if processorType == grampus.ProcessorTypeNPU {
  1302. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  1303. } else if processorType == grampus.ProcessorTypeGPU {
  1304. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  1305. }
  1306. command += commandExport
  1307. //exec code
  1308. var parameters models.Parameters
  1309. var paramCode string
  1310. if len(paramSrc) != 0 {
  1311. err := json.Unmarshal([]byte(paramSrc), &parameters)
  1312. if err != nil {
  1313. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  1314. return command, err
  1315. }
  1316. for _, parameter := range parameters.Parameter {
  1317. paramCode += " --" + parameter.Label + "=" + parameter.Value
  1318. }
  1319. }
  1320. var commandCode string
  1321. if processorType == grampus.ProcessorTypeNPU {
  1322. paramCode += " --model_url=" + modelRemoteObsUrl
  1323. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";"
  1324. } else if processorType == grampus.ProcessorTypeGPU {
  1325. if pretrainModelFileName != "" {
  1326. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  1327. }
  1328. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  1329. }
  1330. command += commandCode
  1331. //get exec result
  1332. commandGetRes := "result=$?;"
  1333. command += commandGetRes
  1334. //upload models
  1335. if processorType == grampus.ProcessorTypeNPU {
  1336. // no need to upload
  1337. } else if processorType == grampus.ProcessorTypeGPU {
  1338. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  1339. command += commandUpload
  1340. }
  1341. //check exec result
  1342. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  1343. command += commandCheckRes
  1344. return command, nil
  1345. }
  1346. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  1347. commandDownloadTemp := commandDownload
  1348. if pretrainModelPath != "" {
  1349. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  1350. }
  1351. commandDownloadTemp += ";"
  1352. return commandDownloadTemp
  1353. }
  1354. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  1355. archiveType := git.ZIP
  1356. archivePath := codePath
  1357. if !com.IsDir(archivePath) {
  1358. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  1359. log.Error("MkdirAll failed:" + err.Error())
  1360. return err
  1361. }
  1362. }
  1363. // Get corresponding commit.
  1364. var (
  1365. commit *git.Commit
  1366. err error
  1367. )
  1368. gitRepo := ctx.Repo.GitRepo
  1369. if err != nil {
  1370. log.Error("OpenRepository failed:" + err.Error())
  1371. return err
  1372. }
  1373. if gitRepo.IsBranchExist(branchName) {
  1374. commit, err = gitRepo.GetBranchCommit(branchName)
  1375. if err != nil {
  1376. log.Error("GetBranchCommit failed:" + err.Error())
  1377. return err
  1378. }
  1379. } else {
  1380. log.Error("the branch is not exist: " + branchName)
  1381. return fmt.Errorf("The branch does not exist.")
  1382. }
  1383. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  1384. if !com.IsFile(archivePath) {
  1385. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  1386. Format: archiveType,
  1387. Prefix: setting.Repository.PrefixArchiveFiles,
  1388. }); err != nil {
  1389. log.Error("CreateArchive failed:" + err.Error())
  1390. return err
  1391. }
  1392. }
  1393. return nil
  1394. }
  1395. func HandleTaskWithAiCenter(ctx *context.Context) {
  1396. log.Info("HandleTaskWithAiCenter start")
  1397. updateCounts := 0
  1398. cloudBrains, err := models.GetC2NetWithAiCenterWrongJob()
  1399. if err != nil {
  1400. log.Error("GetC2NetWithAiCenterWrongJob failed:" + err.Error())
  1401. return
  1402. }
  1403. if len(cloudBrains) == 0 {
  1404. log.Info("HandleC2NetWithAiCenterWrongJob:no task need handle")
  1405. return
  1406. }
  1407. cloudBrainCounts := len(cloudBrains)
  1408. for _, task := range cloudBrains {
  1409. result, err := grampus.GetJob(task.JobID)
  1410. if err != nil {
  1411. log.Error("GetJob failed:" + err.Error())
  1412. continue
  1413. }
  1414. if len(result.JobInfo.Tasks) != 0 {
  1415. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1416. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1417. }
  1418. err = models.UpdateJob(task)
  1419. if err != nil {
  1420. log.Error("UpdateJob failed:" + err.Error())
  1421. }
  1422. updateCounts++
  1423. }
  1424. }
  1425. r := make(map[string]interface{}, 0)
  1426. r["cloudBrainCounts"] = cloudBrainCounts
  1427. r["updateCounts"] = updateCounts
  1428. ctx.JSON(http.StatusOK, response.SuccessWithData(r))
  1429. }
  1430. func GrampusNotebookDebug(ctx *context.Context) {
  1431. result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID)
  1432. if err != nil {
  1433. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  1434. return
  1435. }
  1436. if len(result.JobInfo.Tasks) > 0 {
  1437. ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token)
  1438. return
  1439. }
  1440. ctx.NotFound("Can not find the job.", nil)
  1441. }
  1442. func GrampusNotebookRestart(ctx *context.Context) {
  1443. var id = ctx.Params(":id")
  1444. var resultCode = "-1"
  1445. var errorMsg = ""
  1446. var status = ""
  1447. var spec *models.Specification
  1448. task := ctx.Cloudbrain
  1449. if ctx.Written() {
  1450. return
  1451. }
  1452. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{JobType: task.JobType}, User: ctx.User})
  1453. defer func() {
  1454. if lockOperator != nil {
  1455. lockOperator.Unlock()
  1456. }
  1457. }()
  1458. if errMsg != "" {
  1459. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  1460. errorMsg = ctx.Tr(errMsg)
  1461. }
  1462. for {
  1463. if errorMsg != "" {
  1464. break
  1465. }
  1466. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  1467. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  1468. errorMsg = "the job is not stopped"
  1469. break
  1470. }
  1471. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource)
  1472. if err != nil {
  1473. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1474. errorMsg = "system error"
  1475. break
  1476. } else {
  1477. if count >= 1 {
  1478. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1479. resultCode = "2"
  1480. errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
  1481. break
  1482. }
  1483. }
  1484. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  1485. if err != nil || oldSpec == nil {
  1486. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  1487. errorMsg = "Resource specification not available"
  1488. break
  1489. }
  1490. computeSourceSimple := models.GPU
  1491. action := models.ActionCreateGrampusGPUDebugTask
  1492. if task.ComputeResource == models.NPUResource {
  1493. computeSourceSimple = models.NPU
  1494. action = models.ActionCreateGrampusNPUDebugTask
  1495. } else if task.ComputeResource == models.GCUResource {
  1496. computeSourceSimple = models.GCU
  1497. action = models.ActionCreateGrampusGCUDebugTask
  1498. }
  1499. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  1500. JobType: models.JobType(task.JobType),
  1501. ComputeResource: computeSourceSimple,
  1502. Cluster: models.C2NetCluster,
  1503. })
  1504. if err != nil || spec == nil {
  1505. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  1506. errorMsg = "Resource specification not support any more"
  1507. break
  1508. }
  1509. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1510. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1511. errorMsg = ctx.Tr("points.insufficient_points_balance")
  1512. break
  1513. }
  1514. if task.IsGPUTask() || task.IsGCUTask() {
  1515. if _, err := os.Stat(getOldJobPath(task)); err != nil {
  1516. log.Error("Can not find job minio path", err)
  1517. resultCode = "-1"
  1518. errorMsg = ctx.Tr("cloudbrain.result_cleared")
  1519. break
  1520. }
  1521. }
  1522. if !HasModelFile(task) { //使用预训练模型训练
  1523. errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
  1524. break
  1525. }
  1526. if hasDatasetDeleted(task) {
  1527. errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
  1528. break
  1529. }
  1530. createTime := timeutil.TimeStampNow()
  1531. res, err := grampus.RestartNotebookJob(task.JobID)
  1532. if err != nil {
  1533. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  1534. errorMsg = ctx.Tr("repo.debug_again_fail")
  1535. break
  1536. }
  1537. if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
  1538. log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
  1539. errorMsg = ctx.Tr("repo.debug_again_fail")
  1540. if res.GrampusResult.ErrorCode == 5005 {
  1541. errorMsg = ctx.Tr("repo.debug_again_fail_forever")
  1542. }
  1543. break
  1544. }
  1545. newTask := &models.Cloudbrain{
  1546. Status: res.Status,
  1547. UserID: task.UserID,
  1548. RepoID: task.RepoID,
  1549. JobID: res.NewId,
  1550. JobName: task.JobName,
  1551. DisplayJobName: task.DisplayJobName,
  1552. JobType: task.JobType,
  1553. Type: task.Type,
  1554. Uuid: task.Uuid,
  1555. Image: task.Image,
  1556. ImageID: task.ImageID,
  1557. EngineID: task.EngineID,
  1558. CommitID: task.CommitID,
  1559. EngineName: task.EngineName,
  1560. IsLatestVersion: "1",
  1561. BranchName: task.BranchName,
  1562. DatasetName: task.DatasetName,
  1563. ComputeResource: task.ComputeResource,
  1564. Description: task.Description,
  1565. CreatedUnix: createTime,
  1566. UpdatedUnix: createTime,
  1567. Spec: spec,
  1568. ModelName: task.ModelName,
  1569. ModelVersion: task.ModelVersion,
  1570. LabelName: task.LabelName,
  1571. PreTrainModelUrl: task.PreTrainModelUrl,
  1572. CkptName: task.CkptName,
  1573. WorkServerNumber: 1,
  1574. }
  1575. err = models.RestartCloudbrain(task, newTask)
  1576. if err != nil {
  1577. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  1578. errorMsg = "system error"
  1579. break
  1580. }
  1581. id = strconv.FormatInt(newTask.ID, 10)
  1582. status = res.Status
  1583. resultCode = "0"
  1584. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action)
  1585. break
  1586. }
  1587. ctx.JSON(200, map[string]string{
  1588. "result_code": resultCode,
  1589. "error_msg": errorMsg,
  1590. "status": status,
  1591. "id": id,
  1592. })
  1593. }