You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 47 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476
  1. package repo
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "io"
  6. "net/http"
  7. "os"
  8. "path"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "code.gitea.io/gitea/modules/cloudbrain"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/auth"
  15. "code.gitea.io/gitea/modules/base"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/obs"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "github.com/unknwon/com"
  24. )
  25. const (
  26. tplDebugJobIndex base.TplName = "repo/debugjob/index"
  27. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  28. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  29. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  30. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  31. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  32. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  33. tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
  34. )
  35. func DebugJobIndex(ctx *context.Context) {
  36. debugListType := ctx.Query("debugListType")
  37. MustEnableCloudbrain(ctx)
  38. repo := ctx.Repo.Repository
  39. page := ctx.QueryInt("page")
  40. if page <= 0 {
  41. page = 1
  42. }
  43. debugType := modelarts.DebugType
  44. jobType := string(models.JobTypeDebug)
  45. if debugListType == modelarts.GPUResource {
  46. debugType = models.TypeCloudBrainOne
  47. jobType = ""
  48. }
  49. if debugListType == modelarts.NPUResource {
  50. debugType = models.TypeCloudBrainTwo
  51. }
  52. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  53. ListOptions: models.ListOptions{
  54. Page: page,
  55. PageSize: setting.UI.IssuePagingNum,
  56. },
  57. RepoID: repo.ID,
  58. Type: debugType,
  59. JobType: jobType,
  60. })
  61. if err != nil {
  62. ctx.ServerError("Get debugjob faild:", err)
  63. return
  64. }
  65. for i, task := range ciTasks {
  66. if task.Cloudbrain.Type == models.TypeCloudBrainOne {
  67. ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
  68. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  69. ciTasks[i].Cloudbrain.ComputeResource = modelarts.GPUResource
  70. }
  71. if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
  72. ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
  73. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  74. ciTasks[i].Cloudbrain.ComputeResource = modelarts.NPUResource
  75. }
  76. }
  77. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  78. pager.SetDefaultParams(ctx)
  79. ctx.Data["Page"] = pager
  80. ctx.Data["PageIsCloudBrain"] = true
  81. ctx.Data["Tasks"] = ciTasks
  82. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  83. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  84. ctx.HTML(200, tplDebugJobIndex)
  85. }
  86. // MustEnableDataset check if repository enable internal cb
  87. func MustEnableModelArts(ctx *context.Context) {
  88. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  89. ctx.NotFound("MustEnableCloudbrain", nil)
  90. return
  91. }
  92. }
  93. func NotebookNew(ctx *context.Context) {
  94. ctx.Data["PageIsCloudBrain"] = true
  95. t := time.Now()
  96. var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  97. ctx.Data["job_name"] = jobName
  98. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  99. if err != nil {
  100. ctx.ServerError("GetAllUserAttachments failed:", err)
  101. return
  102. }
  103. ctx.Data["attachments"] = attachs
  104. ctx.Data["dataset_path"] = modelarts.DataSetMountPath
  105. ctx.Data["env"] = modelarts.NotebookEnv
  106. ctx.Data["notebook_type"] = modelarts.NotebookType
  107. if modelarts.FlavorInfos == nil {
  108. json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
  109. }
  110. ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
  111. ctx.HTML(200, tplModelArtsNotebookNew)
  112. }
  113. func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  114. ctx.Data["PageIsNotebook"] = true
  115. jobName := form.JobName
  116. uuid := form.Attachment
  117. description := form.Description
  118. flavor := form.Flavor
  119. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  120. if err != nil {
  121. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  122. cloudBrainNewDataPrepare(ctx)
  123. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  124. return
  125. } else {
  126. if count >= 1 {
  127. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  128. cloudBrainNewDataPrepare(ctx)
  129. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  130. return
  131. }
  132. }
  133. err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
  134. if err != nil {
  135. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  136. return
  137. }
  138. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")
  139. }
  140. func NotebookShow(ctx *context.Context) {
  141. ctx.Data["PageIsCloudBrain"] = true
  142. var jobID = ctx.Params(":jobid")
  143. task, err := models.GetCloudbrainByJobID(jobID)
  144. if err != nil {
  145. ctx.Data["error"] = err.Error()
  146. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  147. return
  148. }
  149. result, err := modelarts.GetJob(jobID)
  150. if err != nil {
  151. ctx.Data["error"] = err.Error()
  152. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  153. return
  154. }
  155. if result != nil {
  156. task.Status = result.Status
  157. err = models.UpdateJob(task)
  158. if err != nil {
  159. ctx.Data["error"] = err.Error()
  160. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  161. return
  162. }
  163. createTime, _ := com.StrTo(result.CreationTimestamp).Int64()
  164. result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05")
  165. endTime, _ := com.StrTo(result.LatestUpdateTimestamp).Int64()
  166. result.LatestUpdateTime = time.Unix(int64(endTime/1000), 0).Format("2006-01-02 15:04:05")
  167. result.QueuingInfo.BeginTime = time.Unix(int64(result.QueuingInfo.BeginTimestamp/1000), 0).Format("2006-01-02 15:04:05")
  168. result.QueuingInfo.EndTime = time.Unix(int64(result.QueuingInfo.EndTimestamp/1000), 0).Format("2006-01-02 15:04:05")
  169. }
  170. ctx.Data["task"] = task
  171. ctx.Data["jobID"] = jobID
  172. ctx.Data["result"] = result
  173. ctx.HTML(200, tplModelArtsNotebookShow)
  174. }
  175. func NotebookDebug(ctx *context.Context) {
  176. var jobID = ctx.Params(":jobid")
  177. _, err := models.GetCloudbrainByJobID(jobID)
  178. if err != nil {
  179. ctx.ServerError("GetCloudbrainByJobID failed", err)
  180. return
  181. }
  182. result, err := modelarts.GetJob(jobID)
  183. if err != nil {
  184. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  185. return
  186. }
  187. res, err := modelarts.GetJobToken(jobID)
  188. if err != nil {
  189. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  190. return
  191. }
  192. urls := strings.Split(result.Spec.Annotations.Url, "/")
  193. urlPrefix := result.Spec.Annotations.TargetDomain
  194. for i, url := range urls {
  195. if i > 2 {
  196. urlPrefix += "/" + url
  197. }
  198. }
  199. debugUrl := urlPrefix + "?token=" + res.Token
  200. ctx.Redirect(debugUrl)
  201. }
  202. func NotebookStop(ctx *context.Context) {
  203. var jobID = ctx.Params(":jobid")
  204. log.Info(jobID)
  205. task, err := models.GetCloudbrainByJobID(jobID)
  206. if err != nil {
  207. ctx.ServerError("GetCloudbrainByJobID failed", err)
  208. return
  209. }
  210. if task.Status != string(models.JobRunning) {
  211. log.Error("the job(%s) is not running", task.JobName)
  212. ctx.ServerError("the job is not running", errors.New("the job is not running"))
  213. return
  214. }
  215. param := models.NotebookAction{
  216. Action: models.ActionStop,
  217. }
  218. res, err := modelarts.StopJob(jobID, param)
  219. if err != nil {
  220. log.Error("StopJob(%s) failed:%v", task.JobName, err.Error())
  221. ctx.ServerError("StopJob failed", err)
  222. return
  223. }
  224. task.Status = res.CurrentStatus
  225. err = models.UpdateJob(task)
  226. if err != nil {
  227. ctx.ServerError("UpdateJob failed", err)
  228. return
  229. }
  230. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")
  231. }
  232. func NotebookDel(ctx *context.Context) {
  233. var jobID = ctx.Params(":jobid")
  234. task, err := models.GetCloudbrainByJobID(jobID)
  235. if err != nil {
  236. ctx.ServerError("GetCloudbrainByJobID failed", err)
  237. return
  238. }
  239. if task.Status != string(models.JobStopped) {
  240. log.Error("the job(%s) has not been stopped", task.JobName)
  241. ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped"))
  242. return
  243. }
  244. _, err = modelarts.DelNotebook(jobID)
  245. if err != nil {
  246. log.Error("DelJob(%s) failed:%v", task.JobName, err.Error())
  247. ctx.ServerError("DelJob failed", err)
  248. return
  249. }
  250. err = models.DeleteJob(task)
  251. if err != nil {
  252. ctx.ServerError("DeleteJob failed", err)
  253. return
  254. }
  255. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")
  256. }
  257. func TrainJobIndex(ctx *context.Context) {
  258. MustEnableModelArts(ctx)
  259. repo := ctx.Repo.Repository
  260. page := ctx.QueryInt("page")
  261. if page <= 0 {
  262. page = 1
  263. }
  264. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  265. ListOptions: models.ListOptions{
  266. Page: page,
  267. PageSize: setting.UI.IssuePagingNum,
  268. },
  269. RepoID: repo.ID,
  270. Type: models.TypeCloudBrainTwo,
  271. JobType: string(models.JobTypeTrain),
  272. IsLatestVersion: modelarts.IsLatestVersion,
  273. })
  274. if err != nil {
  275. ctx.ServerError("Cloudbrain", err)
  276. return
  277. }
  278. for i, task := range tasks {
  279. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  280. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  281. }
  282. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  283. pager.SetDefaultParams(ctx)
  284. ctx.Data["Page"] = pager
  285. ctx.Data["PageIsCloudBrain"] = true
  286. ctx.Data["Tasks"] = tasks
  287. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  288. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  289. ctx.HTML(200, tplModelArtsTrainJobIndex)
  290. }
  291. func TrainJobNew(ctx *context.Context) {
  292. err := trainJobNewDataPrepare(ctx)
  293. if err != nil {
  294. ctx.ServerError("get new train-job info failed", err)
  295. return
  296. }
  297. ctx.HTML(200, tplModelArtsTrainJobNew)
  298. }
  299. func trainJobNewDataPrepare(ctx *context.Context) error {
  300. ctx.Data["PageIsCloudBrain"] = true
  301. //can, err := canUserCreateTrainJob(ctx.User.ID)
  302. //if err != nil {
  303. // ctx.ServerError("canUserCreateTrainJob", err)
  304. // return
  305. //}
  306. //
  307. //if !can {
  308. // log.Error("the user can not create train-job")
  309. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  310. // return
  311. //}
  312. t := time.Now()
  313. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  314. ctx.Data["job_name"] = jobName
  315. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  316. if err != nil {
  317. ctx.ServerError("GetAllUserAttachments failed:", err)
  318. return err
  319. }
  320. ctx.Data["attachments"] = attachs
  321. var resourcePools modelarts.ResourcePool
  322. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  323. ctx.ServerError("json.Unmarshal failed:", err)
  324. return err
  325. }
  326. ctx.Data["resource_pools"] = resourcePools.Info
  327. var engines modelarts.Engine
  328. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  329. ctx.ServerError("json.Unmarshal failed:", err)
  330. return err
  331. }
  332. ctx.Data["engines"] = engines.Info
  333. var versionInfos modelarts.VersionInfo
  334. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  335. ctx.ServerError("json.Unmarshal failed:", err)
  336. return err
  337. }
  338. ctx.Data["engine_versions"] = versionInfos.Version
  339. var flavorInfos modelarts.Flavor
  340. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  341. ctx.ServerError("json.Unmarshal failed:", err)
  342. return err
  343. }
  344. ctx.Data["flavor_infos"] = flavorInfos.Info
  345. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  346. ctx.Data["train_url"] = outputObsPath
  347. ctx.Data["params"] = ""
  348. ctx.Data["branchName"] = ctx.Repo.BranchName
  349. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  350. if err != nil {
  351. ctx.ServerError("getConfigList failed:", err)
  352. return err
  353. }
  354. ctx.Data["config_list"] = configList.ParaConfigs
  355. return nil
  356. }
  357. func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  358. ctx.Data["PageIsCloudBrain"] = true
  359. //can, err := canUserCreateTrainJob(ctx.User.ID)
  360. //if err != nil {
  361. // ctx.ServerError("canUserCreateTrainJob", err)
  362. // return
  363. //}
  364. //
  365. //if !can {
  366. // log.Error("the user can not create train-job")
  367. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  368. // return
  369. //}
  370. t := time.Now()
  371. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  372. ctx.Data["job_name"] = jobName
  373. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  374. if err != nil {
  375. ctx.ServerError("GetAllUserAttachments failed:", err)
  376. return err
  377. }
  378. ctx.Data["attachments"] = attachs
  379. var resourcePools modelarts.ResourcePool
  380. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  381. ctx.ServerError("json.Unmarshal failed:", err)
  382. return err
  383. }
  384. ctx.Data["resource_pools"] = resourcePools.Info
  385. var engines modelarts.Engine
  386. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  387. ctx.ServerError("json.Unmarshal failed:", err)
  388. return err
  389. }
  390. ctx.Data["engines"] = engines.Info
  391. var versionInfos modelarts.VersionInfo
  392. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  393. ctx.ServerError("json.Unmarshal failed:", err)
  394. return err
  395. }
  396. ctx.Data["engine_versions"] = versionInfos.Version
  397. var flavorInfos modelarts.Flavor
  398. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  399. ctx.ServerError("json.Unmarshal failed:", err)
  400. return err
  401. }
  402. ctx.Data["flavor_infos"] = flavorInfos.Info
  403. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  404. ctx.Data["train_url"] = outputObsPath
  405. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  406. if err != nil {
  407. ctx.ServerError("getConfigList failed:", err)
  408. return err
  409. }
  410. var Parameters modelarts.Parameters
  411. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  412. ctx.ServerError("json.Unmarshal failed:", err)
  413. return err
  414. }
  415. ctx.Data["params"] = Parameters.Parameter
  416. ctx.Data["config_list"] = configList.ParaConfigs
  417. ctx.Data["bootFile"] = form.BootFile
  418. ctx.Data["uuid"] = form.Attachment
  419. ctx.Data["branch_name"] = form.BranchName
  420. return nil
  421. }
  422. func TrainJobNewVersion(ctx *context.Context) {
  423. err := trainJobNewVersionDataPrepare(ctx)
  424. if err != nil {
  425. ctx.ServerError("get new train-job info failed", err)
  426. return
  427. }
  428. ctx.HTML(200, tplModelArtsTrainJobVersionNew)
  429. }
  430. func trainJobNewVersionDataPrepare(ctx *context.Context) error {
  431. ctx.Data["PageIsCloudBrain"] = true
  432. var jobID = ctx.Params(":jobid")
  433. var versionName = ctx.Query("version_name")
  434. // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
  435. // if err != nil {
  436. // ctx.ServerError("canNewJob can info failed", err)
  437. // return err
  438. // }
  439. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  440. if err != nil {
  441. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  442. return err
  443. }
  444. t := time.Now()
  445. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  446. ctx.Data["job_name"] = task.JobName
  447. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  448. if err != nil {
  449. ctx.ServerError("GetAllUserAttachments failed:", err)
  450. return err
  451. }
  452. ctx.Data["attachments"] = attachs
  453. var resourcePools modelarts.ResourcePool
  454. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  455. ctx.ServerError("json.Unmarshal failed:", err)
  456. return err
  457. }
  458. ctx.Data["resource_pools"] = resourcePools.Info
  459. var engines modelarts.Engine
  460. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  461. ctx.ServerError("json.Unmarshal failed:", err)
  462. return err
  463. }
  464. ctx.Data["engines"] = engines.Info
  465. var versionInfos modelarts.VersionInfo
  466. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  467. ctx.ServerError("json.Unmarshal failed:", err)
  468. return err
  469. }
  470. ctx.Data["engine_versions"] = versionInfos.Version
  471. var flavorInfos modelarts.Flavor
  472. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  473. ctx.ServerError("json.Unmarshal failed:", err)
  474. return err
  475. }
  476. ctx.Data["flavor_infos"] = flavorInfos.Info
  477. var Parameters modelarts.Parameters
  478. if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
  479. ctx.ServerError("json.Unmarshal failed:", err)
  480. return err
  481. }
  482. ctx.Data["params"] = Parameters.Parameter
  483. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  484. ctx.Data["train_url"] = outputObsPath
  485. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  486. if err != nil {
  487. ctx.ServerError("GetBranches error:", err)
  488. return err
  489. }
  490. ctx.Data["branches"] = branches
  491. ctx.Data["branch_name"] = task.BranchName
  492. ctx.Data["description"] = task.Description
  493. ctx.Data["boot_file"] = task.BootFile
  494. ctx.Data["dataset_name"] = task.DatasetName
  495. ctx.Data["work_server_number"] = task.WorkServerNumber
  496. ctx.Data["flavor_name"] = task.FlavorName
  497. ctx.Data["engine_name"] = task.EngineName
  498. ctx.Data["uuid"] = task.Uuid
  499. ctx.Data["flavor_code"] = task.FlavorCode
  500. ctx.Data["engine_id"] = task.EngineID
  501. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  502. if err != nil {
  503. ctx.ServerError("getConfigList failed:", err)
  504. return err
  505. }
  506. ctx.Data["config_list"] = configList.ParaConfigs
  507. return nil
  508. }
  509. func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  510. ctx.Data["PageIsCloudBrain"] = true
  511. var jobID = ctx.Params(":jobid")
  512. // var versionName = ctx.Params(":version-name")
  513. var versionName = ctx.Query("version_name")
  514. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  515. if err != nil {
  516. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  517. return err
  518. }
  519. t := time.Now()
  520. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  521. ctx.Data["job_name"] = task.JobName
  522. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  523. if err != nil {
  524. ctx.ServerError("GetAllUserAttachments failed:", err)
  525. return err
  526. }
  527. ctx.Data["attachments"] = attachs
  528. var resourcePools modelarts.ResourcePool
  529. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  530. ctx.ServerError("json.Unmarshal failed:", err)
  531. return err
  532. }
  533. ctx.Data["resource_pools"] = resourcePools.Info
  534. var engines modelarts.Engine
  535. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  536. ctx.ServerError("json.Unmarshal failed:", err)
  537. return err
  538. }
  539. ctx.Data["engines"] = engines.Info
  540. var versionInfos modelarts.VersionInfo
  541. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  542. ctx.ServerError("json.Unmarshal failed:", err)
  543. return err
  544. }
  545. ctx.Data["engine_versions"] = versionInfos.Version
  546. var flavorInfos modelarts.Flavor
  547. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  548. ctx.ServerError("json.Unmarshal failed:", err)
  549. return err
  550. }
  551. ctx.Data["flavor_infos"] = flavorInfos.Info
  552. var Parameters modelarts.Parameters
  553. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  554. ctx.ServerError("json.Unmarshal failed:", err)
  555. return err
  556. }
  557. ctx.Data["params"] = Parameters.Parameter
  558. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  559. ctx.Data["train_url"] = outputObsPath
  560. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  561. if err != nil {
  562. ctx.ServerError("GetBranches error:", err)
  563. return err
  564. }
  565. ctx.Data["branches"] = branches
  566. ctx.Data["description"] = form.Description
  567. ctx.Data["dataset_name"] = task.DatasetName
  568. ctx.Data["work_server_number"] = form.WorkServerNumber
  569. ctx.Data["flavor_name"] = form.FlavorName
  570. ctx.Data["engine_name"] = form.EngineName
  571. ctx.Data["flavor_code"] = task.FlavorCode
  572. ctx.Data["engine_id"] = task.EngineID
  573. ctx.Data["version_name"] = form.VersionName
  574. ctx.Data["bootFile"] = form.BootFile
  575. ctx.Data["uuid"] = form.Attachment
  576. ctx.Data["branch_name"] = form.BranchName
  577. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  578. if err != nil {
  579. ctx.ServerError("getConfigList failed:", err)
  580. return err
  581. }
  582. ctx.Data["config_list"] = configList.ParaConfigs
  583. return nil
  584. }
  585. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  586. ctx.Data["PageIsTrainJob"] = true
  587. VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(modelarts.TotalVersionCount)
  588. jobName := form.JobName
  589. uuid := form.Attachment
  590. description := form.Description
  591. workServerNumber := form.WorkServerNumber
  592. engineID := form.EngineID
  593. bootFile := form.BootFile
  594. flavorCode := form.Flavor
  595. params := form.Params
  596. poolID := form.PoolID
  597. isSaveParam := form.IsSaveParam
  598. repo := ctx.Repo.Repository
  599. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  600. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  601. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  602. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  603. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  604. branch_name := form.BranchName
  605. isLatestVersion := modelarts.IsLatestVersion
  606. FlavorName := form.FlavorName
  607. VersionCount := modelarts.VersionCount
  608. EngineName := form.EngineName
  609. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  610. if err != nil {
  611. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  612. trainJobErrorNewDataPrepare(ctx, form)
  613. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  614. return
  615. } else {
  616. if count >= 1 {
  617. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  618. trainJobErrorNewDataPrepare(ctx, form)
  619. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
  620. return
  621. }
  622. }
  623. if err := paramCheckCreateTrainJob(form); err != nil {
  624. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  625. trainJobErrorNewDataPrepare(ctx, form)
  626. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  627. return
  628. }
  629. // attach, err := models.GetAttachmentByUUID(uuid)
  630. // if err != nil {
  631. // log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
  632. // return
  633. // }
  634. //todo: del the codeLocalPath
  635. // _, err := ioutil.ReadDir(codeLocalPath)
  636. // if err == nil {
  637. // os.RemoveAll(codeLocalPath)
  638. // }
  639. os.RemoveAll(codeLocalPath)
  640. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  641. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  642. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  643. Branch: branch_name,
  644. }); err != nil {
  645. log.Error("创建任务失败,服务器超时!: %s (%v)", repo.FullName(), err)
  646. trainJobErrorNewDataPrepare(ctx, form)
  647. ctx.RenderWithErr("创建任务失败,服务器超时!", tplModelArtsTrainJobNew, &form)
  648. return
  649. }
  650. //todo: upload code (send to file_server todo this work?)
  651. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  652. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  653. trainJobErrorNewDataPrepare(ctx, form)
  654. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  655. return
  656. }
  657. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  658. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  659. trainJobErrorNewDataPrepare(ctx, form)
  660. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  661. return
  662. }
  663. // parentDir := VersionOutputPath + "/"
  664. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  665. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  666. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  667. trainJobErrorNewDataPrepare(ctx, form)
  668. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
  669. return
  670. }
  671. //todo: del local code?
  672. var parameters models.Parameters
  673. param := make([]models.Parameter, 0)
  674. param = append(param, models.Parameter{
  675. Label: modelarts.TrainUrl,
  676. Value: outputObsPath,
  677. }, models.Parameter{
  678. Label: modelarts.DataUrl,
  679. Value: dataPath,
  680. })
  681. if len(params) != 0 {
  682. err := json.Unmarshal([]byte(params), &parameters)
  683. if err != nil {
  684. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  685. trainJobErrorNewDataPrepare(ctx, form)
  686. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  687. return
  688. }
  689. for _, parameter := range parameters.Parameter {
  690. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  691. param = append(param, models.Parameter{
  692. Label: parameter.Label,
  693. Value: parameter.Value,
  694. })
  695. }
  696. }
  697. }
  698. //save param config
  699. if isSaveParam == "on" {
  700. if form.ParameterTemplateName == "" {
  701. log.Error("ParameterTemplateName is empty")
  702. trainJobNewDataPrepare(ctx)
  703. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  704. return
  705. }
  706. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  707. ConfigName: form.ParameterTemplateName,
  708. Description: form.PrameterDescription,
  709. DataUrl: dataPath,
  710. AppUrl: codeObsPath,
  711. BootFileUrl: codeObsPath + bootFile,
  712. TrainUrl: outputObsPath,
  713. Flavor: models.Flavor{
  714. Code: flavorCode,
  715. },
  716. WorkServerNum: workServerNumber,
  717. EngineID: int64(engineID),
  718. LogUrl: logObsPath,
  719. PoolID: poolID,
  720. Parameter: param,
  721. })
  722. if err != nil {
  723. log.Error("Failed to CreateTrainJobConfig: %v", err)
  724. trainJobErrorNewDataPrepare(ctx, form)
  725. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  726. return
  727. }
  728. }
  729. req := &modelarts.GenerateTrainJobReq{
  730. JobName: jobName,
  731. DataUrl: dataPath,
  732. Description: description,
  733. CodeObsPath: codeObsPath,
  734. BootFileUrl: codeObsPath + bootFile,
  735. BootFile: bootFile,
  736. TrainUrl: outputObsPath,
  737. FlavorCode: flavorCode,
  738. WorkServerNumber: workServerNumber,
  739. EngineID: int64(engineID),
  740. LogUrl: logObsPath,
  741. PoolID: poolID,
  742. Uuid: uuid,
  743. Parameters: parameters.Parameter,
  744. CommitID: commitID,
  745. IsLatestVersion: isLatestVersion,
  746. BranchName: branch_name,
  747. Params: form.Params,
  748. FlavorName: FlavorName,
  749. EngineName: EngineName,
  750. VersionCount: VersionCount,
  751. TotalVersionCount: modelarts.TotalVersionCount,
  752. }
  753. //将params转换Parameters.Parameter,出错时返回给前端
  754. var Parameters modelarts.Parameters
  755. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  756. ctx.ServerError("json.Unmarshal failed:", err)
  757. return
  758. }
  759. err = modelarts.GenerateTrainJob(ctx, req)
  760. if err != nil {
  761. log.Error("GenerateTrainJob failed:%v", err.Error())
  762. trainJobErrorNewDataPrepare(ctx, form)
  763. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  764. return
  765. }
  766. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  767. }
  768. func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  769. ctx.Data["PageIsTrainJob"] = true
  770. var jobID = ctx.Params(":jobid")
  771. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  772. if err != nil {
  773. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  774. versionErrorDataPrepare(ctx, form)
  775. ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
  776. return
  777. } else {
  778. if count >= 1 {
  779. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  780. versionErrorDataPrepare(ctx, form)
  781. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
  782. return
  783. }
  784. }
  785. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
  786. if err != nil {
  787. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  788. return
  789. }
  790. VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(latestTask.TotalVersionCount + 1)
  791. jobName := form.JobName
  792. uuid := form.Attachment
  793. description := form.Description
  794. workServerNumber := form.WorkServerNumber
  795. engineID := form.EngineID
  796. bootFile := form.BootFile
  797. flavorCode := form.Flavor
  798. params := form.Params
  799. poolID := form.PoolID
  800. isSaveParam := form.IsSaveParam
  801. repo := ctx.Repo.Repository
  802. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  803. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  804. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  805. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  806. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  807. branch_name := form.BranchName
  808. PreVersionName := form.VersionName
  809. FlavorName := form.FlavorName
  810. EngineName := form.EngineName
  811. isLatestVersion := modelarts.IsLatestVersion
  812. //判断权限
  813. canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
  814. if !canNewJob {
  815. ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
  816. return
  817. }
  818. if err := paramCheckCreateTrainJob(form); err != nil {
  819. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  820. versionErrorDataPrepare(ctx, form)
  821. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  822. return
  823. }
  824. // attach, err := models.GetAttachmentByUUID(uuid)
  825. // if err != nil {
  826. // log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
  827. // return
  828. // }
  829. //todo: del the codeLocalPath
  830. // _, err = ioutil.ReadDir(codeLocalPath)
  831. // if err == nil {
  832. // os.RemoveAll(codeLocalPath)
  833. // }
  834. os.RemoveAll(codeLocalPath)
  835. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  836. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  837. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  838. Branch: branch_name,
  839. }); err != nil {
  840. log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
  841. versionErrorDataPrepare(ctx, form)
  842. ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
  843. return
  844. }
  845. //todo: upload code (send to file_server todo this work?)
  846. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  847. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  848. versionErrorDataPrepare(ctx, form)
  849. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
  850. return
  851. }
  852. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  853. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  854. versionErrorDataPrepare(ctx, form)
  855. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
  856. return
  857. }
  858. parentDir := VersionOutputPath + "/"
  859. // parentDir := ""
  860. // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  861. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  862. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  863. versionErrorDataPrepare(ctx, form)
  864. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
  865. return
  866. }
  867. //todo: del local code?
  868. var parameters models.Parameters
  869. param := make([]models.Parameter, 0)
  870. param = append(param, models.Parameter{
  871. Label: modelarts.TrainUrl,
  872. Value: outputObsPath,
  873. }, models.Parameter{
  874. Label: modelarts.DataUrl,
  875. Value: dataPath,
  876. })
  877. if len(params) != 0 {
  878. err := json.Unmarshal([]byte(params), &parameters)
  879. if err != nil {
  880. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  881. versionErrorDataPrepare(ctx, form)
  882. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
  883. return
  884. }
  885. for _, parameter := range parameters.Parameter {
  886. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  887. param = append(param, models.Parameter{
  888. Label: parameter.Label,
  889. Value: parameter.Value,
  890. })
  891. }
  892. }
  893. }
  894. //save param config
  895. if isSaveParam == "on" {
  896. if form.ParameterTemplateName == "" {
  897. log.Error("ParameterTemplateName is empty")
  898. versionErrorDataPrepare(ctx, form)
  899. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
  900. return
  901. }
  902. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  903. ConfigName: form.ParameterTemplateName,
  904. Description: form.PrameterDescription,
  905. DataUrl: dataPath,
  906. AppUrl: codeObsPath,
  907. BootFileUrl: codeObsPath + bootFile,
  908. TrainUrl: outputObsPath,
  909. Flavor: models.Flavor{
  910. Code: flavorCode,
  911. },
  912. WorkServerNum: workServerNumber,
  913. EngineID: int64(engineID),
  914. LogUrl: logObsPath,
  915. PoolID: poolID,
  916. Parameter: parameters.Parameter,
  917. })
  918. if err != nil {
  919. log.Error("Failed to CreateTrainJobConfig: %v", err)
  920. versionErrorDataPrepare(ctx, form)
  921. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  922. return
  923. }
  924. }
  925. if err != nil {
  926. log.Error("getFlavorNameByEngineID(%s) failed:%v", engineID, err.Error())
  927. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  928. return
  929. }
  930. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
  931. if err != nil {
  932. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  933. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  934. return
  935. }
  936. req := &modelarts.GenerateTrainJobReq{
  937. JobName: task.JobName,
  938. DataUrl: dataPath,
  939. Description: description,
  940. CodeObsPath: codeObsPath,
  941. BootFileUrl: codeObsPath + bootFile,
  942. BootFile: bootFile,
  943. TrainUrl: outputObsPath,
  944. FlavorCode: flavorCode,
  945. WorkServerNumber: workServerNumber,
  946. IsLatestVersion: isLatestVersion,
  947. EngineID: int64(engineID),
  948. LogUrl: logObsPath,
  949. PoolID: poolID,
  950. Uuid: uuid,
  951. Params: form.Params,
  952. Parameters: parameters.Parameter,
  953. PreVersionId: task.VersionID,
  954. CommitID: commitID,
  955. BranchName: branch_name,
  956. FlavorName: FlavorName,
  957. EngineName: EngineName,
  958. PreVersionName: PreVersionName,
  959. TotalVersionCount: latestTask.TotalVersionCount + 1,
  960. }
  961. err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
  962. if err != nil {
  963. log.Error("GenerateTrainJob failed:%v", err.Error())
  964. versionErrorDataPrepare(ctx, form)
  965. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  966. return
  967. }
  968. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
  969. // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  970. }
  971. // readDir reads the directory named by dirname and returns
  972. // a list of directory entries sorted by filename.
  973. func readDir(dirname string) ([]os.FileInfo, error) {
  974. f, err := os.Open(dirname)
  975. if err != nil {
  976. return nil, err
  977. }
  978. list, err := f.Readdir(100)
  979. f.Close()
  980. if err != nil {
  981. //todo: can not upload empty folder
  982. if err == io.EOF {
  983. return nil, nil
  984. }
  985. return nil, err
  986. }
  987. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  988. return list, nil
  989. }
  990. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  991. files, err := readDir(codePath)
  992. if err != nil {
  993. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  994. return err
  995. }
  996. for _, file := range files {
  997. if file.IsDir() {
  998. input := &obs.PutObjectInput{}
  999. input.Bucket = setting.Bucket
  1000. input.Key = parentDir + file.Name() + "/"
  1001. _, err = storage.ObsCli.PutObject(input)
  1002. if err != nil {
  1003. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1004. return err
  1005. }
  1006. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  1007. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  1008. return err
  1009. }
  1010. } else {
  1011. input := &obs.PutFileInput{}
  1012. input.Bucket = setting.Bucket
  1013. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  1014. input.SourceFile = codePath + file.Name()
  1015. _, err = storage.ObsCli.PutFile(input)
  1016. if err != nil {
  1017. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  1018. return err
  1019. }
  1020. }
  1021. }
  1022. return nil
  1023. }
  1024. func obsMkdir(dir string) error {
  1025. input := &obs.PutObjectInput{}
  1026. input.Bucket = setting.Bucket
  1027. input.Key = dir
  1028. _, err := storage.ObsCli.PutObject(input)
  1029. if err != nil {
  1030. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1031. return err
  1032. }
  1033. return nil
  1034. }
  1035. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  1036. if !strings.HasSuffix(form.BootFile, ".py") {
  1037. log.Error("the boot file(%s) must be a python file", form.BootFile)
  1038. return errors.New("启动文件必须是python文件")
  1039. }
  1040. if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
  1041. log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
  1042. return errors.New("计算节点数必须在1-25之间")
  1043. }
  1044. return nil
  1045. }
  1046. func TrainJobShow(ctx *context.Context) {
  1047. ctx.Data["PageIsCloudBrain"] = true
  1048. var jobID = ctx.Params(":jobid")
  1049. repo := ctx.Repo.Repository
  1050. page := ctx.QueryInt("page")
  1051. if page <= 0 {
  1052. page = 1
  1053. }
  1054. VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1055. ListOptions: models.ListOptions{
  1056. Page: page,
  1057. PageSize: setting.UI.IssuePagingNum,
  1058. },
  1059. RepoID: repo.ID,
  1060. Type: models.TypeCloudBrainTwo,
  1061. JobType: string(models.JobTypeTrain),
  1062. JobID: jobID,
  1063. })
  1064. if err != nil {
  1065. log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
  1066. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1067. return
  1068. }
  1069. //设置权限
  1070. canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
  1071. if err != nil {
  1072. ctx.ServerError("canNewJob failed", err)
  1073. return
  1074. }
  1075. ctx.Data["canNewJob"] = canNewJob
  1076. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1077. for i, _ := range VersionListTasks {
  1078. var parameters models.Parameters
  1079. err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), &parameters)
  1080. if err != nil {
  1081. log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
  1082. trainJobNewDataPrepare(ctx)
  1083. return
  1084. }
  1085. if len(parameters.Parameter) > 0 {
  1086. paramTemp := ""
  1087. for _, Parameter := range parameters.Parameter {
  1088. param := Parameter.Label + " = " + Parameter.Value + "; "
  1089. paramTemp = paramTemp + param
  1090. }
  1091. VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
  1092. } else {
  1093. VersionListTasks[i].Parameters = ""
  1094. }
  1095. }
  1096. pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
  1097. pager.SetDefaultParams(ctx)
  1098. ctx.Data["Page"] = pager
  1099. ctx.Data["jobID"] = jobID
  1100. ctx.Data["jobName"] = VersionListTasks[0].JobName
  1101. ctx.Data["version_list_task"] = VersionListTasks
  1102. ctx.Data["version_list_count"] = VersionListCount
  1103. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1104. }
  1105. func TrainJobGetLog(ctx *context.Context) {
  1106. ctx.Data["PageIsTrainJob"] = true
  1107. var jobID = ctx.Params(":jobid")
  1108. var logFileName = ctx.Query("file_name")
  1109. var baseLine = ctx.Query("base_line")
  1110. var order = ctx.Query("order")
  1111. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1112. log.Error("order(%s) check failed", order)
  1113. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1114. return
  1115. }
  1116. task, err := models.GetCloudbrainByJobID(jobID)
  1117. if err != nil {
  1118. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1119. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1120. return
  1121. }
  1122. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1123. if err != nil {
  1124. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1125. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1126. return
  1127. }
  1128. ctx.Data["log"] = result
  1129. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1130. }
  1131. func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  1132. task, err := models.GetCloudbrainByJobID(jobID)
  1133. if err != nil {
  1134. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1135. return nil, nil, err
  1136. }
  1137. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  1138. if err != nil {
  1139. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  1140. return nil, nil, err
  1141. }
  1142. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
  1143. if err != nil {
  1144. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1145. return nil, nil, err
  1146. }
  1147. return resultLogFile, result, err
  1148. }
  1149. func TrainJobDel(ctx *context.Context) {
  1150. var jobID = ctx.Params(":jobid")
  1151. repo := ctx.Repo.Repository
  1152. VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1153. RepoID: repo.ID,
  1154. Type: models.TypeCloudBrainTwo,
  1155. JobType: string(models.JobTypeTrain),
  1156. JobID: jobID,
  1157. })
  1158. if err != nil {
  1159. ctx.ServerError("get VersionListTasks failed", err)
  1160. return
  1161. }
  1162. //删除modelarts上的任务记录
  1163. _, err = modelarts.DelTrainJob(jobID)
  1164. if err != nil {
  1165. log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
  1166. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1167. return
  1168. }
  1169. //删除数据库Cloudbrain表的记录
  1170. for _, task := range VersionListTasks {
  1171. err = models.DeleteJob(&task.Cloudbrain)
  1172. if err != nil {
  1173. ctx.ServerError("DeleteJob failed", err)
  1174. return
  1175. }
  1176. }
  1177. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1178. }
  1179. func TrainJobStop(ctx *context.Context) {
  1180. var jobID = ctx.Params(":jobid")
  1181. task, err := models.GetCloudbrainByJobID(jobID)
  1182. if err != nil {
  1183. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  1184. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1185. return
  1186. }
  1187. _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  1188. if err != nil {
  1189. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  1190. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1191. return
  1192. }
  1193. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1194. }
  1195. func canUserCreateTrainJob(uid int64) (bool, error) {
  1196. org, err := models.GetOrgByName(setting.AllowedOrg)
  1197. if err != nil {
  1198. log.Error("get allowed org failed: ", setting.AllowedOrg)
  1199. return false, err
  1200. }
  1201. return org.IsOrgMember(uid)
  1202. }
  1203. func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
  1204. if ctx == nil || ctx.User == nil {
  1205. log.Error("user unlogin!")
  1206. return false, nil
  1207. }
  1208. if userID == ctx.User.ID || ctx.User.IsAdmin {
  1209. return true, nil
  1210. } else {
  1211. log.Error("Only user itself and admin can new trainjob!")
  1212. return false, nil
  1213. }
  1214. }
  1215. func TrainJobGetConfigList(ctx *context.Context) {
  1216. ctx.Data["PageIsTrainJob"] = true
  1217. var jobID = ctx.Params(":jobid")
  1218. var logFileName = ctx.Query("file_name")
  1219. var baseLine = ctx.Query("base_line")
  1220. var order = ctx.Query("order")
  1221. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1222. log.Error("order(%s) check failed", order)
  1223. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1224. return
  1225. }
  1226. task, err := models.GetCloudbrainByJobID(jobID)
  1227. if err != nil {
  1228. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1229. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1230. return
  1231. }
  1232. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1233. if err != nil {
  1234. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1235. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1236. return
  1237. }
  1238. ctx.Data["log"] = result
  1239. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1240. }
  1241. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  1242. var result models.GetConfigListResult
  1243. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  1244. if err != nil {
  1245. log.Error("GetConfigList failed:", err)
  1246. return &result, err
  1247. }
  1248. for _, config := range list.ParaConfigs {
  1249. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  1250. if err != nil {
  1251. log.Error("GetParaConfig failed:", err)
  1252. return &result, err
  1253. }
  1254. config.Result = paraConfig
  1255. }
  1256. return list, nil
  1257. }
  1258. func ModelDownload(ctx *context.Context) {
  1259. var (
  1260. err error
  1261. )
  1262. var jobID = ctx.Params(":jobid")
  1263. versionName := ctx.Query("version_name")
  1264. parentDir := ctx.Query("parent_dir")
  1265. fileName := ctx.Query("file_name")
  1266. log.Info("DownloadSingleModelFile start.")
  1267. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  1268. if err != nil {
  1269. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  1270. return
  1271. }
  1272. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
  1273. log.Info("Download path is:%s", path)
  1274. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  1275. if err != nil {
  1276. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  1277. ctx.ServerError("GetObsCreateSignedUrl", err)
  1278. return
  1279. }
  1280. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  1281. }