You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 17 kB

3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. package grampus
  2. import (
  3. "fmt"
  4. "strconv"
  5. "strings"
  6. "code.gitea.io/gitea/models"
  7. "code.gitea.io/gitea/modules/cloudbrain"
  8. "code.gitea.io/gitea/modules/context"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/notification"
  11. "code.gitea.io/gitea/modules/setting"
  12. "code.gitea.io/gitea/modules/timeutil"
  13. )
  14. const (
  15. JobPath = "job/"
  16. ProcessorTypeNPU = "npu.huawei.com/NPU"
  17. ProcessorTypeGPU = "nvidia.com/gpu"
  18. ProcessorTypeGCU = "enflame-tech.com/gcu"
  19. GpuWorkDir = "/tmp/"
  20. NpuWorkDir = "/cache/"
  21. NpuLocalLogUrl = "/tmp/train.log"
  22. CommandPrepareScriptNpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;"
  23. CodeArchiveName = "master.zip"
  24. BucketRemote = "grampus"
  25. RemoteModelPath = "/output/" + models.ModelSuffix
  26. autoStopDurationMs = 4 * 60 * 60 * 1000
  27. CommandGpuDebug = "mkdir -p /dataset;%s! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='/code' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
  28. )
  29. var (
  30. poolInfos *models.PoolInfos
  31. FlavorInfos *setting.StFlavorInfos
  32. ImageInfos *setting.StImageInfosModelArts
  33. SpecialPools *models.SpecialPools
  34. CommandPrepareScriptGpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" +
  35. "echo \"finish loading script\";unzip -q master.zip;cd %s;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;"
  36. )
  37. type GenerateTrainJobReq struct {
  38. JobName string
  39. Command string
  40. ImageUrl string //与image_id二选一,都有的情况下优先image_url
  41. ImageId string
  42. DisplayJobName string
  43. Uuid string
  44. Description string
  45. CodeObsPath string
  46. BootFile string
  47. BootFileUrl string
  48. DataUrl string
  49. TrainUrl string
  50. WorkServerNumber int
  51. EngineID int64
  52. CommitID string
  53. IsLatestVersion string
  54. BranchName string
  55. PreVersionId int64
  56. PreVersionName string
  57. VersionCount int
  58. EngineName string
  59. TotalVersionCount int
  60. ComputeResource string
  61. ProcessType string
  62. DatasetNames string
  63. DatasetInfos map[string]models.DatasetInfo
  64. Params string
  65. ModelName string
  66. LabelName string
  67. CkptName string
  68. ModelVersion string
  69. PreTrainModelPath string
  70. PreTrainModelUrl string
  71. Spec *models.Specification
  72. CodeName string
  73. }
  74. type GenerateNotebookJobReq struct {
  75. JobName string
  76. Command string
  77. ImageUrl string
  78. ImageId string
  79. DisplayJobName string
  80. Uuid string
  81. Description string
  82. CodeStoragePath string
  83. CommitID string
  84. BranchName string
  85. ComputeResource string
  86. ProcessType string
  87. DatasetNames string
  88. DatasetInfos map[string]models.DatasetInfo
  89. ModelName string
  90. LabelName string
  91. CkptName string
  92. ModelVersion string
  93. PreTrainModelPath string
  94. PreTrainModelUrl string
  95. Spec *models.Specification
  96. CodeName string
  97. ModelPath string //参考启智GPU调试, 挂载/model目录用户的模型可以输出到这个目录
  98. ModelStorageType int
  99. }
  100. func getEndPoint() string {
  101. index := strings.Index(setting.Endpoint, "//")
  102. endpoint := setting.Endpoint[index+2:]
  103. return endpoint
  104. }
  105. func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.GrampusDataset {
  106. var datasetGrampus []models.GrampusDataset
  107. endPoint := getEndPoint()
  108. for _, datasetInfo := range datasetInfos {
  109. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  110. Name: datasetInfo.FullName,
  111. Bucket: setting.Bucket,
  112. EndPoint: endPoint,
  113. ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
  114. })
  115. }
  116. return datasetGrampus
  117. }
  118. func getDatasetGPUGrampus(datasetInfos map[string]models.DatasetInfo) ([]models.GrampusDataset, string) {
  119. var datasetGrampus []models.GrampusDataset
  120. var command = ""
  121. for uuid, datasetInfo := range datasetInfos {
  122. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  123. Name: datasetInfo.FullName,
  124. Bucket: setting.Attachment.Minio.Bucket,
  125. EndPoint: setting.Attachment.Minio.Endpoint,
  126. ObjectKey: datasetInfo.DataLocalPath,
  127. ReadOnly: true,
  128. ContainerPath: "/dataset1/" + datasetInfo.Name,
  129. })
  130. command += "cp /dataset1/'" + datasetInfo.Name + "'/" + uuid + " /dataset/'" + datasetInfo.FullName + "';"
  131. }
  132. return datasetGrampus, command
  133. }
  134. func getDatasetGCUGrampus(datasetInfos map[string]models.DatasetInfo) ([]models.GrampusDataset, string) {
  135. var datasetGrampus []models.GrampusDataset
  136. var command = ""
  137. obsEndPoint := getEndPoint()
  138. for uuid, datasetInfo := range datasetInfos {
  139. if datasetInfo.Type == models.TypeCloudBrainOne {
  140. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  141. Name: datasetInfo.FullName,
  142. Bucket: setting.Attachment.Minio.Bucket,
  143. EndPoint: setting.Attachment.Minio.Endpoint,
  144. ObjectKey: datasetInfo.DataLocalPath,
  145. ReadOnly: true,
  146. ContainerPath: "/dataset1/" + datasetInfo.Name,
  147. })
  148. command += "cp /dataset1/'" + datasetInfo.Name + "'/" + uuid + " /dataset/'" + datasetInfo.FullName + "';"
  149. } else {
  150. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  151. Name: datasetInfo.FullName,
  152. Bucket: setting.Bucket,
  153. EndPoint: obsEndPoint,
  154. ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
  155. ContainerPath: "/dataset/" + datasetInfo.Name,
  156. })
  157. }
  158. }
  159. return datasetGrampus, command
  160. }
  161. func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (jobId string, err error) {
  162. createTime := timeutil.TimeStampNow()
  163. var datasetGrampus []models.GrampusDataset
  164. var codeGrampus models.GrampusDataset
  165. var cpCommand string
  166. imageUrl := req.ImageUrl
  167. if ProcessorTypeNPU == req.ProcessType {
  168. datasetGrampus = getDatasetGrampus(req.DatasetInfos)
  169. if len(req.ModelName) != 0 {
  170. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  171. Name: req.ModelName,
  172. Bucket: setting.Bucket,
  173. EndPoint: getEndPoint(),
  174. ReadOnly: true,
  175. ObjectKey: req.PreTrainModelPath,
  176. })
  177. }
  178. codeGrampus = models.GrampusDataset{
  179. Name: req.CodeName,
  180. Bucket: setting.Bucket,
  181. EndPoint: getEndPoint(),
  182. ObjectKey: req.CodeStoragePath + cloudbrain.DefaultBranchName + ".zip",
  183. ReadOnly: false,
  184. }
  185. imageUrl = ""
  186. req.Command = ""
  187. } else {
  188. if ProcessorTypeGCU == req.ProcessType {
  189. datasetGrampus, cpCommand = getDatasetGCUGrampus(req.DatasetInfos)
  190. } else {
  191. datasetGrampus, cpCommand = getDatasetGPUGrampus(req.DatasetInfos)
  192. }
  193. if len(req.ModelName) != 0 {
  194. if req.ModelStorageType == models.TypeCloudBrainOne {
  195. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  196. Name: req.ModelName,
  197. Bucket: setting.Attachment.Minio.Bucket,
  198. EndPoint: setting.Attachment.Minio.Endpoint,
  199. ObjectKey: req.PreTrainModelPath,
  200. ReadOnly: true,
  201. ContainerPath: cloudbrain.PretrainModelMountPath,
  202. })
  203. } else {
  204. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  205. Name: req.ModelName,
  206. Bucket: setting.Bucket,
  207. EndPoint: getEndPoint(),
  208. ReadOnly: true,
  209. ObjectKey: req.PreTrainModelPath,
  210. ContainerPath: cloudbrain.PretrainModelMountPath,
  211. })
  212. }
  213. }
  214. codeArchiveName := cloudbrain.DefaultBranchName + ".zip"
  215. codeGrampus = models.GrampusDataset{
  216. Name: req.CodeName,
  217. Bucket: setting.Attachment.Minio.Bucket,
  218. EndPoint: setting.Attachment.Minio.Endpoint,
  219. ObjectKey: req.CodeStoragePath + codeArchiveName,
  220. ReadOnly: false,
  221. ContainerPath: cloudbrain.CodeMountPath,
  222. }
  223. if ProcessorTypeGCU == req.ProcessType {
  224. imageUrl = ""
  225. }
  226. req.Command = fmt.Sprintf(CommandGpuDebug, cpCommand, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval)
  227. log.Info("debug command:" + req.Command)
  228. }
  229. jobResult, err := createNotebookJob(models.CreateGrampusNotebookRequest{
  230. Name: req.JobName,
  231. Tasks: []models.GrampusNotebookTask{
  232. {
  233. Name: req.JobName,
  234. ResourceSpecId: req.Spec.SourceSpecId,
  235. ImageId: req.ImageId,
  236. ImageUrl: imageUrl,
  237. Datasets: datasetGrampus,
  238. Code: codeGrampus,
  239. AutoStopDuration: autoStopDurationMs,
  240. Capacity: setting.Capacity,
  241. Command: req.Command,
  242. CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
  243. },
  244. },
  245. })
  246. if err != nil {
  247. log.Error("createNotebookJob failed: %v", err.Error())
  248. return "", err
  249. }
  250. jobID := jobResult.JobInfo.JobID
  251. err = models.CreateCloudbrain(&models.Cloudbrain{
  252. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  253. UserID: ctx.User.ID,
  254. RepoID: ctx.Repo.Repository.ID,
  255. JobID: jobID,
  256. JobName: req.JobName,
  257. DisplayJobName: req.DisplayJobName,
  258. JobType: string(models.JobTypeDebug),
  259. Type: models.TypeC2Net,
  260. Uuid: req.Uuid,
  261. DatasetName: req.DatasetNames,
  262. CommitID: req.CommitID,
  263. IsLatestVersion: "1",
  264. ComputeResource: req.ComputeResource,
  265. ImageID: req.ImageId,
  266. BranchName: req.BranchName,
  267. Description: req.Description,
  268. WorkServerNumber: 1,
  269. EngineName: req.ImageUrl,
  270. CreatedUnix: createTime,
  271. UpdatedUnix: createTime,
  272. Spec: req.Spec,
  273. ModelName: req.ModelName,
  274. ModelVersion: req.ModelVersion,
  275. LabelName: req.LabelName,
  276. PreTrainModelUrl: req.PreTrainModelUrl,
  277. CkptName: req.CkptName,
  278. })
  279. if err != nil {
  280. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  281. return "", err
  282. }
  283. var actionType models.ActionType
  284. if req.ComputeResource == models.NPUResource {
  285. actionType = models.ActionCreateGrampusNPUDebugTask
  286. } else if req.ComputeResource == models.GPUResource {
  287. actionType = models.ActionCreateGrampusGPUDebugTask
  288. } else if req.ComputeResource == models.GCUResource {
  289. actionType = models.ActionCreateGrampusGCUDebugTask
  290. }
  291. task, err := models.GetCloudbrainByJobID(jobID)
  292. if err != nil {
  293. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  294. return "", err
  295. }
  296. stringId := strconv.FormatInt(task.ID, 10)
  297. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, actionType)
  298. return jobID, nil
  299. }
  300. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  301. createTime := timeutil.TimeStampNow()
  302. var datasetGrampus, modelGrampus []models.GrampusDataset
  303. var codeGrampus models.GrampusDataset
  304. if ProcessorTypeNPU == req.ProcessType {
  305. datasetGrampus = getDatasetGrampus(req.DatasetInfos)
  306. if len(req.ModelName) != 0 {
  307. modelGrampus = []models.GrampusDataset{
  308. {
  309. Name: req.ModelName,
  310. Bucket: setting.Bucket,
  311. EndPoint: getEndPoint(),
  312. ObjectKey: req.PreTrainModelPath,
  313. },
  314. }
  315. }
  316. codeGrampus = models.GrampusDataset{
  317. Name: req.CodeName,
  318. Bucket: setting.Bucket,
  319. EndPoint: getEndPoint(),
  320. ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
  321. }
  322. }
  323. jobResult, err := createJob(models.CreateGrampusJobRequest{
  324. Name: req.JobName,
  325. Tasks: []models.GrampusTasks{
  326. {
  327. Name: req.JobName,
  328. Command: req.Command,
  329. ResourceSpecId: req.Spec.SourceSpecId,
  330. ImageId: req.ImageId,
  331. ImageUrl: req.ImageUrl,
  332. CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
  333. ReplicaNum: 1,
  334. Datasets: datasetGrampus,
  335. Models: modelGrampus,
  336. Code: codeGrampus,
  337. BootFile: req.BootFile,
  338. },
  339. },
  340. })
  341. if err != nil {
  342. log.Error("createJob failed: %v", err.Error())
  343. return "", err
  344. }
  345. jobID := jobResult.JobInfo.JobID
  346. err = models.CreateCloudbrain(&models.Cloudbrain{
  347. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  348. UserID: ctx.User.ID,
  349. RepoID: ctx.Repo.Repository.ID,
  350. JobID: jobID,
  351. JobName: req.JobName,
  352. DisplayJobName: req.DisplayJobName,
  353. JobType: string(models.JobTypeTrain),
  354. Type: models.TypeC2Net,
  355. Uuid: req.Uuid,
  356. DatasetName: req.DatasetNames,
  357. CommitID: req.CommitID,
  358. IsLatestVersion: req.IsLatestVersion,
  359. ComputeResource: req.ComputeResource,
  360. ImageID: req.ImageId,
  361. TrainUrl: req.TrainUrl,
  362. BranchName: req.BranchName,
  363. Parameters: req.Params,
  364. BootFile: req.BootFile,
  365. DataUrl: req.DataUrl,
  366. Description: req.Description,
  367. WorkServerNumber: req.WorkServerNumber,
  368. EngineName: req.EngineName,
  369. VersionCount: req.VersionCount,
  370. TotalVersionCount: req.TotalVersionCount,
  371. CreatedUnix: createTime,
  372. UpdatedUnix: createTime,
  373. Spec: req.Spec,
  374. ModelName: req.ModelName,
  375. ModelVersion: req.ModelVersion,
  376. LabelName: req.LabelName,
  377. PreTrainModelUrl: req.PreTrainModelUrl,
  378. CkptName: req.CkptName,
  379. })
  380. if err != nil {
  381. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  382. return "", err
  383. }
  384. var actionType models.ActionType
  385. if req.ComputeResource == models.NPUResource {
  386. actionType = models.ActionCreateGrampusNPUTrainTask
  387. } else if req.ComputeResource == models.GPUResource {
  388. actionType = models.ActionCreateGrampusGPUTrainTask
  389. }
  390. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  391. return jobID, nil
  392. }
  393. func getCentersParamter(ctx *context.Context, req *GenerateTrainJobReq) ([]string, []string) {
  394. var centerID []string
  395. var centerName []string
  396. includeCenters := make(map[string]string)
  397. excludeCenters := make(map[string]string)
  398. if SpecialPools != nil {
  399. for _, pool := range SpecialPools.Pools {
  400. if !pool.IsExclusive && strings.Contains(req.ComputeResource, pool.Type) {
  401. org, _ := models.GetOrgByName(pool.Org)
  402. if org != nil {
  403. isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
  404. if isOrgMember {
  405. for _, info := range pool.Pool {
  406. includeCenters[info.Queue] = info.Value
  407. }
  408. } else {
  409. for _, info := range pool.Pool {
  410. excludeCenters[info.Queue] = info.Value
  411. }
  412. }
  413. }
  414. }
  415. }
  416. }
  417. if len(includeCenters) > 0 {
  418. //如果有专属资源池,根据专属资源池指定智算中心
  419. for k, v := range includeCenters {
  420. centerID = append(centerID, k)
  421. centerName = append(centerName, v)
  422. }
  423. } else if len(excludeCenters) > 0 {
  424. //否则,有要排除的中心,先获取所有中心,删除其中的排除中心,得到指定的智算中心
  425. allCenters := make(map[string]string)
  426. specs, err := GetResourceSpecs(req.ProcessType)
  427. if err == nil {
  428. for _, info := range specs.Infos {
  429. for _, center := range info.Centers {
  430. allCenters[center.ID] = center.Name
  431. }
  432. }
  433. }
  434. for k, _ := range excludeCenters {
  435. delete(allCenters, k)
  436. }
  437. for k, v := range allCenters {
  438. centerID = append(centerID, k)
  439. centerName = append(centerName, v)
  440. }
  441. }
  442. return centerID, centerName
  443. }
  444. func TransTrainJobStatus(status string) string {
  445. if status == models.GrampusStatusPending {
  446. status = models.GrampusStatusWaiting
  447. }
  448. return strings.ToUpper(status)
  449. }
  450. func GetNpuModelRemoteObsUrl(jobName string) string {
  451. return "s3:///" + BucketRemote + "/" + GetNpuModelObjectKey(jobName)
  452. }
  453. func GetNpuModelObjectKey(jobName string) string {
  454. return setting.CodePathPrefix + jobName + RemoteModelPath
  455. }
  456. func GetRemoteEndPoint(aiCenterID string) string {
  457. var endPoint string
  458. for _, info := range setting.CenterInfos.Info {
  459. if info.CenterID == aiCenterID {
  460. endPoint = info.Endpoint
  461. break
  462. }
  463. }
  464. return endPoint
  465. }
  466. func GetCenterProxy(aiCenterID string) string {
  467. var proxy string
  468. for _, info := range setting.CenterInfos.Info {
  469. if info.CenterID == aiCenterID {
  470. proxy = info.StorageProxyServer
  471. break
  472. }
  473. }
  474. return proxy
  475. }