You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 17 kB

3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. package grampus
  2. import (
  3. "fmt"
  4. "strconv"
  5. "strings"
  6. "code.gitea.io/gitea/models"
  7. "code.gitea.io/gitea/modules/cloudbrain"
  8. "code.gitea.io/gitea/modules/context"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/notification"
  11. "code.gitea.io/gitea/modules/setting"
  12. "code.gitea.io/gitea/modules/timeutil"
  13. )
  14. const (
  15. JobPath = "job/"
  16. ProcessorTypeNPU = "npu.huawei.com/NPU"
  17. ProcessorTypeGPU = "nvidia.com/gpu"
  18. ProcessorTypeGCU = "enflame-tech.com/gcu"
  19. GpuWorkDir = "/tmp/"
  20. NpuWorkDir = "/cache/"
  21. NpuLocalLogUrl = "/tmp/train.log"
  22. CommandPrepareScriptNpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;"
  23. CodeArchiveName = "master.zip"
  24. BucketRemote = "grampus"
  25. RemoteModelPath = "/output/" + models.ModelSuffix
  26. autoStopDurationMs = 4 * 60 * 60 * 1000
  27. CommandGpuDebug = "mkdir -p /dataset;%s! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='/code' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
  28. CommandGrampusDebug = "unzip -d %s %s;rm %s;" + CommandGpuDebug
  29. )
  30. var (
  31. poolInfos *models.PoolInfos
  32. FlavorInfos *setting.StFlavorInfos
  33. ImageInfos *setting.StImageInfosModelArts
  34. SpecialPools *models.SpecialPools
  35. CommandPrepareScriptGpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" +
  36. "echo \"finish loading script\";unzip -q master.zip;cd %s;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;"
  37. )
  38. type GenerateTrainJobReq struct {
  39. JobName string
  40. Command string
  41. ImageUrl string //与image_id二选一,都有的情况下优先image_url
  42. ImageId string
  43. DisplayJobName string
  44. Uuid string
  45. Description string
  46. CodeObsPath string
  47. BootFile string
  48. BootFileUrl string
  49. DataUrl string
  50. TrainUrl string
  51. WorkServerNumber int
  52. EngineID int64
  53. CommitID string
  54. IsLatestVersion string
  55. BranchName string
  56. PreVersionId int64
  57. PreVersionName string
  58. VersionCount int
  59. EngineName string
  60. TotalVersionCount int
  61. ComputeResource string
  62. ProcessType string
  63. DatasetNames string
  64. DatasetInfos map[string]models.DatasetInfo
  65. Params string
  66. ModelName string
  67. LabelName string
  68. CkptName string
  69. ModelVersion string
  70. PreTrainModelPath string
  71. PreTrainModelUrl string
  72. Spec *models.Specification
  73. CodeName string
  74. }
  75. type GenerateNotebookJobReq struct {
  76. JobName string
  77. Command string
  78. ImageUrl string
  79. ImageId string
  80. DisplayJobName string
  81. Uuid string
  82. Description string
  83. CodeStoragePath string
  84. CommitID string
  85. BranchName string
  86. ComputeResource string
  87. ProcessType string
  88. DatasetNames string
  89. DatasetInfos map[string]models.DatasetInfo
  90. ModelName string
  91. LabelName string
  92. CkptName string
  93. ModelVersion string
  94. PreTrainModelPath string
  95. PreTrainModelUrl string
  96. Spec *models.Specification
  97. CodeName string
  98. ModelPath string //参考启智GPU调试, 挂载/model目录用户的模型可以输出到这个目录
  99. ModelStorageType int
  100. }
  101. func getEndPoint() string {
  102. index := strings.Index(setting.Endpoint, "//")
  103. endpoint := setting.Endpoint[index+2:]
  104. return endpoint
  105. }
  106. func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.GrampusDataset {
  107. var datasetGrampus []models.GrampusDataset
  108. endPoint := getEndPoint()
  109. for _, datasetInfo := range datasetInfos {
  110. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  111. Name: datasetInfo.FullName,
  112. Bucket: setting.Bucket,
  113. EndPoint: endPoint,
  114. ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
  115. })
  116. }
  117. return datasetGrampus
  118. }
  119. func getDatasetGPUGrampus(datasetInfos map[string]models.DatasetInfo) ([]models.GrampusDataset, string) {
  120. var datasetGrampus []models.GrampusDataset
  121. var command = ""
  122. for uuid, datasetInfo := range datasetInfos {
  123. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  124. Name: datasetInfo.FullName,
  125. Bucket: setting.Attachment.Minio.Bucket,
  126. EndPoint: setting.Attachment.Minio.Endpoint,
  127. ObjectKey: datasetInfo.DataLocalPath,
  128. ReadOnly: true,
  129. ContainerPath: "/dataset1/" + datasetInfo.Name,
  130. })
  131. command += "cp /dataset1/'" + datasetInfo.Name + "'/" + uuid + " /dataset/'" + datasetInfo.FullName + "';"
  132. }
  133. return datasetGrampus, command
  134. }
  135. func getDatasetGCUGrampus(datasetInfos map[string]models.DatasetInfo) ([]models.GrampusDataset, string) {
  136. var datasetGrampus []models.GrampusDataset
  137. var command = ""
  138. obsEndPoint := getEndPoint()
  139. for uuid, datasetInfo := range datasetInfos {
  140. if datasetInfo.Type == models.TypeCloudBrainOne {
  141. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  142. Name: datasetInfo.FullName,
  143. Bucket: setting.Attachment.Minio.Bucket,
  144. EndPoint: setting.Attachment.Minio.Endpoint,
  145. ObjectKey: datasetInfo.DataLocalPath,
  146. ReadOnly: true,
  147. ContainerPath: "/dataset1/" + datasetInfo.Name,
  148. })
  149. command += "cp /dataset1/'" + datasetInfo.Name + "'/" + uuid + " /dataset/'" + datasetInfo.FullName + "';"
  150. } else {
  151. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  152. Name: datasetInfo.FullName,
  153. Bucket: setting.Bucket,
  154. EndPoint: obsEndPoint,
  155. ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
  156. ContainerPath: "/dataset/" + datasetInfo.Name,
  157. })
  158. }
  159. }
  160. return datasetGrampus, command
  161. }
  162. func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (jobId string, err error) {
  163. createTime := timeutil.TimeStampNow()
  164. var datasetGrampus []models.GrampusDataset
  165. var codeGrampus models.GrampusDataset
  166. var cpCommand string
  167. imageUrl := req.ImageUrl
  168. if ProcessorTypeNPU == req.ProcessType {
  169. datasetGrampus = getDatasetGrampus(req.DatasetInfos)
  170. if len(req.ModelName) != 0 {
  171. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  172. Name: req.ModelName,
  173. Bucket: setting.Bucket,
  174. EndPoint: getEndPoint(),
  175. ReadOnly: true,
  176. ObjectKey: req.PreTrainModelPath,
  177. })
  178. }
  179. codeGrampus = models.GrampusDataset{
  180. Name: req.CodeName,
  181. Bucket: setting.Bucket,
  182. EndPoint: getEndPoint(),
  183. ObjectKey: req.CodeStoragePath + cloudbrain.DefaultBranchName + ".zip",
  184. ReadOnly: false,
  185. }
  186. imageUrl = ""
  187. req.Command = ""
  188. } else {
  189. if ProcessorTypeGCU == req.ProcessType {
  190. datasetGrampus, cpCommand = getDatasetGCUGrampus(req.DatasetInfos)
  191. } else {
  192. datasetGrampus, cpCommand = getDatasetGPUGrampus(req.DatasetInfos)
  193. }
  194. if len(req.ModelName) != 0 {
  195. if req.ModelStorageType == models.TypeCloudBrainOne {
  196. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  197. Name: req.ModelName,
  198. Bucket: setting.Attachment.Minio.Bucket,
  199. EndPoint: setting.Attachment.Minio.Endpoint,
  200. ObjectKey: req.PreTrainModelPath,
  201. ReadOnly: true,
  202. ContainerPath: cloudbrain.PretrainModelMountPath,
  203. })
  204. } else {
  205. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  206. Name: req.ModelName,
  207. Bucket: setting.Bucket,
  208. EndPoint: getEndPoint(),
  209. ReadOnly: true,
  210. ObjectKey: req.PreTrainModelPath,
  211. ContainerPath: cloudbrain.PretrainModelMountPath,
  212. })
  213. }
  214. }
  215. codeArchiveName := cloudbrain.DefaultBranchName + ".zip"
  216. codeGrampus = models.GrampusDataset{
  217. Name: req.CodeName,
  218. Bucket: setting.Attachment.Minio.Bucket,
  219. EndPoint: setting.Attachment.Minio.Endpoint,
  220. ObjectKey: req.CodeStoragePath + codeArchiveName,
  221. ReadOnly: false,
  222. ContainerPath: cloudbrain.CodeMountPath,
  223. }
  224. if ProcessorTypeGCU == req.ProcessType {
  225. imageUrl = ""
  226. }
  227. codeArchiveContainerPath := cloudbrain.CodeMountPath + "/" + codeArchiveName
  228. req.Command = fmt.Sprintf(CommandGrampusDebug, cloudbrain.CodeMountPath, codeArchiveContainerPath, codeArchiveContainerPath, cpCommand, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval)
  229. log.Info("debug command:" + req.Command)
  230. }
  231. jobResult, err := createNotebookJob(models.CreateGrampusNotebookRequest{
  232. Name: req.JobName,
  233. Tasks: []models.GrampusNotebookTask{
  234. {
  235. Name: req.JobName,
  236. ResourceSpecId: req.Spec.SourceSpecId,
  237. ImageId: req.ImageId,
  238. ImageUrl: imageUrl,
  239. Datasets: datasetGrampus,
  240. Code: codeGrampus,
  241. AutoStopDuration: autoStopDurationMs,
  242. Capacity: setting.Capacity,
  243. Command: req.Command,
  244. CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
  245. },
  246. },
  247. })
  248. if err != nil {
  249. log.Error("createNotebookJob failed: %v", err.Error())
  250. return "", err
  251. }
  252. jobID := jobResult.JobInfo.JobID
  253. err = models.CreateCloudbrain(&models.Cloudbrain{
  254. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  255. UserID: ctx.User.ID,
  256. RepoID: ctx.Repo.Repository.ID,
  257. JobID: jobID,
  258. JobName: req.JobName,
  259. DisplayJobName: req.DisplayJobName,
  260. JobType: string(models.JobTypeDebug),
  261. Type: models.TypeC2Net,
  262. Uuid: req.Uuid,
  263. DatasetName: req.DatasetNames,
  264. CommitID: req.CommitID,
  265. IsLatestVersion: "1",
  266. ComputeResource: req.ComputeResource,
  267. ImageID: req.ImageId,
  268. BranchName: req.BranchName,
  269. Description: req.Description,
  270. WorkServerNumber: 1,
  271. EngineName: req.ImageUrl,
  272. CreatedUnix: createTime,
  273. UpdatedUnix: createTime,
  274. Spec: req.Spec,
  275. ModelName: req.ModelName,
  276. ModelVersion: req.ModelVersion,
  277. LabelName: req.LabelName,
  278. PreTrainModelUrl: req.PreTrainModelUrl,
  279. CkptName: req.CkptName,
  280. })
  281. if err != nil {
  282. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  283. return "", err
  284. }
  285. var actionType models.ActionType
  286. if req.ComputeResource == models.NPUResource {
  287. actionType = models.ActionCreateGrampusNPUDebugTask
  288. } else if req.ComputeResource == models.GPUResource {
  289. actionType = models.ActionCreateGrampusGPUDebugTask
  290. } else if req.ComputeResource == models.GCUResource {
  291. actionType = models.ActionCreateGrampusGCUDebugTask
  292. }
  293. task, err := models.GetCloudbrainByJobID(jobID)
  294. if err != nil {
  295. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  296. return "", err
  297. }
  298. stringId := strconv.FormatInt(task.ID, 10)
  299. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, actionType)
  300. return jobID, nil
  301. }
  302. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  303. createTime := timeutil.TimeStampNow()
  304. var datasetGrampus, modelGrampus []models.GrampusDataset
  305. var codeGrampus models.GrampusDataset
  306. if ProcessorTypeNPU == req.ProcessType {
  307. datasetGrampus = getDatasetGrampus(req.DatasetInfos)
  308. if len(req.ModelName) != 0 {
  309. modelGrampus = []models.GrampusDataset{
  310. {
  311. Name: req.ModelName,
  312. Bucket: setting.Bucket,
  313. EndPoint: getEndPoint(),
  314. ObjectKey: req.PreTrainModelPath,
  315. },
  316. }
  317. }
  318. codeGrampus = models.GrampusDataset{
  319. Name: req.CodeName,
  320. Bucket: setting.Bucket,
  321. EndPoint: getEndPoint(),
  322. ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
  323. }
  324. }
  325. jobResult, err := createJob(models.CreateGrampusJobRequest{
  326. Name: req.JobName,
  327. Tasks: []models.GrampusTasks{
  328. {
  329. Name: req.JobName,
  330. Command: req.Command,
  331. ResourceSpecId: req.Spec.SourceSpecId,
  332. ImageId: req.ImageId,
  333. ImageUrl: req.ImageUrl,
  334. CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
  335. ReplicaNum: 1,
  336. Datasets: datasetGrampus,
  337. Models: modelGrampus,
  338. Code: codeGrampus,
  339. BootFile: req.BootFile,
  340. },
  341. },
  342. })
  343. if err != nil {
  344. log.Error("createJob failed: %v", err.Error())
  345. return "", err
  346. }
  347. jobID := jobResult.JobInfo.JobID
  348. err = models.CreateCloudbrain(&models.Cloudbrain{
  349. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  350. UserID: ctx.User.ID,
  351. RepoID: ctx.Repo.Repository.ID,
  352. JobID: jobID,
  353. JobName: req.JobName,
  354. DisplayJobName: req.DisplayJobName,
  355. JobType: string(models.JobTypeTrain),
  356. Type: models.TypeC2Net,
  357. Uuid: req.Uuid,
  358. DatasetName: req.DatasetNames,
  359. CommitID: req.CommitID,
  360. IsLatestVersion: req.IsLatestVersion,
  361. ComputeResource: req.ComputeResource,
  362. ImageID: req.ImageId,
  363. TrainUrl: req.TrainUrl,
  364. BranchName: req.BranchName,
  365. Parameters: req.Params,
  366. BootFile: req.BootFile,
  367. DataUrl: req.DataUrl,
  368. Description: req.Description,
  369. WorkServerNumber: req.WorkServerNumber,
  370. EngineName: req.EngineName,
  371. VersionCount: req.VersionCount,
  372. TotalVersionCount: req.TotalVersionCount,
  373. CreatedUnix: createTime,
  374. UpdatedUnix: createTime,
  375. Spec: req.Spec,
  376. ModelName: req.ModelName,
  377. ModelVersion: req.ModelVersion,
  378. LabelName: req.LabelName,
  379. PreTrainModelUrl: req.PreTrainModelUrl,
  380. CkptName: req.CkptName,
  381. })
  382. if err != nil {
  383. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  384. return "", err
  385. }
  386. var actionType models.ActionType
  387. if req.ComputeResource == models.NPUResource {
  388. actionType = models.ActionCreateGrampusNPUTrainTask
  389. } else if req.ComputeResource == models.GPUResource {
  390. actionType = models.ActionCreateGrampusGPUTrainTask
  391. }
  392. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  393. return jobID, nil
  394. }
  395. func getCentersParamter(ctx *context.Context, req *GenerateTrainJobReq) ([]string, []string) {
  396. var centerID []string
  397. var centerName []string
  398. includeCenters := make(map[string]string)
  399. excludeCenters := make(map[string]string)
  400. if SpecialPools != nil {
  401. for _, pool := range SpecialPools.Pools {
  402. if !pool.IsExclusive && strings.Contains(req.ComputeResource, pool.Type) {
  403. org, _ := models.GetOrgByName(pool.Org)
  404. if org != nil {
  405. isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
  406. if isOrgMember {
  407. for _, info := range pool.Pool {
  408. includeCenters[info.Queue] = info.Value
  409. }
  410. } else {
  411. for _, info := range pool.Pool {
  412. excludeCenters[info.Queue] = info.Value
  413. }
  414. }
  415. }
  416. }
  417. }
  418. }
  419. if len(includeCenters) > 0 {
  420. //如果有专属资源池,根据专属资源池指定智算中心
  421. for k, v := range includeCenters {
  422. centerID = append(centerID, k)
  423. centerName = append(centerName, v)
  424. }
  425. } else if len(excludeCenters) > 0 {
  426. //否则,有要排除的中心,先获取所有中心,删除其中的排除中心,得到指定的智算中心
  427. allCenters := make(map[string]string)
  428. specs, err := GetResourceSpecs(req.ProcessType)
  429. if err == nil {
  430. for _, info := range specs.Infos {
  431. for _, center := range info.Centers {
  432. allCenters[center.ID] = center.Name
  433. }
  434. }
  435. }
  436. for k, _ := range excludeCenters {
  437. delete(allCenters, k)
  438. }
  439. for k, v := range allCenters {
  440. centerID = append(centerID, k)
  441. centerName = append(centerName, v)
  442. }
  443. }
  444. return centerID, centerName
  445. }
  446. func TransTrainJobStatus(status string) string {
  447. if status == models.GrampusStatusPending {
  448. status = models.GrampusStatusWaiting
  449. }
  450. return strings.ToUpper(status)
  451. }
  452. func GetNpuModelRemoteObsUrl(jobName string) string {
  453. return "s3:///" + BucketRemote + "/" + GetNpuModelObjectKey(jobName)
  454. }
  455. func GetNpuModelObjectKey(jobName string) string {
  456. return setting.CodePathPrefix + jobName + RemoteModelPath
  457. }
  458. func GetRemoteEndPoint(aiCenterID string) string {
  459. var endPoint string
  460. for _, info := range setting.CenterInfos.Info {
  461. if info.CenterID == aiCenterID {
  462. endPoint = info.Endpoint
  463. break
  464. }
  465. }
  466. return endPoint
  467. }
  468. func GetCenterProxy(aiCenterID string) string {
  469. var proxy string
  470. for _, info := range setting.CenterInfos.Info {
  471. if info.CenterID == aiCenterID {
  472. proxy = info.StorageProxyServer
  473. break
  474. }
  475. }
  476. return proxy
  477. }