You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common-k8s.yaml 7.8 kB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. train-job:
  2. docker-dataset-path: "/dataset"
  3. k8s:
  4. # k8s集群配置文件
  5. kubeconfig: kubeconfig_test
  6. # nfs服务暴露的IP地址 如需测试需修改为合适的地址
  7. nfs: 10.5.26.234
  8. #nfs服务端 共享目录
  9. nfs-root-path: /nfs/
  10. nfs-root-windows-path: "Z:"
  11. # 文件存储服务暴露的IP地址 如需测试需修改为合适的地址
  12. file-store: 10.5.26.234
  13. #文件存储服务端 共享目录
  14. file-store-root-path: /nfs/
  15. file-store-root-windows-path: "Z:"
  16. # 命名空间关键字
  17. namespace: namespace
  18. # k8s ingress-controller 对外grpc port
  19. https-port: 31287
  20. # k8s 模型部署配置
  21. serving:
  22. # 在线服务模型部署后容器域名(k8s ingress域名),解析地址为k8s集群地址,如需测试需修改为合适的域名
  23. host: servingtest.dubhe.club
  24. # tls 证书 crt
  25. tls-crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURYekNDQWtlZ0F3SUJBZ0lKQUp3Z0VCYlhGdk9HTUEwR0NTcUdTSWIzRFFFQkN3VUFNRVl4SVRBZkJnTlYKQkFNTUdDb3VjMlZ5ZG1sdVozUmxjM1F1WkhWaWFHVXVZMngxWWpFaE1COEdBMVVFQ2d3WUtpNXpaWEoyYVc1bgpkR1Z6ZEM1a2RXSm9aUzVqYkhWaU1CNFhEVEl4TURVeE9UQTNNRGN5TTFvWERUTXhNRFV4TnpBM01EY3lNMW93ClJqRWhNQjhHQTFVRUF3d1lLaTV6WlhKMmFXNW5kR1Z6ZEM1a2RXSm9aUzVqYkhWaU1TRXdId1lEVlFRS0RCZ3EKTG5ObGNuWnBibWQwWlhOMExtUjFZbWhsTG1Oc2RXSXdnZ0VpTUEwR0NTcUdTSWIzRFFFQkFRVUFBNElCRHdBdwpnZ0VLQW9JQkFRREwxNDBMUXk2dXFHUi9WckVXeVNUdFppVGwvOEo1eGlHenB5WUJtTldUTzVQSTczTHRKbXdkCmFPMGNCNXhBcy96SXlpYjM3b0loR0FQejZNWlBBZjJEeUMxNXJLb3Via3h2Vkswa3hJY0k2L1dsWFUxMjdZZzQKdUllNzJEbFNIak9tbStVQ2JDWkxtc0VlQVI2S3RwNW9lVUFUQVgwOWEyL012ZkdtNU1URWZFSVplTENoL21rVAppZGFKcnl0NThwYVFLZmpJcVdKQnlnUGpJYlFMUlZnSG5mckIzdlB5b3RmMFBZZWVKVHlEa1lNQnNzblMxS044ClY2Qldvb2dzWUxQcCs2VE1wOXBHS3pBdksxdXJMWkFsa1N0SUw1TkppQWJOZ1lMOXN1UVE3OU9aano2c1ZrbWQKait6aHZHSGUrYk9zdlBrYmtqNS9lbzQyQUdQcW9zdTlBZ01CQUFHalVEQk9NQjBHQTFVZERnUVdCQlFzWDlCegpxVEhNN21xMUJrOFRNWXgzY1ZRaFB6QWZCZ05WSFNNRUdEQVdnQlFzWDlCenFUSE03bXExQms4VE1ZeDNjVlFoClB6QU1CZ05WSFJNRUJUQURBUUgvTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFCc1BJSUpPMm1qUFNJV1lTaloKQmdjbFJoUTRudEkvdjFWczVjb0xOVzlrZUhqUGlCYlhvVzgzZExqSWh5ZmVYSk93Y3lpRnJkaVlZeVV5SEprTAprTmlxajRxVmZuaGkwcUoweDQyN3MzUG40TkZZRnNJYVM2OHA1WlZpUlJSZGNWY0dIMkNiRUxzYW41ZUhkNExYCnlTUUdDZmVkUVdtMEVyWWxDMGJWSER1d3RCa0NzWElPRmJEb0tSMW5wMDc3YlBrbEpIRS9rUi82QVpJR3hFM28KbDd1SDVRWWRsWldoanY5WFZRM1plVnZLUzVkM1pHYjcxZ0dwSEU2b2hmL1IweVhoS1RadUREYkhjK2lFc1dWbQowa24xenlvTUVHSW5nM0N6cVk4aE1tdHhRVStLekJrSEliVzh0UnkrS0MxZ2ZvNktFa01JVGpjK2pwQU4wZ2VDCm1LWWYKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo=
  26. # tls 证书 key
  27. tls-key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktnd2dnU2tBZ0VBQW9JQkFRREwxNDBMUXk2dXFHUi8KVnJFV3lTVHRaaVRsLzhKNXhpR3pweVlCbU5XVE81UEk3M0x0Sm13ZGFPMGNCNXhBcy96SXlpYjM3b0loR0FQego2TVpQQWYyRHlDMTVyS291Ymt4dlZLMGt4SWNJNi9XbFhVMTI3WWc0dUllNzJEbFNIak9tbStVQ2JDWkxtc0VlCkFSNkt0cDVvZVVBVEFYMDlhMi9NdmZHbTVNVEVmRUlaZUxDaC9ta1RpZGFKcnl0NThwYVFLZmpJcVdKQnlnUGoKSWJRTFJWZ0huZnJCM3ZQeW90ZjBQWWVlSlR5RGtZTUJzc25TMUtOOFY2Qldvb2dzWUxQcCs2VE1wOXBHS3pBdgpLMXVyTFpBbGtTdElMNU5KaUFiTmdZTDlzdVFRNzlPWmp6NnNWa21kait6aHZHSGUrYk9zdlBrYmtqNS9lbzQyCkFHUHFvc3U5QWdNQkFBRUNnZ0VCQUtJRG83WDA0dEN0UkpzMVV6RnlFK2lnUyttR201TGdHSDlLOWRsRkdWT1oKZ2IrdUdhRkY3WjUrZkI3UUthLy9ub1lVcmw1VzhwVE5HcTh0THhreTBzV0FRQ3k2UU5VQWs3M1ZCbnozdXdBcgpVeVhvLzUzcjgxNXB4SEJYems5bmM5UVRpalNmc3R4YWx1MTdKRVJLRzZPYjQ0SjNwNHcrclRDRk0rRmJhTTFsCkxuWUh2bGR4ZnRoMmhBWHdaU2dFYXdCNXRBbTY2YnIxbUc4ZW5saEIvbisrY09iYkZWYnk4OUcyUmVMMjJxdHIKa0R0RHVQOGY1NVdxNVB0a25TRE4zaGoweEYydG9aSC80RHpKOWVibVNMVHpUK1pQSjVjK1dlZ3BNdCtVd3YzbwpNNFdGQnFUUUZSR0MyQVJTaXVFTEdtRUJnVk50VDRmRVhRSmFveGJKeWYwQ2dZRUE2MDRJOXlDZUttY2hjSEJvCmRFeFJ2bnNhcGhwbkJqc3pHazZIWVhhTFI0NGpvalRCWm5ZWVhHYklreDdZaytla1RQYjMwRUpzQjlRRmxpVUUKQ25oZXhCS0pjTk5VcTdWdHFGTXBhT0JKWG9Md0cySmVwVmNhN1MzcFo3THZMWGRQR2wvOXFTTW5pMXROV1kxdwpmTGl1Zzlqb3JCeEJaL250NU43MDJFTWpaL2NDZ1lFQTNjVWV2U3U1VmlHaGNPSmFVSlBoUjdnclViMW9EWDF4CnROdDdXNS9pNkFyNDBsY1FZK2RCcll2VEljRllEL3piZnRWNGFnMDhsLzVHamxQK2grWW01QWNRb1c1Z3BaaHgKd1ZhTFFzRTg4ZUdPY2pLakVqd2krTEZRNE5HRmdXdU9pc1hZWk5GTlBvT1JDbHRUbkRMRnFjS0FNY2tBTEgwVwpRT2poTkJqeGhPc0NnWUFRaHRPVVgrTWNBVkJVOEdBMXd1Sis2WENPdUE0Q1h3Z1EwZkVxUkVRMkMyS3ZVdHMrCmtnN1Y5cFloMXluSkFaMEZsdGNDOXBkVjJXdG5CMFNJWTduc05ZMFhzcEFnMjBaUGF6L2VVTnAyVytYM1ZtcGwKWEgvVXBzUGM5N0ZhMVNWbUtkWE1HbDc5cDdVQUZESVJZSHRKWVdPK0t1SGhKcW14eUlNVDZXdEVNd0tCZ0R0ZAo4WVNpbDlLKzNnRGlGMXRLdXh2LzZWalFZM0o1Q2w5b0FmWGRMMWorMXUwMzhXTk5IUC9nVm56S3pWQTZXR2Z6CnJYQjJhcW9sbjYycVBwRVN3NFozZmJRNVlCWDBZVDlvYzQ5RE81VmsxRVV0MlFtZ241d0RtNnNUYTdIaG9SNzEKSjZDVmh5QWRDRTdGYy9SMGd3V1cwOHFBREZQY2lJQ0gxd0dqUzhSUkFvR0JBTklUNW5yTzNwZmNKYTROZWFZeApKMzd5RzFodUJGcWdRRWpVU3pmbHQ3UWwvNjI3LzNWT2lIRjVqSkhES0JqODBEdzh1UjhoQ2VhWDY2WkJqUmxKCk5vOUEvUm43Q2F4YUl2dkNjWG4rUW5oLy9BVlNHRzU4WWI2UUlQNFVqT1RKOW1GUlFFajN0cm9KNnVpL3FCZTQKUkhXSE44WC91Z242dzZUSGRWNktJbSs4Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K
  28. # elasticsearch暴露的服务地址
  29. elasticsearch:
  30. hostlist: ${eshostlist:10.5.26.90:30498}
  31. # 日志采集配置信息
  32. log:
  33. type: _doc
  34. # 过滤源字段
  35. source_field: log,@timestamp,kubernetes.pod_name
  36. # 异步回调
  37. callback:
  38. # boot 单机部署即回调本机实例
  39. url: localhost:${server.port}
  40. token:
  41. # 秘钥
  42. secret-key: 1qaz2wsx
  43. # 过期时间(秒)
  44. expire-seconds: 300
  45. # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址
  46. pod:
  47. metrics:
  48. grafanaUrl: http://10.5.26.90:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod=
  49. prometheus:
  50. url: http://10.5.26.90:30003/
  51. query: api/v1/query
  52. query-range: api/v1/query_range
  53. gpu-query-param: sum(DCGM_FI_DEV_GPU_UTIL{pod="pod-name-placeholder"})by(pod,UUID)
  54. gpu-mem-total-query-param: sum(DCGM_FI_DEV_FB_TOTAL_MEGABYTES{pod="pod-name-placeholder"})by(pod,UUID)
  55. gpu-mem-use-query-param: sum(DCGM_FI_DEV_FB_USED{pod="pod-name-placeholder"})by(pod,UUID)
  56. cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100
  57. mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"})
  58. gpu-usage-query-param: sum by(Hostname,gpu)(DCGM_FI_DEV_GPU_UTIL{Hostname="node-name-placeholder",pod!=""})
  59. gpu-range-query-param: sum(DCGM_FI_DEV_GPU_UTIL{pod="pod-name-placeholder"}) by (pod,UUID)
  60. gpu-mem-total-range-query-param: sum(DCGM_FI_DEV_FB_TOTAL_MEGABYTES{pod="pod-name-placeholder"}) by (pod,UUID)
  61. gpu-mem-use-range-query-param: sum(DCGM_FI_DEV_FB_USED{pod="pod-name-placeholder"}) by (pod,UUID)
  62. gpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:DCGM_FI_DEV_GPU_UTIL:sumn[usage-rate-day])))
  63. cpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten[usage-rate-day])))
  64. mem-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace.*"}[usage-rate-day])))
  65. gpu-usage-namespace-query-param: max_over_time(namespace:DCGM_FI_DEV_GPU_UTIL:sumn{namespace=~"namespace-placeholder"}[usage-rate-day])
  66. cpu-usage-namespace-query-param: max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten{namespace=~"namespace-placeholder"}[usage-rate-day])
  67. mem-usage-namespace-query-param: max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace-placeholder"}[usage-rate-day])
  68. nfs-storage-class-name: zjlab-nfs-storage
  69. namespace-limits:
  70. cpu: 10
  71. memory: 32
  72. gpu: 2
  73. # minio配置
  74. minio:
  75. url: http://10.5.26.90:9000/
  76. accessKey: admin
  77. secretKey: 123@abc.com
  78. bucketName: dubhe-cloud-test
  79. presignedUrlExpiryTime: 300
  80. annotation: /annotation/
  81. docker:
  82. remote-api-port: 2375
  83. #配置harbor
  84. harbor:
  85. address: harbor.dubhe.ai
  86. username: admin
  87. password: Harbor12345

一站式算法开发平台、高性能分布式深度学习框架、先进算法模型库、视觉模型炼知平台、数据可视化分析平台等一系列平台及工具,在模型高效分布式训练、数据处理和可视分析、模型炼知和轻量化等技术上形成独特优势,目前已在产学研等各领域近千家单位及个人提供AI应用赋能