You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common-k8s.yaml 3.2 kB

3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. train-job:
  2. docker-dataset-path: "/dataset"
  3. k8s:
  4. # k8s集群配置文件
  5. kubeconfig: kubeconfig_test
  6. # nfs服务暴露的IP地址 如需测试需修改为合适的地址
  7. nfs: 127.0.0.1
  8. #nfs服务端 共享目录
  9. nfs-root-path: /nfs/
  10. nfs-root-windows-path: "Z:"
  11. # 文件存储服务暴露的IP地址 如需测试需修改为合适的地址
  12. file-store: 127.0.0.1
  13. #文件存储服务端 共享目录
  14. file-store-root-path: /nfs/
  15. file-store-root-windows-path: "Z:"
  16. # 命名空间关键字
  17. namespace: namespace
  18. # k8s ingress域名 如需测试需修改为合适的域名,注意:需要为此域名配置泛域名解析到 k8s集群master节点的ip
  19. host: notebooktest.dubhe.club
  20. # k8s ingress-controller 对外port
  21. port: 30865
  22. # k8s ingress-controller 对外grpc port
  23. https-port: 31287
  24. # k8s 模型部署配置
  25. serving:
  26. # k8s 模型部署域名 如需测试需修改为合适的域名
  27. host: servingtest.dubhe.club
  28. # tls 证书 crt
  29. tls-crt:
  30. # tls 证书 key
  31. tls-key:
  32. # elasticsearch暴露的服务地址
  33. elasticsearch:
  34. hostlist: ${eshostlist:127.0.0.1:30498}
  35. # 日志采集配置信息
  36. log:
  37. type: _doc
  38. # 过滤源字段
  39. source_field: log,@timestamp,kubernetes.pod_name
  40. # 异步回调
  41. callback:
  42. # boot 单机部署即回调本机实例
  43. url: localhost:${server.port}
  44. token:
  45. # 秘钥
  46. secret-key: 1qaz2wsx
  47. # 过期时间(秒)
  48. expire-seconds: 300
  49. # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址
  50. pod:
  51. metrics:
  52. grafanaUrl: http://127.0.0.1:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod=
  53. prometheus:
  54. url: http://127.0.0.1:30003/
  55. query: api/v1/query
  56. query-range: api/v1/query_range
  57. gpu-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"})by(pod,acc_id)
  58. gpu-mem-total-query-param: sum(container_accelerator_memory_total_bytes{pod="pod-name-placeholder"})by(pod,acc_id)
  59. gpu-mem-use-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"})by(pod,acc_id)
  60. cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100
  61. mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"})
  62. gpu-range-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"}) by (pod,acc_id)
  63. gpu-mem-total-range-query-param: sum(container_accelerator_memory_total_bytes{pod="pod-name-placeholder"}) by (pod,acc_id)
  64. gpu-mem-use-range-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"}) by (pod,acc_id)
  65. nfs-storage-class-name: zjlab-nfs-storage
  66. namespace-limits:
  67. cpu: 10
  68. memory: 32
  69. gpu: 2
  70. #配置harbor
  71. harbor:
  72. address: harbor.dubhe.ai
  73. username: admin
  74. password: Harbor12345
  75. model-name: train
  76. # minio配置
  77. minio:
  78. url: http://127.0.0.1:9000/
  79. accessKey: admin
  80. secretKey: 123@abc.com
  81. bucketName: dubhe-cloud-test
  82. presignedUrlExpiryTime: 300
  83. annotation: /annotation/
  84. docker:
  85. remote-api-port: 2375

一站式算法开发平台、高性能分布式深度学习框架、先进算法模型库、视觉模型炼知平台、数据可视化分析平台等一系列平台及工具,在模型高效分布式训练、数据处理和可视分析、模型炼知和轻量化等技术上形成独特优势,目前已在产学研等各领域近千家单位及个人提供AI应用赋能