You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common-k8s.yaml 2.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. train-job:
  2. docker-dataset-path: "/dataset"
  3. k8s:
  4. # k8s集群配置文件
  5. kubeconfig: kubeconfig_test
  6. # nfs服务暴露的IP地址 如需测试需修改为合适的地址
  7. nfs: 127.0.0.1
  8. #nfs服务端 共享目录
  9. nfs-root-path: /nfs/
  10. nfs-root-windows-path: "Z:"
  11. # 文件存储服务暴露的IP地址 如需测试需修改为合适的地址
  12. file-store: 127.0.0.1
  13. #文件存储服务端 共享目录
  14. file-store-root-path: /nfs/
  15. file-store-root-windows-path: "Z:"
  16. # 命名空间关键字
  17. namespace: namespace
  18. # k8s ingress域名 如需测试需修改为合适的域名,注意:需要为此域名配置泛域名解析到 k8s集群master节点的ip
  19. host: notebooktest.dubhe.club
  20. # k8s ingress-controller 对外port
  21. port: 30865
  22. # k8s ingress-controller 对外grpc port
  23. https-port: 31287
  24. # k8s 模型部署配置
  25. serving:
  26. # k8s 模型部署域名 如需测试需修改为合适的域名
  27. host: servingtest.dubhe.club
  28. # tls 证书 crt
  29. tls-crt:
  30. # tls 证书 key
  31. tls-key:
  32. # elasticsearch暴露的服务地址
  33. elasticsearch:
  34. hostlist: ${eshostlist:127.0.0.1:30498}
  35. # 日志采集配置信息
  36. log:
  37. type: _doc
  38. # 过滤源字段
  39. source_field: log,@timestamp,kubernetes.pod_name
  40. # 异步回调
  41. callback:
  42. # boot 单机部署即回调本机实例
  43. url: localhost:${server.port}
  44. token:
  45. # 秘钥
  46. secret-key: 1qaz2wsx
  47. # 过期时间(秒)
  48. expire-seconds: 300
  49. # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址
  50. pod:
  51. metrics:
  52. grafanaUrl: http://127.0.0.1:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod=
  53. prometheus:
  54. url: http://127.0.0.1:30003/
  55. query: api/v1/query
  56. query-range: api/v1/query_range
  57. gpu-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"})by(pod,acc_id)
  58. cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100
  59. mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"})
  60. gpu-range-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"}) by (pod,acc_id)
  61. nfs-storage-class-name: zjlab-nfs-storage
  62. namespace-limits:
  63. cpu: 10
  64. memory: 32
  65. gpu: 2
  66. #配置harbor
  67. harbor:
  68. address: harbor.dubhe.ai
  69. username: admin
  70. password: Harbor12345
  71. model-name: train
  72. # minio配置
  73. minio:
  74. url: http://127.0.0.1:9000/
  75. accessKey: admin
  76. secretKey: 123@abc.com
  77. bucketName: dubhe-cloud-test
  78. presignedUrlExpiryTime: 300
  79. annotation: /annotation/

一站式算法开发平台、高性能分布式深度学习框架、先进算法模型库、视觉模型炼知平台、数据可视化分析平台等一系列平台及工具,在模型高效分布式训练、数据处理和可视分析、模型炼知和轻量化等技术上形成独特优势,目前已在产学研等各领域近千家单位及个人提供AI应用赋能