diff --git a/dubhe-server/yaml/common-k8s.yaml b/dubhe-server/yaml/common-k8s.yaml index de86f6c..a90e54b 100644 --- a/dubhe-server/yaml/common-k8s.yaml +++ b/dubhe-server/yaml/common-k8s.yaml @@ -50,9 +50,9 @@ k8s: # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址 pod: metrics: - grafanaUrl: http://127.0.0.1:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod= + grafanaUrl: http://grafana.dubhe.ai:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod= prometheus: - url: http://127.0.0.1:30003/ + url: http://10.5.26.91:30003/ query: api/v1/query query-range: api/v1/query_range gpu-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"})by(pod,acc_id) @@ -60,9 +60,16 @@ k8s: gpu-mem-use-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"})by(pod,acc_id) cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100 mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"}) + gpu-usage-query-param: sum by(Hostname,gpu)(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod!=""}) gpu-range-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"}) by (pod,acc_id) gpu-mem-total-range-query-param: sum(container_accelerator_memory_total_bytes{pod="pod-name-placeholder"}) by (pod,acc_id) gpu-mem-use-range-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"}) by (pod,acc_id) + gpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:DCGM_FI_PROF_GR_ENGINE_ACTIVE:sumn[usage-rate-day]))) + cpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten[usage-rate-day]))) + mem-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace.*"}[usage-rate-day]))) + gpu-usage-namespace-query-param: max_over_time(namespace:DCGM_FI_PROF_GR_ENGINE_ACTIVE:sumn{namespace=~"namespace-placeholder"}[usage-rate-day]) + cpu-usage-namespace-query-param: max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten{namespace=~"namespace-placeholder"}[usage-rate-day]) + mem-usage-namespace-query-param: max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace-placeholder"}[usage-rate-day]) nfs-storage-class-name: zjlab-nfs-storage namespace-limits: cpu: 10