From 7c2b7f702c3df99be0d3f60d92c67a291e8b6d7c Mon Sep 17 00:00:00 2001 From: Andrei Stoicescu <astoicescu@gitlab.com> Date: Thu, 5 Mar 2020 15:06:47 +0200 Subject: [PATCH] Add system metrics block to top of config file --- config/prometheus/common_metrics.yml | 144 +++++++++++++-------------- 1 file changed, 68 insertions(+), 76 deletions(-) diff --git a/config/prometheus/common_metrics.yml b/config/prometheus/common_metrics.yml index aa739614c9df2..85833cc19689f 100644 --- a/config/prometheus/common_metrics.yml +++ b/config/prometheus/common_metrics.yml @@ -1,6 +1,74 @@ dashboard: 'Environment metrics' priority: 1 panel_groups: +- group: System metrics (Kubernetes) + priority: 15 + panels: + - title: "Memory Usage (Total)" + type: "area-chart" + y_label: "Total Memory Used (GB)" + weight: 4 + metrics: + - id: system_metrics_kubernetes_container_memory_total + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024' + label: Total (GB) + unit: GB + - title: "Core Usage (Total)" + type: "area-chart" + y_label: "Total Cores" + weight: 3 + metrics: + - id: system_metrics_kubernetes_container_cores_total + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)' + label: Total (cores) + unit: "cores" + - title: "Memory Usage (Pod average)" + type: "line-chart" + y_label: "Memory Used per Pod (MB)" + weight: 2 + metrics: + - id: system_metrics_kubernetes_container_memory_average + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average (MB) + unit: MB + - title: "Canary: Memory Usage (Pod Average)" + type: "line-chart" + y_label: "Memory Used per Pod (MB)" + weight: 2 + metrics: + - id: system_metrics_kubernetes_container_memory_average_canary + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average (MB) + unit: MB + track: canary + - title: "Core Usage (Pod Average)" + type: "line-chart" + y_label: "Cores per Pod" + weight: 1 + metrics: + - id: system_metrics_kubernetes_container_core_usage + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average (cores) + unit: "cores" + - title: "Canary: Core Usage (Pod Average)" + type: "line-chart" + y_label: "Cores per Pod" + weight: 1 + metrics: + - id: system_metrics_kubernetes_container_core_usage_canary + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average (cores) + unit: "cores" + track: canary + - title: "Knative function invocations" + type: "area-chart" + y_label: "Invocations" + weight: 1 + metrics: + - id: system_metrics_knative_function_invocation_count + query_range: 'sum(ceil(rate(istio_requests_total{destination_service_namespace="%{kube_namespace}", destination_service=~"%{function_name}.*"}[1m])*60))' + label: invocations / minute + unit: requests # NGINX Ingress metrics for pre-0.16.0 versions - group: Response metrics (NGINX Ingress VTS) priority: 10 @@ -150,79 +218,3 @@ panel_groups: query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))' label: HTTP Errors unit: "errors / sec" -- group: System metrics (Kubernetes) - priority: 15 - panels: - - title: "Memory Usage (Total)" - type: "area-chart" - y_label: "Total Memory Used (GB)" - y_axis: - format: "gibibytes" - weight: 4 - metrics: - - id: system_metrics_kubernetes_container_memory_total - query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024' - label: Total (GB) - unit: GB - - title: "Core Usage (Total)" - type: "area-chart" - y_label: "Total Cores" - weight: 3 - metrics: - - id: system_metrics_kubernetes_container_cores_total - query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)' - label: Total (cores) - unit: "cores" - - title: "Memory Usage (Pod average)" - type: "line-chart" - y_label: "Memory Used per Pod (MB)" - y_axis: - format: "mebibytes" - weight: 2 - metrics: - - id: system_metrics_kubernetes_container_memory_average - query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' - label: Pod average (MB) - unit: MB - - title: "Canary: Memory Usage (Pod Average)" - type: "line-chart" - y_label: "Memory Used per Pod (MB)" - y_axis: - format: "mebibytes" - weight: 2 - metrics: - - id: system_metrics_kubernetes_container_memory_average_canary - query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' - label: Pod average (MB) - unit: MB - track: canary - - title: "Core Usage (Pod Average)" - type: "line-chart" - y_label: "Cores per Pod" - weight: 1 - metrics: - - id: system_metrics_kubernetes_container_core_usage - query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' - label: Pod average (cores) - unit: "cores" - - title: "Canary: Core Usage (Pod Average)" - type: "line-chart" - y_label: "Cores per Pod" - weight: 1 - metrics: - - id: system_metrics_kubernetes_container_core_usage_canary - query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' - label: Pod average (cores) - unit: "cores" - track: canary - - title: "Knative function invocations" - type: "area-chart" - y_label: "Invocations" - y_axis: - precision: 0 - weight: 1 - metrics: - - id: system_metrics_knative_function_invocation_count - query_range: 'sum(ceil(rate(istio_requests_total{destination_service_namespace="%{kube_namespace}", destination_service=~"%{function_name}.*"}[1m])*60))' - label: invocations / minute - unit: requests -- GitLab