From ec7a853eb7de143a0f3198128606d18176b0b67b Mon Sep 17 00:00:00 2001
From: rpereira2 <rpereira@gitlab.com>
Date: Tue, 14 Jul 2020 23:00:24 +0530
Subject: [PATCH] Use the pod and container labels

- K8s 1.14 introduced the pod and container labels and deprecated the
pod_name and container_name labels. K8s 1.16 removes the pod_name and
container_name labels.

- Metrics from K8s versions before 1.14 contain pod_name and
container_name labels. Metrics from K8s 1.14/1.15 contain pod, pod_name,
container and container_name labels. Metrics from K8s 1.16 onwards
contain pod and container labels.

- According to our docs, we need to support K8s 1.12 to 1.16.

- This commit changes existing queries to use pod instead of pod_name
and container instead of container_name. These changed queries should
work on K8s 1.14 onwards.

- This commit also adds a second query using `OR`. The second query
uses pod_name instead of pod, and container_name instead of container.
This second query should work on K8s 1.12 to 1.15.
---
 .../unreleased/216022-use-pod-label.yml       |  5 ++++
 config/prometheus/common_metrics.yml          | 24 ++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)
 create mode 100644 changelogs/unreleased/216022-use-pod-label.yml

diff --git a/changelogs/unreleased/216022-use-pod-label.yml b/changelogs/unreleased/216022-use-pod-label.yml
new file mode 100644
index 0000000000000..76ad5bc8430d6
--- /dev/null
+++ b/changelogs/unreleased/216022-use-pod-label.yml
@@ -0,0 +1,5 @@
+---
+title: 'Fix the default metrics dashboard to work on K8s versions 1.12 to 1.16'
+merge_request: 36863
+author:
+type: fixed
diff --git a/config/prometheus/common_metrics.yml b/config/prometheus/common_metrics.yml
index f0491df3db9ee..d9aaff12a4d24 100644
--- a/config/prometheus/common_metrics.yml
+++ b/config/prometheus/common_metrics.yml
@@ -10,7 +10,9 @@ panel_groups:
     weight: 4
     metrics:
     - id: system_metrics_kubernetes_container_memory_total
-      query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job)  /1024/1024/1024'
+      # Remove the second metric (after OR) when we drop support for K8s 1.13
+      # https://gitlab.com/gitlab-org/gitlab/-/issues/229279
+      query_range: 'avg(sum(container_memory_usage_bytes{container!="POD",pod=~"^{{ci_environment_slug}}-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job)  /1024/1024/1024     OR      avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job)  /1024/1024/1024'
       label: Total (GB)
       unit: GB
   - title: "Core Usage (Total)"
@@ -19,7 +21,9 @@ panel_groups:
     weight: 3
     metrics:
     - id: system_metrics_kubernetes_container_cores_total
-      query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job)'
+      # Remove the second metric (after OR) when we drop support for K8s 1.13
+      # https://gitlab.com/gitlab-org/gitlab/-/issues/229279
+      query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container!="POD",pod=~"^{{ci_environment_slug}}-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job)     OR      avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job)'
       label: Total (cores)
       unit: "cores"
   - title: "Memory Usage (Pod average)"
@@ -28,7 +32,9 @@ panel_groups:
     weight: 2
     metrics:
     - id: system_metrics_kubernetes_container_memory_average
-      query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}) without (job)) /1024/1024'
+      # Remove the second metric (after OR) when we drop support for K8s 1.13
+      # https://gitlab.com/gitlab-org/gitlab/-/issues/229279
+      query_range: 'avg(sum(container_memory_usage_bytes{container!="POD",pod=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container!="POD",pod=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}) without (job)) /1024/1024     OR      avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}) without (job)) /1024/1024'
       label: Pod average (MB)
       unit: MB
   - title: "Canary: Memory Usage (Pod Average)"
@@ -37,7 +43,9 @@ panel_groups:
     weight: 2
     metrics:
     - id: system_metrics_kubernetes_container_memory_average_canary
-      query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}) without (job)) /1024/1024'
+      # Remove the second metric (after OR) when we drop support for K8s 1.13
+      # https://gitlab.com/gitlab-org/gitlab/-/issues/229279
+      query_range: 'avg(sum(container_memory_usage_bytes{container!="POD",pod=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container!="POD",pod=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}) without (job)) /1024/1024     OR      avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}) without (job)) /1024/1024'
       label: Pod average (MB)
       unit: MB
       track: canary
@@ -47,7 +55,9 @@ panel_groups:
     weight: 1
     metrics:
     - id: system_metrics_kubernetes_container_core_usage
-      query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}[15m])) by (pod_name))'
+      # Remove the second metric (after OR) when we drop support for K8s 1.13
+      # https://gitlab.com/gitlab-org/gitlab/-/issues/229279
+      query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container!="POD",pod=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container!="POD",pod=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}[15m])) by (pod))     OR      avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="{{kube_namespace}}"}[15m])) by (pod_name))'
       label: Pod average (cores)
       unit: "cores"
   - title: "Canary: Core Usage (Pod Average)"
@@ -56,7 +66,9 @@ panel_groups:
     weight: 1
     metrics:
     - id: system_metrics_kubernetes_container_core_usage_canary
-      query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}[15m])) by (pod_name))'
+      # Remove the second metric (after OR) when we drop support for K8s 1.13
+      # https://gitlab.com/gitlab-org/gitlab/-/issues/229279
+      query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container!="POD",pod=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container!="POD",pod=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}[15m])) by (pod))     OR      avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^{{ci_environment_slug}}-canary-(.*)",namespace="{{kube_namespace}}"}[15m])) by (pod_name))'
       label: Pod average (cores)
       unit: "cores"
       track: canary
-- 
GitLab