From e169f71c4de1b27190498c309cdba25929e1b2bd Mon Sep 17 00:00:00 2001 From: Hordur Freyr Yngvason <hfyngvason@gitlab.com> Date: Mon, 17 Jul 2023 12:03:56 -0400 Subject: [PATCH] Change build queuing metrics FF to ops type This flag has been enabled for a long time on GitLab.com, and the metrics are considered production-ready, but the they are undocumented and stuck behind a `:development` flag, making it largely inaccessible to self-managed customers. To address those issues, we are making this flag to an `:ops` type flag and adding documentation. The flag remains disabled by default to avoid adding unexpected load on customers' Prometheus instances. See https://gitlab.com/gitlab-org/gitlab/-/issues/350888 Changelog: changed --- .../gitlab_ci_builds_queuing_metrics.yml | 2 +- .../monitoring/prometheus/gitlab_metrics.md | 17 +++++ lib/gitlab/ci/queue/metrics.rb | 10 +-- spec/lib/gitlab/ci/queue/metrics_spec.rb | 71 +++++++++++++++++++ 4 files changed, 94 insertions(+), 6 deletions(-) rename config/feature_flags/{development => ops}/gitlab_ci_builds_queuing_metrics.yml (93%) create mode 100644 spec/lib/gitlab/ci/queue/metrics_spec.rb diff --git a/config/feature_flags/development/gitlab_ci_builds_queuing_metrics.yml b/config/feature_flags/ops/gitlab_ci_builds_queuing_metrics.yml similarity index 93% rename from config/feature_flags/development/gitlab_ci_builds_queuing_metrics.yml rename to config/feature_flags/ops/gitlab_ci_builds_queuing_metrics.yml index d23cc852d4915..2838a056063e9 100644 --- a/config/feature_flags/development/gitlab_ci_builds_queuing_metrics.yml +++ b/config/feature_flags/ops/gitlab_ci_builds_queuing_metrics.yml @@ -3,6 +3,6 @@ name: gitlab_ci_builds_queuing_metrics introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/54909 rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/350888 milestone: '13.10' -type: development +type: ops group: group::pipeline execution default_enabled: false diff --git a/doc/administration/monitoring/prometheus/gitlab_metrics.md b/doc/administration/monitoring/prometheus/gitlab_metrics.md index 061b1921e4fa1..c4731e46a659b 100644 --- a/doc/administration/monitoring/prometheus/gitlab_metrics.md +++ b/doc/administration/monitoring/prometheus/gitlab_metrics.md @@ -170,6 +170,16 @@ The following metrics are available: | `gitlab_sli_rails_request_apdex_total` | Counter | 14.4 | Total number of request Apdex measurements. For more information, see [Rails request SLIs](../../../development/application_slis/rails_request.md) | `endpoint_id`, `feature_category`, `request_urgency` | | `gitlab_sli_rails_request_apdex_success_total` | Counter | 14.4 | Total number of successful requests that met the target duration for their urgency. Divide by `gitlab_sli_rails_requests_apdex_total` to get a success ratio | `endpoint_id`, `feature_category`, `request_urgency` | | `gitlab_sli_rails_request_error_total` | Counter | 15.7 | Total number of request error measurements. For more information, see [Rails request SLIs](../../../development/application_slis/rails_request.md) | `endpoint_id`, `feature_category`, `request_urgency`, `error` | +| `job_register_attempts_failed_total` | Counter | 9.5 | Counts the times a runner fails to register a job | +| `job_register_attempts_total` | Counter | 9.5 | Counts the times a runner tries to register a job | +| `job_queue_duration_seconds` | Histogram | 9.5 | Request handling execution time | +| `gitlab_ci_queue_operations_total` | Counter | 16.3 | Counts all the operations that are happening inside a queue | +| `gitlab_ci_queue_depth_total` | Histogram | 16.3 | Size of a CI/CD builds queue in relation to the operation result | +| `gitlab_ci_queue_size_total` | Histogram | 16.3 | Size of initialized CI/CD builds queue | +| `gitlab_ci_current_queue_size` | Gauge | 16.3 | Current size of initialized CI/CD builds queue | +| `gitlab_ci_queue_iteration_duration_seconds` | Histogram | 16.3 | Time it takes to find a build in CI/CD queue | +| `gitlab_ci_queue_retrieval_duration_seconds` | Histogram | 16.3 | Time it takes to execute a SQL query to retrieve builds queue | +| `gitlab_ci_queue_active_runners_total` | Histogram | 16.3 | The amount of active runners that can process queue in a project | ## Metrics controlled by a feature flag @@ -178,6 +188,13 @@ The following metrics can be controlled by feature flags: | Metric | Feature flag | |:---------------------------------------------------------------|:-------------------------------------------------------------------| | `gitlab_view_rendering_duration_seconds` | `prometheus_metrics_view_instrumentation` | +| `gitlab_ci_queue_depth_total` | `gitlab_ci_builds_queuing_metrics` | +| `gitlab_ci_queue_size` | `gitlab_ci_builds_queuing_metrics` | +| `gitlab_ci_queue_size_total` | `gitlab_ci_builds_queuing_metrics` | +| `gitlab_ci_queue_iteration_duration_seconds` | `gitlab_ci_builds_queuing_metrics` | +| `gitlab_ci_current_queue_size` | `gitlab_ci_builds_queuing_metrics` | +| `gitlab_ci_queue_retrieval_duration_seconds` | `gitlab_ci_builds_queuing_metrics` | +| `gitlab_ci_queue_active_runners_total` | `gitlab_ci_builds_queuing_metrics` | ## Praefect metrics diff --git a/lib/gitlab/ci/queue/metrics.rb b/lib/gitlab/ci/queue/metrics.rb index 5cee73238ca63..a18542288c9fc 100644 --- a/lib/gitlab/ci/queue/metrics.rb +++ b/lib/gitlab/ci/queue/metrics.rb @@ -74,7 +74,7 @@ def increment_queue_operation(operation) end def observe_queue_depth(queue, size) - return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics) + return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, type: :ops) if !Rails.env.production? && !QUEUE_DEPTH_HISTOGRAMS.include?(queue) raise ArgumentError, "unknown queue depth label: #{queue}" @@ -84,7 +84,7 @@ def observe_queue_depth(queue, size) end def observe_queue_size(size_proc, runner_type) - return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics) + return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, type: :ops) size = size_proc.call.to_f self.class.queue_size_total.observe({ runner_type: runner_type }, size) @@ -96,7 +96,7 @@ def observe_queue_time(metric, runner_type) result = yield - return result unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics) + return result unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, type: :ops) seconds = ::Gitlab::Metrics::System.monotonic_time - start_time @@ -121,7 +121,7 @@ def self.increment_queue_operation(operation) end def self.observe_active_runners(runners_proc) - return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics) + return unless Feature.enabled?(:gitlab_ci_builds_queuing_metrics, type: :ops) queue_active_runners_total.observe({}, runners_proc.call.to_f) end @@ -133,7 +133,7 @@ def self.increment_runner_tick(runner) def self.failed_attempt_counter strong_memoize(:failed_attempt_counter) do name = :job_register_attempts_failed_total - comment = 'Counts the times a runner tries to register a job' + comment = 'Counts the times a runner fails to register a job' Gitlab::Metrics.counter(name, comment) end diff --git a/spec/lib/gitlab/ci/queue/metrics_spec.rb b/spec/lib/gitlab/ci/queue/metrics_spec.rb new file mode 100644 index 0000000000000..2fb4226ba5ad1 --- /dev/null +++ b/spec/lib/gitlab/ci/queue/metrics_spec.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::Ci::Queue::Metrics, feature_category: :continuous_integration do + let(:metrics) { described_class.new(build(:ci_runner)) } + + describe '#observe_queue_depth' do + subject { metrics.observe_queue_depth(:found, 1) } + + it { is_expected.not_to be_nil } + + context 'with feature flag gitlab_ci_builds_queueing_metrics disabled' do + before do + stub_feature_flags(gitlab_ci_builds_queuing_metrics: false) + end + + it { is_expected.to be_nil } + end + end + + describe '#observe_queue_size' do + subject { metrics.observe_queue_size(-> { 0 }, :some_runner_type) } + + it { is_expected.not_to be_nil } + + context 'with feature flag gitlab_ci_builds_queueing_metrics disabled' do + before do + stub_feature_flags(gitlab_ci_builds_queuing_metrics: false) + end + + it { is_expected.to be_nil } + end + end + + describe '#observe_queue_time' do + subject { metrics.observe_queue_time(:process, :some_runner_type) { 1 } } + + specify do + expect(described_class).to receive(:queue_iteration_duration_seconds).and_call_original + + subject + end + + context 'with feature flag gitlab_ci_builds_queueing_metrics disabled' do + before do + stub_feature_flags(gitlab_ci_builds_queuing_metrics: false) + end + + specify do + expect(described_class).not_to receive(:queue_iteration_duration_seconds) + + subject + end + end + + describe '.observe_active_runners' do + subject { described_class.observe_active_runners(-> { 0 }) } + + it { is_expected.not_to be_nil } + + context 'with feature flag gitlab_ci_builds_queueing_metrics disabled' do + before do + stub_feature_flags(gitlab_ci_builds_queuing_metrics: false) + end + + it { is_expected.to be_nil } + end + end + end +end -- GitLab