From 1fddbb2778cf62e475d2413c8f34d74d3e08bef3 Mon Sep 17 00:00:00 2001 From: Pedro Pombeiro <noreply@pedro.pombei.ro> Date: Tue, 24 Jan 2023 12:22:24 +0000 Subject: [PATCH] Add cronjob to clean up stale runner machines Changelog: added --- .../runners/stale_machines_cleanup_service.rb | 33 ++++++++++++++ app/workers/all_queues.yml | 9 ++++ ...le_existing_runner_versions_cron_worker.rb | 2 +- .../stale_machines_cleanup_cron_worker.rb | 24 ++++++++++ app/workers/concerns/application_worker.rb | 5 +++ config/gitlab.yml.example | 4 ++ config/initializers/1_settings.rb | 3 ++ .../stale_group_runners_prune_cron_worker.rb | 2 +- spec/factories/ci/runner_machines.rb | 5 +++ spec/models/ci/runner_machine_spec.rb | 9 ++-- .../stale_machines_cleanup_service_spec.rb | 45 +++++++++++++++++++ ...stale_machines_cleanup_cron_worker_spec.rb | 42 +++++++++++++++++ .../concerns/application_worker_spec.rb | 9 ++++ 13 files changed, 186 insertions(+), 6 deletions(-) create mode 100644 app/services/ci/runners/stale_machines_cleanup_service.rb create mode 100644 app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb create mode 100644 spec/services/ci/runners/stale_machines_cleanup_service_spec.rb create mode 100644 spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb diff --git a/app/services/ci/runners/stale_machines_cleanup_service.rb b/app/services/ci/runners/stale_machines_cleanup_service.rb new file mode 100644 index 0000000000000..3e5706d24a6b9 --- /dev/null +++ b/app/services/ci/runners/stale_machines_cleanup_service.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module Ci + module Runners + class StaleMachinesCleanupService + MAX_DELETIONS = 1000 + + def execute + ServiceResponse.success(payload: { + # the `stale` relationship can return duplicates, so we don't try to return a precise count here + deleted_machines: delete_stale_runner_machines > 0 + }) + end + + private + + def delete_stale_runner_machines + total_deleted_count = 0 + loop do + sub_batch_limit = [100, MAX_DELETIONS].min + + # delete_all discards part of the `stale` scope query, so we expliclitly wrap it with a SELECT as a workaround + deleted_count = Ci::RunnerMachine.id_in(Ci::RunnerMachine.stale.limit(sub_batch_limit)).delete_all + total_deleted_count += deleted_count + + break if deleted_count == 0 || total_deleted_count >= MAX_DELETIONS + end + + total_deleted_count + end + end + end +end diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml index 208d878ebc528..d1078c4bf92ff 100644 --- a/app/workers/all_queues.yml +++ b/app/workers/all_queues.yml @@ -255,6 +255,15 @@ :weight: 1 :idempotent: true :tags: [] +- :name: cronjob:ci_runners_stale_machines_cleanup_cron + :worker_name: Ci::Runners::StaleMachinesCleanupCronWorker + :feature_category: :runner_fleet + :has_external_dependencies: false + :urgency: :low + :resource_boundary: :unknown + :weight: 1 + :idempotent: true + :tags: [] - :name: cronjob:ci_schedule_delete_objects_cron :worker_name: Ci::ScheduleDeleteObjectsCronWorker :feature_category: :continuous_integration diff --git a/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb b/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb index 69ab477c80a98..722c513a4bb8c 100644 --- a/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb +++ b/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb @@ -30,7 +30,7 @@ def perform(cronjob_scheduled = true) end result = ::Ci::Runners::ReconcileExistingRunnerVersionsService.new.execute - result.payload.each { |key, value| log_extra_metadata_on_done(key, value) } + log_hash_metadata_on_done(result.payload) end end end diff --git a/app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb b/app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb new file mode 100644 index 0000000000000..9a11db33fb613 --- /dev/null +++ b/app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +module Ci + module Runners + class StaleMachinesCleanupCronWorker + include ApplicationWorker + + # This worker does not schedule other workers that require context. + include CronjobQueue # rubocop:disable Scalability/CronWorkerContext + + data_consistency :sticky + feature_category :runner_fleet + urgency :low + + idempotent! + + def perform + result = ::Ci::Runners::StaleMachinesCleanupService.new.execute + log_extra_metadata_on_done(:status, result.status) + log_hash_metadata_on_done(result.payload) + end + end + end +end diff --git a/app/workers/concerns/application_worker.rb b/app/workers/concerns/application_worker.rb index 222d045b0badb..e2e31b0a5bda3 100644 --- a/app/workers/concerns/application_worker.rb +++ b/app/workers/concerns/application_worker.rb @@ -36,6 +36,11 @@ def log_extra_metadata_on_done(key, value) @done_log_extra_metadata[key] = value end + def log_hash_metadata_on_done(hash) + @done_log_extra_metadata ||= {} + hash.each { |key, value| @done_log_extra_metadata[key] = value } + end + def logging_extras return {} unless @done_log_extra_metadata diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 340864d2dfa2c..7342fe4821a0f 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -578,6 +578,10 @@ production: &base ci_runner_versions_reconciliation_worker: cron: "@daily" + # Periodically clean up stale runner machines. + ci_runners_stale_machines_cleanup_worker: + cron: "36 4 * * *" + # GitLab EE only jobs. These jobs are automatically enabled for an EE # installation, and ignored for a CE installation. ee_cron_jobs: diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 55c21744ad028..8d0d95e01fe39 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -676,6 +676,9 @@ Settings.cron_jobs['users_migrate_records_to_ghost_user_in_batches_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['users_migrate_records_to_ghost_user_in_batches_worker']['cron'] ||= '*/2 * * * *' Settings.cron_jobs['users_migrate_records_to_ghost_user_in_batches_worker']['job_class'] = 'Users::MigrateRecordsToGhostUserInBatchesWorker' +Settings.cron_jobs['ci_runners_stale_machines_cleanup_worker'] ||= Settingslogic.new({}) +Settings.cron_jobs['ci_runners_stale_machines_cleanup_worker']['cron'] ||= '36 4 * * *' +Settings.cron_jobs['ci_runners_stale_machines_cleanup_worker']['job_class'] = 'Ci::Runners::StaleMachinesCleanupCronWorker' Gitlab.ee do Settings.cron_jobs['analytics_devops_adoption_create_all_snapshots_worker'] ||= Settingslogic.new({}) diff --git a/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb b/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb index 4cf1f55f46d91..694c4e52cdeaf 100644 --- a/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb +++ b/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb @@ -21,7 +21,7 @@ def perform result = ::Ci::Runners::StaleGroupRunnersPruneService.new.execute(namespace_ids) log_extra_metadata_on_done(:status, result.status) - result.payload.each { |key, value| log_extra_metadata_on_done(key, value) } + log_hash_metadata_on_done(result.payload) end end end diff --git a/spec/factories/ci/runner_machines.rb b/spec/factories/ci/runner_machines.rb index 09bf5d0844e2d..c382ebdcb262c 100644 --- a/spec/factories/ci/runner_machines.rb +++ b/spec/factories/ci/runner_machines.rb @@ -4,5 +4,10 @@ factory :ci_runner_machine, class: 'Ci::RunnerMachine' do runner factory: :ci_runner machine_xid { "r_#{SecureRandom.hex.slice(0, 10)}" } + + trait :stale do + created_at { 1.year.ago } + contacted_at { Ci::RunnerMachine::STALE_TIMEOUT.ago } + end end end diff --git a/spec/models/ci/runner_machine_spec.rb b/spec/models/ci/runner_machine_spec.rb index 9fc35006233e9..c658058c1312a 100644 --- a/spec/models/ci/runner_machine_spec.rb +++ b/spec/models/ci/runner_machine_spec.rb @@ -38,10 +38,11 @@ describe '.stale', :freeze_time do subject { described_class.stale.ids } - let!(:runner_machine1) { create(:ci_runner_machine, created_at: 8.days.ago, contacted_at: 7.days.ago) } - let!(:runner_machine2) { create(:ci_runner_machine, created_at: 7.days.ago, contacted_at: nil) } - let!(:runner_machine3) { create(:ci_runner_machine, created_at: 5.days.ago, contacted_at: nil) } - let!(:runner_machine4) do + let!(:runner_machine1) { create(:ci_runner_machine, :stale) } + let!(:runner_machine2) { create(:ci_runner_machine, :stale, contacted_at: nil) } + let!(:runner_machine3) { create(:ci_runner_machine, created_at: 6.months.ago, contacted_at: Time.current) } + let!(:runner_machine4) { create(:ci_runner_machine, created_at: 5.days.ago) } + let!(:runner_machine5) do create(:ci_runner_machine, created_at: (7.days - 1.second).ago, contacted_at: (7.days - 1.second).ago) end diff --git a/spec/services/ci/runners/stale_machines_cleanup_service_spec.rb b/spec/services/ci/runners/stale_machines_cleanup_service_spec.rb new file mode 100644 index 0000000000000..456dbcebb84ae --- /dev/null +++ b/spec/services/ci/runners/stale_machines_cleanup_service_spec.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Ci::Runners::StaleMachinesCleanupService, feature_category: :runner_fleet do + let(:service) { described_class.new } + let!(:runner_machine3) { create(:ci_runner_machine, created_at: 6.months.ago, contacted_at: Time.current) } + + subject(:response) { service.execute } + + context 'with no stale runner machines' do + it 'does not clean any runner machines and returns :success status' do + expect do + expect(response).to be_success + expect(response.payload).to match({ deleted_machines: false }) + end.not_to change { Ci::RunnerMachine.count }.from(1) + end + end + + context 'with some stale runner machines' do + before do + create(:ci_runner_machine, :stale) + create(:ci_runner_machine, :stale, contacted_at: nil) + end + + it 'only leaves non-stale runners' do + expect(response).to be_success + expect(response.payload).to match({ deleted_machines: true }) + expect(Ci::RunnerMachine.all).to contain_exactly(runner_machine3) + end + + context 'with more stale runners than MAX_DELETIONS' do + before do + stub_const("#{described_class}::MAX_DELETIONS", 1) + end + + it 'only leaves non-stale runners' do + expect do + expect(response).to be_success + expect(response.payload).to match({ deleted_machines: true }) + end.to change { Ci::RunnerMachine.count }.by(-Ci::Runners::StaleMachinesCleanupService::MAX_DELETIONS) + end + end + end +end diff --git a/spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb b/spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb new file mode 100644 index 0000000000000..d8f620bc02425 --- /dev/null +++ b/spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Ci::Runners::StaleMachinesCleanupCronWorker, feature_category: :runner_fleet do + let(:worker) { described_class.new } + + describe '#perform', :freeze_time do + subject(:perform) { worker.perform } + + let!(:runner_machine1) do + create(:ci_runner_machine, created_at: 7.days.ago, contacted_at: 7.days.ago) + end + + let!(:runner_machine2) { create(:ci_runner_machine) } + let!(:runner_machine3) { create(:ci_runner_machine, created_at: 6.days.ago) } + + it_behaves_like 'an idempotent worker' do + it 'delegates to Ci::Runners::StaleMachinesCleanupService' do + expect_next_instance_of(Ci::Runners::StaleMachinesCleanupService) do |service| + expect(service) + .to receive(:execute).and_call_original + end + + perform + + expect(worker.logging_extras).to eq({ + "extra.ci_runners_stale_machines_cleanup_cron_worker.status" => :success, + "extra.ci_runners_stale_machines_cleanup_cron_worker.deleted_machines" => true + }) + end + + it 'cleans up stale runner machines', :aggregate_failures do + expect(Ci::RunnerMachine.stale.count).to eq 1 + + expect { perform }.to change { Ci::RunnerMachine.count }.from(3).to(2) + + expect(Ci::RunnerMachine.all).to match_array [runner_machine2, runner_machine3] + end + end + end +end diff --git a/spec/workers/concerns/application_worker_spec.rb b/spec/workers/concerns/application_worker_spec.rb index 5fde54b98f024..0abb029f1466b 100644 --- a/spec/workers/concerns/application_worker_spec.rb +++ b/spec/workers/concerns/application_worker_spec.rb @@ -103,6 +103,15 @@ def self.name expect(instance.logging_extras).to eq({ 'extra.gitlab_foo_bar_dummy_worker.key1' => "value1", 'extra.gitlab_foo_bar_dummy_worker.key2' => "value2" }) end + it 'returns extra data to be logged that was set from #log_hash_metadata_on_done' do + instance.log_hash_metadata_on_done({ key1: 'value0', key2: 'value1' }) + + expect(instance.logging_extras).to match_array({ + 'extra.gitlab_foo_bar_dummy_worker.key1' => 'value0', + 'extra.gitlab_foo_bar_dummy_worker.key2' => 'value1' + }) + end + context 'when nothing is set' do it 'returns {}' do expect(instance.logging_extras).to eq({}) -- GitLab