From 1fddbb2778cf62e475d2413c8f34d74d3e08bef3 Mon Sep 17 00:00:00 2001
From: Pedro Pombeiro <noreply@pedro.pombei.ro>
Date: Tue, 24 Jan 2023 12:22:24 +0000
Subject: [PATCH] Add cronjob to clean up stale runner machines

Changelog: added
---
 .../runners/stale_machines_cleanup_service.rb | 33 ++++++++++++++
 app/workers/all_queues.yml                    |  9 ++++
 ...le_existing_runner_versions_cron_worker.rb |  2 +-
 .../stale_machines_cleanup_cron_worker.rb     | 24 ++++++++++
 app/workers/concerns/application_worker.rb    |  5 +++
 config/gitlab.yml.example                     |  4 ++
 config/initializers/1_settings.rb             |  3 ++
 .../stale_group_runners_prune_cron_worker.rb  |  2 +-
 spec/factories/ci/runner_machines.rb          |  5 +++
 spec/models/ci/runner_machine_spec.rb         |  9 ++--
 .../stale_machines_cleanup_service_spec.rb    | 45 +++++++++++++++++++
 ...stale_machines_cleanup_cron_worker_spec.rb | 42 +++++++++++++++++
 .../concerns/application_worker_spec.rb       |  9 ++++
 13 files changed, 186 insertions(+), 6 deletions(-)
 create mode 100644 app/services/ci/runners/stale_machines_cleanup_service.rb
 create mode 100644 app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb
 create mode 100644 spec/services/ci/runners/stale_machines_cleanup_service_spec.rb
 create mode 100644 spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb

diff --git a/app/services/ci/runners/stale_machines_cleanup_service.rb b/app/services/ci/runners/stale_machines_cleanup_service.rb
new file mode 100644
index 0000000000000..3e5706d24a6b9
--- /dev/null
+++ b/app/services/ci/runners/stale_machines_cleanup_service.rb
@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+module Ci
+  module Runners
+    class StaleMachinesCleanupService
+      MAX_DELETIONS = 1000
+
+      def execute
+        ServiceResponse.success(payload: {
+          # the `stale` relationship can return duplicates, so we don't try to return a precise count here
+          deleted_machines: delete_stale_runner_machines > 0
+        })
+      end
+
+      private
+
+      def delete_stale_runner_machines
+        total_deleted_count = 0
+        loop do
+          sub_batch_limit = [100, MAX_DELETIONS].min
+
+          # delete_all discards part of the `stale` scope query, so we expliclitly wrap it with a SELECT as a workaround
+          deleted_count = Ci::RunnerMachine.id_in(Ci::RunnerMachine.stale.limit(sub_batch_limit)).delete_all
+          total_deleted_count += deleted_count
+
+          break if deleted_count == 0 || total_deleted_count >= MAX_DELETIONS
+        end
+
+        total_deleted_count
+      end
+    end
+  end
+end
diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml
index 208d878ebc528..d1078c4bf92ff 100644
--- a/app/workers/all_queues.yml
+++ b/app/workers/all_queues.yml
@@ -255,6 +255,15 @@
   :weight: 1
   :idempotent: true
   :tags: []
+- :name: cronjob:ci_runners_stale_machines_cleanup_cron
+  :worker_name: Ci::Runners::StaleMachinesCleanupCronWorker
+  :feature_category: :runner_fleet
+  :has_external_dependencies: false
+  :urgency: :low
+  :resource_boundary: :unknown
+  :weight: 1
+  :idempotent: true
+  :tags: []
 - :name: cronjob:ci_schedule_delete_objects_cron
   :worker_name: Ci::ScheduleDeleteObjectsCronWorker
   :feature_category: :continuous_integration
diff --git a/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb b/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb
index 69ab477c80a98..722c513a4bb8c 100644
--- a/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb
+++ b/app/workers/ci/runners/reconcile_existing_runner_versions_cron_worker.rb
@@ -30,7 +30,7 @@ def perform(cronjob_scheduled = true)
         end
 
         result = ::Ci::Runners::ReconcileExistingRunnerVersionsService.new.execute
-        result.payload.each { |key, value| log_extra_metadata_on_done(key, value) }
+        log_hash_metadata_on_done(result.payload)
       end
     end
   end
diff --git a/app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb b/app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb
new file mode 100644
index 0000000000000..9a11db33fb613
--- /dev/null
+++ b/app/workers/ci/runners/stale_machines_cleanup_cron_worker.rb
@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+
+module Ci
+  module Runners
+    class StaleMachinesCleanupCronWorker
+      include ApplicationWorker
+
+      # This worker does not schedule other workers that require context.
+      include CronjobQueue # rubocop:disable Scalability/CronWorkerContext
+
+      data_consistency :sticky
+      feature_category :runner_fleet
+      urgency :low
+
+      idempotent!
+
+      def perform
+        result = ::Ci::Runners::StaleMachinesCleanupService.new.execute
+        log_extra_metadata_on_done(:status, result.status)
+        log_hash_metadata_on_done(result.payload)
+      end
+    end
+  end
+end
diff --git a/app/workers/concerns/application_worker.rb b/app/workers/concerns/application_worker.rb
index 222d045b0badb..e2e31b0a5bda3 100644
--- a/app/workers/concerns/application_worker.rb
+++ b/app/workers/concerns/application_worker.rb
@@ -36,6 +36,11 @@ def log_extra_metadata_on_done(key, value)
       @done_log_extra_metadata[key] = value
     end
 
+    def log_hash_metadata_on_done(hash)
+      @done_log_extra_metadata ||= {}
+      hash.each { |key, value| @done_log_extra_metadata[key] = value }
+    end
+
     def logging_extras
       return {} unless @done_log_extra_metadata
 
diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example
index 340864d2dfa2c..7342fe4821a0f 100644
--- a/config/gitlab.yml.example
+++ b/config/gitlab.yml.example
@@ -578,6 +578,10 @@ production: &base
     ci_runner_versions_reconciliation_worker:
       cron: "@daily"
 
+    # Periodically clean up stale runner machines.
+    ci_runners_stale_machines_cleanup_worker:
+      cron: "36 4 * * *"
+
   # GitLab EE only jobs. These jobs are automatically enabled for an EE
   # installation, and ignored for a CE installation.
   ee_cron_jobs:
diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb
index 55c21744ad028..8d0d95e01fe39 100644
--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -676,6 +676,9 @@
 Settings.cron_jobs['users_migrate_records_to_ghost_user_in_batches_worker'] ||= Settingslogic.new({})
 Settings.cron_jobs['users_migrate_records_to_ghost_user_in_batches_worker']['cron'] ||= '*/2 * * * *'
 Settings.cron_jobs['users_migrate_records_to_ghost_user_in_batches_worker']['job_class'] = 'Users::MigrateRecordsToGhostUserInBatchesWorker'
+Settings.cron_jobs['ci_runners_stale_machines_cleanup_worker'] ||= Settingslogic.new({})
+Settings.cron_jobs['ci_runners_stale_machines_cleanup_worker']['cron'] ||= '36 4 * * *'
+Settings.cron_jobs['ci_runners_stale_machines_cleanup_worker']['job_class'] = 'Ci::Runners::StaleMachinesCleanupCronWorker'
 
 Gitlab.ee do
   Settings.cron_jobs['analytics_devops_adoption_create_all_snapshots_worker'] ||= Settingslogic.new({})
diff --git a/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb b/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb
index 4cf1f55f46d91..694c4e52cdeaf 100644
--- a/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb
+++ b/ee/app/workers/ci/runners/stale_group_runners_prune_cron_worker.rb
@@ -21,7 +21,7 @@ def perform
 
         result = ::Ci::Runners::StaleGroupRunnersPruneService.new.execute(namespace_ids)
         log_extra_metadata_on_done(:status, result.status)
-        result.payload.each { |key, value| log_extra_metadata_on_done(key, value) }
+        log_hash_metadata_on_done(result.payload)
       end
     end
   end
diff --git a/spec/factories/ci/runner_machines.rb b/spec/factories/ci/runner_machines.rb
index 09bf5d0844e2d..c382ebdcb262c 100644
--- a/spec/factories/ci/runner_machines.rb
+++ b/spec/factories/ci/runner_machines.rb
@@ -4,5 +4,10 @@
   factory :ci_runner_machine, class: 'Ci::RunnerMachine' do
     runner factory: :ci_runner
     machine_xid { "r_#{SecureRandom.hex.slice(0, 10)}" }
+
+    trait :stale do
+      created_at { 1.year.ago }
+      contacted_at { Ci::RunnerMachine::STALE_TIMEOUT.ago }
+    end
   end
 end
diff --git a/spec/models/ci/runner_machine_spec.rb b/spec/models/ci/runner_machine_spec.rb
index 9fc35006233e9..c658058c1312a 100644
--- a/spec/models/ci/runner_machine_spec.rb
+++ b/spec/models/ci/runner_machine_spec.rb
@@ -38,10 +38,11 @@
   describe '.stale', :freeze_time do
     subject { described_class.stale.ids }
 
-    let!(:runner_machine1) { create(:ci_runner_machine, created_at: 8.days.ago, contacted_at: 7.days.ago) }
-    let!(:runner_machine2) { create(:ci_runner_machine, created_at: 7.days.ago, contacted_at: nil) }
-    let!(:runner_machine3) { create(:ci_runner_machine, created_at: 5.days.ago, contacted_at: nil) }
-    let!(:runner_machine4) do
+    let!(:runner_machine1) { create(:ci_runner_machine, :stale) }
+    let!(:runner_machine2) { create(:ci_runner_machine, :stale, contacted_at: nil) }
+    let!(:runner_machine3) { create(:ci_runner_machine, created_at: 6.months.ago, contacted_at: Time.current) }
+    let!(:runner_machine4) { create(:ci_runner_machine, created_at: 5.days.ago) }
+    let!(:runner_machine5) do
       create(:ci_runner_machine, created_at: (7.days - 1.second).ago, contacted_at: (7.days - 1.second).ago)
     end
 
diff --git a/spec/services/ci/runners/stale_machines_cleanup_service_spec.rb b/spec/services/ci/runners/stale_machines_cleanup_service_spec.rb
new file mode 100644
index 0000000000000..456dbcebb84ae
--- /dev/null
+++ b/spec/services/ci/runners/stale_machines_cleanup_service_spec.rb
@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Ci::Runners::StaleMachinesCleanupService, feature_category: :runner_fleet do
+  let(:service) { described_class.new }
+  let!(:runner_machine3) { create(:ci_runner_machine, created_at: 6.months.ago, contacted_at: Time.current) }
+
+  subject(:response) { service.execute }
+
+  context 'with no stale runner machines' do
+    it 'does not clean any runner machines and returns :success status' do
+      expect do
+        expect(response).to be_success
+        expect(response.payload).to match({ deleted_machines: false })
+      end.not_to change { Ci::RunnerMachine.count }.from(1)
+    end
+  end
+
+  context 'with some stale runner machines' do
+    before do
+      create(:ci_runner_machine, :stale)
+      create(:ci_runner_machine, :stale, contacted_at: nil)
+    end
+
+    it 'only leaves non-stale runners' do
+      expect(response).to be_success
+      expect(response.payload).to match({ deleted_machines: true })
+      expect(Ci::RunnerMachine.all).to contain_exactly(runner_machine3)
+    end
+
+    context 'with more stale runners than MAX_DELETIONS' do
+      before do
+        stub_const("#{described_class}::MAX_DELETIONS", 1)
+      end
+
+      it 'only leaves non-stale runners' do
+        expect do
+          expect(response).to be_success
+          expect(response.payload).to match({ deleted_machines: true })
+        end.to change { Ci::RunnerMachine.count }.by(-Ci::Runners::StaleMachinesCleanupService::MAX_DELETIONS)
+      end
+    end
+  end
+end
diff --git a/spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb b/spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb
new file mode 100644
index 0000000000000..d8f620bc02425
--- /dev/null
+++ b/spec/workers/ci/runners/stale_machines_cleanup_cron_worker_spec.rb
@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Ci::Runners::StaleMachinesCleanupCronWorker, feature_category: :runner_fleet do
+  let(:worker) { described_class.new }
+
+  describe '#perform', :freeze_time do
+    subject(:perform) { worker.perform }
+
+    let!(:runner_machine1) do
+      create(:ci_runner_machine, created_at: 7.days.ago, contacted_at: 7.days.ago)
+    end
+
+    let!(:runner_machine2) { create(:ci_runner_machine) }
+    let!(:runner_machine3) { create(:ci_runner_machine, created_at: 6.days.ago) }
+
+    it_behaves_like 'an idempotent worker' do
+      it 'delegates to Ci::Runners::StaleMachinesCleanupService' do
+        expect_next_instance_of(Ci::Runners::StaleMachinesCleanupService) do |service|
+          expect(service)
+            .to receive(:execute).and_call_original
+        end
+
+        perform
+
+        expect(worker.logging_extras).to eq({
+          "extra.ci_runners_stale_machines_cleanup_cron_worker.status" => :success,
+          "extra.ci_runners_stale_machines_cleanup_cron_worker.deleted_machines" => true
+        })
+      end
+
+      it 'cleans up stale runner machines', :aggregate_failures do
+        expect(Ci::RunnerMachine.stale.count).to eq 1
+
+        expect { perform }.to change { Ci::RunnerMachine.count }.from(3).to(2)
+
+        expect(Ci::RunnerMachine.all).to match_array [runner_machine2, runner_machine3]
+      end
+    end
+  end
+end
diff --git a/spec/workers/concerns/application_worker_spec.rb b/spec/workers/concerns/application_worker_spec.rb
index 5fde54b98f024..0abb029f1466b 100644
--- a/spec/workers/concerns/application_worker_spec.rb
+++ b/spec/workers/concerns/application_worker_spec.rb
@@ -103,6 +103,15 @@ def self.name
       expect(instance.logging_extras).to eq({ 'extra.gitlab_foo_bar_dummy_worker.key1' => "value1", 'extra.gitlab_foo_bar_dummy_worker.key2' => "value2" })
     end
 
+    it 'returns extra data to be logged that was set from #log_hash_metadata_on_done' do
+      instance.log_hash_metadata_on_done({ key1: 'value0', key2: 'value1' })
+
+      expect(instance.logging_extras).to match_array({
+        'extra.gitlab_foo_bar_dummy_worker.key1' => 'value0',
+        'extra.gitlab_foo_bar_dummy_worker.key2' => 'value1'
+      })
+    end
+
     context 'when nothing is set' do
       it 'returns {}' do
         expect(instance.logging_extras).to eq({})
-- 
GitLab