diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb index 74a12dbff77234783d4691e70fb9759ffa71564b..17aa43856c01311ef321c3e4caae0567d1b790e4 100644 --- a/app/workers/background_migration_worker.rb +++ b/app/workers/background_migration_worker.rb @@ -24,10 +24,14 @@ def self.minimum_interval # class_name - The class name of the background migration to run. # arguments - The arguments to pass to the migration class. # lease_attempts - The number of times we will try to obtain an exclusive - # lease on the class before running anyway. Pass 0 to always run. + # lease on the class before giving up. See MR for more discussion. + # https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956 def perform(class_name, arguments = [], lease_attempts = 5) with_context(caller_id: class_name.to_s) do - should_perform, ttl = perform_and_ttl(class_name) + attempts_left = lease_attempts - 1 + should_perform, ttl = perform_and_ttl(class_name, attempts_left) + + break if should_perform.nil? if should_perform Gitlab::BackgroundMigration.perform(class_name, arguments) @@ -37,32 +41,39 @@ def perform(class_name, arguments = [], lease_attempts = 5) # we'll reschedule the job in such a way that it is picked up again around # the time the lease expires. self.class - .perform_in(ttl || self.class.minimum_interval, class_name, arguments) + .perform_in(ttl || self.class.minimum_interval, class_name, arguments, attempts_left) end end end - def perform_and_ttl(class_name) - if always_perform? - # In test environments `perform_in` will run right away. This can then - # lead to stack level errors in the above `#perform`. To work around this - # we'll just perform the migration right away in the test environment. - [true, nil] - else - lease = lease_for(class_name) - perform = !!lease.try_obtain - - # If we managed to acquire the lease but the DB is not healthy, then we - # want to simply reschedule our job and try again _after_ the lease - # expires. - if perform && !healthy_database? - database_unhealthy_counter.increment - - perform = false - end + def perform_and_ttl(class_name, attempts_left) + # In test environments `perform_in` will run right away. This can then + # lead to stack level errors in the above `#perform`. To work around this + # we'll just perform the migration right away in the test environment. + return [true, nil] if always_perform? + + lease = lease_for(class_name) + lease_obtained = !!lease.try_obtain + healthy_db = healthy_database? + perform = lease_obtained && healthy_db + + database_unhealthy_counter.increment if lease_obtained && !healthy_db - [perform, lease.ttl] + # If we've tried several times to get a lease with a healthy DB without success, just give up. + # Otherwise we could end up in an infinite rescheduling loop. + if !perform && attempts_left < 0 + msg = if !lease_obtained + 'Job could not get an exclusive lease after several tries. Giving up.' + else + 'Database was unhealthy after several tries. Giving up.' + end + + Sidekiq.logger.warn(class: class_name, message: msg, job_id: jid) + + return [nil, nil] end + + [perform, lease.ttl] end def lease_for(class_name) diff --git a/changelogs/unreleased/267828-remove-minimum_interval-from-backgroundmigrationworker.yml b/changelogs/unreleased/267828-remove-minimum_interval-from-backgroundmigrationworker.yml new file mode 100644 index 0000000000000000000000000000000000000000..09fc66a511c6b241227ed2945abf8dc0dca591dc --- /dev/null +++ b/changelogs/unreleased/267828-remove-minimum_interval-from-backgroundmigrationworker.yml @@ -0,0 +1,5 @@ +--- +title: Limit number of times a background migration is rescheduled +merge_request: 45298 +author: +type: fixed diff --git a/spec/workers/background_migration_worker_spec.rb b/spec/workers/background_migration_worker_spec.rb index 15e93d62c7dfcff694632b9a066000649d37f1f4..8094efcaf04aadf0f47171b58f7c902455c23e71 100644 --- a/spec/workers/background_migration_worker_spec.rb +++ b/spec/workers/background_migration_worker_spec.rb @@ -12,45 +12,91 @@ end describe '#perform' do - it 'performs a background migration' do - expect(Gitlab::BackgroundMigration) - .to receive(:perform) - .with('Foo', [10, 20]) + before do + allow(worker).to receive(:jid).and_return(1) + expect(worker).to receive(:always_perform?).and_return(false) + end - worker.perform('Foo', [10, 20]) + context 'when lease can be obtained' do + before do + expect(Gitlab::BackgroundMigration) + .to receive(:perform) + .with('Foo', [10, 20]) + end + + it 'performs a background migration' do + worker.perform('Foo', [10, 20]) + end + + context 'when lease_attempts is 1' do + it 'performs a background migration' do + worker.perform('Foo', [10, 20], 1) + end + end end - it 'reschedules a migration if it was performed recently' do - expect(worker) - .to receive(:always_perform?) - .and_return(false) + context 'when lease not obtained (migration of same class was performed recently)' do + before do + expect(Gitlab::BackgroundMigration).not_to receive(:perform) + + worker.lease_for('Foo').try_obtain + end - worker.lease_for('Foo').try_obtain + it 'reschedules the migration and decrements the lease_attempts' do + expect(described_class) + .to receive(:perform_in) + .with(a_kind_of(Numeric), 'Foo', [10, 20], 4) - expect(Gitlab::BackgroundMigration) - .not_to receive(:perform) + worker.perform('Foo', [10, 20], 5) + end - expect(described_class) - .to receive(:perform_in) - .with(a_kind_of(Numeric), 'Foo', [10, 20]) + context 'when lease_attempts is 1' do + it 'reschedules the migration and decrements the lease_attempts' do + expect(described_class) + .to receive(:perform_in) + .with(a_kind_of(Numeric), 'Foo', [10, 20], 0) - worker.perform('Foo', [10, 20]) + worker.perform('Foo', [10, 20], 1) + end + end + + context 'when lease_attempts is 0' do + it 'gives up performing the migration' do + expect(described_class).not_to receive(:perform_in) + expect(Sidekiq.logger).to receive(:warn).with( + class: 'Foo', + message: 'Job could not get an exclusive lease after several tries. Giving up.', + job_id: 1) + + worker.perform('Foo', [10, 20], 0) + end + end end - it 'reschedules a migration if the database is not healthy' do - allow(worker) - .to receive(:always_perform?) - .and_return(false) + context 'when database is not healthy' do + before do + allow(worker).to receive(:healthy_database?).and_return(false) + end - allow(worker) - .to receive(:healthy_database?) - .and_return(false) + it 'reschedules a migration if the database is not healthy' do + expect(described_class) + .to receive(:perform_in) + .with(a_kind_of(Numeric), 'Foo', [10, 20], 4) - expect(described_class) - .to receive(:perform_in) - .with(a_kind_of(Numeric), 'Foo', [10, 20]) + worker.perform('Foo', [10, 20]) + end - worker.perform('Foo', [10, 20]) + context 'when lease_attempts is 0' do + it 'gives up performing the migration' do + expect(described_class).not_to receive(:perform_in) + expect(Sidekiq.logger).to receive(:warn).with( + class: 'Foo', + message: 'Database was unhealthy after several tries. Giving up.', + job_id: 1) + + worker.perform('Foo', [10, 20], 0) + end + end end it 'sets the class that will be executed as the caller_id' do