diff --git a/app/models/concerns/triggerable_hooks.rb b/app/models/concerns/triggerable_hooks.rb index db5df6c2c9f1e16064cd24f00958d6b7750848d9..8fe34632430b9de2df51469f18395f3529eaae11 100644 --- a/app/models/concerns/triggerable_hooks.rb +++ b/app/models/concerns/triggerable_hooks.rb @@ -29,11 +29,11 @@ def hooks_for(trigger) callable_scopes = triggers.keys + [:all] return none unless callable_scopes.include?(trigger) - public_send(trigger) # rubocop:disable GitlabSecurity/PublicSend + executable.public_send(trigger) # rubocop:disable GitlabSecurity/PublicSend end def select_active(hooks_scope, data) - select do |hook| + executable.select do |hook| ActiveHookFilter.new(hook).matches?(hooks_scope, data) end end diff --git a/app/models/hooks/project_hook.rb b/app/models/hooks/project_hook.rb index b625a70b4446d78dc3846862c9e79edb0b242a79..83858ee0c771431de7d4eb83e820129b66517b22 100644 --- a/app/models/hooks/project_hook.rb +++ b/app/models/hooks/project_hook.rb @@ -29,6 +29,10 @@ class ProjectHook < WebHook def pluralized_name _('Webhooks') end + + def web_hooks_disable_failed? + Feature.enabled?(:web_hooks_disable_failed, project) + end end ProjectHook.prepend_if_ee('EE::ProjectHook') diff --git a/app/models/hooks/service_hook.rb b/app/models/hooks/service_hook.rb index 4caa45a13d4fea562a0fe1b977b97519753ea338..b35542d5b9332c92efb367f8453dcbdef9d66e86 100644 --- a/app/models/hooks/service_hook.rb +++ b/app/models/hooks/service_hook.rb @@ -6,9 +6,7 @@ class ServiceHook < WebHook belongs_to :service validates :service, presence: true - # rubocop: disable CodeReuse/ServiceClass def execute(data, hook_name = 'service_hook') - WebHookService.new(self, data, hook_name).execute + super(data, hook_name) end - # rubocop: enable CodeReuse/ServiceClass end diff --git a/app/models/hooks/web_hook.rb b/app/models/hooks/web_hook.rb index dbd5a1b032a6dbdf7e507c629d6351a7355212d9..445381343186ea9cd0b3e92164b2e480d5947912 100644 --- a/app/models/hooks/web_hook.rb +++ b/app/models/hooks/web_hook.rb @@ -3,6 +3,11 @@ class WebHook < ApplicationRecord include Sortable + FAILURE_THRESHOLD = 3 # three strikes + INITIAL_BACKOFF = 10.minutes + MAX_BACKOFF = 1.day + BACKOFF_GROWTH_FACTOR = 2.0 + attr_encrypted :token, mode: :per_attribute_iv, algorithm: 'aes-256-gcm', @@ -21,15 +26,27 @@ class WebHook < ApplicationRecord validates :token, format: { without: /\n/ } validates :push_events_branch_filter, branch_filter: true + scope :executable, -> do + next all unless Feature.enabled?(:web_hooks_disable_failed) + + where('recent_failures <= ? AND (disabled_until IS NULL OR disabled_until < ?)', FAILURE_THRESHOLD, Time.current) + end + + def executable? + return true unless web_hooks_disable_failed? + + recent_failures <= FAILURE_THRESHOLD && (disabled_until.nil? || disabled_until < Time.current) + end + # rubocop: disable CodeReuse/ServiceClass def execute(data, hook_name) - WebHookService.new(self, data, hook_name).execute + WebHookService.new(self, data, hook_name).execute if executable? end # rubocop: enable CodeReuse/ServiceClass # rubocop: disable CodeReuse/ServiceClass def async_execute(data, hook_name) - WebHookService.new(self, data, hook_name).async_execute + WebHookService.new(self, data, hook_name).async_execute if executable? end # rubocop: enable CodeReuse/ServiceClass @@ -41,4 +58,26 @@ def allow_local_requests? def help_path 'user/project/integrations/webhooks' end + + def next_backoff + return MAX_BACKOFF if backoff_count >= 8 # optimization to prevent expensive exponentiation and possible overflows + + (INITIAL_BACKOFF * (BACKOFF_GROWTH_FACTOR**backoff_count)) + .clamp(INITIAL_BACKOFF, MAX_BACKOFF) + .seconds + end + + def disable! + update!(recent_failures: FAILURE_THRESHOLD + 1) + end + + def enable! + update!(recent_failures: 0, disabled_until: nil, backoff_count: 0) + end + + private + + def web_hooks_disable_failed? + Feature.enabled?(:web_hooks_disable_failed) + end end diff --git a/app/services/system_hooks_service.rb b/app/services/system_hooks_service.rb index d0099283afa3d5d306a1f19292a1c8d940f01c5f..2a2053cb912b4e14db5cf3a906ce03c94fbaee6f 100644 --- a/app/services/system_hooks_service.rb +++ b/app/services/system_hooks_service.rb @@ -10,7 +10,7 @@ def execute_hooks_for(model, event) end def execute_hooks(data, hooks_scope = :all) - SystemHook.hooks_for(hooks_scope).find_each do |hook| + SystemHook.executable.hooks_for(hooks_scope).find_each do |hook| hook.async_execute(data, 'system_hooks') end diff --git a/app/services/web_hook_service.rb b/app/services/web_hook_service.rb index 5a51b42f9f9479efca23c70e09dc014ca5e17301..0535bc625ac1fcbf12ec3aca47282f2860193751 100644 --- a/app/services/web_hook_service.rb +++ b/app/services/web_hook_service.rb @@ -6,6 +6,18 @@ class InternalErrorResponse attr_reader :body, :headers, :code + def success? + false + end + + def redirection? + false + end + + def internal_server_error? + true + end + def initialize @headers = Gitlab::HTTP::Response::Headers.new({}) @body = '' @@ -33,6 +45,8 @@ def initialize(hook, data, hook_name) end def execute + return { status: :error, message: 'Hook disabled' } unless hook.executable? + start_time = Gitlab::Metrics::System.monotonic_time response = if parsed_url.userinfo.blank? @@ -104,6 +118,8 @@ def make_request_with_auth end def log_execution(trigger:, url:, request_data:, response:, execution_duration:, error_message: nil) + handle_failure(response, hook) + WebHookLog.create( web_hook: hook, trigger: trigger, @@ -118,6 +134,17 @@ def log_execution(trigger:, url:, request_data:, response:, execution_duration:, ) end + def handle_failure(response, hook) + if response.success? || response.redirection? + hook.enable! + elsif response.internal_server_error? + next_backoff = hook.next_backoff + hook.update!(disabled_until: next_backoff.from_now, backoff_count: hook.backoff_count + 1) + else + hook.update!(recent_failures: hook.recent_failures + 1) + end + end + def build_headers(hook_name) @headers ||= begin { diff --git a/changelogs/unreleased/ajk-disable-broken-webhooks.yml b/changelogs/unreleased/ajk-disable-broken-webhooks.yml new file mode 100644 index 0000000000000000000000000000000000000000..5f09552068cac2ad22f6a2e0187e693cf4e04478 --- /dev/null +++ b/changelogs/unreleased/ajk-disable-broken-webhooks.yml @@ -0,0 +1,5 @@ +--- +title: Disable web-hooks that fail repeatedly +merge_request: 60837 +author: +type: changed diff --git a/config/feature_flags/development/web_hooks_disable_failed.yml b/config/feature_flags/development/web_hooks_disable_failed.yml new file mode 100644 index 0000000000000000000000000000000000000000..a54034d73e8a2cfe39ea9c010b958cd230d599cf --- /dev/null +++ b/config/feature_flags/development/web_hooks_disable_failed.yml @@ -0,0 +1,8 @@ +--- +name: web_hooks_disable_failed +introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/60837 +rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/329849 +milestone: '13.12' +type: development +group: group::ecosystem +default_enabled: false diff --git a/db/migrate/20210429181325_add_failure_tracking_to_web_hooks.rb b/db/migrate/20210429181325_add_failure_tracking_to_web_hooks.rb new file mode 100644 index 0000000000000000000000000000000000000000..4a34c2dd3073b203a42d7e87a1055bd460c8adbb --- /dev/null +++ b/db/migrate/20210429181325_add_failure_tracking_to_web_hooks.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +class AddFailureTrackingToWebHooks < ActiveRecord::Migration[6.0] + def change + change_table(:web_hooks, bulk: true) do |t| + t.integer :recent_failures, null: false, limit: 2, default: 0 + t.integer :backoff_count, null: false, limit: 2, default: 0 + t.column :disabled_until, :timestamptz + end + end +end diff --git a/db/migrate/20210504085144_add_index_on_web_hook_project_id_recent_failures.rb b/db/migrate/20210504085144_add_index_on_web_hook_project_id_recent_failures.rb new file mode 100644 index 0000000000000000000000000000000000000000..898a0ccd1c52d162e3a5ab1a8ef21452466119e5 --- /dev/null +++ b/db/migrate/20210504085144_add_index_on_web_hook_project_id_recent_failures.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +class AddIndexOnWebHookProjectIdRecentFailures < ActiveRecord::Migration[6.0] + include Gitlab::Database::MigrationHelpers + + INDEX_NAME = 'index_web_hooks_on_project_id_recent_failures' + + disable_ddl_transaction! + + def up + add_concurrent_index(:web_hooks, [:project_id, :recent_failures], name: INDEX_NAME) + end + + def down + remove_concurrent_index_by_name(:web_hooks, INDEX_NAME) + end +end diff --git a/db/schema_migrations/20210429181325 b/db/schema_migrations/20210429181325 new file mode 100644 index 0000000000000000000000000000000000000000..d778566a58034247811db3073135364303b70e8d --- /dev/null +++ b/db/schema_migrations/20210429181325 @@ -0,0 +1 @@ +9674f04640f897928925ff1e23ff6d3ff918627b7c2374713a31071678956614 \ No newline at end of file diff --git a/db/schema_migrations/20210504085144 b/db/schema_migrations/20210504085144 new file mode 100644 index 0000000000000000000000000000000000000000..67abcd8eece055e295818e03f4174552bee941e0 --- /dev/null +++ b/db/schema_migrations/20210504085144 @@ -0,0 +1 @@ +3cdf8e93c4b80867a5d8e086f3f44eaeb479e875abf16187b94b3f6238faf062 \ No newline at end of file diff --git a/db/structure.sql b/db/structure.sql index c962794e928a2126f3e261e7e334ec0265ca0236..5bf73b4955a5319dc54d2ed5a4100a9f436723ef 100644 --- a/db/structure.sql +++ b/db/structure.sql @@ -19111,7 +19111,10 @@ CREATE TABLE web_hooks ( releases_events boolean DEFAULT false NOT NULL, feature_flag_events boolean DEFAULT false NOT NULL, member_events boolean DEFAULT false NOT NULL, - subgroup_events boolean DEFAULT false NOT NULL + subgroup_events boolean DEFAULT false NOT NULL, + recent_failures smallint DEFAULT 0 NOT NULL, + backoff_count smallint DEFAULT 0 NOT NULL, + disabled_until timestamp with time zone ); CREATE SEQUENCE web_hooks_id_seq @@ -24578,6 +24581,8 @@ CREATE INDEX index_web_hooks_on_group_id ON web_hooks USING btree (group_id) WHE CREATE INDEX index_web_hooks_on_project_id ON web_hooks USING btree (project_id); +CREATE INDEX index_web_hooks_on_project_id_recent_failures ON web_hooks USING btree (project_id, recent_failures); + CREATE INDEX index_web_hooks_on_service_id ON web_hooks USING btree (service_id); CREATE INDEX index_web_hooks_on_type ON web_hooks USING btree (type); diff --git a/ee/app/models/hooks/group_hook.rb b/ee/app/models/hooks/group_hook.rb index c87a5bd75409546f420b3dbfe31949f064203819..f87c537e4272231c2954415cfa2e19e849a19e9f 100644 --- a/ee/app/models/hooks/group_hook.rb +++ b/ee/app/models/hooks/group_hook.rb @@ -32,4 +32,8 @@ class GroupHook < WebHook def pluralized_name _('Group Hooks') end + + def web_hooks_disable_failed? + Feature.enabled?(:web_hooks_disable_failed, group) + end end diff --git a/ee/spec/models/ee/group_spec.rb b/ee/spec/models/ee/group_spec.rb index 2e35f39bc05620cf395ebeb4201fa8748f352ce1..00b314b9525ce8a47f79ffab3498c8a4a57c312a 100644 --- a/ee/spec/models/ee/group_spec.rb +++ b/ee/spec/models/ee/group_spec.rb @@ -986,6 +986,19 @@ group.execute_hooks(data, :member_hooks) end end + + context 'when a hook is not executable' do + before do + group_hook.update!(recent_failures: 4) + end + + it 'does not execute the disabled hook' do + expect(WebHookService).to receive(:new).with(parent_group_hook, anything, anything).and_call_original + expect(WebHookService).not_to receive(:new).with(group_hook, anything, anything) + + group.execute_hooks(data, :member_hooks) + end + end end context 'when group_webhooks feature is disabled' do diff --git a/spec/models/hooks/web_hook_spec.rb b/spec/models/hooks/web_hook_spec.rb index 413e69fb071ad54391d636add7b2743552fc15b0..b528dbedd2cef305f86981010b495e0f8e2da769 100644 --- a/spec/models/hooks/web_hook_spec.rb +++ b/spec/models/hooks/web_hook_spec.rb @@ -3,7 +3,15 @@ require 'spec_helper' RSpec.describe WebHook do - let(:hook) { build(:project_hook) } + include AfterNextHelpers + + let_it_be(:project) { create(:project) } + + let(:hook) { build(:project_hook, project: project) } + + around do |example| + freeze_time { example.run } + end describe 'associations' do it { is_expected.to have_many(:web_hook_logs) } @@ -69,18 +77,30 @@ let(:data) { { key: 'value' } } let(:hook_name) { 'project hook' } - before do - expect(WebHookService).to receive(:new).with(hook, data, hook_name).and_call_original + it '#execute' do + expect_next(WebHookService).to receive(:execute) + + hook.execute(data, hook_name) end - it '#execute' do - expect_any_instance_of(WebHookService).to receive(:execute) + it 'does not execute non-executable hooks' do + hook.update!(disabled_until: 1.day.from_now) + + expect(WebHookService).not_to receive(:new) hook.execute(data, hook_name) end it '#async_execute' do - expect_any_instance_of(WebHookService).to receive(:async_execute) + expect_next(WebHookService).to receive(:async_execute) + + hook.async_execute(data, hook_name) + end + + it 'does not async execute non-executable hooks' do + hook.update!(disabled_until: 1.day.from_now) + + expect(WebHookService).not_to receive(:new) hook.async_execute(data, hook_name) end @@ -94,4 +114,170 @@ expect { web_hook.destroy! }.to change(web_hook.web_hook_logs, :count).by(-3) end end + + describe '.executable' do + let(:not_executable) do + [ + [0, Time.current], + [0, 1.minute.from_now], + [1, 1.minute.from_now], + [3, 1.minute.from_now], + [4, nil], + [4, 1.day.ago], + [4, 1.minute.from_now] + ].map do |(recent_failures, disabled_until)| + create(:project_hook, project: project, recent_failures: recent_failures, disabled_until: disabled_until) + end + end + + let(:executables) do + [ + [0, nil], + [0, 1.day.ago], + [1, nil], + [1, 1.day.ago], + [3, nil], + [3, 1.day.ago] + ].map do |(recent_failures, disabled_until)| + create(:project_hook, project: project, recent_failures: recent_failures, disabled_until: disabled_until) + end + end + + it 'finds the correct set of project hooks' do + expect(described_class.where(project_id: project.id).executable).to match_array executables + end + + context 'when the feature flag is not enabled' do + before do + stub_feature_flags(web_hooks_disable_failed: false) + end + + it 'is the same as all' do + expect(described_class.where(project_id: project.id).executable).to match_array(executables + not_executable) + end + end + end + + describe '#executable?' do + let(:web_hook) { create(:project_hook, project: project) } + + where(:recent_failures, :not_until, :executable) do + [ + [0, :not_set, true], + [0, :past, true], + [0, :future, false], + [0, :now, false], + [1, :not_set, true], + [1, :past, true], + [1, :future, false], + [3, :not_set, true], + [3, :past, true], + [3, :future, false], + [4, :not_set, false], + [4, :past, false], + [4, :future, false] + ] + end + + with_them do + # Phasing means we cannot put these values in the where block, + # which is not subject to the frozen time context. + let(:disabled_until) do + case not_until + when :not_set + nil + when :past + 1.minute.ago + when :future + 1.minute.from_now + when :now + Time.current + end + end + + before do + web_hook.update!(recent_failures: recent_failures, disabled_until: disabled_until) + end + + it 'has the correct state' do + expect(web_hook.executable?).to eq(executable) + end + + context 'when the feature flag is enabled for a project' do + before do + stub_feature_flags(web_hooks_disable_failed: project) + end + + it 'has the expected value' do + expect(web_hook.executable?).to eq(executable) + end + end + + context 'when the feature flag is not enabled' do + before do + stub_feature_flags(web_hooks_disable_failed: false) + end + + it 'is executable' do + expect(web_hook).to be_executable + end + end + end + end + + describe '#next_backoff' do + context 'when there was no last backoff' do + before do + hook.backoff_count = 0 + end + + it 'is 10 minutes' do + expect(hook.next_backoff).to eq(described_class::INITIAL_BACKOFF) + end + end + + context 'when we have backed off once' do + before do + hook.backoff_count = 1 + end + + it 'is twice the initial value' do + expect(hook.next_backoff).to eq(20.minutes) + end + end + + context 'when we have backed off 3 times' do + before do + hook.backoff_count = 3 + end + + it 'grows exponentially' do + expect(hook.next_backoff).to eq(80.minutes) + end + end + + context 'when the previous backoff was large' do + before do + hook.backoff_count = 8 # last value before MAX_BACKOFF + end + + it 'does not exceed the max backoff value' do + expect(hook.next_backoff).to eq(described_class::MAX_BACKOFF) + end + end + end + + describe '#enable!' do + it 'makes a hook executable' do + hook.recent_failures = 1000 + + expect { hook.enable! }.to change(hook, :executable?).from(false).to(true) + end + end + + describe '#disable!' do + it 'disables a hook' do + expect { hook.disable! }.to change(hook, :executable?).from(true).to(false) + end + end end diff --git a/spec/services/web_hook_service_spec.rb b/spec/services/web_hook_service_spec.rb index 2fe72ab31c2618964262a38fcf86ac0dc7257bc5..46dab4fa1715ce2778431bb02a8686a6ad9ddf12 100644 --- a/spec/services/web_hook_service_spec.rb +++ b/spec/services/web_hook_service_spec.rb @@ -21,6 +21,10 @@ let(:service_instance) { described_class.new(project_hook, data, :push_hooks) } + around do |example| + travel_to(Time.current) { example.run } + end + describe '#initialize' do before do stub_application_setting(setting_name => setting) @@ -120,10 +124,21 @@ expect { service_instance.execute }.to raise_error(StandardError) end + it 'does not execute disabled hooks' do + project_hook.update!(recent_failures: 4) + + expect(service_instance.execute).to eq({ status: :error, message: 'Hook disabled' }) + end + it 'handles exceptions' do - exceptions = [SocketError, OpenSSL::SSL::SSLError, Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, Net::OpenTimeout, Net::ReadTimeout, Gitlab::HTTP::BlockedUrlError, Gitlab::HTTP::RedirectionTooDeep] + exceptions = [ + SocketError, OpenSSL::SSL::SSLError, Errno::ECONNRESET, Errno::ECONNREFUSED, + Errno::EHOSTUNREACH, Net::OpenTimeout, Net::ReadTimeout, + Gitlab::HTTP::BlockedUrlError, Gitlab::HTTP::RedirectionTooDeep + ] exceptions.each do |exception_class| exception = exception_class.new('Exception message') + project_hook.enable! stub_full_request(project_hook.url, method: :post).to_raise(exception) expect(service_instance.execute).to eq({ status: :error, message: exception.to_s }) @@ -166,10 +181,11 @@ context 'with success' do before do stub_full_request(project_hook.url, method: :post).to_return(status: 200, body: 'Success') - service_instance.execute end it 'log successful execution' do + service_instance.execute + expect(hook_log.trigger).to eq('push_hooks') expect(hook_log.url).to eq(project_hook.url) expect(hook_log.request_headers).to eq(headers) @@ -178,15 +194,62 @@ expect(hook_log.execution_duration).to be > 0 expect(hook_log.internal_error_message).to be_nil end + + it 'does not increment the failure count' do + expect { service_instance.execute }.not_to change(project_hook, :recent_failures) + end + + it 'does not change the disabled_until attribute' do + expect { service_instance.execute }.not_to change(project_hook, :disabled_until) + end + + context 'when the hook had previously failed' do + before do + project_hook.update!(recent_failures: 2) + end + + it 'resets the failure count' do + expect { service_instance.execute }.to change(project_hook, :recent_failures).to(0) + end + end + end + + context 'with bad request' do + before do + stub_full_request(project_hook.url, method: :post).to_return(status: 400, body: 'Bad request') + end + + it 'logs failed execution' do + service_instance.execute + + expect(hook_log).to have_attributes( + trigger: eq('push_hooks'), + url: eq(project_hook.url), + request_headers: eq(headers), + response_body: eq('Bad request'), + response_status: eq('400'), + execution_duration: be > 0, + internal_error_message: be_nil + ) + end + + it 'increments the failure count' do + expect { service_instance.execute }.to change(project_hook, :recent_failures).by(1) + end + + it 'does not change the disabled_until attribute' do + expect { service_instance.execute }.not_to change(project_hook, :disabled_until) + end end context 'with exception' do before do stub_full_request(project_hook.url, method: :post).to_raise(SocketError.new('Some HTTP Post error')) - service_instance.execute end it 'log failed execution' do + service_instance.execute + expect(hook_log.trigger).to eq('push_hooks') expect(hook_log.url).to eq(project_hook.url) expect(hook_log.request_headers).to eq(headers) @@ -195,6 +258,47 @@ expect(hook_log.execution_duration).to be > 0 expect(hook_log.internal_error_message).to eq('Some HTTP Post error') end + + it 'does not increment the failure count' do + expect { service_instance.execute }.not_to change(project_hook, :recent_failures) + end + + it 'sets the disabled_until attribute' do + expect { service_instance.execute } + .to change(project_hook, :disabled_until).to(project_hook.next_backoff.from_now) + end + + it 'increases the backoff count' do + expect { service_instance.execute }.to change(project_hook, :backoff_count).by(1) + end + + context 'when the previous cool-off was near the maximum' do + before do + project_hook.update!(disabled_until: 5.minutes.ago, backoff_count: 8) + end + + it 'sets the disabled_until attribute' do + expect { service_instance.execute }.to change(project_hook, :disabled_until).to(1.day.from_now) + end + + it 'sets the last_backoff attribute' do + expect { service_instance.execute }.to change(project_hook, :backoff_count).by(1) + end + end + + context 'when we have backed-off many many times' do + before do + project_hook.update!(disabled_until: 5.minutes.ago, backoff_count: 365) + end + + it 'sets the disabled_until attribute' do + expect { service_instance.execute }.to change(project_hook, :disabled_until).to(1.day.from_now) + end + + it 'sets the last_backoff attribute' do + expect { service_instance.execute }.to change(project_hook, :backoff_count).by(1) + end + end end context 'with unsafe response body' do