diff --git a/.rubocop_todo/layout/argument_alignment.yml b/.rubocop_todo/layout/argument_alignment.yml index 35ed83513eae9f26fee1e008c1d257dc3961fcd9..ec7edb2d33a95e34e4a3334f77e7b07551821452 100644 --- a/.rubocop_todo/layout/argument_alignment.yml +++ b/.rubocop_todo/layout/argument_alignment.yml @@ -1736,7 +1736,6 @@ Layout/ArgumentAlignment: - 'lib/gitlab/seeders/ci/runner/runner_fleet_pipeline_seeder.rb' - 'lib/gitlab/setup_helper.rb' - 'lib/gitlab/sidekiq_config/worker.rb' - - 'lib/gitlab/sidekiq_daemon/memory_killer.rb' - 'lib/gitlab/spamcheck/client.rb' - 'lib/gitlab/usage/metrics/instrumentations/database_metric.rb' - 'lib/gitlab/usage_data.rb' diff --git a/.rubocop_todo/layout/line_length.yml b/.rubocop_todo/layout/line_length.yml index d3181bbeef2619b712f3c8bfaafa8c8df2a3d87e..b93145d89f6ba27097808a5999d51cfa5b167979 100644 --- a/.rubocop_todo/layout/line_length.yml +++ b/.rubocop_todo/layout/line_length.yml @@ -2932,7 +2932,6 @@ Layout/LineLength: - 'lib/gitlab/setup_helper.rb' - 'lib/gitlab/sidekiq_config.rb' - 'lib/gitlab/sidekiq_config/worker_router.rb' - - 'lib/gitlab/sidekiq_daemon/memory_killer.rb' - 'lib/gitlab/sidekiq_daemon/monitor.rb' - 'lib/gitlab/sidekiq_middleware/duplicate_jobs/strategies/deduplicates_when_scheduling.rb' - 'lib/gitlab/sidekiq_middleware/server_metrics.rb' @@ -4197,7 +4196,6 @@ Layout/LineLength: - 'spec/lib/gitlab/search_results_spec.rb' - 'spec/lib/gitlab/serializer/pagination_spec.rb' - 'spec/lib/gitlab/sidekiq_config/worker_router_spec.rb' - - 'spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb' - 'spec/lib/gitlab/sidekiq_logging/structured_logger_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/client_metrics_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/duplicate_jobs/duplicate_job_spec.rb' diff --git a/.rubocop_todo/lint/empty_block.yml b/.rubocop_todo/lint/empty_block.yml index 4cd66fdc30f68506c70a74421010606b7e915c9a..ae61e7cb862f073a7633fa9b633ec9d16a641e17 100644 --- a/.rubocop_todo/lint/empty_block.yml +++ b/.rubocop_todo/lint/empty_block.yml @@ -139,7 +139,6 @@ Lint/EmptyBlock: - 'spec/lib/gitlab/quick_actions/extractor_spec.rb' - 'spec/lib/gitlab/search_context/builder_spec.rb' - 'spec/lib/gitlab/session_spec.rb' - - 'spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb' - 'spec/lib/gitlab/sidekiq_logging/structured_logger_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/extra_done_log_metadata_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/server_metrics_spec.rb' diff --git a/.rubocop_todo/rspec/context_wording.yml b/.rubocop_todo/rspec/context_wording.yml index 5cdfbef0acca2c91ee2bef4b6b6f7903590848b6..679d9ba091aa79f5db7ed74dedc315d74006a689 100644 --- a/.rubocop_todo/rspec/context_wording.yml +++ b/.rubocop_todo/rspec/context_wording.yml @@ -2020,7 +2020,6 @@ RSpec/ContextWording: - 'spec/lib/gitlab/search_context/controller_concern_spec.rb' - 'spec/lib/gitlab/search_results_spec.rb' - 'spec/lib/gitlab/sidekiq_config/worker_router_spec.rb' - - 'spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb' - 'spec/lib/gitlab/sidekiq_logging/json_formatter_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/admin_mode/client_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/admin_mode/server_spec.rb' diff --git a/.rubocop_todo/rspec/missing_feature_category.yml b/.rubocop_todo/rspec/missing_feature_category.yml index 2941f2999d314337a6d444255511309b5c25ef3b..3f57c57ab61a6ff888b8c044a9c3a93691216b3e 100644 --- a/.rubocop_todo/rspec/missing_feature_category.yml +++ b/.rubocop_todo/rspec/missing_feature_category.yml @@ -4396,7 +4396,6 @@ RSpec/MissingFeatureCategory: - 'spec/lib/gitlab/sidekiq_config/worker_router_spec.rb' - 'spec/lib/gitlab/sidekiq_config/worker_spec.rb' - 'spec/lib/gitlab/sidekiq_config_spec.rb' - - 'spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb' - 'spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb' - 'spec/lib/gitlab/sidekiq_death_handler_spec.rb' - 'spec/lib/gitlab/sidekiq_logging/deduplication_logger_spec.rb' diff --git a/.rubocop_todo/rspec/return_from_stub.yml b/.rubocop_todo/rspec/return_from_stub.yml index 41428687eafca23d221bd363d036f51662e403ac..da4c41e2f6dc8c4610a656b6ba2f9967c5494a9a 100644 --- a/.rubocop_todo/rspec/return_from_stub.yml +++ b/.rubocop_todo/rspec/return_from_stub.yml @@ -162,7 +162,6 @@ RSpec/ReturnFromStub: - 'spec/lib/gitlab/redis/shared_state_spec.rb' - 'spec/lib/gitlab/redis/sidekiq_status_spec.rb' - 'spec/lib/gitlab/relative_positioning/range_spec.rb' - - 'spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb' - 'spec/lib/gitlab/sidekiq_logging/structured_logger_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/duplicate_jobs/duplicate_job_spec.rb' - 'spec/lib/gitlab/sidekiq_middleware/duplicate_jobs/strategies/until_executed_spec.rb' diff --git a/app/workers/gitlab/github_import/stage/finish_import_worker.rb b/app/workers/gitlab/github_import/stage/finish_import_worker.rb index 5197c1e1e881eb4f2ef400e849823b3767b598f4..e716eda5c997e2065cfd84eeb279a78533ddbf0c 100644 --- a/app/workers/gitlab/github_import/stage/finish_import_worker.rb +++ b/app/workers/gitlab/github_import/stage/finish_import_worker.rb @@ -12,10 +12,6 @@ class FinishImportWorker # rubocop:disable Scalability/IdempotentWorker include GithubImport::Queue include StageMethods - # technical debt: https://gitlab.com/gitlab-org/gitlab/issues/33991 - sidekiq_options memory_killer_memory_growth_kb: ENV.fetch('MEMORY_KILLER_FINISH_IMPORT_WORKER_MEMORY_GROWTH_KB', 50).to_i - sidekiq_options memory_killer_max_memory_growth_kb: ENV.fetch('MEMORY_KILLER_FINISH_IMPORT_WORKER_MAX_MEMORY_GROWTH_KB', 200_000).to_i - # project - An instance of Project. def import(_, project) @project = project diff --git a/app/workers/gitlab/github_import/stage/import_repository_worker.rb b/app/workers/gitlab/github_import/stage/import_repository_worker.rb index e13f43ee1f3b4de4d7059806beed1ad1f226772f..d998771b32854766596c4941f0bf86ece008ee20 100644 --- a/app/workers/gitlab/github_import/stage/import_repository_worker.rb +++ b/app/workers/gitlab/github_import/stage/import_repository_worker.rb @@ -12,10 +12,6 @@ class ImportRepositoryWorker # rubocop:disable Scalability/IdempotentWorker include GithubImport::Queue include StageMethods - # technical debt: https://gitlab.com/gitlab-org/gitlab/issues/33991 - sidekiq_options memory_killer_memory_growth_kb: ENV.fetch('MEMORY_KILLER_IMPORT_REPOSITORY_WORKER_MEMORY_GROWTH_KB', 50).to_i - sidekiq_options memory_killer_max_memory_growth_kb: ENV.fetch('MEMORY_KILLER_IMPORT_REPOSITORY_WORKER_MAX_MEMORY_GROWTH_KB', 300_000).to_i - # client - An instance of Gitlab::GithubImport::Client. # project - An instance of Project. def import(client, project) diff --git a/app/workers/repository_import_worker.rb b/app/workers/repository_import_worker.rb index 641b22918961279367fb397da37d594b234a52f4..c16071e8e31c05561734a2c344c16a79f2cf0489 100644 --- a/app/workers/repository_import_worker.rb +++ b/app/workers/repository_import_worker.rb @@ -14,10 +14,6 @@ class RepositoryImportWorker # rubocop:disable Scalability/IdempotentWorker sidekiq_options status_expiration: Gitlab::Import::StuckImportJob::IMPORT_JOBS_EXPIRATION worker_resource_boundary :memory - # technical debt: https://gitlab.com/gitlab-org/gitlab/issues/33991 - sidekiq_options memory_killer_memory_growth_kb: ENV.fetch('MEMORY_KILLER_REPOSITORY_IMPORT_WORKER_MEMORY_GROWTH_KB', 50).to_i - sidekiq_options memory_killer_max_memory_growth_kb: ENV.fetch('MEMORY_KILLER_REPOSITORY_IMPORT_WORKER_MAX_MEMORY_GROWTH_KB', 300_000).to_i - def perform(project_id) @project = Project.find_by_id(project_id) return if project.nil? || !start_import? diff --git a/config/feature_flags/ops/sidekiq_memory_killer_read_only_mode.yml b/config/feature_flags/ops/sidekiq_memory_killer_read_only_mode.yml deleted file mode 100644 index aa5cb754aff99c5d63ff737e520ef1e75cd45f41..0000000000000000000000000000000000000000 --- a/config/feature_flags/ops/sidekiq_memory_killer_read_only_mode.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -name: sidekiq_memory_killer_read_only_mode -introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/98519 -milestone: '15.5' -type: ops -group: group::application performance -default_enabled: false diff --git a/config/initializers/sidekiq.rb b/config/initializers/sidekiq.rb index cf8df814990c5a673b0c42caed87eb22b2f24294..b2baccf886fb9d6ed95987e9b7f2877fd6403239 100644 --- a/config/initializers/sidekiq.rb +++ b/config/initializers/sidekiq.rb @@ -32,8 +32,6 @@ def enable_semi_reliable_fetch_mode? queues_config_hash[:namespace] = Gitlab::Redis::Queues::SIDEKIQ_NAMESPACE enable_json_logs = Gitlab.config.sidekiq.log_format == 'json' -enable_sidekiq_memory_killer = ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS'].to_i.nonzero? && - !Gitlab::Utils.to_boolean(ENV['GITLAB_MEMORY_WATCHDOG_ENABLED'], default: true) Sidekiq.configure_server do |config| config[:strict] = false @@ -70,8 +68,6 @@ def enable_semi_reliable_fetch_mode? # To cancel job, it requires `SIDEKIQ_MONITOR_WORKER=1` to enable notification channel Gitlab::SidekiqDaemon::Monitor.instance.start - Gitlab::SidekiqDaemon::MemoryKiller.instance.start if enable_sidekiq_memory_killer - first_sidekiq_worker = !ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0' health_checks = Settings.monitoring.sidekiq_health_checks diff --git a/doc/administration/sidekiq/sidekiq_memory_killer.md b/doc/administration/sidekiq/sidekiq_memory_killer.md index cb27d44a2e631c6b4b9fd2920bc5ea2db9bc978e..63d93919129313e9c365b385e6b73f762190ba58 100644 --- a/doc/administration/sidekiq/sidekiq_memory_killer.md +++ b/doc/administration/sidekiq/sidekiq_memory_killer.md @@ -56,9 +56,7 @@ Sidekiq memory limits are controlled using environment variables. If jobs do not finish during that time, all currently running jobs are interrupted with a `SIGTERM` signal sent to the Sidekiq process. -- `GITLAB_MEMORY_WATCHDOG_ENABLED`: enabled by default. Set the `GITLAB_MEMORY_WATCHDOG_ENABLED` to false, to use legacy - Daemon Sidekiq Memory Killer implementation used prior GitLab 15.9. Support for setting `GITLAB_MEMORY_WATCHDOG_ENABLED` - will be removed in GitLab 16.0. +- `GITLAB_MEMORY_WATCHDOG_ENABLED`: enabled by default. Set the `GITLAB_MEMORY_WATCHDOG_ENABLED` to false, to disable Watchdog from running. ### Monitor worker restarts diff --git a/lib/gitlab/sidekiq_daemon/memory_killer.rb b/lib/gitlab/sidekiq_daemon/memory_killer.rb deleted file mode 100644 index 1682d62d782d3786f8ef23caaa7436d403679e8f..0000000000000000000000000000000000000000 --- a/lib/gitlab/sidekiq_daemon/memory_killer.rb +++ /dev/null @@ -1,293 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module SidekiqDaemon - class MemoryKiller < Daemon - include ::Gitlab::Utils::StrongMemoize - - # Today 64-bit CPU support max 256T memory. It is big enough. - MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024 - # RSS below `soft_limit_rss` is considered safe - SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i - # RSS above `hard_limit_rss` will be stopped - HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i - # RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS - GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i - # Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds - CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max - # Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit - SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i - # Developer/admin should always set `memory_killer_max_memory_growth_kb` explicitly - # In case not set, default to 300M. This is for extra-safe. - DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000 - - # Phases of memory killer - PHASE = { - running: 1, - above_soft_limit: 2, - stop_fetching_new_jobs: 3, - shutting_down: 4, - killing_sidekiq: 5 - }.freeze - - def initialize - super - - @enabled = true - @metrics = init_metrics - @sidekiq_daemon_monitor = Gitlab::SidekiqDaemon::Monitor.instance - end - - private - - def init_metrics - { - sidekiq_current_rss: ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'), - sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'), - sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'), - sidekiq_memory_killer_phase: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker'), - sidekiq_memory_killer_running_jobs: ::Gitlab::Metrics.counter(:sidekiq_memory_killer_running_jobs_total, 'Current running jobs when limit was reached') - } - end - - def refresh_state(phase) - @phase = PHASE.fetch(phase) - @current_rss = get_rss_kb - @soft_limit_rss = get_soft_limit_rss_kb - @hard_limit_rss = get_hard_limit_rss_kb - @memory_total = get_memory_total_kb - - # track the current state as prometheus gauges - @metrics[:sidekiq_memory_killer_phase].set({}, @phase) - @metrics[:sidekiq_current_rss].set({}, @current_rss) - @metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss) - @metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss) - end - - def run_thread - Sidekiq.logger.info( - class: self.class.to_s, - action: 'start', - pid: pid, - message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon' - ) - - while enabled? - begin - sleep(CHECK_INTERVAL_SECONDS) - restart_sidekiq unless rss_within_range? - rescue StandardError => e - log_exception(e, __method__) - rescue Exception => e # rubocop:disable Lint/RescueException - log_exception(e, __method__) - raise e - end - end - ensure - Sidekiq.logger.warn( - class: self.class.to_s, - action: 'stop', - pid: pid, - message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon' - ) - end - - def log_exception(exception, method) - Sidekiq.logger.warn( - class: self.class.to_s, - pid: pid, - message: "Exception from #{method}: #{exception.message}" - ) - end - - def stop_working - @enabled = false - end - - def enabled? - @enabled - end - - def restart_sidekiq - return if Feature.enabled?(:sidekiq_memory_killer_read_only_mode, type: :ops) - - # Tell Sidekiq to stop fetching new jobs - # We first SIGNAL and then wait given time - # We also monitor a number of running jobs and allow to restart early - refresh_state(:stop_fetching_new_jobs) - signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs') - return unless enabled? - - # Tell sidekiq to restart itself - # Keep extra safe to wait `Sidekiq[:timeout] + 2` seconds before SIGKILL - refresh_state(:shutting_down) - signal_and_wait(Sidekiq[:timeout] + 2, 'SIGTERM', 'gracefully shut down') - return unless enabled? - - # Ideally we should never reach this condition - # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't - # Kill the whole pgroup, so we can be sure no children are left behind - refresh_state(:killing_sidekiq) - signal_pgroup('SIGKILL', 'die') - end - - def rss_within_range? - refresh_state(:running) - - deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds - loop do - return true unless enabled? - - # RSS go above hard limit should trigger forcible shutdown right away - break if @current_rss > @hard_limit_rss - - # RSS go below the soft limit - return true if @current_rss < @soft_limit_rss - - # RSS did not go below the soft limit within deadline, restart - break if Gitlab::Metrics::System.monotonic_time > deadline - - sleep(CHECK_INTERVAL_SECONDS) - - refresh_state(:above_soft_limit) - - log_rss_out_of_range(false) - end - - # There are two chances to break from loop: - # - above hard limit, or - # - above soft limit after deadline - # When `above hard limit`, it immediately go to `stop_fetching_new_jobs` - # So ignore `above hard limit` and always set `above_soft_limit` here - refresh_state(:above_soft_limit) - log_rss_out_of_range - - false - end - - def log_rss_out_of_range(deadline_exceeded = true) - reason = out_of_range_description(@current_rss, - @hard_limit_rss, - @soft_limit_rss, - deadline_exceeded) - - running_jobs = fetch_running_jobs - - Sidekiq.logger.warn( - class: self.class.to_s, - pid: pid, - message: 'Sidekiq worker RSS out of range', - current_rss: @current_rss, - soft_limit_rss: @soft_limit_rss, - hard_limit_rss: @hard_limit_rss, - memory_total_kb: @memory_total, - reason: reason, - running_jobs: running_jobs) - - increment_worker_counters(running_jobs, deadline_exceeded) - end - - def increment_worker_counters(running_jobs, deadline_exceeded) - running_jobs.each do |job| - @metrics[:sidekiq_memory_killer_running_jobs].increment({ worker_class: job[:worker_class], deadline_exceeded: deadline_exceeded }) - end - end - - def fetch_running_jobs - @sidekiq_daemon_monitor.jobs.map do |jid, job| - { - jid: jid, - worker_class: job[:worker_class].name - } - end - end - - def out_of_range_description(rss, hard_limit, soft_limit, deadline_exceeded) - if rss > hard_limit - "current_rss(#{rss}) > hard_limit_rss(#{hard_limit})" - elsif deadline_exceeded - "current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})" - else - "current_rss(#{rss}) > soft_limit_rss(#{soft_limit})" - end - end - - def get_memory_total_kb - Gitlab::Metrics::System.memory_total / 1.kilobytes - end - - def get_rss_kb - Gitlab::Metrics::System.memory_usage_rss[:total] / 1.kilobytes - end - - def get_soft_limit_rss_kb - SOFT_LIMIT_RSS_KB + rss_increase_by_jobs - end - - def get_hard_limit_rss_kb - HARD_LIMIT_RSS_KB - end - - def signal_and_wait(time, signal, explanation) - Sidekiq.logger.warn( - class: self.class.to_s, - pid: pid, - signal: signal, - explanation: explanation, - wait_time: time, - message: "Sending signal and waiting" - ) - Process.kill(signal, pid) - - deadline = Gitlab::Metrics::System.monotonic_time + time - - # Sleep until thread killed or timeout reached - sleep(CHECK_INTERVAL_SECONDS) while enabled? && Gitlab::Metrics::System.monotonic_time < deadline - end - - def signal_pgroup(signal, explanation) - if Process.getpgrp == pid - pid_or_pgrp_str = 'PGRP' - pid_to_signal = 0 - else - pid_or_pgrp_str = 'PID' - pid_to_signal = pid - end - - Sidekiq.logger.warn( - class: self.class.to_s, - signal: signal, - pid: pid, - message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})" - ) - Process.kill(signal, pid_to_signal) - end - - def rss_increase_by_jobs - @sidekiq_daemon_monitor.jobs.sum do |_, job| - rss_increase_by_job(job) - end - end - - def rss_increase_by_job(job) - memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i - max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', DEFAULT_MAX_MEMORY_GROWTH_KB).to_i - - return 0 if memory_growth_kb == 0 - - time_elapsed = [Gitlab::Metrics::System.monotonic_time - job[:started_at], 0].max - [memory_growth_kb * time_elapsed, max_memory_growth_kb].min - end - - def get_job_options(job, key, default) - job[:worker_class].sidekiq_options.fetch(key, default) - rescue StandardError - default - end - - def pid - Process.pid - end - end - end -end diff --git a/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb b/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb deleted file mode 100644 index 6f46a5aea3b2eb40ed41aef3584162d14aa44a7a..0000000000000000000000000000000000000000 --- a/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb +++ /dev/null @@ -1,562 +0,0 @@ -# frozen_string_literal: true - -require 'spec_helper' - -RSpec.describe Gitlab::SidekiqDaemon::MemoryKiller do - let(:memory_killer) { described_class.new } - let(:sidekiq_daemon_monitor) { instance_double(Gitlab::SidekiqDaemon::Monitor) } - let(:running_jobs) { {} } - let(:pid) { 12345 } - let(:worker) do - Class.new do - def self.name - 'DummyWorker' - end - end - end - - before do - stub_const('DummyWorker', worker) - allow(Sidekiq.logger).to receive(:info) - allow(Sidekiq.logger).to receive(:warn) - allow(Gitlab::SidekiqDaemon::Monitor).to receive(:instance).and_return(sidekiq_daemon_monitor) - allow(sidekiq_daemon_monitor).to receive(:jobs).and_return(running_jobs) - allow(memory_killer).to receive(:pid).and_return(pid) - - # make sleep no-op - allow(memory_killer).to receive(:sleep) {} - end - - describe '#run_thread' do - subject { memory_killer.send(:run_thread) } - - before do - # let enabled? return 3 times: true, true, false - allow(memory_killer).to receive(:enabled?).and_return(true, true, false) - end - - context 'when structured logging is used' do - it 'logs start message once' do - expect(Sidekiq.logger).to receive(:info).once - .with( - class: described_class.to_s, - action: 'start', - pid: pid, - message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon') - - subject - end - - it 'logs StandardError message twice' do - expect(Sidekiq.logger).to receive(:warn).twice - .with( - class: described_class.to_s, - pid: pid, - message: "Exception from run_thread: My Exception") - - expect(memory_killer).to receive(:rss_within_range?) - .twice - .and_raise(StandardError, 'My Exception') - - expect { subject }.not_to raise_exception - end - - it 'logs exception message once and raise exception and log stop message' do - expect(Sidekiq.logger).to receive(:warn).once - .with( - class: described_class.to_s, - pid: pid, - message: "Exception from run_thread: My Exception") - - expect(memory_killer).to receive(:rss_within_range?) - .once - .and_raise(Exception, 'My Exception') - - expect(memory_killer).to receive(:sleep).with(Gitlab::SidekiqDaemon::MemoryKiller::CHECK_INTERVAL_SECONDS) - expect(Sidekiq.logger).to receive(:warn).once - .with( - class: described_class.to_s, - action: 'stop', - pid: pid, - message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon') - - expect { subject }.to raise_exception(Exception, 'My Exception') - end - - it 'logs stop message once' do - expect(Sidekiq.logger).to receive(:warn).once - .with( - class: described_class.to_s, - action: 'stop', - pid: pid, - message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon') - - subject - end - end - - it 'not invoke restart_sidekiq when rss in range' do - expect(memory_killer).to receive(:rss_within_range?) - .twice - .and_return(true) - - expect(memory_killer).not_to receive(:restart_sidekiq) - - subject - end - - it 'invoke restart_sidekiq when rss not in range' do - expect(memory_killer).to receive(:rss_within_range?) - .at_least(:once) - .and_return(false) - - expect(memory_killer).to receive(:restart_sidekiq) - .at_least(:once) - - subject - end - end - - describe '#stop_working' do - subject { memory_killer.send(:stop_working) } - - it 'changes enable? to false' do - expect { subject }.to change { memory_killer.send(:enabled?) } - .from(true).to(false) - end - end - - describe '#rss_within_range?' do - let(:shutdown_timeout_seconds) { 7 } - let(:check_interval_seconds) { 2 } - let(:grace_balloon_seconds) { 5 } - - subject { memory_killer.send(:rss_within_range?) } - - before do - stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds) - stub_const("#{described_class}::CHECK_INTERVAL_SECONDS", check_interval_seconds) - stub_const("#{described_class}::GRACE_BALLOON_SECONDS", grace_balloon_seconds) - allow(Process).to receive(:getpgrp).and_return(pid) - allow(Sidekiq).to receive(:[]).with(:timeout).and_return(9) - end - - it 'return true when everything is within limit', :aggregate_failures do - expect(memory_killer).to receive(:get_rss_kb).and_return(100) - expect(memory_killer).to receive(:get_soft_limit_rss_kb).and_return(200) - expect(memory_killer).to receive(:get_hard_limit_rss_kb).and_return(300) - expect(memory_killer).to receive(:get_memory_total_kb).and_return(3072) - - expect(memory_killer).to receive(:refresh_state) - .with(:running) - .and_call_original - - expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original - expect(memory_killer).not_to receive(:log_rss_out_of_range) - - expect(subject).to be true - end - - it 'return false when rss exceeds hard_limit_rss', :aggregate_failures do - expect(memory_killer).to receive(:get_rss_kb).at_least(:once).and_return(400) - expect(memory_killer).to receive(:get_soft_limit_rss_kb).at_least(:once).and_return(200) - expect(memory_killer).to receive(:get_hard_limit_rss_kb).at_least(:once).and_return(300) - expect(memory_killer).to receive(:get_memory_total_kb).at_least(:once).and_return(3072) - - expect(memory_killer).to receive(:refresh_state) - .with(:running) - .and_call_original - - expect(memory_killer).to receive(:refresh_state) - .with(:above_soft_limit) - .and_call_original - - expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_call_original - - expect(memory_killer).to receive(:out_of_range_description).with(400, 300, 200, true) - - expect(subject).to be false - end - - it 'return false when rss exceed hard_limit_rss after a while', :aggregate_failures do - expect(memory_killer).to receive(:get_rss_kb).and_return(250, 400, 400) - expect(memory_killer).to receive(:get_soft_limit_rss_kb).at_least(:once).and_return(200) - expect(memory_killer).to receive(:get_hard_limit_rss_kb).at_least(:once).and_return(300) - expect(memory_killer).to receive(:get_memory_total_kb).at_least(:once).and_return(3072) - - expect(memory_killer).to receive(:refresh_state) - .with(:running) - .and_call_original - - expect(memory_killer).to receive(:refresh_state) - .at_least(:once) - .with(:above_soft_limit) - .and_call_original - - expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original - expect(memory_killer).to receive(:sleep).with(check_interval_seconds) - expect(memory_killer).to receive(:out_of_range_description).with(400, 300, 200, false) - expect(memory_killer).to receive(:out_of_range_description).with(400, 300, 200, true) - - expect(subject).to be false - end - - it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS', :aggregate_failures do - expect(memory_killer).to receive(:get_rss_kb).and_return(250, 100) - expect(memory_killer).to receive(:get_soft_limit_rss_kb).and_return(200, 200) - expect(memory_killer).to receive(:get_hard_limit_rss_kb).and_return(300, 300) - expect(memory_killer).to receive(:get_memory_total_kb).and_return(3072, 3072) - - expect(memory_killer).to receive(:refresh_state) - .with(:running) - .and_call_original - - expect(memory_killer).to receive(:refresh_state) - .with(:above_soft_limit) - .and_call_original - - expect(Gitlab::Metrics::System).to receive(:monotonic_time).twice.and_call_original - expect(memory_killer).to receive(:sleep).with(check_interval_seconds) - - expect(memory_killer).to receive(:out_of_range_description).with(100, 300, 200, false) - - expect(subject).to be true - end - - context 'when exceeds GRACE_BALLOON_SECONDS' do - let(:grace_balloon_seconds) { 0 } - - it 'return false when rss exceed soft_limit_rss', :aggregate_failures do - allow(memory_killer).to receive(:get_rss_kb).and_return(250) - allow(memory_killer).to receive(:get_soft_limit_rss_kb).and_return(200) - allow(memory_killer).to receive(:get_hard_limit_rss_kb).and_return(300) - allow(memory_killer).to receive(:get_memory_total_kb).and_return(3072) - - expect(memory_killer).to receive(:refresh_state) - .with(:running) - .and_call_original - - expect(memory_killer).to receive(:refresh_state) - .with(:above_soft_limit) - .and_call_original - - expect(memory_killer).to receive(:out_of_range_description).with(250, 300, 200, true) - - expect(subject).to be false - end - end - end - - describe '#restart_sidekiq' do - let(:shutdown_timeout_seconds) { 7 } - - subject { memory_killer.send(:restart_sidekiq) } - - context 'when sidekiq_memory_killer_read_only_mode is enabled' do - before do - stub_feature_flags(sidekiq_memory_killer_read_only_mode: true) - end - - it 'does not send signal' do - expect(memory_killer).not_to receive(:refresh_state) - expect(memory_killer).not_to receive(:signal_and_wait) - - subject - end - end - - context 'when sidekiq_memory_killer_read_only_mode is disabled' do - before do - stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds) - stub_feature_flags(sidekiq_memory_killer_read_only_mode: false) - allow(Sidekiq).to receive(:[]).with(:timeout).and_return(9) - allow(memory_killer).to receive(:get_rss_kb).and_return(100) - allow(memory_killer).to receive(:get_soft_limit_rss_kb).and_return(200) - allow(memory_killer).to receive(:get_hard_limit_rss_kb).and_return(300) - allow(memory_killer).to receive(:get_memory_total_kb).and_return(3072) - end - - it 'send signal' do - expect(memory_killer).to receive(:refresh_state) - .with(:stop_fetching_new_jobs) - .ordered - .and_call_original - expect(memory_killer).to receive(:signal_and_wait) - .with(shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs') - .ordered - - expect(memory_killer).to receive(:refresh_state) - .with(:shutting_down) - .ordered - .and_call_original - expect(memory_killer).to receive(:signal_and_wait) - .with(11, 'SIGTERM', 'gracefully shut down') - .ordered - - expect(memory_killer).to receive(:refresh_state) - .with(:killing_sidekiq) - .ordered - .and_call_original - expect(memory_killer).to receive(:signal_pgroup) - .with('SIGKILL', 'die') - .ordered - - subject - end - end - end - - describe '#signal_and_wait' do - let(:time) { 0.1 } - let(:signal) { 'my-signal' } - let(:explanation) { 'my-explanation' } - let(:check_interval_seconds) { 0.1 } - - subject { memory_killer.send(:signal_and_wait, time, signal, explanation) } - - before do - stub_const("#{described_class}::CHECK_INTERVAL_SECONDS", check_interval_seconds) - end - - it 'send signal and wait till deadline' do - expect(Process).to receive(:kill) - .with(signal, pid) - .ordered - - expect(Gitlab::Metrics::System).to receive(:monotonic_time) - .and_call_original - .at_least(3) - - expect(memory_killer).to receive(:enabled?).and_return(true).at_least(:twice) - expect(memory_killer).to receive(:sleep).at_least(:once).and_call_original - - subject - end - end - - describe '#signal_pgroup' do - let(:signal) { 'my-signal' } - let(:explanation) { 'my-explanation' } - - subject { memory_killer.send(:signal_pgroup, signal, explanation) } - - it 'send signal to this process if it is not group leader' do - expect(Process).to receive(:getpgrp).and_return(pid + 1) - - expect(Sidekiq.logger).to receive(:warn).once - .with( - class: described_class.to_s, - signal: signal, - pid: pid, - message: "sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})") - expect(Process).to receive(:kill).with(signal, pid).ordered - - subject - end - - it 'send signal to whole process group as group leader' do - expect(Process).to receive(:getpgrp).and_return(pid) - - expect(Sidekiq.logger).to receive(:warn).once - .with( - class: described_class.to_s, - signal: signal, - pid: pid, - message: "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})") - expect(Process).to receive(:kill).with(signal, 0).ordered - - subject - end - end - - describe '#log_rss_out_of_range' do - let(:current_rss) { 100 } - let(:soft_limit_rss) { 200 } - let(:hard_limit_rss) { 300 } - let(:memory_total) { 3072 } - let(:jid) { 1 } - let(:reason) { 'rss out of range reason description' } - let(:queue) { 'default' } - - let(:metrics) { memory_killer.instance_variable_get(:@metrics) } - let(:running_jobs) { { jid => { worker_class: DummyWorker } } } - - before do - allow(memory_killer).to receive(:get_rss_kb).and_return(*current_rss) - allow(memory_killer).to receive(:get_soft_limit_rss_kb).and_return(soft_limit_rss) - allow(memory_killer).to receive(:get_hard_limit_rss_kb).and_return(hard_limit_rss) - allow(memory_killer).to receive(:get_memory_total_kb).and_return(memory_total) - - memory_killer.send(:refresh_state, :running) - end - - subject { memory_killer.send(:log_rss_out_of_range) } - - it 'invoke sidekiq logger warn' do - expect(memory_killer).to receive(:out_of_range_description).with(current_rss, hard_limit_rss, soft_limit_rss, true).and_return(reason) - expect(Sidekiq.logger).to receive(:warn) - .with( - class: described_class.to_s, - pid: pid, - message: 'Sidekiq worker RSS out of range', - current_rss: current_rss, - hard_limit_rss: hard_limit_rss, - soft_limit_rss: soft_limit_rss, - reason: reason, - running_jobs: [jid: jid, worker_class: 'DummyWorker'], - memory_total_kb: memory_total) - - expect(metrics[:sidekiq_memory_killer_running_jobs]).to receive(:increment) - .with({ worker_class: "DummyWorker", deadline_exceeded: true }) - - subject - end - end - - describe '#out_of_range_description' do - let(:hard_limit) { 300 } - let(:soft_limit) { 200 } - let(:grace_balloon_seconds) { 12 } - let(:deadline_exceeded) { true } - - subject { memory_killer.send(:out_of_range_description, rss, hard_limit, soft_limit, deadline_exceeded) } - - context 'when rss > hard_limit' do - let(:rss) { 400 } - - it 'tells reason' do - expect(subject).to eq("current_rss(#{rss}) > hard_limit_rss(#{hard_limit})") - end - end - - context 'when rss <= hard_limit' do - let(:rss) { 300 } - - context 'deadline exceeded' do - let(:deadline_exceeded) { true } - - it 'tells reason' do - stub_const("#{described_class}::GRACE_BALLOON_SECONDS", grace_balloon_seconds) - expect(subject).to eq("current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{grace_balloon_seconds})") - end - end - - context 'deadline not exceeded' do - let(:deadline_exceeded) { false } - - it 'tells reason' do - expect(subject).to eq("current_rss(#{rss}) > soft_limit_rss(#{soft_limit})") - end - end - end - end - - describe '#rss_increase_by_jobs' do - let(:running_jobs) { { 'job1' => { worker_class: "Job1" }, 'job2' => { worker_class: "Job2" } } } - - subject { memory_killer.send(:rss_increase_by_jobs) } - - before do - allow(memory_killer).to receive(:rss_increase_by_job).and_return(11, 22) - end - - it 'adds up individual rss_increase_by_job' do - expect(subject).to eq(33) - end - - context 'when there is no running job' do - let(:running_jobs) { {} } - - it 'return 0 if no job' do - expect(subject).to eq(0) - end - end - end - - describe '#rss_increase_by_job' do - let(:worker_class) { Chaos::SleepWorker } - let(:job) { { worker_class: worker_class, started_at: 321 } } - let(:max_memory_kb) { 100000 } - - subject { memory_killer.send(:rss_increase_by_job, job) } - - before do - stub_const("#{described_class}::DEFAULT_MAX_MEMORY_GROWTH_KB", max_memory_kb) - end - - it 'return 0 if memory_growth_kb return 0' do - expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(0) - expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(0) - - expect(Time).not_to receive(:now) - expect(subject).to eq(0) - end - - it 'return time factored growth value when it does not exceed max growth limit for whilited job' do - expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(10) - expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(100) - - expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(323) - expect(subject).to eq(20) - end - - it 'return max growth limit when time factored growth value exceed max growth limit for whilited job' do - expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(10) - expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(100) - - expect(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(332) - expect(subject).to eq(100) - end - end - - describe '#get_job_options' do - let(:worker_class) { Chaos::SleepWorker } - let(:job) { { worker_class: worker_class, started_at: 321 } } - let(:key) { 'my-key' } - let(:default) { 'my-default' } - - subject { memory_killer.send(:get_job_options, job, key, default) } - - it 'return default if key is not defined' do - expect(worker_class).to receive(:sidekiq_options).and_return({ "retry" => 5 }) - - expect(subject).to eq(default) - end - - it 'return default if get StandardError when retrieve sidekiq_options' do - expect(worker_class).to receive(:sidekiq_options).and_raise(StandardError) - - expect(subject).to eq(default) - end - - it 'return right value if sidekiq_options has the key' do - expect(worker_class).to receive(:sidekiq_options).and_return({ key => 10 }) - - expect(subject).to eq(10) - end - end - - describe '#refresh_state' do - let(:metrics) { memory_killer.instance_variable_get(:@metrics) } - - subject { memory_killer.send(:refresh_state, :shutting_down) } - - it 'calls gitlab metrics gauge set methods' do - expect(memory_killer).to receive(:get_rss_kb) { 1010 } - expect(memory_killer).to receive(:get_soft_limit_rss_kb) { 1020 } - expect(memory_killer).to receive(:get_hard_limit_rss_kb) { 1040 } - expect(memory_killer).to receive(:get_memory_total_kb) { 3072 } - - expect(metrics[:sidekiq_memory_killer_phase]).to receive(:set) - .with({}, described_class::PHASE[:shutting_down]) - expect(metrics[:sidekiq_current_rss]).to receive(:set) - .with({}, 1010) - expect(metrics[:sidekiq_memory_killer_soft_limit_rss]).to receive(:set) - .with({}, 1020) - expect(metrics[:sidekiq_memory_killer_hard_limit_rss]).to receive(:set) - .with({}, 1040) - - subject - end - end -end diff --git a/spec/support/rspec_order_todo.yml b/spec/support/rspec_order_todo.yml index 3b35b43d25c6020dd186361dcc0392f92e3a3615..71a9ac4652c3bb02a617969878be8086e4b89c24 100644 --- a/spec/support/rspec_order_todo.yml +++ b/spec/support/rspec_order_todo.yml @@ -7007,7 +7007,6 @@ - './spec/lib/gitlab/sidekiq_config/worker_matcher_spec.rb' - './spec/lib/gitlab/sidekiq_config/worker_router_spec.rb' - './spec/lib/gitlab/sidekiq_config/worker_spec.rb' -- './spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb' - './spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb' - './spec/lib/gitlab/sidekiq_death_handler_spec.rb' - './spec/lib/gitlab/sidekiq_logging/deduplication_logger_spec.rb'