Skip to content
代码片段 群组 项目
提交 30c391b0 编辑于 作者: Sean McGivern's avatar Sean McGivern
浏览文件

Merge branch '370079-watchdog-memory-limits' into 'master'

Reap workers on excessive memory growth

See merge request gitlab-org/gitlab!96241
No related branches found
No related tags found
无相关合并请求
...@@ -4,6 +4,11 @@ ...@@ -4,6 +4,11 @@
module Gitlab module Gitlab
module Cluster module Cluster
# We take advantage of the fact that the application is pre-loaded in the primary
# process. If it's a pre-fork server like Puma, this will be the Puma master process.
# Otherwise it is the worker itself such as for Sidekiq.
PRIMARY_PID = $$
# #
# LifecycleEvents lets Rails initializers register application startup hooks # LifecycleEvents lets Rails initializers register application startup hooks
# that are sensitive to forking. For example, to defer the creation of # that are sensitive to forking. For example, to defer the creation of
......
...@@ -16,8 +16,9 @@ module Memory ...@@ -16,8 +16,9 @@ module Memory
# The duration for which a process may be above a given fragmentation # The duration for which a process may be above a given fragmentation
# threshold is computed as `max_strikes * sleep_time_seconds`. # threshold is computed as `max_strikes * sleep_time_seconds`.
class Watchdog class Watchdog
DEFAULT_SLEEP_TIME_SECONDS = 60 DEFAULT_SLEEP_TIME_SECONDS = 60 * 5
DEFAULT_HEAP_FRAG_THRESHOLD = 0.5 DEFAULT_MAX_HEAP_FRAG = 0.5
DEFAULT_MAX_MEM_GROWTH = 3.0
DEFAULT_MAX_STRIKES = 5 DEFAULT_MAX_STRIKES = 5
# This handler does nothing. It returns `false` to indicate to the # This handler does nothing. It returns `false` to indicate to the
...@@ -29,7 +30,7 @@ class Watchdog ...@@ -29,7 +30,7 @@ class Watchdog
class NullHandler class NullHandler
include Singleton include Singleton
def on_high_heap_fragmentation(value) def call
# NOP # NOP
false false
end end
...@@ -41,7 +42,7 @@ def initialize(pid = $$) ...@@ -41,7 +42,7 @@ def initialize(pid = $$)
@pid = pid @pid = pid
end end
def on_high_heap_fragmentation(value) def call
Process.kill(:TERM, @pid) Process.kill(:TERM, @pid)
true true
end end
...@@ -55,7 +56,7 @@ def initialize(puma_options = ::Puma.cli_config.options) ...@@ -55,7 +56,7 @@ def initialize(puma_options = ::Puma.cli_config.options)
@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options) @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
end end
def on_high_heap_fragmentation(value) def call
@worker.term @worker.term
true true
end end
...@@ -63,6 +64,9 @@ def on_high_heap_fragmentation(value) ...@@ -63,6 +64,9 @@ def on_high_heap_fragmentation(value)
# max_heap_fragmentation: # max_heap_fragmentation:
# The degree to which the Ruby heap is allowed to be fragmented. Range [0,1]. # The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
# max_mem_growth:
# A multiplier for how much excess private memory a worker can map compared to a reference process
# (itself or the primary in a pre-fork server.)
# max_strikes: # max_strikes:
# How many times the process is allowed to be above max_heap_fragmentation before # How many times the process is allowed to be above max_heap_fragmentation before
# a handler is invoked. # a handler is invoked.
...@@ -71,7 +75,8 @@ def on_high_heap_fragmentation(value) ...@@ -71,7 +75,8 @@ def on_high_heap_fragmentation(value)
def initialize( def initialize(
handler: NullHandler.instance, handler: NullHandler.instance,
logger: Logger.new($stdout), logger: Logger.new($stdout),
max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_HEAP_FRAG_THRESHOLD, max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_MAX_HEAP_FRAG,
max_mem_growth: ENV['GITLAB_MEMWD_MAX_MEM_GROWTH']&.to_f || DEFAULT_MAX_MEM_GROWTH,
max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES, max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS, sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
**options) **options)
...@@ -79,17 +84,37 @@ def initialize( ...@@ -79,17 +84,37 @@ def initialize(
@handler = handler @handler = handler
@logger = logger @logger = logger
@max_heap_fragmentation = max_heap_fragmentation
@sleep_time_seconds = sleep_time_seconds @sleep_time_seconds = sleep_time_seconds
@max_strikes = max_strikes @max_strikes = max_strikes
@stats = {
heap_frag: {
max: max_heap_fragmentation,
strikes: 0
},
mem_growth: {
max: max_mem_growth,
strikes: 0
}
}
@alive = true @alive = true
@strikes = 0
init_prometheus_metrics(max_heap_fragmentation) init_prometheus_metrics(max_heap_fragmentation)
end end
attr_reader :strikes, :max_heap_fragmentation, :max_strikes, :sleep_time_seconds attr_reader :max_strikes, :sleep_time_seconds
def max_heap_fragmentation
@stats[:heap_frag][:max]
end
def max_mem_growth
@stats[:mem_growth][:max]
end
def strikes(stat)
@stats[stat][:strikes]
end
def call def call
@logger.info(log_labels.merge(message: 'started')) @logger.info(log_labels.merge(message: 'started'))
...@@ -97,7 +122,10 @@ def call ...@@ -97,7 +122,10 @@ def call
while @alive while @alive
sleep(@sleep_time_seconds) sleep(@sleep_time_seconds)
monitor_heap_fragmentation if Feature.enabled?(:gitlab_memory_watchdog, type: :ops) next unless Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
monitor_heap_fragmentation
monitor_memory_growth
end end
@logger.info(log_labels.merge(message: 'stopped')) @logger.info(log_labels.merge(message: 'stopped'))
...@@ -109,32 +137,73 @@ def stop ...@@ -109,32 +137,73 @@ def stop
private private
def monitor_heap_fragmentation def monitor_memory_condition(stat_key)
heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation return unless @alive
stat = @stats[stat_key]
ok, labels = yield(stat)
if heap_fragmentation > @max_heap_fragmentation if ok
@strikes += 1 stat[:strikes] = 0
@heap_frag_violations.increment
else else
@strikes = 0 stat[:strikes] += 1
@counter_violations.increment(reason: stat_key.to_s)
end end
if @strikes > @max_strikes if stat[:strikes] > @max_strikes
# If the handler returns true, it means the event is handled and we can shut down. @alive = !memory_limit_exceeded_callback(stat_key, labels)
@alive = !handle_heap_fragmentation_limit_exceeded(heap_fragmentation) stat[:strikes] = 0
@strikes = 0
end end
end end
def handle_heap_fragmentation_limit_exceeded(value) def monitor_heap_fragmentation
@logger.warn( monitor_memory_condition(:heap_frag) do |stat|
log_labels.merge( heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
message: 'heap fragmentation limit exceeded', [
memwd_cur_heap_frag: value heap_fragmentation <= stat[:max],
)) {
@heap_frag_violations_handled.increment message: 'heap fragmentation limit exceeded',
memwd_cur_heap_frag: heap_fragmentation,
memwd_max_heap_frag: stat[:max]
}
]
end
end
def monitor_memory_growth
monitor_memory_condition(:mem_growth) do |stat|
worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss]
reference_uss = reference_mem[:uss]
memory_limit = stat[:max] * reference_uss
[
worker_uss <= memory_limit,
{
message: 'memory limit exceeded',
memwd_uss_bytes: worker_uss,
memwd_ref_uss_bytes: reference_uss,
memwd_max_uss_bytes: memory_limit
}
]
end
end
# On pre-fork systems this would be the primary process memory from which workers fork.
# Otherwise it is the current process' memory.
#
# We initialize this lazily because in the initializer the application may not have
# finished booting yet, which would yield an incorrect baseline.
def reference_mem
@reference_mem ||= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID)
end
def memory_limit_exceeded_callback(stat_key, handler_labels)
all_labels = log_labels.merge(handler_labels)
.merge(memwd_cur_strikes: strikes(stat_key))
@logger.warn(all_labels)
@counter_violations_handled.increment(reason: stat_key.to_s)
handler.on_high_heap_fragmentation(value) handler.call
end end
def handler def handler
...@@ -151,9 +220,7 @@ def log_labels ...@@ -151,9 +220,7 @@ def log_labels
worker_id: worker_id, worker_id: worker_id,
memwd_handler_class: handler.class.name, memwd_handler_class: handler.class.name,
memwd_sleep_time_s: @sleep_time_seconds, memwd_sleep_time_s: @sleep_time_seconds,
memwd_max_heap_frag: @max_heap_fragmentation,
memwd_max_strikes: @max_strikes, memwd_max_strikes: @max_strikes,
memwd_cur_strikes: @strikes,
memwd_rss_bytes: process_rss_bytes memwd_rss_bytes: process_rss_bytes
} }
end end
...@@ -174,14 +241,14 @@ def init_prometheus_metrics(max_heap_fragmentation) ...@@ -174,14 +241,14 @@ def init_prometheus_metrics(max_heap_fragmentation)
@heap_frag_limit.set({}, max_heap_fragmentation) @heap_frag_limit.set({}, max_heap_fragmentation)
default_labels = { pid: worker_id } default_labels = { pid: worker_id }
@heap_frag_violations = Gitlab::Metrics.counter( @counter_violations = Gitlab::Metrics.counter(
:gitlab_memwd_heap_frag_violations_total, :gitlab_memwd_violations_total,
'Total number of times heap fragmentation in a Ruby process exceeded its allowed maximum', 'Total number of times a Ruby process violated a memory threshold',
default_labels default_labels
) )
@heap_frag_violations_handled = Gitlab::Metrics.counter( @counter_violations_handled = Gitlab::Metrics.counter(
:gitlab_memwd_heap_frag_violations_handled_total, :gitlab_memwd_violations_handled_total,
'Total number of times heap fragmentation violations in a Ruby process were handled', 'Total number of times Ruby process memory violations were handled',
default_labels default_labels
) )
end end
......
# frozen_string_literal: true # frozen_string_literal: true
require 'spec_helper' require 'spec_helper'
require_relative '../../../../lib/gitlab/cluster/lifecycle_events'
RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do
context 'watchdog' do context 'watchdog' do
...@@ -8,23 +9,31 @@ ...@@ -8,23 +9,31 @@
let(:handler) { instance_double(described_class::NullHandler) } let(:handler) { instance_double(described_class::NullHandler) }
let(:heap_frag_limit_gauge) { instance_double(::Prometheus::Client::Gauge) } let(:heap_frag_limit_gauge) { instance_double(::Prometheus::Client::Gauge) }
let(:heap_frag_violations_counter) { instance_double(::Prometheus::Client::Counter) } let(:violations_counter) { instance_double(::Prometheus::Client::Counter) }
let(:heap_frag_violations_handled_counter) { instance_double(::Prometheus::Client::Counter) } let(:violations_handled_counter) { instance_double(::Prometheus::Client::Counter) }
let(:sleep_time) { 0.1 } let(:sleep_time) { 0.1 }
let(:max_heap_fragmentation) { 0.2 } let(:max_heap_fragmentation) { 0.2 }
let(:max_mem_growth) { 2 }
# Defaults that will not trigger any events.
let(:fragmentation) { 0 }
let(:worker_memory) { 0 }
let(:primary_memory) { 0 }
let(:max_strikes) { 0 }
# Tests should set this to control the number of loop iterations in `call`. # Tests should set this to control the number of loop iterations in `call`.
let(:watchdog_iterations) { 1 } let(:watchdog_iterations) { 1 }
subject(:watchdog) do subject(:watchdog) do
described_class.new(handler: handler, logger: logger, sleep_time_seconds: sleep_time, described_class.new(handler: handler, logger: logger, sleep_time_seconds: sleep_time,
max_strikes: max_strikes, max_heap_fragmentation: max_heap_fragmentation).tap do |instance| max_strikes: max_strikes, max_mem_growth: max_mem_growth,
max_heap_fragmentation: max_heap_fragmentation).tap do |instance|
# We need to defuse `sleep` and stop the internal loop after N iterations. # We need to defuse `sleep` and stop the internal loop after N iterations.
iterations = 0 iterations = 0
expect(instance).to receive(:sleep) do allow(instance).to receive(:sleep) do
instance.stop if (iterations += 1) >= watchdog_iterations instance.stop if (iterations += 1) > watchdog_iterations
end.at_most(watchdog_iterations) end
end end
end end
...@@ -33,34 +42,35 @@ def stub_prometheus_metrics ...@@ -33,34 +42,35 @@ def stub_prometheus_metrics
.with(:gitlab_memwd_heap_frag_limit, anything) .with(:gitlab_memwd_heap_frag_limit, anything)
.and_return(heap_frag_limit_gauge) .and_return(heap_frag_limit_gauge)
allow(Gitlab::Metrics).to receive(:counter) allow(Gitlab::Metrics).to receive(:counter)
.with(:gitlab_memwd_heap_frag_violations_total, anything, anything) .with(:gitlab_memwd_violations_total, anything, anything)
.and_return(heap_frag_violations_counter) .and_return(violations_counter)
allow(Gitlab::Metrics).to receive(:counter) allow(Gitlab::Metrics).to receive(:counter)
.with(:gitlab_memwd_heap_frag_violations_handled_total, anything, anything) .with(:gitlab_memwd_violations_handled_total, anything, anything)
.and_return(heap_frag_violations_handled_counter) .and_return(violations_handled_counter)
allow(heap_frag_limit_gauge).to receive(:set) allow(heap_frag_limit_gauge).to receive(:set)
allow(heap_frag_violations_counter).to receive(:increment) allow(violations_counter).to receive(:increment)
allow(heap_frag_violations_handled_counter).to receive(:increment) allow(violations_handled_counter).to receive(:increment)
end end
before do before do
stub_prometheus_metrics stub_prometheus_metrics
allow(handler).to receive(:on_high_heap_fragmentation).and_return(true) allow(handler).to receive(:call).and_return(true)
allow(logger).to receive(:warn) allow(logger).to receive(:warn)
allow(logger).to receive(:info) allow(logger).to receive(:info)
allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(fragmentation) allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(fragmentation)
allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return({ uss: worker_memory })
allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with(
pid: Gitlab::Cluster::PRIMARY_PID
).and_return({ uss: primary_memory })
allow(::Prometheus::PidProvider).to receive(:worker_id).and_return('worker_1') allow(::Prometheus::PidProvider).to receive(:worker_id).and_return('worker_1')
end end
context 'when created' do context 'when created' do
let(:fragmentation) { 0 }
let(:max_strikes) { 0 }
it 'sets the heap fragmentation limit gauge' do it 'sets the heap fragmentation limit gauge' do
expect(heap_frag_limit_gauge).to receive(:set).with({}, max_heap_fragmentation) expect(heap_frag_limit_gauge).to receive(:set).with({}, max_heap_fragmentation)
...@@ -71,7 +81,8 @@ def stub_prometheus_metrics ...@@ -71,7 +81,8 @@ def stub_prometheus_metrics
it 'initializes with defaults' do it 'initializes with defaults' do
watchdog = described_class.new(handler: handler, logger: logger) watchdog = described_class.new(handler: handler, logger: logger)
expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_HEAP_FRAG_THRESHOLD) expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_MAX_HEAP_FRAG)
expect(watchdog.max_mem_growth).to eq(described_class::DEFAULT_MAX_MEM_GROWTH)
expect(watchdog.max_strikes).to eq(described_class::DEFAULT_MAX_STRIKES) expect(watchdog.max_strikes).to eq(described_class::DEFAULT_MAX_STRIKES)
expect(watchdog.sleep_time_seconds).to eq(described_class::DEFAULT_SLEEP_TIME_SECONDS) expect(watchdog.sleep_time_seconds).to eq(described_class::DEFAULT_SLEEP_TIME_SECONDS)
end end
...@@ -82,6 +93,7 @@ def stub_prometheus_metrics ...@@ -82,6 +93,7 @@ def stub_prometheus_metrics
stub_env('GITLAB_MEMWD_MAX_HEAP_FRAG', 1) stub_env('GITLAB_MEMWD_MAX_HEAP_FRAG', 1)
stub_env('GITLAB_MEMWD_MAX_STRIKES', 2) stub_env('GITLAB_MEMWD_MAX_STRIKES', 2)
stub_env('GITLAB_MEMWD_SLEEP_TIME_SEC', 3) stub_env('GITLAB_MEMWD_SLEEP_TIME_SEC', 3)
stub_env('GITLAB_MEMWD_MAX_MEM_GROWTH', 4)
end end
it 'initializes with these settings' do it 'initializes with these settings' do
...@@ -90,30 +102,17 @@ def stub_prometheus_metrics ...@@ -90,30 +102,17 @@ def stub_prometheus_metrics
expect(watchdog.max_heap_fragmentation).to eq(1) expect(watchdog.max_heap_fragmentation).to eq(1)
expect(watchdog.max_strikes).to eq(2) expect(watchdog.max_strikes).to eq(2)
expect(watchdog.sleep_time_seconds).to eq(3) expect(watchdog.sleep_time_seconds).to eq(3)
expect(watchdog.max_mem_growth).to eq(4)
end end
end end
end end
context 'when process does not exceed heap fragmentation threshold' do shared_examples 'has strikes left' do |stat|
let(:fragmentation) { max_heap_fragmentation - 0.1 }
let(:max_strikes) { 0 } # To rule out that we were granting too many strikes.
it 'does not signal the handler' do
expect(handler).not_to receive(:on_high_heap_fragmentation)
watchdog.call
end
end
context 'when process exceeds heap fragmentation threshold permanently' do
let(:fragmentation) { max_heap_fragmentation + 0.1 }
let(:max_strikes) { 3 }
context 'when process has not exceeded allowed number of strikes' do context 'when process has not exceeded allowed number of strikes' do
let(:watchdog_iterations) { max_strikes } let(:watchdog_iterations) { max_strikes }
it 'does not signal the handler' do it 'does not signal the handler' do
expect(handler).not_to receive(:on_high_heap_fragmentation) expect(handler).not_to receive(:call)
watchdog.call watchdog.call
end end
...@@ -125,119 +124,228 @@ def stub_prometheus_metrics ...@@ -125,119 +124,228 @@ def stub_prometheus_metrics
end end
it 'increments the violations counter' do it 'increments the violations counter' do
expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations) expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations)
watchdog.call watchdog.call
end end
it 'does not increment violations handled counter' do it 'does not increment violations handled counter' do
expect(heap_frag_violations_handled_counter).not_to receive(:increment) expect(violations_handled_counter).not_to receive(:increment)
watchdog.call watchdog.call
end end
end end
end
shared_examples 'no strikes left' do |stat|
it 'signals the handler and resets strike counter' do
expect(handler).to receive(:call).and_return(true)
watchdog.call
expect(watchdog.strikes(stat.to_sym)).to eq(0)
end
it 'increments both the violations and violations handled counters' do
expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations)
expect(violations_handled_counter).to receive(:increment).with(reason: stat)
watchdog.call
end
context 'when process exceeds the allowed number of strikes' do context 'when enforce_memory_watchdog ops toggle is off' do
let(:watchdog_iterations) { max_strikes + 1 } before do
stub_feature_flags(enforce_memory_watchdog: false)
end
it 'signals the handler and resets strike counter' do it 'always uses the NullHandler' do
expect(handler).to receive(:on_high_heap_fragmentation).and_return(true) expect(handler).not_to receive(:call)
expect(described_class::NullHandler.instance).to receive(:call).and_return(true)
watchdog.call watchdog.call
end
end
expect(watchdog.strikes).to eq(0) context 'when handler result is true' do
it 'considers the event handled and stops itself' do
expect(handler).to receive(:call).once.and_return(true)
expect(logger).to receive(:info).with(hash_including(message: 'stopped'))
watchdog.call
end end
end
it 'logs the event' do context 'when handler result is false' do
expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) let(:max_strikes) { 0 } # to make sure the handler fires each iteration
expect(logger).to receive(:warn).with({ let(:watchdog_iterations) { 3 }
message: 'heap fragmentation limit exceeded',
pid: Process.pid, it 'keeps running' do
worker_id: 'worker_1', expect(violations_counter).to receive(:increment).exactly(watchdog_iterations)
memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble', expect(violations_handled_counter).to receive(:increment).exactly(watchdog_iterations)
memwd_sleep_time_s: sleep_time, # Return true the third time to terminate the daemon.
memwd_max_heap_frag: max_heap_fragmentation, expect(handler).to receive(:call).and_return(false, false, true)
memwd_cur_heap_frag: fragmentation,
memwd_max_strikes: max_strikes,
memwd_cur_strikes: max_strikes + 1,
memwd_rss_bytes: 1024
})
watchdog.call watchdog.call
end end
end
end
context 'when monitoring memory growth' do
let(:primary_memory) { 2048 }
it 'increments both the violations and violations handled counters' do context 'when process does not exceed threshold' do
expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations) let(:worker_memory) { max_mem_growth * primary_memory - 1 }
expect(heap_frag_violations_handled_counter).to receive(:increment)
it 'does not signal the handler' do
expect(handler).not_to receive(:call)
watchdog.call watchdog.call
end end
end
context 'when enforce_memory_watchdog ops toggle is off' do context 'when process exceeds threshold permanently' do
before do let(:worker_memory) { max_mem_growth * primary_memory + 1 }
stub_feature_flags(enforce_memory_watchdog: false) let(:max_strikes) { 3 }
end
it_behaves_like 'has strikes left', 'mem_growth'
context 'when process exceeds the allowed number of strikes' do
let(:watchdog_iterations) { max_strikes + 1 }
it 'always uses the NullHandler' do it_behaves_like 'no strikes left', 'mem_growth'
expect(handler).not_to receive(:on_high_heap_fragmentation)
expect(described_class::NullHandler.instance).to( it 'only reads reference memory once' do
receive(:on_high_heap_fragmentation).with(fragmentation).and_return(true) expect(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss)
) .with(pid: Gitlab::Cluster::PRIMARY_PID)
.once
watchdog.call watchdog.call
end end
end
context 'when handler result is true' do it 'logs the event' do
it 'considers the event handled and stops itself' do expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024)
expect(handler).to receive(:on_high_heap_fragmentation).once.and_return(true) expect(logger).to receive(:warn).with({
expect(logger).to receive(:info).with(hash_including(message: 'stopped')) message: 'memory limit exceeded',
pid: Process.pid,
worker_id: 'worker_1',
memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble',
memwd_sleep_time_s: sleep_time,
memwd_max_uss_bytes: max_mem_growth * primary_memory,
memwd_ref_uss_bytes: primary_memory,
memwd_uss_bytes: worker_memory,
memwd_rss_bytes: 1024,
memwd_max_strikes: max_strikes,
memwd_cur_strikes: max_strikes + 1
})
watchdog.call watchdog.call
end end
end end
end
context 'when process exceeds threshold temporarily' do
let(:worker_memory) { max_mem_growth * primary_memory }
let(:max_strikes) { 1 }
let(:watchdog_iterations) { 4 }
before do
allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return(
{ uss: worker_memory - 0.1 },
{ uss: worker_memory + 0.2 },
{ uss: worker_memory - 0.1 },
{ uss: worker_memory + 0.1 }
)
allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with(
pid: Gitlab::Cluster::PRIMARY_PID
).and_return({ uss: primary_memory })
end
it 'does not signal the handler' do
expect(handler).not_to receive(:call)
watchdog.call
end
end
end
context 'when monitoring heap fragmentation' do
context 'when process does not exceed threshold' do
let(:fragmentation) { max_heap_fragmentation - 0.1 }
it 'does not signal the handler' do
expect(handler).not_to receive(:call)
watchdog.call
end
end
context 'when process exceeds threshold permanently' do
let(:fragmentation) { max_heap_fragmentation + 0.1 }
let(:max_strikes) { 3 }
context 'when handler result is false' do it_behaves_like 'has strikes left', 'heap_frag'
let(:max_strikes) { 0 } # to make sure the handler fires each iteration
let(:watchdog_iterations) { 3 }
it 'keeps running' do context 'when process exceeds the allowed number of strikes' do
expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations) let(:watchdog_iterations) { max_strikes + 1 }
expect(heap_frag_violations_handled_counter).to receive(:increment).exactly(watchdog_iterations)
# Return true the third time to terminate the daemon. it_behaves_like 'no strikes left', 'heap_frag'
expect(handler).to receive(:on_high_heap_fragmentation).and_return(false, false, true)
it 'logs the event' do
expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024)
expect(logger).to receive(:warn).with({
message: 'heap fragmentation limit exceeded',
pid: Process.pid,
worker_id: 'worker_1',
memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble',
memwd_sleep_time_s: sleep_time,
memwd_max_heap_frag: max_heap_fragmentation,
memwd_cur_heap_frag: fragmentation,
memwd_max_strikes: max_strikes,
memwd_cur_strikes: max_strikes + 1,
memwd_rss_bytes: 1024
})
watchdog.call watchdog.call
end end
end end
end end
end
context 'when process exceeds heap fragmentation threshold temporarily' do context 'when process exceeds threshold temporarily' do
let(:fragmentation) { max_heap_fragmentation } let(:fragmentation) { max_heap_fragmentation }
let(:max_strikes) { 1 } let(:max_strikes) { 1 }
let(:watchdog_iterations) { 4 } let(:watchdog_iterations) { 4 }
before do before do
allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return( allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(
fragmentation - 0.1, fragmentation - 0.1,
fragmentation + 0.2, fragmentation + 0.2,
fragmentation - 0.1, fragmentation - 0.1,
fragmentation + 0.1 fragmentation + 0.1
) )
end
it 'does not signal the handler' do
expect(handler).not_to receive(:call)
watchdog.call
end
end end
end
it 'does not signal the handler' do context 'when both memory fragmentation and growth exceed thresholds' do
expect(handler).not_to receive(:on_high_heap_fragmentation) let(:fragmentation) { max_heap_fragmentation + 0.1 }
let(:primary_memory) { 2048 }
let(:worker_memory) { max_mem_growth * primary_memory + 1 }
let(:watchdog_iterations) { max_strikes + 1 }
it 'only calls the handler once' do
expect(handler).to receive(:call).once.and_return(true)
watchdog.call watchdog.call
end end
end end
context 'when gitlab_memory_watchdog ops toggle is off' do context 'when gitlab_memory_watchdog ops toggle is off' do
let(:fragmentation) { 0 }
let(:max_strikes) { 0 }
before do before do
stub_feature_flags(gitlab_memory_watchdog: false) stub_feature_flags(gitlab_memory_watchdog: false)
end end
...@@ -247,6 +355,12 @@ def stub_prometheus_metrics ...@@ -247,6 +355,12 @@ def stub_prometheus_metrics
watchdog.call watchdog.call
end end
it 'does not monitor memory growth' do
expect(Gitlab::Metrics::System).not_to receive(:memory_usage_uss_pss)
watchdog.call
end
end end
end end
...@@ -254,9 +368,9 @@ def stub_prometheus_metrics ...@@ -254,9 +368,9 @@ def stub_prometheus_metrics
context 'NullHandler' do context 'NullHandler' do
subject(:handler) { described_class::NullHandler.instance } subject(:handler) { described_class::NullHandler.instance }
describe '#on_high_heap_fragmentation' do describe '#call' do
it 'does nothing' do it 'does nothing' do
expect(handler.on_high_heap_fragmentation(1.0)).to be(false) expect(handler.call).to be(false)
end end
end end
end end
...@@ -264,11 +378,11 @@ def stub_prometheus_metrics ...@@ -264,11 +378,11 @@ def stub_prometheus_metrics
context 'TermProcessHandler' do context 'TermProcessHandler' do
subject(:handler) { described_class::TermProcessHandler.new(42) } subject(:handler) { described_class::TermProcessHandler.new(42) }
describe '#on_high_heap_fragmentation' do describe '#call' do
it 'sends SIGTERM to the current process' do it 'sends SIGTERM to the current process' do
expect(Process).to receive(:kill).with(:TERM, 42) expect(Process).to receive(:kill).with(:TERM, 42)
expect(handler.on_high_heap_fragmentation(1.0)).to be(true) expect(handler.call).to be(true)
end end
end end
end end
...@@ -286,12 +400,12 @@ def stub_prometheus_metrics ...@@ -286,12 +400,12 @@ def stub_prometheus_metrics
stub_const('::Puma::Cluster::WorkerHandle', puma_worker_handle_class) stub_const('::Puma::Cluster::WorkerHandle', puma_worker_handle_class)
end end
describe '#on_high_heap_fragmentation' do describe '#call' do
it 'invokes orderly termination via Puma API' do it 'invokes orderly termination via Puma API' do
expect(puma_worker_handle_class).to receive(:new).and_return(puma_worker_handle) expect(puma_worker_handle_class).to receive(:new).and_return(puma_worker_handle)
expect(puma_worker_handle).to receive(:term) expect(puma_worker_handle).to receive(:term)
expect(handler.on_high_heap_fragmentation(1.0)).to be(true) expect(handler.call).to be(true)
end end
end end
end end
......
0% 加载中 .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册