Skip to content
代码片段 群组 项目
未验证 提交 9dd56719 编辑于 作者: Craig Miskell's avatar Craig Miskell 提交者: Sean McGivern
浏览文件

Be more vigorous in killing workers when shutting down sidekiq-cluster

In https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/8511
and https://gitlab.com/gitlab-com/gl-infra/production/issues/1309
we have seen cases where sidekiq-cluster has been asked to restart,
but it has left one or more sidekiq worker processes in a stuck/hung
state burning CPU.

With this, sidekiq-cluster now waits for the worker processes to
terminate themselves cleanly (and push jobs back onto the queue) and
another few seconds beyond, then if any processes remain, kills them
hard (:KILL signal) before exiting as usual (allowing whater process
monitor is in place to restart sidekiq-cluster)
上级 16363f28
No related branches found
No related tags found
无相关合并请求
......@@ -143,6 +143,14 @@ def self.all_alive?(pids)
true
end
def self.any_alive?(pids)
pids_alive(pids).any?
end
def self.pids_alive(pids)
pids.select { |pid| signal(pid, 0) }
end
def self.write_pid(path)
File.open(path, 'w') do |handle|
handle.write(Process.pid.to_s)
......
......@@ -7,6 +7,12 @@
module Gitlab
module SidekiqCluster
class CLI
# How often to check for processes when terminating
CHECK_TERMINATE_INTERVAL_SECONDS = 1
# How long to wait in total when asking for a clean termination
# Sidekiq default to self-terminate is 25s
TERMINATE_TIMEOUT_SECONDS = 30
CommandError = Class.new(StandardError)
def initialize(log_output = STDERR)
......@@ -63,10 +69,19 @@ def write_pid
SidekiqCluster.write_pid(@pid) if @pid
end
def wait_for_termination(check_interval = CHECK_TERMINATE_INTERVAL_SECONDS, terminate_timeout = TERMINATE_TIMEOUT_SECONDS)
deadline = Gitlab::Metrics::System.monotonic_time + terminate_timeout
sleep(check_interval) while SidekiqCluster.any_alive?(@processes) && Gitlab::Metrics::System.monotonic_time < deadline
# Hard-stop any that remain; they're likely stuck
SidekiqCluster.signal_processes(SidekiqCluster.pids_alive(@processes), :KILL)
end
def trap_signals
SidekiqCluster.trap_terminate do |signal|
@alive = false
SidekiqCluster.signal_processes(@processes, signal)
wait_for_termination
end
SidekiqCluster.trap_forward do |signal|
......
......@@ -84,6 +84,51 @@
end
end
describe '#wait_for_termination' do
it 'waits for termination of all sub-processes and succeeds after 3 checks' do
expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
.with(an_instance_of(Array)).and_return(true, true, true, false)
expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
.with([]).and_return([])
expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
.with([], :KILL)
# Check every 0.1s, for no more than 1 second total
cli.wait_for_termination(0.1, 1)
end
context 'with hanging workers' do
before do
expect(cli).to receive(:write_pid)
expect(cli).to receive(:trap_signals)
expect(cli).to receive(:start_loop)
end
it 'hard kills workers after timeout expires' do
worker_pids = [101, 102, 103]
expect(Gitlab::SidekiqCluster).to receive(:start)
.with([['foo']], default_options)
.and_return(worker_pids)
expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
.with(worker_pids).and_return(true).at_least(10).times
expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
.with(worker_pids).and_return([102])
expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
.with([102], :KILL)
cli.run(%w(foo))
# Check every 0.1s, for no more than 1 second total
cli.wait_for_termination(0.1, 1)
end
end
end
describe '#trap_signals' do
it 'traps the termination and forwarding signals' do
expect(Gitlab::SidekiqCluster).to receive(:trap_terminate)
......
......@@ -146,6 +146,26 @@
end
end
describe '.any_alive?' do
it 'returns true if at least one process is alive' do
processes = [1, 2]
allow(described_class).to receive(:signal).with(1, 0).and_return(true)
allow(described_class).to receive(:signal).with(2, 0).and_return(false)
expect(described_class.any_alive?(processes)).to eq(true)
end
it 'returns false when all threads are dead' do
processes = [1, 2]
allow(described_class).to receive(:signal).with(1, 0).and_return(false)
allow(described_class).to receive(:signal).with(2, 0).and_return(false)
expect(described_class.any_alive?(processes)).to eq(false)
end
end
describe '.write_pid' do
it 'writes the PID of the current process to the given file' do
handle = double(:handle)
......
0% 加载中 .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册