Address several edge cases in ProcessSupervisor

Fixed: - Do not determine `alive` status from the pidset returned by caller, since this can lead to "bouncing" state. - Swap loop-check with sleep call, since sleep suspends the calling thread, which may lead to the callback being invoked if `alive` is invalidated by another thread. - Do not trap INT and TERM by default; this was swallowing these signals for the Puma master. Changelog: fixed

Address several edge cases in ProcessSupervisor
625e38e0 · Matthias Käppler · Sean McGivern · 07a34882 · 625e38e0 · 625e38e0
--- a/lib/gitlab/process_supervisor.rb
+++ b/lib/gitlab/process_supervisor.rb
@@ -20,7 +20,7 @@ def initialize(
      health_check_interval_seconds: DEFAULT_HEALTH_CHECK_INTERVAL_SECONDS,
      check_terminate_interval_seconds: DEFAULT_TERMINATE_INTERVAL_SECONDS,
      terminate_timeout_seconds: DEFAULT_TERMINATE_TIMEOUT_SECONDS,
-      term_signals: %i(INT TERM),
+      term_signals: [],
      forwarded_signals: [],
      **options)
      super(**options)
@@ -31,7 +31,7 @@ def initialize(
      @check_terminate_interval_seconds = check_terminate_interval_seconds
      @terminate_timeout_seconds = terminate_timeout_seconds

-      @pids = []
+      @pids = Set.new
      @alive = false
    end

@@ -43,7 +43,7 @@ def initialize(
    # If the block returns a non-empty list of IDs, the supervisor will
    # start observing those processes instead. Otherwise it will shut down.
    def supervise(pid_or_pids, &on_process_death)
-      @pids = Array(pid_or_pids)
+      @pids = Array(pid_or_pids).to_set
      @on_process_death = on_process_death

      trap_signals!
@@ -56,7 +56,6 @@ def shutdown(signal = :TERM)
      return unless @alive

      stop_processes(signal)
-      stop
    end

    def supervised_pids
@@ -75,26 +74,25 @@ def stop_working

    def run_thread
      while @alive
-        sleep(@health_check_interval_seconds)
-
        check_process_health
+
+        sleep(@health_check_interval_seconds)
      end
    end

    def check_process_health
      unless all_alive?
-        existing_pids = live_pids # Capture this value for the duration of the block.
+        existing_pids = live_pids.to_set # Capture this value for the duration of the block.
        dead_pids = @pids - existing_pids
-        new_pids = Array(@on_process_death.call(dead_pids))
-        @pids = existing_pids + new_pids
-        @alive = @pids.any?
+        new_pids = Array(@on_process_death.call(dead_pids.to_a))
+        @pids = existing_pids + new_pids.to_set
      end
    end

    def stop_processes(signal)
      # Set this prior to shutting down so that shutdown hooks which read `alive`
      # know the supervisor is about to shut down.
-      @alive = false
+      stop_working

      # Shut down supervised processes.
      signal_all(signal)

--- a/metrics_server/metrics_server.rb
+++ b/metrics_server/metrics_server.rb
@@ -22,8 +22,6 @@ def start_for_puma

      supervisor = PumaProcessSupervisor.instance
      supervisor.supervise(start_server.call) do
-        next unless supervisor.alive
-
        Gitlab::AppLogger.info('Puma metrics server terminated, restarting...')
        start_server.call
      end

--- a/sidekiq_cluster/cli.rb
+++ b/sidekiq_cluster/cli.rb
@@ -20,7 +20,7 @@
 module Gitlab
  module SidekiqCluster
    class CLI
-      THREAD_NAME = 'supervisor'
+      THREAD_NAME = 'sidekiq-cluster'

      # The signals that should terminate both the master and workers.
      TERMINATE_SIGNALS = %i(INT TERM).freeze
@@ -134,23 +134,17 @@ def start_and_supervise_workers(queue_groups)
        )

        metrics_server_pid = start_metrics_server
-
-        all_pids = worker_pids + Array(metrics_server_pid)
-
-        supervisor.supervise(all_pids) do |dead_pids|
+        supervisor.supervise(worker_pids + Array(metrics_server_pid)) do |dead_pids|
          # If we're not in the process of shutting down the cluster,
          # and the metrics server died, restart it.
-          if supervisor.alive && dead_pids.include?(metrics_server_pid)
+          if dead_pids == Array(metrics_server_pid)
            @logger.info('Sidekiq metrics server terminated, restarting...')
            metrics_server_pid = restart_metrics_server
-            all_pids = worker_pids + Array(metrics_server_pid)
          else
            # If a worker process died we'll just terminate the whole cluster.
            # We let an external system (runit, kubernetes) handle the restart.
            @logger.info('A worker terminated, shutting down the cluster')
-
-            ProcessManagement.signal_processes(all_pids - dead_pids, :TERM)
-            # Signal supervisor not to respawn workers and shut down.
+            supervisor.shutdown
            []
          end
        end

--- a/spec/commands/sidekiq_cluster/cli_spec.rb
+++ b/spec/commands/sidekiq_cluster/cli_spec.rb
@@ -440,32 +440,18 @@
      end

      it 'stops the entire process cluster if one of the workers has been terminated' do
-        expect(supervisor).to receive(:alive).and_return(true)
-        expect(supervisor).to receive(:supervise).and_yield([2])
        expect(MetricsServer).to receive(:start_for_sidekiq).once.and_return(metrics_server_pid)
-        expect(Gitlab::ProcessManagement).to receive(:signal_processes).with([42, 99], :TERM)
+        expect(supervisor).to receive(:supervise).and_yield([2, 99])
+        expect(supervisor).to receive(:shutdown)

        cli.run(%w(foo))
      end

-      context 'when the supervisor is alive' do
-        it 'restarts the metrics server when it is down' do
-          expect(supervisor).to receive(:alive).and_return(true)
-          expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
-          expect(MetricsServer).to receive(:start_for_sidekiq).twice.and_return(metrics_server_pid)
+      it 'restarts the metrics server when it is down' do
+        expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
+        expect(MetricsServer).to receive(:start_for_sidekiq).twice.and_return(metrics_server_pid)

-          cli.run(%w(foo))
-        end
-      end
-
-      context 'when the supervisor is shutting down' do
-        it 'does not restart the metrics server' do
-          expect(supervisor).to receive(:alive).and_return(false)
-          expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
-          expect(MetricsServer).to receive(:start_for_sidekiq).once.and_return(metrics_server_pid)
-
-          cli.run(%w(foo))
-        end
+        cli.run(%w(foo))
      end
    end
  end

--- a/spec/lib/gitlab/process_supervisor_spec.rb
+++ b/spec/lib/gitlab/process_supervisor_spec.rb
@@ -6,6 +6,7 @@
  let(:health_check_interval_seconds) { 0.1 }
  let(:check_terminate_interval_seconds) { 1 }
  let(:forwarded_signals) { [] }
+  let(:term_signals) { [] }
  let(:process_ids) { [spawn_process, spawn_process] }

  def spawn_process
@@ -19,7 +20,8 @@ def spawn_process
      health_check_interval_seconds: health_check_interval_seconds,
      check_terminate_interval_seconds: check_terminate_interval_seconds,
      terminate_timeout_seconds: 1 + check_terminate_interval_seconds,
-      forwarded_signals: forwarded_signals
+      forwarded_signals: forwarded_signals,
+      term_signals: term_signals
    )
  end

@@ -29,6 +31,8 @@ def spawn_process
    rescue Errno::ESRCH
      # Ignore if a process wasn't actually alive.
    end
+
+    supervisor.stop
  end

  describe '#supervise' do
@@ -60,7 +64,7 @@ def spawn_process
          [42] # Fake starting a new process in place of the terminated one.
        end

-        # Terminate the supervised process.
+        # Terminate a supervised process.
        Process.kill('TERM', process_ids.first)

        await_condition(sleep_sec: health_check_interval_seconds) do
@@ -71,6 +75,72 @@ def spawn_process
        expect(Gitlab::ProcessManagement.process_alive?(process_ids.last)).to be(true)
        expect(supervisor.supervised_pids).to match_array([process_ids.last, 42])
      end
+
+      it 'deduplicates PIDs returned from callback' do
+        expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
+        pids_killed = []
+
+        supervisor.supervise(process_ids) do |dead_pids|
+          pids_killed = dead_pids
+          # Fake a new process having the same pid as one that was just terminated.
+          [process_ids.last]
+        end
+
+        # Terminate a supervised process.
+        Process.kill('TERM', process_ids.first)
+
+        await_condition(sleep_sec: health_check_interval_seconds) do
+          pids_killed == [process_ids.first]
+        end
+
+        expect(supervisor.supervised_pids).to contain_exactly(process_ids.last)
+      end
+
+      it 'accepts single PID returned from callback' do
+        expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
+        pids_killed = []
+
+        supervisor.supervise(process_ids) do |dead_pids|
+          pids_killed = dead_pids
+          42
+        end
+
+        # Terminate a supervised process.
+        Process.kill('TERM', process_ids.first)
+
+        await_condition(sleep_sec: health_check_interval_seconds) do
+          pids_killed == [process_ids.first]
+        end
+
+        expect(supervisor.supervised_pids).to contain_exactly(42, process_ids.last)
+      end
+
+      context 'but supervisor has entered shutdown' do
+        it 'does not trigger callback again' do
+          expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
+          callback_count = 0
+
+          supervisor.supervise(process_ids) do |dead_pids|
+            callback_count += 1
+
+            Thread.new { supervisor.shutdown }
+
+            [42]
+          end
+
+          # Terminate the supervised processes to trigger more than 1 callback.
+          Process.kill('TERM', process_ids.first)
+          Process.kill('TERM', process_ids.last)
+
+          await_condition(sleep_sec: health_check_interval_seconds * 3) do
+            supervisor.alive == false
+          end
+
+          # Since we shut down the supervisor during the first callback, it should not
+          # be called anymore.
+          expect(callback_count).to eq(1)
+        end
+      end
    end

    context 'signal handling' do
@@ -82,6 +152,8 @@ def spawn_process
      end

      context 'termination signals' do
+        let(:term_signals) { %i(INT TERM) }
+
        context 'when TERM results in timely shutdown of processes' do
          it 'forwards them to observed processes without waiting for grace period to expire' do
            allow(Gitlab::ProcessManagement).to receive(:any_alive?).and_return(false)

--- a/spec/metrics_server/metrics_server_spec.rb
+++ b/spec/metrics_server/metrics_server_spec.rb
@@ -267,28 +267,13 @@
    end

    context 'when the supervisor callback is invoked' do
-      context 'and the supervisor is alive' do
-        it 'restarts the metrics server' do
-          expect(supervisor).to receive(:alive).and_return(true)
-          expect(supervisor).to receive(:supervise).and_yield
-          expect(Process).to receive(:spawn).with(
-            include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
-          ).twice.and_return(42)
-
-          described_class.start_for_puma
-        end
-      end
+      it 'restarts the metrics server' do
+        expect(supervisor).to receive(:supervise).and_yield
+        expect(Process).to receive(:spawn).with(
+          include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
+        ).twice.and_return(42)

-      context 'and the supervisor is not alive' do
-        it 'does not restart the server' do
-          expect(supervisor).to receive(:alive).and_return(false)
-          expect(supervisor).to receive(:supervise).and_yield
-          expect(Process).to receive(:spawn).with(
-            include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
-          ).once.and_return(42)
-
-          described_class.start_for_puma
-        end
+        described_class.start_for_puma
      end
    end
  end