Merge branch '370079-watchdog-memory-limits' into 'master'

Reap workers on excessive memory growth See merge request gitlab-org/gitlab!96241

Merge branch '370079-watchdog-memory-limits' into 'master'
30c391b0 · Sean McGivern · e10989f0 · d233174c · 30c391b0 · 30c391b0
--- a/lib/gitlab/cluster/lifecycle_events.rb
+++ b/lib/gitlab/cluster/lifecycle_events.rb
@@ -4,6 +4,11 @@
 module Gitlab
  module Cluster
+    # We take advantage of the fact that the application is pre-loaded in the primary
+    # process. If it's a pre-fork server like Puma, this will be the Puma master process.
+    # Otherwise it is the worker itself such as for Sidekiq.
+    PRIMARY_PID = $$
    #
    # LifecycleEvents lets Rails initializers register application startup hooks
    # that are sensitive to forking. For example, to defer the creation of

--- a/lib/gitlab/memory/watchdog.rb
+++ b/lib/gitlab/memory/watchdog.rb
@@ -16,8 +16,9 @@ module Memory
    # The duration for which a process may be above a given fragmentation
    # threshold is computed as `max_strikes * sleep_time_seconds`.
    class Watchdog
-      DEFAULT_SLEEP_TIME_SECONDS = 60
+      DEFAULT_SLEEP_TIME_SECONDS = 60 * 5
-      DEFAULT_HEAP_FRAG_THRESHOLD = 0.5
+      DEFAULT_MAX_HEAP_FRAG = 0.5
+      DEFAULT_MAX_MEM_GROWTH = 3.0
      DEFAULT_MAX_STRIKES = 5
      # This handler does nothing. It returns `false` to indicate to the
@@ -29,7 +30,7 @@ class Watchdog
      class NullHandler
        include Singleton
-        def on_high_heap_fragmentation(value)
+        def call
          # NOP
          false
        end
@@ -41,7 +42,7 @@ def initialize(pid = $$)
          @pid = pid
        end
-        def on_high_heap_fragmentation(value)
+        def call
          Process.kill(:TERM, @pid)
          true
        end
@@ -55,7 +56,7 @@ def initialize(puma_options = ::Puma.cli_config.options)
          @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
        end
-        def on_high_heap_fragmentation(value)
+        def call
          @worker.term
          true
        end
@@ -63,6 +64,9 @@ def on_high_heap_fragmentation(value)
      # max_heap_fragmentation:
      #   The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
+      # max_mem_growth:
+      #   A multiplier for how much excess private memory a worker can map compared to a reference process
+      #   (itself or the primary in a pre-fork server.)
      # max_strikes:
      #   How many times the process is allowed to be above max_heap_fragmentation before
      #   a handler is invoked.
@@ -71,7 +75,8 @@ def on_high_heap_fragmentation(value)
      def initialize(
        handler: NullHandler.instance,
        logger: Logger.new($stdout),
-        max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_HEAP_FRAG_THRESHOLD,
+        max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_MAX_HEAP_FRAG,
+        max_mem_growth: ENV['GITLAB_MEMWD_MAX_MEM_GROWTH']&.to_f || DEFAULT_MAX_MEM_GROWTH,
        max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
        sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
        **options)
@@ -79,17 +84,37 @@ def initialize(
        @handler = handler
        @logger = logger
-        @max_heap_fragmentation = max_heap_fragmentation
        @sleep_time_seconds = sleep_time_seconds
        @max_strikes = max_strikes
+        @stats = {
+          heap_frag: {
+            max: max_heap_fragmentation,
+            strikes: 0
+          },
+          mem_growth: {
+            max: max_mem_growth,
+            strikes: 0
+          }
+        }
        @alive = true
-        @strikes = 0
        init_prometheus_metrics(max_heap_fragmentation)
      end
-      attr_reader :strikes, :max_heap_fragmentation, :max_strikes, :sleep_time_seconds
+      attr_reader :max_strikes, :sleep_time_seconds
+      def max_heap_fragmentation
+        @stats[:heap_frag][:max]
+      end
+      def max_mem_growth
+        @stats[:mem_growth][:max]
+      end
+      def strikes(stat)
+        @stats[stat][:strikes]
+      end
      def call
        @logger.info(log_labels.merge(message: 'started'))
@@ -97,7 +122,10 @@ def call
        while @alive
          sleep(@sleep_time_seconds)
-          monitor_heap_fragmentation if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
+          next unless Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
+          monitor_heap_fragmentation
+          monitor_memory_growth
        end
        @logger.info(log_labels.merge(message: 'stopped'))
@@ -109,32 +137,73 @@ def stop
      private
-      def monitor_heap_fragmentation
+      def monitor_memory_condition(stat_key)
-        heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
+        return unless @alive
+        stat = @stats[stat_key]
+        ok, labels = yield(stat)
-        if heap_fragmentation > @max_heap_fragmentation
+        if ok
-          @strikes += 1
+          stat[:strikes] = 0
-          @heap_frag_violations.increment
        else
-          @strikes = 0
+          stat[:strikes] += 1
+          @counter_violations.increment(reason: stat_key.to_s)
        end
-        if @strikes > @max_strikes
+        if stat[:strikes] > @max_strikes
-          # If the handler returns true, it means the event is handled and we can shut down.
+          @alive = !memory_limit_exceeded_callback(stat_key, labels)
-          @alive = !handle_heap_fragmentation_limit_exceeded(heap_fragmentation)
+          stat[:strikes] = 0
-          @strikes = 0
        end
      end
-      def handle_heap_fragmentation_limit_exceeded(value)
+      def monitor_heap_fragmentation
-        @logger.warn(
+        monitor_memory_condition(:heap_frag) do |stat|
-          log_labels.merge(
+          heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
-            message: 'heap fragmentation limit exceeded',
+          [
-            memwd_cur_heap_frag: value
+            heap_fragmentation <= stat[:max],
-          ))
+            {
-        @heap_frag_violations_handled.increment
+              message: 'heap fragmentation limit exceeded',
+              memwd_cur_heap_frag: heap_fragmentation,
+              memwd_max_heap_frag: stat[:max]
+            }
+          ]
+        end
+      end
+      def monitor_memory_growth
+        monitor_memory_condition(:mem_growth) do |stat|
+          worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss]
+          reference_uss = reference_mem[:uss]
+          memory_limit = stat[:max] * reference_uss
+          [
+            worker_uss <= memory_limit,
+            {
+              message: 'memory limit exceeded',
+              memwd_uss_bytes: worker_uss,
+              memwd_ref_uss_bytes: reference_uss,
+              memwd_max_uss_bytes: memory_limit
+            }
+          ]
+        end
+      end
+      # On pre-fork systems this would be the primary process memory from which workers fork.
+      # Otherwise it is the current process' memory.
+      #
+      # We initialize this lazily because in the initializer the application may not have
+      # finished booting yet, which would yield an incorrect baseline.
+      def reference_mem
+        @reference_mem ||= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID)
+      end
+      def memory_limit_exceeded_callback(stat_key, handler_labels)
+        all_labels = log_labels.merge(handler_labels)
+          .merge(memwd_cur_strikes: strikes(stat_key))
+        @logger.warn(all_labels)
+        @counter_violations_handled.increment(reason: stat_key.to_s)
-        handler.on_high_heap_fragmentation(value)
+        handler.call
      end
      def handler
@@ -151,9 +220,7 @@ def log_labels
          worker_id: worker_id,
          memwd_handler_class: handler.class.name,
          memwd_sleep_time_s: @sleep_time_seconds,
-          memwd_max_heap_frag: @max_heap_fragmentation,
          memwd_max_strikes: @max_strikes,
-          memwd_cur_strikes: @strikes,
          memwd_rss_bytes: process_rss_bytes
        }
      end
@@ -174,14 +241,14 @@ def init_prometheus_metrics(max_heap_fragmentation)
        @heap_frag_limit.set({}, max_heap_fragmentation)
        default_labels = { pid: worker_id }
-        @heap_frag_violations = Gitlab::Metrics.counter(
+        @counter_violations = Gitlab::Metrics.counter(
-          :gitlab_memwd_heap_frag_violations_total,
+          :gitlab_memwd_violations_total,
-          'Total number of times heap fragmentation in a Ruby process exceeded its allowed maximum',
+          'Total number of times a Ruby process violated a memory threshold',
          default_labels
        )
-        @heap_frag_violations_handled = Gitlab::Metrics.counter(
+        @counter_violations_handled = Gitlab::Metrics.counter(
-          :gitlab_memwd_heap_frag_violations_handled_total,
+          :gitlab_memwd_violations_handled_total,
-          'Total number of times heap fragmentation violations in a Ruby process were handled',
+          'Total number of times Ruby process memory violations were handled',
          default_labels
        )
      end

--- a/spec/lib/gitlab/memory/watchdog_spec.rb
+++ b/spec/lib/gitlab/memory/watchdog_spec.rb
 # frozen_string_literal: true
 require 'spec_helper'
+require_relative '../../../../lib/gitlab/cluster/lifecycle_events'
 RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do
  context 'watchdog' do
@@ -8,23 +9,31 @@
    let(:handler) { instance_double(described_class::NullHandler) }
    let(:heap_frag_limit_gauge) { instance_double(::Prometheus::Client::Gauge) }
-    let(:heap_frag_violations_counter) { instance_double(::Prometheus::Client::Counter) }
+    let(:violations_counter) { instance_double(::Prometheus::Client::Counter) }
-    let(:heap_frag_violations_handled_counter) { instance_double(::Prometheus::Client::Counter) }
+    let(:violations_handled_counter) { instance_double(::Prometheus::Client::Counter) }
    let(:sleep_time) { 0.1 }
    let(:max_heap_fragmentation) { 0.2 }
+    let(:max_mem_growth) { 2 }
+    # Defaults that will not trigger any events.
+    let(:fragmentation) { 0 }
+    let(:worker_memory) { 0 }
+    let(:primary_memory) { 0 }
+    let(:max_strikes) { 0 }
    # Tests should set this to control the number of loop iterations in `call`.
    let(:watchdog_iterations) { 1 }
    subject(:watchdog) do
      described_class.new(handler: handler, logger: logger, sleep_time_seconds: sleep_time,
-                          max_strikes: max_strikes, max_heap_fragmentation: max_heap_fragmentation).tap do |instance|
+                          max_strikes: max_strikes, max_mem_growth: max_mem_growth,
+                          max_heap_fragmentation: max_heap_fragmentation).tap do |instance|
        # We need to defuse `sleep` and stop the internal loop after N iterations.
        iterations = 0
-        expect(instance).to receive(:sleep) do
+        allow(instance).to receive(:sleep) do
-          instance.stop if (iterations += 1) >= watchdog_iterations
+          instance.stop if (iterations += 1) > watchdog_iterations
-        end.at_most(watchdog_iterations)
+        end
      end
    end
@@ -33,34 +42,35 @@ def stub_prometheus_metrics
        .with(:gitlab_memwd_heap_frag_limit, anything)
        .and_return(heap_frag_limit_gauge)
      allow(Gitlab::Metrics).to receive(:counter)
-        .with(:gitlab_memwd_heap_frag_violations_total, anything, anything)
+        .with(:gitlab_memwd_violations_total, anything, anything)
-        .and_return(heap_frag_violations_counter)
+        .and_return(violations_counter)
      allow(Gitlab::Metrics).to receive(:counter)
-        .with(:gitlab_memwd_heap_frag_violations_handled_total, anything, anything)
+        .with(:gitlab_memwd_violations_handled_total, anything, anything)
-        .and_return(heap_frag_violations_handled_counter)
+        .and_return(violations_handled_counter)
      allow(heap_frag_limit_gauge).to receive(:set)
-      allow(heap_frag_violations_counter).to receive(:increment)
+      allow(violations_counter).to receive(:increment)
-      allow(heap_frag_violations_handled_counter).to receive(:increment)
+      allow(violations_handled_counter).to receive(:increment)
    end
    before do
      stub_prometheus_metrics
-      allow(handler).to receive(:on_high_heap_fragmentation).and_return(true)
+      allow(handler).to receive(:call).and_return(true)
      allow(logger).to receive(:warn)
      allow(logger).to receive(:info)
      allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(fragmentation)
+      allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return({ uss: worker_memory })
+      allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with(
+        pid: Gitlab::Cluster::PRIMARY_PID
+      ).and_return({ uss: primary_memory })
      allow(::Prometheus::PidProvider).to receive(:worker_id).and_return('worker_1')
    end
    context 'when created' do
-      let(:fragmentation) { 0 }
-      let(:max_strikes) { 0 }
      it 'sets the heap fragmentation limit gauge' do
        expect(heap_frag_limit_gauge).to receive(:set).with({}, max_heap_fragmentation)
@@ -71,7 +81,8 @@ def stub_prometheus_metrics
        it 'initializes with defaults' do
          watchdog = described_class.new(handler: handler, logger: logger)
-          expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_HEAP_FRAG_THRESHOLD)
+          expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_MAX_HEAP_FRAG)
+          expect(watchdog.max_mem_growth).to eq(described_class::DEFAULT_MAX_MEM_GROWTH)
          expect(watchdog.max_strikes).to eq(described_class::DEFAULT_MAX_STRIKES)
          expect(watchdog.sleep_time_seconds).to eq(described_class::DEFAULT_SLEEP_TIME_SECONDS)
        end
@@ -82,6 +93,7 @@ def stub_prometheus_metrics
          stub_env('GITLAB_MEMWD_MAX_HEAP_FRAG', 1)
          stub_env('GITLAB_MEMWD_MAX_STRIKES', 2)
          stub_env('GITLAB_MEMWD_SLEEP_TIME_SEC', 3)
+          stub_env('GITLAB_MEMWD_MAX_MEM_GROWTH', 4)
        end
        it 'initializes with these settings' do
@@ -90,30 +102,17 @@ def stub_prometheus_metrics
          expect(watchdog.max_heap_fragmentation).to eq(1)
          expect(watchdog.max_strikes).to eq(2)
          expect(watchdog.sleep_time_seconds).to eq(3)
+          expect(watchdog.max_mem_growth).to eq(4)
        end
      end
    end
-    context 'when process does not exceed heap fragmentation threshold' do
+    shared_examples 'has strikes left' do |stat|
-      let(:fragmentation) { max_heap_fragmentation - 0.1 }
-      let(:max_strikes) { 0 } # To rule out that we were granting too many strikes.
-      it 'does not signal the handler' do
-        expect(handler).not_to receive(:on_high_heap_fragmentation)
-        watchdog.call
-      end
-    end
-    context 'when process exceeds heap fragmentation threshold permanently' do
-      let(:fragmentation) { max_heap_fragmentation + 0.1 }
-      let(:max_strikes) { 3 }
      context 'when process has not exceeded allowed number of strikes' do
        let(:watchdog_iterations) { max_strikes }
        it 'does not signal the handler' do
-          expect(handler).not_to receive(:on_high_heap_fragmentation)
+          expect(handler).not_to receive(:call)
          watchdog.call
        end
@@ -125,119 +124,228 @@ def stub_prometheus_metrics
        end
        it 'increments the violations counter' do
-          expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations)
+          expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations)
          watchdog.call
        end
        it 'does not increment violations handled counter' do
-          expect(heap_frag_violations_handled_counter).not_to receive(:increment)
+          expect(violations_handled_counter).not_to receive(:increment)
          watchdog.call
        end
      end
+    end
+    shared_examples 'no strikes left' do |stat|
+      it 'signals the handler and resets strike counter' do
+        expect(handler).to receive(:call).and_return(true)
+        watchdog.call
+        expect(watchdog.strikes(stat.to_sym)).to eq(0)
+      end
+      it 'increments both the violations and violations handled counters' do
+        expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations)
+        expect(violations_handled_counter).to receive(:increment).with(reason: stat)
+        watchdog.call
+      end
-      context 'when process exceeds the allowed number of strikes' do
+      context 'when enforce_memory_watchdog ops toggle is off' do
-        let(:watchdog_iterations) { max_strikes + 1 }
+        before do
+          stub_feature_flags(enforce_memory_watchdog: false)
+        end
-        it 'signals the handler and resets strike counter' do
+        it 'always uses the NullHandler' do
-          expect(handler).to receive(:on_high_heap_fragmentation).and_return(true)
+          expect(handler).not_to receive(:call)
+          expect(described_class::NullHandler.instance).to receive(:call).and_return(true)
          watchdog.call
+        end
+      end
-          expect(watchdog.strikes).to eq(0)
+      context 'when handler result is true' do
+        it 'considers the event handled and stops itself' do
+          expect(handler).to receive(:call).once.and_return(true)
+          expect(logger).to receive(:info).with(hash_including(message: 'stopped'))
+          watchdog.call
        end
+      end
-        it 'logs the event' do
+      context 'when handler result is false' do
-          expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024)
+        let(:max_strikes) { 0 } # to make sure the handler fires each iteration
-          expect(logger).to receive(:warn).with({
+        let(:watchdog_iterations) { 3 }
-            message: 'heap fragmentation limit exceeded',
-            pid: Process.pid,
+        it 'keeps running' do
-            worker_id: 'worker_1',
+          expect(violations_counter).to receive(:increment).exactly(watchdog_iterations)
-            memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble',
+          expect(violations_handled_counter).to receive(:increment).exactly(watchdog_iterations)
-            memwd_sleep_time_s: sleep_time,
+          # Return true the third time to terminate the daemon.
-            memwd_max_heap_frag: max_heap_fragmentation,
+          expect(handler).to receive(:call).and_return(false, false, true)
-            memwd_cur_heap_frag: fragmentation,
-            memwd_max_strikes: max_strikes,
-            memwd_cur_strikes: max_strikes + 1,
-            memwd_rss_bytes: 1024
-          })
          watchdog.call
        end
+      end
+    end
+    context 'when monitoring memory growth' do
+      let(:primary_memory) { 2048 }
-        it 'increments both the violations and violations handled counters' do
+      context 'when process does not exceed threshold' do
-          expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations)
+        let(:worker_memory) { max_mem_growth * primary_memory - 1 }
-          expect(heap_frag_violations_handled_counter).to receive(:increment)
+        it 'does not signal the handler' do
+          expect(handler).not_to receive(:call)
          watchdog.call
        end
+      end
-        context 'when enforce_memory_watchdog ops toggle is off' do
+      context 'when process exceeds threshold permanently' do
-          before do
+        let(:worker_memory) { max_mem_growth * primary_memory + 1 }
-            stub_feature_flags(enforce_memory_watchdog: false)
+        let(:max_strikes) { 3 }
-          end
+        it_behaves_like 'has strikes left', 'mem_growth'
+        context 'when process exceeds the allowed number of strikes' do
+          let(:watchdog_iterations) { max_strikes + 1 }
-          it 'always uses the NullHandler' do
+          it_behaves_like 'no strikes left', 'mem_growth'
-            expect(handler).not_to receive(:on_high_heap_fragmentation)
-            expect(described_class::NullHandler.instance).to(
+          it 'only reads reference memory once' do
-              receive(:on_high_heap_fragmentation).with(fragmentation).and_return(true)
+            expect(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss)
-            )
+              .with(pid: Gitlab::Cluster::PRIMARY_PID)
+              .once
            watchdog.call
          end
-        end
-        context 'when handler result is true' do
+          it 'logs the event' do
-          it 'considers the event handled and stops itself' do
+            expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024)
-            expect(handler).to receive(:on_high_heap_fragmentation).once.and_return(true)
+            expect(logger).to receive(:warn).with({
-            expect(logger).to receive(:info).with(hash_including(message: 'stopped'))
+              message: 'memory limit exceeded',
+              pid: Process.pid,
+              worker_id: 'worker_1',
+              memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble',
+              memwd_sleep_time_s: sleep_time,
+              memwd_max_uss_bytes: max_mem_growth * primary_memory,
+              memwd_ref_uss_bytes: primary_memory,
+              memwd_uss_bytes: worker_memory,
+              memwd_rss_bytes: 1024,
+              memwd_max_strikes: max_strikes,
+              memwd_cur_strikes: max_strikes + 1
+            })
            watchdog.call
          end
        end
+      end
+      context 'when process exceeds threshold temporarily' do
+        let(:worker_memory) { max_mem_growth * primary_memory }
+        let(:max_strikes) { 1 }
+        let(:watchdog_iterations) { 4 }
+        before do
+          allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return(
+            { uss: worker_memory - 0.1 },
+            { uss: worker_memory + 0.2 },
+            { uss: worker_memory - 0.1 },
+            { uss: worker_memory + 0.1 }
+          )
+          allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with(
+            pid: Gitlab::Cluster::PRIMARY_PID
+          ).and_return({ uss: primary_memory })
+        end
+        it 'does not signal the handler' do
+          expect(handler).not_to receive(:call)
+          watchdog.call
+        end
+      end
+    end
+    context 'when monitoring heap fragmentation' do
+      context 'when process does not exceed threshold' do
+        let(:fragmentation) { max_heap_fragmentation - 0.1 }
+        it 'does not signal the handler' do
+          expect(handler).not_to receive(:call)
+          watchdog.call
+        end
+      end
+      context 'when process exceeds threshold permanently' do
+        let(:fragmentation) { max_heap_fragmentation + 0.1 }
+        let(:max_strikes) { 3 }
-        context 'when handler result is false' do
+        it_behaves_like 'has strikes left', 'heap_frag'
-          let(:max_strikes) { 0 } # to make sure the handler fires each iteration
-          let(:watchdog_iterations) { 3 }
-          it 'keeps running' do
+        context 'when process exceeds the allowed number of strikes' do
-            expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations)
+          let(:watchdog_iterations) { max_strikes + 1 }
-            expect(heap_frag_violations_handled_counter).to receive(:increment).exactly(watchdog_iterations)
-            # Return true the third time to terminate the daemon.
+          it_behaves_like 'no strikes left', 'heap_frag'
-            expect(handler).to receive(:on_high_heap_fragmentation).and_return(false, false, true)
+          it 'logs the event' do
+            expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024)
+            expect(logger).to receive(:warn).with({
+              message: 'heap fragmentation limit exceeded',
+              pid: Process.pid,
+              worker_id: 'worker_1',
+              memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble',
+              memwd_sleep_time_s: sleep_time,
+              memwd_max_heap_frag: max_heap_fragmentation,
+              memwd_cur_heap_frag: fragmentation,
+              memwd_max_strikes: max_strikes,
+              memwd_cur_strikes: max_strikes + 1,
+              memwd_rss_bytes: 1024
+            })
            watchdog.call
          end
        end
      end
-    end
-    context 'when process exceeds heap fragmentation threshold temporarily' do
+      context 'when process exceeds threshold temporarily' do
-      let(:fragmentation) { max_heap_fragmentation }
+        let(:fragmentation) { max_heap_fragmentation }
-      let(:max_strikes) { 1 }
+        let(:max_strikes) { 1 }
-      let(:watchdog_iterations) { 4 }
+        let(:watchdog_iterations) { 4 }
-      before do
+        before do
-        allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(
+          allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(
-          fragmentation - 0.1,
+            fragmentation - 0.1,
-          fragmentation + 0.2,
+            fragmentation + 0.2,
-          fragmentation - 0.1,
+            fragmentation - 0.1,
-          fragmentation + 0.1
+            fragmentation + 0.1
-        )
+          )
+        end
+        it 'does not signal the handler' do
+          expect(handler).not_to receive(:call)
+          watchdog.call
+        end
      end
+    end
-      it 'does not signal the handler' do
+    context 'when both memory fragmentation and growth exceed thresholds' do
-        expect(handler).not_to receive(:on_high_heap_fragmentation)
+      let(:fragmentation) { max_heap_fragmentation + 0.1 }
+      let(:primary_memory) { 2048 }
+      let(:worker_memory) { max_mem_growth * primary_memory + 1 }
+      let(:watchdog_iterations) { max_strikes + 1 }
+      it 'only calls the handler once' do
+        expect(handler).to receive(:call).once.and_return(true)
        watchdog.call
      end
    end
    context 'when gitlab_memory_watchdog ops toggle is off' do
-      let(:fragmentation) { 0 }
-      let(:max_strikes) { 0 }
      before do
        stub_feature_flags(gitlab_memory_watchdog: false)
      end
@@ -247,6 +355,12 @@ def stub_prometheus_metrics
        watchdog.call
      end
+      it 'does not monitor memory growth' do
+        expect(Gitlab::Metrics::System).not_to receive(:memory_usage_uss_pss)
+        watchdog.call
+      end
    end
  end
@@ -254,9 +368,9 @@ def stub_prometheus_metrics
    context 'NullHandler' do
      subject(:handler) { described_class::NullHandler.instance }
-      describe '#on_high_heap_fragmentation' do
+      describe '#call' do
        it 'does nothing' do
-          expect(handler.on_high_heap_fragmentation(1.0)).to be(false)
+          expect(handler.call).to be(false)
        end
      end
    end
@@ -264,11 +378,11 @@ def stub_prometheus_metrics
    context 'TermProcessHandler' do
      subject(:handler) { described_class::TermProcessHandler.new(42) }
-      describe '#on_high_heap_fragmentation' do
+      describe '#call' do
        it 'sends SIGTERM to the current process' do
          expect(Process).to receive(:kill).with(:TERM, 42)
-          expect(handler.on_high_heap_fragmentation(1.0)).to be(true)
+          expect(handler.call).to be(true)
        end
      end
    end
@@ -286,12 +400,12 @@ def stub_prometheus_metrics
        stub_const('::Puma::Cluster::WorkerHandle', puma_worker_handle_class)
      end
-      describe '#on_high_heap_fragmentation' do
+      describe '#call' do
        it 'invokes orderly termination via Puma API' do
          expect(puma_worker_handle_class).to receive(:new).and_return(puma_worker_handle)
          expect(puma_worker_handle).to receive(:term)
-          expect(handler.on_high_heap_fragmentation(1.0)).to be(true)
+          expect(handler.call).to be(true)
        end
      end
    end