From 20bff3bca6d08e219c5b57f819923f0608cdb9e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matthias=20K=C3=A4ppler?= <mkaeppler@gitlab.com>
Date: Tue, 5 May 2020 14:33:56 +0000
Subject: [PATCH] Report USS+PSS from ruby_sampler

Reports the unique & proportional set size into prometheus.

Change is behind a feature toggle.
---
 lib/gitlab/metrics/samplers/ruby_sampler.rb   |  31 +++--
 lib/gitlab/metrics/system.rb                  |  81 +++++++------
 .../metrics/samplers/ruby_sampler_spec.rb     |  17 ++-
 spec/lib/gitlab/metrics/system_spec.rb        | 113 ++++++++++++++++--
 4 files changed, 178 insertions(+), 64 deletions(-)

diff --git a/lib/gitlab/metrics/samplers/ruby_sampler.rb b/lib/gitlab/metrics/samplers/ruby_sampler.rb
index c38769f39a933..5cd2a86a10663 100644
--- a/lib/gitlab/metrics/samplers/ruby_sampler.rb
+++ b/lib/gitlab/metrics/samplers/ruby_sampler.rb
@@ -34,14 +34,16 @@ def labels
 
         def init_metrics
           metrics = {
-            file_descriptors:               ::Gitlab::Metrics.gauge(with_prefix(:file, :descriptors), 'File descriptors used', labels),
-            memory_bytes:                   ::Gitlab::Metrics.gauge(with_prefix(:memory, :bytes), 'Memory used', labels),
-            process_cpu_seconds_total:      ::Gitlab::Metrics.gauge(with_prefix(:process, :cpu_seconds_total), 'Process CPU seconds total'),
-            process_max_fds:                ::Gitlab::Metrics.gauge(with_prefix(:process, :max_fds), 'Process max fds'),
-            process_resident_memory_bytes:  ::Gitlab::Metrics.gauge(with_prefix(:process, :resident_memory_bytes), 'Memory used', labels),
-            process_start_time_seconds:     ::Gitlab::Metrics.gauge(with_prefix(:process, :start_time_seconds), 'Process start time seconds'),
-            sampler_duration:               ::Gitlab::Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels),
-            gc_duration_seconds:            ::Gitlab::Metrics.histogram(with_prefix(:gc, :duration_seconds), 'GC time', labels, GC_REPORT_BUCKETS)
+            file_descriptors:                  ::Gitlab::Metrics.gauge(with_prefix(:file, :descriptors), 'File descriptors used', labels),
+            memory_bytes:                      ::Gitlab::Metrics.gauge(with_prefix(:memory, :bytes), 'Memory used (RSS)', labels),
+            process_cpu_seconds_total:         ::Gitlab::Metrics.gauge(with_prefix(:process, :cpu_seconds_total), 'Process CPU seconds total'),
+            process_max_fds:                   ::Gitlab::Metrics.gauge(with_prefix(:process, :max_fds), 'Process max fds'),
+            process_resident_memory_bytes:     ::Gitlab::Metrics.gauge(with_prefix(:process, :resident_memory_bytes), 'Memory used (RSS)', labels),
+            process_unique_memory_bytes:       ::Gitlab::Metrics.gauge(with_prefix(:process, :unique_memory_bytes), 'Memory used (USS)', labels),
+            process_proportional_memory_bytes: ::Gitlab::Metrics.gauge(with_prefix(:process, :proportional_memory_bytes), 'Memory used (PSS)', labels),
+            process_start_time_seconds:        ::Gitlab::Metrics.gauge(with_prefix(:process, :start_time_seconds), 'Process start time seconds'),
+            sampler_duration:                  ::Gitlab::Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels),
+            gc_duration_seconds:               ::Gitlab::Metrics.histogram(with_prefix(:gc, :duration_seconds), 'GC time', labels, GC_REPORT_BUCKETS)
           }
 
           GC.stat.keys.each do |key|
@@ -85,10 +87,15 @@ def sample_gc_reports
         end
 
         def set_memory_usage_metrics
-          memory_usage = System.memory_usage
-
-          metrics[:memory_bytes].set(labels, memory_usage)
-          metrics[:process_resident_memory_bytes].set(labels, memory_usage)
+          memory_rss = System.memory_usage
+          metrics[:memory_bytes].set(labels, memory_rss)
+          metrics[:process_resident_memory_bytes].set(labels, memory_rss)
+
+          if Feature.enabled?(:collect_memory_uss_pss)
+            memory_uss_pss = System.memory_usage_uss_pss
+            metrics[:process_unique_memory_bytes].set(labels, memory_uss_pss[:uss])
+            metrics[:process_proportional_memory_bytes].set(labels, memory_uss_pss[:pss])
+          end
         end
       end
     end
diff --git a/lib/gitlab/metrics/system.rb b/lib/gitlab/metrics/system.rb
index 2a61b3de405d5..d01b6bc5b5012 100644
--- a/lib/gitlab/metrics/system.rb
+++ b/lib/gitlab/metrics/system.rb
@@ -7,47 +7,37 @@ module Metrics
     # This module relies on the /proc filesystem being available. If /proc is
     # not available the methods of this module will be stubbed.
     module System
-      if File.exist?('/proc')
-        # Returns the current process' memory usage in bytes.
-        def self.memory_usage
-          mem   = 0
-          match = File.read('/proc/self/status').match(/VmRSS:\s+(\d+)/)
-
-          if match && match[1]
-            mem = match[1].to_f * 1024
-          end
-
-          mem
-        end
-
-        def self.file_descriptor_count
-          Dir.glob('/proc/self/fd/*').length
-        end
-
-        def self.max_open_file_descriptors
-          match = File.read('/proc/self/limits').match(/Max open files\s*(\d+)/)
-
-          return unless match && match[1]
+      PROC_STATUS_PATH = '/proc/self/status'
+      PROC_SMAPS_ROLLUP_PATH = '/proc/self/smaps_rollup'
+      PROC_LIMITS_PATH = '/proc/self/limits'
+      PROC_FD_GLOB = '/proc/self/fd/*'
+
+      PRIVATE_PAGES_PATTERN = /^(Private_Clean|Private_Dirty|Private_Hugetlb):\s+(?<value>\d+)/.freeze
+      PSS_PATTERN = /^Pss:\s+(?<value>\d+)/.freeze
+      RSS_PATTERN = /VmRSS:\s+(?<value>\d+)/.freeze
+      MAX_OPEN_FILES_PATTERN = /Max open files\s*(?<value>\d+)/.freeze
+
+      # Returns the current process' RSS (resident set size) in bytes.
+      def self.memory_usage
+        sum_matches(PROC_STATUS_PATH, rss: RSS_PATTERN)[:rss].kilobytes
+      end
 
-          match[1].to_i
-        end
-      else
-        def self.memory_usage
-          0.0
-        end
+      # Returns the current process' USS/PSS (unique/proportional set size) in bytes.
+      def self.memory_usage_uss_pss
+        sum_matches(PROC_SMAPS_ROLLUP_PATH, uss: PRIVATE_PAGES_PATTERN, pss: PSS_PATTERN)
+          .transform_values(&:kilobytes)
+      end
 
-        def self.file_descriptor_count
-          0
-        end
+      def self.file_descriptor_count
+        Dir.glob(PROC_FD_GLOB).length
+      end
 
-        def self.max_open_file_descriptors
-          0
-        end
+      def self.max_open_file_descriptors
+        sum_matches(PROC_LIMITS_PATH, max_fds: MAX_OPEN_FILES_PATTERN)[:max_fds]
       end
 
       def self.cpu_time
-        Process
-          .clock_gettime(Process::CLOCK_PROCESS_CPUTIME_ID, :float_second)
+        Process.clock_gettime(Process::CLOCK_PROCESS_CPUTIME_ID, :float_second)
       end
 
       # Returns the current real time in a given precision.
@@ -78,6 +68,27 @@ def self.thread_cpu_duration(start_time)
 
         end_time - start_time
       end
+
+      # Given a path to a file in /proc and a hash of (metric, pattern) pairs,
+      # sums up all values found for those patterns under the respective metric.
+      def self.sum_matches(proc_file, **patterns)
+        results = patterns.transform_values { 0 }
+
+        begin
+          File.foreach(proc_file) do |line|
+            patterns.each do |metric, pattern|
+              match = line.match(pattern)
+              value = match&.named_captures&.fetch('value', 0)
+              results[metric] += value.to_i
+            end
+          end
+        rescue Errno::ENOENT
+          # This means the procfile we're reading from did not exist;
+          # this is safe to ignore, since we initialize each metric to 0
+        end
+
+        results
+      end
     end
   end
 end
diff --git a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb
index 8c4071a7ed169..9d8ec2d9b2153 100644
--- a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb
+++ b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb
@@ -19,20 +19,19 @@
   end
 
   describe '#sample' do
-    it 'samples various statistics' do
-      expect(Gitlab::Metrics::System).to receive(:cpu_time)
-      expect(Gitlab::Metrics::System).to receive(:file_descriptor_count)
-      expect(Gitlab::Metrics::System).to receive(:memory_usage)
-      expect(Gitlab::Metrics::System).to receive(:max_open_file_descriptors)
-      expect(sampler).to receive(:sample_gc)
+    it 'adds a metric containing the process resident memory bytes' do
+      expect(Gitlab::Metrics::System).to receive(:memory_usage).and_return(9000)
+
+      expect(sampler.metrics[:process_resident_memory_bytes]).to receive(:set).with({}, 9000)
 
       sampler.sample
     end
 
-    it 'adds a metric containing the process resident memory bytes' do
-      expect(Gitlab::Metrics::System).to receive(:memory_usage).and_return(9000)
+    it 'adds a metric containing the process unique and proportional memory bytes' do
+      expect(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return(uss: 9000, pss: 10_000)
 
-      expect(sampler.metrics[:process_resident_memory_bytes]).to receive(:set).with({}, 9000)
+      expect(sampler.metrics[:process_unique_memory_bytes]).to receive(:set).with({}, 9000)
+      expect(sampler.metrics[:process_proportional_memory_bytes]).to receive(:set).with({}, 10_000)
 
       sampler.sample
     end
diff --git a/spec/lib/gitlab/metrics/system_spec.rb b/spec/lib/gitlab/metrics/system_spec.rb
index a5aa80686fdf9..37d26bd9d63e2 100644
--- a/spec/lib/gitlab/metrics/system_spec.rb
+++ b/spec/lib/gitlab/metrics/system_spec.rb
@@ -3,33 +3,122 @@
 require 'spec_helper'
 
 describe Gitlab::Metrics::System do
-  if File.exist?('/proc')
+  context 'when /proc files exist' do
+    # Fixtures pulled from:
+    # Linux carbon 5.3.0-7648-generic #41~1586789791~19.10~9593806-Ubuntu SMP Mon Apr 13 17:50:40 UTC  x86_64 x86_64 x86_64 GNU/Linux
+    let(:proc_status) do
+      # most rows omitted for brevity
+      <<~SNIP
+      Name:       less
+      VmHWM:      2468 kB
+      VmRSS:      2468 kB
+      RssAnon:    260 kB
+      SNIP
+    end
+
+    let(:proc_smaps_rollup) do
+      # full snapshot
+      <<~SNIP
+      Rss:                2564 kB
+      Pss:                 503 kB
+      Pss_Anon:            312 kB
+      Pss_File:            191 kB
+      Pss_Shmem:             0 kB
+      Shared_Clean:       2100 kB
+      Shared_Dirty:          0 kB
+      Private_Clean:       152 kB
+      Private_Dirty:       312 kB
+      Referenced:         2564 kB
+      Anonymous:           312 kB
+      LazyFree:              0 kB
+      AnonHugePages:         0 kB
+      ShmemPmdMapped:        0 kB
+      Shared_Hugetlb:        0 kB
+      Private_Hugetlb:       0 kB
+      Swap:                  0 kB
+      SwapPss:               0 kB
+      Locked:                0 kB
+      SNIP
+    end
+
+    let(:proc_limits) do
+      # full snapshot
+      <<~SNIP
+      Limit                     Soft Limit           Hard Limit           Units
+      Max cpu time              unlimited            unlimited            seconds
+      Max file size             unlimited            unlimited            bytes
+      Max data size             unlimited            unlimited            bytes
+      Max stack size            8388608              unlimited            bytes
+      Max core file size        0                    unlimited            bytes
+      Max resident set          unlimited            unlimited            bytes
+      Max processes             126519               126519               processes
+      Max open files            1024                 1048576              files
+      Max locked memory         67108864             67108864             bytes
+      Max address space         unlimited            unlimited            bytes
+      Max file locks            unlimited            unlimited            locks
+      Max pending signals       126519               126519               signals
+      Max msgqueue size         819200               819200               bytes
+      Max nice priority         0                    0
+      Max realtime priority     0                    0
+      Max realtime timeout      unlimited            unlimited            us
+      SNIP
+    end
+
     describe '.memory_usage' do
-      it "returns the process' memory usage in bytes" do
-        expect(described_class.memory_usage).to be > 0
+      it "returns the process' resident set size (RSS) in bytes" do
+        mock_existing_proc_file('/proc/self/status', proc_status)
+
+        expect(described_class.memory_usage).to eq(2527232)
       end
     end
 
     describe '.file_descriptor_count' do
       it 'returns the amount of open file descriptors' do
-        expect(described_class.file_descriptor_count).to be > 0
+        expect(Dir).to receive(:glob).and_return(['/some/path', '/some/other/path'])
+
+        expect(described_class.file_descriptor_count).to eq(2)
       end
     end
 
     describe '.max_open_file_descriptors' do
       it 'returns the max allowed open file descriptors' do
-        expect(described_class.max_open_file_descriptors).to be > 0
+        mock_existing_proc_file('/proc/self/limits', proc_limits)
+
+        expect(described_class.max_open_file_descriptors).to eq(1024)
+      end
+    end
+
+    describe '.memory_usage_uss_pss' do
+      it "returns the process' unique and porportional set size (USS/PSS) in bytes" do
+        mock_existing_proc_file('/proc/self/smaps_rollup', proc_smaps_rollup)
+
+        # (Private_Clean (152 kB) + Private_Dirty (312 kB) + Private_Hugetlb (0 kB)) * 1024
+        expect(described_class.memory_usage_uss_pss).to eq(uss: 475136, pss: 515072)
       end
     end
-  else
+  end
+
+  context 'when /proc files do not exist' do
+    before do
+      mock_missing_proc_file
+    end
+
     describe '.memory_usage' do
-      it 'returns 0.0' do
-        expect(described_class.memory_usage).to eq(0.0)
+      it 'returns 0' do
+        expect(described_class.memory_usage).to eq(0)
+      end
+    end
+
+    describe '.memory_usage_uss_pss' do
+      it "returns 0 for all components" do
+        expect(described_class.memory_usage_uss_pss).to eq(uss: 0, pss: 0)
       end
     end
 
     describe '.file_descriptor_count' do
       it 'returns 0' do
+        expect(Dir).to receive(:glob).and_return([])
+
         expect(described_class.file_descriptor_count).to eq(0)
       end
     end
@@ -98,4 +187,12 @@
       expect(described_class.thread_cpu_duration(start_time)).to be_nil
     end
   end
+
+  def mock_existing_proc_file(path, content)
+    allow(File).to receive(:foreach).with(path) { |_path, &block| content.each_line(&block) }
+  end
+
+  def mock_missing_proc_file
+    allow(File).to receive(:foreach).and_raise(Errno::ENOENT)
+  end
 end
-- 
GitLab