From adb93622cca7edbe15f5c75daaa39c9eb975e196 Mon Sep 17 00:00:00 2001
From: johnwparent <john.parent@kitware.com>
Date: Wed, 28 Jun 2023 14:51:57 -0400
Subject: [PATCH] Diffs: Model files newly returned from gitaly

Gitaly's https://gitlab.com/gitlab-org/gitaly/-/merge_requests/5680
allows for returning information about all files in
a diff, not just those that do not exceed file limitations.
Add handling for this in Gitlab to request the metadata for diffs that
exceed set limits and ensure diff collections process
the added metadata as diffs to be stored.

lib/gitlab/git/diff#Diff: Add overflow attr

gitlab/git/diff.rb#Diff: make `collect_patch_overage` a predicate method

gitlab/git/diff.rb#Diff: prune guard on overflow

gitlab/git/diff.rb#Diff: fixup typo

gitlab/git/diff_collection.rb#DiffCollection: force consideration of feature flag

gitlab/git/diff: Review formatting suggestions

gitlab/git/diff_collection: make collect_all_paths a class method

lib/gitlab/git/diff.rb: Reduce one off method useage

spec/lib/gitlab/git/diff_spec.rb: Correct constructor signature

spec/lib/gitlab/git/diff_spec.rb: rubocop

lib/gitlab/git/diff.rb: rm dangling ref

spec/lib/gitlab/git/diff_spec.rb: simulate patch pruned by gitaly
---
 .../development/collect_all_diff_paths.yml    |  8 +++++
 lib/gitlab/git/diff.rb                        | 34 ++++++++++++++++---
 lib/gitlab/git/diff_collection.rb             | 13 +++++--
 spec/lib/gitlab/git/diff_spec.rb              | 25 ++++++++++++++
 4 files changed, 73 insertions(+), 7 deletions(-)
 create mode 100644 config/feature_flags/development/collect_all_diff_paths.yml

diff --git a/config/feature_flags/development/collect_all_diff_paths.yml b/config/feature_flags/development/collect_all_diff_paths.yml
new file mode 100644
index 0000000000000..d382889fe1690
--- /dev/null
+++ b/config/feature_flags/development/collect_all_diff_paths.yml
@@ -0,0 +1,8 @@
+---
+name: collect_all_diff_paths
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/124907
+rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/421460
+milestone: '16.4'
+type: development
+group: "group::gitaly"
+default_enabled: false
diff --git a/lib/gitlab/git/diff.rb b/lib/gitlab/git/diff.rb
index 72f7413500f04..2dd85445602e8 100644
--- a/lib/gitlab/git/diff.rb
+++ b/lib/gitlab/git/diff.rb
@@ -81,7 +81,7 @@ def between(repo, head, base, options = {}, *paths)
         #    exceeded
         def filter_diff_options(options, default_options = {})
           allowed_options = [:ignore_whitespace_change, :max_files, :max_lines,
-                             :limits, :expanded]
+                             :limits, :expanded, :collect_all_paths]
 
           if default_options
             actual_defaults = default_options.dup
@@ -126,6 +126,10 @@ def patch_safe_limit_bytes(limit = patch_hard_limit_bytes)
           limit / 10
         end
 
+        def collect_patch_overage?
+          !!Feature.enabled?(:collect_all_diff_paths)
+        end
+
         # Returns the limit for a single diff file (patch).
         #
         # Patches surpassing this limit shouldn't be persisted in the database
@@ -198,9 +202,13 @@ def too_large?
       # This is used by `to_hash` and `init_from_hash`.
       alias_method :too_large, :too_large?
 
-      def too_large!
+      def prune!
         @diff = ''
         @line_count = 0
+      end
+
+      def too_large!
+        prune!
         @too_large = true
       end
 
@@ -211,11 +219,19 @@ def collapsed?
       end
 
       def collapse!
-        @diff = ''
-        @line_count = 0
+        prune!
         @collapsed = true
       end
 
+      def overflow?
+        return @overflow if defined?(@overflow)
+
+        # If overflow is not defined, we're
+        # not recieveing a diff from Gitaly
+        # and overflow has no meaning
+        false
+      end
+
       def json_safe_diff
         return @diff unless detect_binary?(@diff)
 
@@ -248,7 +264,7 @@ def init_from_hash(hash)
       end
 
       def init_from_gitaly(gitaly_diff)
-        @diff = gitaly_diff.respond_to?(:patch) ? encode!(gitaly_diff.patch) : ''
+        @diff = gitaly_diff.try(:patch).present? ? encode!(gitaly_diff.patch) : ''
         @new_path = encode!(gitaly_diff.to_path.dup)
         @old_path = encode!(gitaly_diff.from_path.dup)
         @a_mode = gitaly_diff.old_mode.to_s(8)
@@ -257,11 +273,19 @@ def init_from_gitaly(gitaly_diff)
         @renamed_file = gitaly_diff.from_path != gitaly_diff.to_path
         @deleted_file = gitaly_diff.to_id == BLANK_SHA
         @too_large = gitaly_diff.too_large if gitaly_diff.respond_to?(:too_large)
+        gitaly_overflow = gitaly_diff.try(:overflow_marker)
+        @overflow = Diff.collect_patch_overage? && gitaly_overflow
 
         collapse! if gitaly_diff.respond_to?(:collapsed) && gitaly_diff.collapsed
+        # Diffs exceeding limits returned from gitaly when "collect_all_paths" are enabled
+        # are already pruned, but should be "collapsed" as they have no content
+        @collapsed = true if @overflow
       end
 
       def prune_diff_if_eligible
+        # If we have overflow, diffs are already pruned, retain line counts
+        return if overflow?
+
         if too_large?
           ::Gitlab::Metrics.add_event(:patch_hard_limit_bytes_hit)
 
diff --git a/lib/gitlab/git/diff_collection.rb b/lib/gitlab/git/diff_collection.rb
index b4dd880ceb7d7..c021268a62a5f 100644
--- a/lib/gitlab/git/diff_collection.rb
+++ b/lib/gitlab/git/diff_collection.rb
@@ -13,6 +13,10 @@ def self.default_limits
         { max_files: ::Commit.diff_safe_max_files, max_lines: ::Commit.diff_safe_max_lines }
       end
 
+      def self.collect_all_paths?(collect_all_paths)
+        Gitlab::Git::Diff.collect_patch_overage? ? collect_all_paths : false
+      end
+
       def self.limits(options = {})
         limits = {}
         defaults = default_limits
@@ -25,7 +29,7 @@ def self.limits(options = {})
         limits[:safe_max_bytes] = limits[:safe_max_files] * 5.kilobytes # Average 5 KB per file
         limits[:max_patch_bytes] = Gitlab::Git::Diff.patch_hard_limit_bytes
         limits[:max_patch_bytes_for_file_extension] = options.fetch(:max_patch_bytes_for_file_extension, {})
-
+        limits[:collect_all_paths] = collect_all_paths?(options.fetch(:collect_all_paths, false))
         limits
       end
 
@@ -164,7 +168,12 @@ def each_gitaly_patch
 
           if raw.overflow_marker
             @overflow = true
-            break
+            # If we're requesting patches with `collect_all_paths` enabled, then
+            # Once we hit the overflow marker, gitlay has still returned diffs, just without
+            # patches, only metadata
+            unless @limits[:collect_all_paths]
+              break
+            end
           end
 
           yield @array[i] = diff
diff --git a/spec/lib/gitlab/git/diff_spec.rb b/spec/lib/gitlab/git/diff_spec.rb
index 6745c700b92ef..4d78e194da87a 100644
--- a/spec/lib/gitlab/git/diff_spec.rb
+++ b/spec/lib/gitlab/git/diff_spec.rb
@@ -131,6 +131,31 @@
           expect(diff.diff).to be_utf8
         end
       end
+
+      context 'using a diff that it too large but collecting all paths' do
+        let(:gitaly_diff) do
+          Gitlab::GitalyClient::Diff.new(
+            from_path: '.gitmodules',
+            to_path: '.gitmodules',
+            old_mode: 0100644,
+            new_mode: 0100644,
+            from_id: '0792c58905eff3432b721f8c4a64363d8e28d9ae',
+            to_id: 'efd587ccb47caf5f31fc954edb21f0a713d9ecc3',
+            overflow_marker: true,
+            collapsed: false,
+            too_large: false,
+            patch: ''
+          )
+        end
+
+        let(:diff) { described_class.new(gitaly_diff) }
+
+        it 'is already pruned and collapsed but not too large' do
+          expect(diff.diff).to be_empty
+          expect(diff).not_to be_too_large
+          expect(diff).to be_collapsed
+        end
+      end
     end
 
     context 'using a Gitaly::CommitDelta' do
-- 
GitLab