From f14647fdae4a07c3c59665576b70f847ab866c58 Mon Sep 17 00:00:00 2001
From: Stan Hu <stanhu@gmail.com>
Date: Fri, 16 Aug 2019 19:53:56 +0000
Subject: [PATCH] Expire project caches once per push instead of once per ref

Previously `ProjectCacheWorker` would be scheduled once per ref, which
would generate unnecessary I/O and load on Sidekiq, especially if many
tags or branches were pushed at once. `ProjectCacheWorker` would expire
three items:

1. Repository size: This only needs to be updated once per push.
2. Commit count: This only needs to be updated if the default branch
   is updated.
3. Project method caches: This only needs to be updated if the default
   branch changes, but only if certain files change (e.g. README,
   CHANGELOG, etc.).

Because the third item requires looking at the actual changes in the
commit deltas, we schedule one `ProjectCacheWorker` to handle the first
two cases, and schedule a separate `ProjectCacheWorker` for the third
case if it is needed. As a result, this brings down the number of
`ProjectCacheWorker` jobs from N to 2.

Closes https://gitlab.com/gitlab-org/gitlab-ce/issues/52046
---
 app/models/repository.rb                      |  8 ++-
 app/services/git/base_hooks_service.rb        | 12 ++--
 app/workers/post_receive.rb                   | 30 ++++++--
 app/workers/project_cache_worker.rb           |  6 +-
 .../sh-post-receive-cache-clear-once.yml      |  5 ++
 lib/gitlab/git_post_receive.rb                | 11 +++
 spec/lib/gitlab/git_post_receive_spec.rb      | 45 +++++++++++-
 spec/models/repository_spec.rb                | 20 +++++-
 .../services/git/branch_hooks_service_spec.rb | 10 ++-
 spec/workers/post_receive_spec.rb             | 71 +++++++++++++++++--
 spec/workers/project_cache_worker_spec.rb     | 10 +++
 11 files changed, 201 insertions(+), 27 deletions(-)
 create mode 100644 changelogs/unreleased/sh-post-receive-cache-clear-once.yml

diff --git a/app/models/repository.rb b/app/models/repository.rb
index 9d45a12fa6e4..6f63cd32da49 100644
--- a/app/models/repository.rb
+++ b/app/models/repository.rb
@@ -389,11 +389,15 @@ def expire_content_cache
     expire_statistics_caches
   end
 
-  # Runs code after a repository has been created.
-  def after_create
+  def expire_status_cache
     expire_exists_cache
     expire_root_ref_cache
     expire_emptiness_caches
+  end
+
+  # Runs code after a repository has been created.
+  def after_create
+    expire_status_cache
 
     repository_event(:create_repository)
   end
diff --git a/app/services/git/base_hooks_service.rb b/app/services/git/base_hooks_service.rb
index 1db18fcf4015..3fd384441960 100644
--- a/app/services/git/base_hooks_service.rb
+++ b/app/services/git/base_hooks_service.rb
@@ -8,8 +8,6 @@ class BaseHooksService < ::BaseService
     PROCESS_COMMIT_LIMIT = 100
 
     def execute
-      project.repository.after_create if project.empty_repo?
-
       create_events
       create_pipelines
       execute_project_hooks
@@ -70,11 +68,11 @@ def execute_project_hooks
     end
 
     def enqueue_invalidate_cache
-      ProjectCacheWorker.perform_async(
-        project.id,
-        invalidated_file_types,
-        [:commit_count, :repository_size]
-      )
+      file_types = invalidated_file_types
+
+      return unless file_types.present?
+
+      ProjectCacheWorker.perform_async(project.id, file_types, [], false)
     end
 
     def base_params
diff --git a/app/workers/post_receive.rb b/app/workers/post_receive.rb
index 622bd6f1f48a..61d34981458c 100644
--- a/app/workers/post_receive.rb
+++ b/app/workers/post_receive.rb
@@ -42,10 +42,8 @@ def process_project_changes(post_received)
     user = identify_user(post_received)
     return false unless user
 
-    # Expire the branches cache so we have updated data for this push
-    post_received.project.repository.expire_branches_cache if post_received.includes_branches?
-    # We only need to expire tags once per push
-    post_received.project.repository.expire_caches_for_tags if post_received.includes_tags?
+    # We only need to expire certain caches once per push
+    expire_caches(post_received)
 
     post_received.enum_for(:changes_refs).with_index do |(oldrev, newrev, ref), index|
       service_klass =
@@ -74,6 +72,30 @@ def process_project_changes(post_received)
     after_project_changes_hooks(post_received, user, refs.to_a, changes)
   end
 
+  # Expire the project, branch, and tag cache once per push. Schedule an
+  # update for the repository size and commit count if necessary.
+  def expire_caches(post_received)
+    project = post_received.project
+
+    project.repository.expire_status_cache if project.empty_repo?
+    project.repository.expire_branches_cache if post_received.includes_branches?
+    project.repository.expire_caches_for_tags if post_received.includes_tags?
+
+    enqueue_repository_cache_update(post_received)
+  end
+
+  def enqueue_repository_cache_update(post_received)
+    stats_to_invalidate = [:repository_size]
+    stats_to_invalidate << :commit_count if post_received.includes_default_branch?
+
+    ProjectCacheWorker.perform_async(
+      post_received.project.id,
+      [],
+      stats_to_invalidate,
+      true
+    )
+  end
+
   def after_project_changes_hooks(post_received, user, refs, changes)
     hook_data = Gitlab::DataBuilder::Repository.update(post_received.project, user, changes, refs)
     SystemHooksService.new.execute_hooks(hook_data, :repository_update_hooks)
diff --git a/app/workers/project_cache_worker.rb b/app/workers/project_cache_worker.rb
index 4e8ea9031397..5ac860c93e00 100644
--- a/app/workers/project_cache_worker.rb
+++ b/app/workers/project_cache_worker.rb
@@ -12,13 +12,15 @@ class ProjectCacheWorker
   #         CHANGELOG.
   # statistics - An Array containing columns from ProjectStatistics to
   #              refresh, if empty all columns will be refreshed
+  # refresh_statistics - A boolean that determines whether project statistics should
+  #                     be updated.
   # rubocop: disable CodeReuse/ActiveRecord
-  def perform(project_id, files = [], statistics = [])
+  def perform(project_id, files = [], statistics = [], refresh_statistics = true)
     project = Project.find_by(id: project_id)
 
     return unless project
 
-    update_statistics(project, statistics)
+    update_statistics(project, statistics) if refresh_statistics
 
     return unless project.repository.exists?
 
diff --git a/changelogs/unreleased/sh-post-receive-cache-clear-once.yml b/changelogs/unreleased/sh-post-receive-cache-clear-once.yml
new file mode 100644
index 000000000000..b677adf78d93
--- /dev/null
+++ b/changelogs/unreleased/sh-post-receive-cache-clear-once.yml
@@ -0,0 +1,5 @@
+---
+title: Expire project caches once per push instead of once per ref
+merge_request: 31876
+author:
+type: performance
diff --git a/lib/gitlab/git_post_receive.rb b/lib/gitlab/git_post_receive.rb
index 24d752b8a4b7..2a8bcd015a8c 100644
--- a/lib/gitlab/git_post_receive.rb
+++ b/lib/gitlab/git_post_receive.rb
@@ -39,6 +39,17 @@ def includes_tags?
       end
     end
 
+    def includes_default_branch?
+      # If the branch doesn't have a default branch yet, we presume the
+      # first branch pushed will be the default.
+      return true unless project.default_branch.present?
+
+      enum_for(:changes_refs).any? do |_oldrev, _newrev, ref|
+        Gitlab::Git.branch_ref?(ref) &&
+          Gitlab::Git.branch_name(ref) == project.default_branch
+      end
+    end
+
     private
 
     def deserialize_changes(changes)
diff --git a/spec/lib/gitlab/git_post_receive_spec.rb b/spec/lib/gitlab/git_post_receive_spec.rb
index 4c20d945585b..f0df3794e290 100644
--- a/spec/lib/gitlab/git_post_receive_spec.rb
+++ b/spec/lib/gitlab/git_post_receive_spec.rb
@@ -3,7 +3,7 @@
 require 'spec_helper'
 
 describe ::Gitlab::GitPostReceive do
-  let(:project) { create(:project) }
+  set(:project) { create(:project, :repository) }
 
   subject { described_class.new(project, "project-#{project.id}", changes.dup, {}) }
 
@@ -92,4 +92,47 @@
       end
     end
   end
+
+  describe '#includes_default_branch?' do
+    context 'with no default branch' do
+      let(:changes) do
+        <<~EOF
+          654321 210987 refs/heads/test1
+          654322 210986 refs/tags/#{project.default_branch}
+          654323 210985 refs/heads/test3
+        EOF
+      end
+
+      it 'returns false' do
+        expect(subject.includes_default_branch?).to be_falsey
+      end
+    end
+
+    context 'with a project with no default branch' do
+      let(:changes) do
+        <<~EOF
+          654321 210987 refs/heads/test1
+        EOF
+      end
+
+      it 'returns true' do
+        expect(project).to receive(:default_branch).and_return(nil)
+        expect(subject.includes_default_branch?).to be_truthy
+      end
+    end
+
+    context 'with default branch' do
+      let(:changes) do
+        <<~EOF
+          654322 210986 refs/heads/test1
+          654321 210987 refs/tags/test2
+          654323 210985 refs/heads/#{project.default_branch}
+        EOF
+      end
+
+      it 'returns true' do
+        expect(subject.includes_default_branch?).to be_truthy
+      end
+    end
+  end
 end
diff --git a/spec/models/repository_spec.rb b/spec/models/repository_spec.rb
index e68de2e73a8d..419e1dc2459a 100644
--- a/spec/models/repository_spec.rb
+++ b/spec/models/repository_spec.rb
@@ -1815,22 +1815,36 @@ def mock_gitaly(second_response)
   end
 
   describe '#after_create' do
+    it 'calls expire_status_cache' do
+      expect(repository).to receive(:expire_status_cache)
+
+      repository.after_create
+    end
+
+    it 'logs an event' do
+      expect(repository).to receive(:repository_event).with(:create_repository)
+
+      repository.after_create
+    end
+  end
+
+  describe '#expire_status_cache' do
     it 'flushes the exists cache' do
       expect(repository).to receive(:expire_exists_cache)
 
-      repository.after_create
+      repository.expire_status_cache
     end
 
     it 'flushes the root ref cache' do
       expect(repository).to receive(:expire_root_ref_cache)
 
-      repository.after_create
+      repository.expire_status_cache
     end
 
     it 'flushes the emptiness caches' do
       expect(repository).to receive(:expire_emptiness_caches)
 
-      repository.after_create
+      repository.expire_status_cache
     end
   end
 
diff --git a/spec/services/git/branch_hooks_service_spec.rb b/spec/services/git/branch_hooks_service_spec.rb
index 8af51848b7b8..3929f51a0e2f 100644
--- a/spec/services/git/branch_hooks_service_spec.rb
+++ b/spec/services/git/branch_hooks_service_spec.rb
@@ -158,9 +158,13 @@
     let(:blank_sha) { Gitlab::Git::BLANK_SHA }
 
     def clears_cache(extended: [])
-      expect(ProjectCacheWorker)
-        .to receive(:perform_async)
-        .with(project.id, extended, %i[commit_count repository_size])
+      expect(service).to receive(:invalidated_file_types).and_return(extended)
+
+      if extended.present?
+        expect(ProjectCacheWorker)
+          .to receive(:perform_async)
+          .with(project.id, extended, [], false)
+      end
 
       service.execute
     end
diff --git a/spec/workers/post_receive_spec.rb b/spec/workers/post_receive_spec.rb
index 3b69b81f12e4..c8a0c22b0e8c 100644
--- a/spec/workers/post_receive_spec.rb
+++ b/spec/workers/post_receive_spec.rb
@@ -37,6 +37,29 @@ def perform(changes: base64_changes)
   end
 
   describe "#process_project_changes" do
+    context 'with an empty project' do
+      let(:empty_project) { create(:project, :empty_repo) }
+      let(:changes) { "123456 789012 refs/heads/tést1\n" }
+
+      before do
+        allow_any_instance_of(Gitlab::GitPostReceive).to receive(:identify).and_return(empty_project.owner)
+        allow(Gitlab::GlRepository).to receive(:parse).and_return([empty_project, Gitlab::GlRepository::PROJECT])
+      end
+
+      it 'expire the status cache' do
+        expect(empty_project.repository).to receive(:expire_status_cache)
+
+        perform
+      end
+
+      it 'schedules a cache update for commit count and size' do
+        expect(ProjectCacheWorker).to receive(:perform_async)
+                                        .with(empty_project.id, [], [:repository_size, :commit_count], true)
+
+        perform
+      end
+    end
+
     context 'empty changes' do
       it "does not call any PushService but runs after project hooks" do
         expect(Git::BranchPushService).not_to receive(:new)
@@ -67,15 +90,22 @@ def perform(changes: base64_changes)
       context "branches" do
         let(:changes) do
           <<~EOF
-            '123456 789012 refs/heads/tést1'
-            '123456 789012 refs/heads/tést2'
+            123456 789012 refs/heads/tést1
+            123456 789012 refs/heads/tést2
           EOF
         end
 
         it 'expires the branches cache' do
           expect(project.repository).to receive(:expire_branches_cache).once
 
-          described_class.new.perform(gl_repository, key_id, base64_changes)
+          perform
+        end
+
+        it 'expires the status cache' do
+          expect(project).to receive(:empty_repo?).and_return(true)
+          expect(project.repository).to receive(:expire_status_cache)
+
+          perform
         end
 
         it 'calls Git::BranchPushService' do
@@ -87,6 +117,30 @@ def perform(changes: base64_changes)
 
           perform
         end
+
+        it 'schedules a cache update for repository size only' do
+          expect(ProjectCacheWorker).to receive(:perform_async)
+                                          .with(project.id, [], [:repository_size], true)
+
+          perform
+        end
+
+        context 'with a default branch' do
+          let(:changes) do
+            <<~EOF
+              123456 789012 refs/heads/tést1
+              123456 789012 refs/heads/tést2
+              678912 123455 refs/heads/#{project.default_branch}
+            EOF
+          end
+
+          it 'schedules a cache update for commit count and size' do
+            expect(ProjectCacheWorker).to receive(:perform_async)
+                                            .with(project.id, [], [:repository_size, :commit_count], true)
+
+            perform
+          end
+        end
       end
 
       context "tags" do
@@ -107,7 +161,7 @@ def perform(changes: base64_changes)
         it 'does not expire branches cache' do
           expect(project.repository).not_to receive(:expire_branches_cache)
 
-          described_class.new.perform(gl_repository, key_id, base64_changes)
+          perform
         end
 
         it "only invalidates tags once" do
@@ -115,7 +169,7 @@ def perform(changes: base64_changes)
           expect(project.repository).to receive(:expire_caches_for_tags).once.and_call_original
           expect(project.repository).to receive(:expire_tags_cache).once.and_call_original
 
-          described_class.new.perform(gl_repository, key_id, base64_changes)
+          perform
         end
 
         it "calls Git::TagPushService" do
@@ -129,6 +183,13 @@ def perform(changes: base64_changes)
 
           perform
         end
+
+        it 'schedules a single ProjectCacheWorker update' do
+          expect(ProjectCacheWorker).to receive(:perform_async)
+                                          .with(project.id, [], [:repository_size], true)
+
+          perform
+        end
       end
 
       context "merge-requests" do
diff --git a/spec/workers/project_cache_worker_spec.rb b/spec/workers/project_cache_worker_spec.rb
index edc55920b8ee..7f3c4881b89f 100644
--- a/spec/workers/project_cache_worker_spec.rb
+++ b/spec/workers/project_cache_worker_spec.rb
@@ -49,6 +49,16 @@
         worker.perform(project.id, %w(readme))
       end
 
+      context 'with statistics disabled' do
+        let(:statistics) { [] }
+
+        it 'does not update the project statistics' do
+          expect(worker).not_to receive(:update_statistics)
+
+          worker.perform(project.id, [], [], false)
+        end
+      end
+
       context 'with statistics' do
         let(:statistics) { %w(repository_size) }
 
-- 
GitLab