From a4fe627e5da23a4bfae2aa0ad58f43219e587787 Mon Sep 17 00:00:00 2001
From: David Fernandez <dfernandez@gitlab.com>
Date: Fri, 11 Feb 2022 18:03:48 +0100
Subject: [PATCH] Guard worker will check ongoing migrations

By pinging the Container Registry API. If the status returned is
coeherent with an ongoing migration, the job will skip the migration.
The migration is aborted otherwise.
---
 app/models/container_repository.rb            | 10 +++-
 .../migration/guard_worker.rb                 | 56 +++++++++++++++++-
 lib/container_registry/gitlab_api_client.rb   |  4 ++
 lib/container_registry/migration.rb           |  5 ++
 spec/models/container_repository_spec.rb      | 10 ++++
 .../migration/guard_worker_spec.rb            | 59 +++++++++++++++++--
 6 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/app/models/container_repository.rb b/app/models/container_repository.rb
index 7e28c9e079f60..38bd8143bf776 100644
--- a/app/models/container_repository.rb
+++ b/app/models/container_repository.rb
@@ -274,9 +274,7 @@ def finish_pre_import_and_start_import
   def retry_aborted_migration
     return unless migration_state == 'import_aborted'
 
-    import_status = gitlab_api_client.import_status(self.path)
-
-    case import_status
+    case external_import_status
     when 'native'
       raise NativeImportError
     when 'import_in_progress'
@@ -322,6 +320,12 @@ def last_import_step_done_at
     [migration_pre_import_done_at, migration_import_done_at, migration_aborted_at].compact.max
   end
 
+  def external_import_status
+    strong_memoize(:import_status) do
+      gitlab_api_client.import_status(self.path)
+    end
+  end
+
   # rubocop: disable CodeReuse/ServiceClass
   def registry
     @registry ||= begin
diff --git a/app/workers/container_registry/migration/guard_worker.rb b/app/workers/container_registry/migration/guard_worker.rb
index 1237b6058e47f..77ae111c1cb84 100644
--- a/app/workers/container_registry/migration/guard_worker.rb
+++ b/app/workers/container_registry/migration/guard_worker.rb
@@ -21,18 +21,68 @@ def perform
 
         repositories = ::ContainerRepository.with_stale_migration(step_before_timestamp)
                                             .limit(max_capacity)
+        aborts_count = 0
+        long_running_migration_ids = []
 
         # the #to_a is safe as the amount of entries is limited.
         # In addition, we're calling #each in the next line and we don't want two different SQL queries for these two lines
         log_extra_metadata_on_done(:stale_migrations_count, repositories.to_a.size)
 
         repositories.each do |repository|
-          repository.abort_import
+          if abortable?(repository)
+            repository.abort_import
+            aborts_count += 1
+          else
+            long_running_migration_ids << repository.id if long_running_migration?(repository)
+          end
+        end
+
+        log_extra_metadata_on_done(:aborted_stale_migrations_count, aborts_count)
+
+        if long_running_migration_ids.any?
+          log_extra_metadata_on_done(:long_running_stale_migration_container_repository_ids, long_running_migration_ids)
         end
       end
 
       private
 
+      # This can ping the Container Registry API.
+      # We loop on a set of repositories to calls this function (see #perform)
+      # In the worst case scenario, we have a n+1 API calls situation here.
+      #
+      # This is reasonable because the maximum amount of repositories looped
+      # on is `25`. See ::ContainerRegistry::Migration.capacity.
+      #
+      # TODO We can remove this n+1 situation by having a Container Registry API
+      # endpoint that accepts multiple repository paths at once. This is issue
+      # https://gitlab.com/gitlab-org/container-registry/-/issues/582
+      def abortable?(repository)
+        # early return to save one Container Registry API request
+        return true unless repository.importing? || repository.pre_importing?
+        return true unless external_migration_in_progress?(repository)
+
+        false
+      end
+
+      def long_running_migration?(repository)
+        migration_start_timestamp(repository).before?(long_running_migration_threshold)
+      end
+
+      def external_migration_in_progress?(repository)
+        status = repository.external_import_status
+
+        (status == 'pre_import_in_progress' && repository.pre_importing?) ||
+          (status == 'import_in_progress' && repository.importing?)
+      end
+
+      def migration_start_timestamp(repository)
+        if repository.pre_importing?
+          repository.migration_pre_import_started_at
+        else
+          repository.migration_import_started_at
+        end
+      end
+
       def step_before_timestamp
         ::ContainerRegistry::Migration.max_step_duration.seconds.ago
       end
@@ -42,6 +92,10 @@ def max_capacity
         # is not properly applied
         ::ContainerRegistry::Migration.capacity * 2
       end
+
+      def long_running_migration_threshold
+        @threshold ||= 30.minutes.ago
+      end
     end
   end
 end
diff --git a/lib/container_registry/gitlab_api_client.rb b/lib/container_registry/gitlab_api_client.rb
index aa3d8ff47c431..20b8e1d419b74 100644
--- a/lib/container_registry/gitlab_api_client.rb
+++ b/lib/container_registry/gitlab_api_client.rb
@@ -25,6 +25,7 @@ def self.supports_gitlab_api?
       end
     end
 
+    # https://gitlab.com/gitlab-org/container-registry/-/blob/master/docs-gitlab/api.md#compliance-check
     def supports_gitlab_api?
       strong_memoize(:supports_gitlab_api) do
         registry_features = Gitlab::CurrentSettings.container_registry_features || []
@@ -35,16 +36,19 @@ def supports_gitlab_api?
       end
     end
 
+    # https://gitlab.com/gitlab-org/container-registry/-/blob/master/docs-gitlab/api.md#import-repository
     def pre_import_repository(path)
       response = start_import_for(path, pre: true)
       IMPORT_RESPONSES.fetch(response.status, :error)
     end
 
+    # https://gitlab.com/gitlab-org/container-registry/-/blob/master/docs-gitlab/api.md#import-repository
     def import_repository(path)
       response = start_import_for(path, pre: false)
       IMPORT_RESPONSES.fetch(response.status, :error)
     end
 
+    # https://gitlab.com/gitlab-org/container-registry/-/blob/master/docs-gitlab/api.md#get-repository-import-status
     def import_status(path)
       body_hash = response_body(faraday.get(import_url_for(path)))
       body_hash['status'] || 'error'
diff --git a/lib/container_registry/migration.rb b/lib/container_registry/migration.rb
index 322804418eb05..b03c94e5ebfcb 100644
--- a/lib/container_registry/migration.rb
+++ b/lib/container_registry/migration.rb
@@ -34,6 +34,11 @@ def self.enqueue_waiting_time
     end
 
     def self.capacity
+      # Increasing capacity numbers will increase the n+1 API calls we can have
+      # in ContainerRegistry::Migration::GuardWorker#external_migration_in_progress?
+      #
+      # TODO: See https://gitlab.com/gitlab-org/container-registry/-/issues/582
+      #
       return 25 if Feature.enabled?(:container_registry_migration_phase2_capacity_25)
       return 10 if Feature.enabled?(:container_registry_migration_phase2_capacity_10)
       return 1 if Feature.enabled?(:container_registry_migration_phase2_capacity_1)
diff --git a/spec/models/container_repository_spec.rb b/spec/models/container_repository_spec.rb
index 77de9e83f0a0d..3524c810ff455 100644
--- a/spec/models/container_repository_spec.rb
+++ b/spec/models/container_repository_spec.rb
@@ -1179,6 +1179,16 @@ def repository_creation_race(path)
     end
   end
 
+  describe '#external_import_status' do
+    subject { repository.external_import_status }
+
+    it 'returns the response from the client' do
+      expect(repository.gitlab_api_client).to receive(:import_status).with(repository.path).and_return('test')
+
+      expect(subject).to eq('test')
+    end
+  end
+
   describe '.with_stale_migration' do
     let_it_be(:repository) { create(:container_repository) }
     let_it_be(:stale_pre_importing_old_timestamp) { create(:container_repository, :pre_importing, migration_pre_import_started_at: 10.minutes.ago) }
diff --git a/spec/workers/container_registry/migration/guard_worker_spec.rb b/spec/workers/container_registry/migration/guard_worker_spec.rb
index 480e8adbd5cac..7d1df320d4eb5 100644
--- a/spec/workers/container_registry/migration/guard_worker_spec.rb
+++ b/spec/workers/container_registry/migration/guard_worker_spec.rb
@@ -26,11 +26,30 @@
         allow(::Gitlab).to receive(:com?).and_return(true)
       end
 
+      shared_examples 'not aborting any migration' do
+        it 'will not abort the migration' do
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 0)
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:long_running_stale_migration_container_repository_ids, [stale_migration.id])
+
+          expect { subject }
+              .to not_change(pre_importing_migrations, :count)
+              .and not_change(pre_import_done_migrations, :count)
+              .and not_change(importing_migrations, :count)
+              .and not_change(import_done_migrations, :count)
+              .and not_change(import_aborted_migrations, :count)
+              .and not_change { stale_migration.reload.migration_state }
+              .and not_change { ongoing_migration.migration_state }
+        end
+      end
+
       context 'with no stale migrations' do
         it_behaves_like 'an idempotent worker'
 
         it 'will not update any migration state' do
           expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 0)
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 0)
+
           expect { subject }
             .to not_change(pre_importing_migrations, :count)
             .and not_change(pre_import_done_migrations, :count)
@@ -41,10 +60,19 @@
 
       context 'with pre_importing stale migrations' do
         let(:ongoing_migration) { create(:container_repository, :pre_importing) }
-        let(:stale_migration) { create(:container_repository, :pre_importing, migration_pre_import_started_at: 10.minutes.ago) }
+        let(:stale_migration) { create(:container_repository, :pre_importing, migration_pre_import_started_at: 35.minutes.ago) }
+        let(:import_status) { 'test' }
+
+        before do
+          allow_next_instance_of(ContainerRegistry::GitlabApiClient) do |client|
+            allow(client).to receive(:import_status).and_return(import_status)
+          end
+        end
 
         it 'will abort the migration' do
           expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 1)
+
           expect { subject }
               .to change(pre_importing_migrations, :count).by(-1)
               .and not_change(pre_import_done_migrations, :count)
@@ -54,18 +82,26 @@
               .and change { stale_migration.reload.migration_state }.from('pre_importing').to('import_aborted')
               .and not_change { ongoing_migration.migration_state }
         end
+
+        context 'the client returns pre_import_in_progress' do
+          let(:import_status) { 'pre_import_in_progress' }
+
+          it_behaves_like 'not aborting any migration'
+        end
       end
 
       context 'with pre_import_done stale migrations' do
         let(:ongoing_migration) { create(:container_repository, :pre_import_done) }
-        let(:stale_migration) { create(:container_repository, :pre_import_done, migration_pre_import_done_at: 10.minutes.ago) }
+        let(:stale_migration) { create(:container_repository, :pre_import_done, migration_pre_import_done_at: 35.minutes.ago) }
 
         before do
           allow(::ContainerRegistry::Migration).to receive(:max_step_duration).and_return(5.minutes)
-          expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
         end
 
         it 'will abort the migration' do
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 1)
+
           expect { subject }
               .to not_change(pre_importing_migrations, :count)
               .and change(pre_import_done_migrations, :count).by(-1)
@@ -79,14 +115,19 @@
 
       context 'with importing stale migrations' do
         let(:ongoing_migration) { create(:container_repository, :importing) }
-        let(:stale_migration) { create(:container_repository, :importing, migration_import_started_at: 10.minutes.ago) }
+        let(:stale_migration) { create(:container_repository, :importing, migration_import_started_at: 35.minutes.ago) }
+        let(:import_status) { 'test' }
 
         before do
-          allow(::ContainerRegistry::Migration).to receive(:max_step_duration).and_return(5.minutes)
-          expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
+          allow_next_instance_of(ContainerRegistry::GitlabApiClient) do |client|
+            allow(client).to receive(:import_status).and_return(import_status)
+          end
         end
 
         it 'will abort the migration' do
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:stale_migrations_count, 1)
+          expect(worker).to receive(:log_extra_metadata_on_done).with(:aborted_stale_migrations_count, 1)
+
           expect { subject }
               .to not_change(pre_importing_migrations, :count)
               .and not_change(pre_import_done_migrations, :count)
@@ -96,6 +137,12 @@
               .and change { stale_migration.reload.migration_state }.from('importing').to('import_aborted')
               .and not_change { ongoing_migration.migration_state }
         end
+
+        context 'the client returns import_in_progress' do
+          let(:import_status) { 'import_in_progress' }
+
+          it_behaves_like 'not aborting any migration'
+        end
       end
     end
 
-- 
GitLab