From e4a7ff58142e84458a3f18e838de333c2ab9891c Mon Sep 17 00:00:00 2001
From: George Koltsov <gkoltsov@gitlab.com>
Date: Fri, 28 Feb 2025 21:07:58 +0000
Subject: [PATCH] Move Import file removal to a cron job

Update file-based Project/Group Import to remove import_file
used to do the import to a cron job and not during the import
process to allow future restarts of the process.

Changelog: other
---
 app/models/import_export_upload.rb            |  2 +
 .../groups/import_export/import_service.rb    |  2 +-
 .../import/import_file_cleanup_service.rb     | 19 +++++++
 app/workers/all_queues.yml                    | 20 +++++++
 .../import/import_file_cleanup_worker.rb      | 18 +++++++
 .../import/remove_import_file_worker.rb       | 34 ++++++++++++
 app/workers/repository_import_worker.rb       | 14 +++++
 config/initializers/1_settings.rb             |  3 ++
 config/sidekiq_queues.yml                     |  2 +
 ...port_export_uploads_on_update_at_and_id.rb | 16 ++++++
 db/schema_migrations/20250227104609           |  1 +
 db/structure.sql                              |  2 +
 lib/gitlab/import_export/importer.rb          |  3 +-
 .../lib/gitlab/import_export/importer_spec.rb |  2 +-
 spec/models/import_export_upload_spec.rb      | 10 +++-
 .../import_file_cleanup_service_spec.rb       | 35 +++++++++++++
 .../import/import_file_cleanup_worker_spec.rb | 19 +++++++
 .../import/remove_import_file_worker_spec.rb  | 52 +++++++++++++++++++
 spec/workers/repository_import_worker_spec.rb | 40 ++++++++++----
 19 files changed, 278 insertions(+), 16 deletions(-)
 create mode 100644 app/services/import/import_file_cleanup_service.rb
 create mode 100644 app/workers/gitlab/import/import_file_cleanup_worker.rb
 create mode 100644 app/workers/gitlab/import/remove_import_file_worker.rb
 create mode 100644 db/migrate/20250227104609_add_partial_index_to_import_export_uploads_on_update_at_and_id.rb
 create mode 100644 db/schema_migrations/20250227104609
 create mode 100644 spec/services/import/import_file_cleanup_service_spec.rb
 create mode 100644 spec/workers/gitlab/import/import_file_cleanup_worker_spec.rb
 create mode 100644 spec/workers/gitlab/import/remove_import_file_worker_spec.rb

diff --git a/app/models/import_export_upload.rb b/app/models/import_export_upload.rb
index ce4c1e1b91520..654f3935c0162 100644
--- a/app/models/import_export_upload.rb
+++ b/app/models/import_export_upload.rb
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 
 class ImportExportUpload < ApplicationRecord
+  include EachBatch
   include WithUploads
 
   belongs_to :project
@@ -21,6 +22,7 @@ class ImportExportUpload < ApplicationRecord
 
   scope :updated_before, ->(date) { where('updated_at < ?', date) }
   scope :with_export_file, -> { where.not(export_file: nil) }
+  scope :with_import_file, -> { where.not(import_file: nil) }
 
   def retrieve_upload(_identifier, paths)
     Upload.find_by(model: self, path: paths)
diff --git a/app/services/groups/import_export/import_service.rb b/app/services/groups/import_export/import_service.rb
index 1969c29121f70..7dd3b06c04479 100644
--- a/app/services/groups/import_export/import_service.rb
+++ b/app/services/groups/import_export/import_service.rb
@@ -30,6 +30,7 @@ def execute
         Gitlab::Tracking.event(self.class.name, 'create', label: 'import_group_from_file')
 
         if valid_user_permissions? && import_file && valid_import_file? && restorers.all?(&:restore)
+          remove_import_file
           notify_success
 
           Gitlab::Tracking.event(
@@ -47,7 +48,6 @@ def execute
 
       ensure
         remove_base_tmp_dir
-        remove_import_file
       end
 
       private
diff --git a/app/services/import/import_file_cleanup_service.rb b/app/services/import/import_file_cleanup_service.rb
new file mode 100644
index 0000000000000..4273029aaaa4d
--- /dev/null
+++ b/app/services/import/import_file_cleanup_service.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+
+module Import
+  class ImportFileCleanupService
+    LAST_MODIFIED = 72.hours
+    BATCH_SIZE = 100
+
+    def execute
+      ImportExportUpload
+        .with_import_file
+        .updated_before(LAST_MODIFIED.ago)
+        .each_batch(of: BATCH_SIZE) do |batch|
+        batch.each do |upload|
+          ::Gitlab::Import::RemoveImportFileWorker.perform_async(upload.id)
+        end
+      end
+    end
+  end
+end
diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml
index f98e31efe9b2f..37400d360b8c0 100644
--- a/app/workers/all_queues.yml
+++ b/app/workers/all_queues.yml
@@ -636,6 +636,16 @@
   :idempotent: false
   :tags: []
   :queue_namespace: :cronjob
+- :name: cronjob:import_import_file_cleanup
+  :worker_name: Gitlab::Import::ImportFileCleanupWorker
+  :feature_category: :importers
+  :has_external_dependencies: false
+  :urgency: :low
+  :resource_boundary: :unknown
+  :weight: 1
+  :idempotent: true
+  :tags: []
+  :queue_namespace: :cronjob
 - :name: cronjob:import_stuck_project_import_jobs
   :worker_name: Gitlab::Import::StuckProjectImportJobsWorker
   :feature_category: :importers
@@ -3820,6 +3830,16 @@
   :idempotent: true
   :tags: []
   :queue_namespace:
+- :name: import_remove_import_file
+  :worker_name: Gitlab::Import::RemoveImportFileWorker
+  :feature_category: :importers
+  :has_external_dependencies: false
+  :urgency: :low
+  :resource_boundary: :unknown
+  :weight: 1
+  :idempotent: true
+  :tags: []
+  :queue_namespace:
 - :name: import_user_mapping_assignment_from_csv
   :worker_name: Import::UserMapping::AssignmentFromCsvWorker
   :feature_category: :importers
diff --git a/app/workers/gitlab/import/import_file_cleanup_worker.rb b/app/workers/gitlab/import/import_file_cleanup_worker.rb
new file mode 100644
index 0000000000000..9180df79b6042
--- /dev/null
+++ b/app/workers/gitlab/import/import_file_cleanup_worker.rb
@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Import
+    class ImportFileCleanupWorker
+      include ApplicationWorker
+      include CronjobQueue # rubocop:disable Scalability/CronWorkerContext -- no context in this worker
+
+      idempotent!
+      feature_category :importers
+      data_consistency :sticky
+
+      def perform
+        ::Import::ImportFileCleanupService.new.execute
+      end
+    end
+  end
+end
diff --git a/app/workers/gitlab/import/remove_import_file_worker.rb b/app/workers/gitlab/import/remove_import_file_worker.rb
new file mode 100644
index 0000000000000..66a7414b186d4
--- /dev/null
+++ b/app/workers/gitlab/import/remove_import_file_worker.rb
@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Import
+    class RemoveImportFileWorker
+      include ApplicationWorker
+
+      idempotent!
+      feature_category :importers
+      data_consistency :sticky
+
+      def perform(upload_id)
+        upload = ImportExportUpload.find_by_id(upload_id)
+
+        return unless upload
+
+        upload.remove_import_file!
+        upload.save!
+
+        logger.info(
+          message: 'Removed ImportExportUpload import_file',
+          project_id: upload.project_id,
+          group_id: upload.group_id
+        )
+      end
+
+      private
+
+      def logger
+        @logger ||= ::Import::Framework::Logger.build
+      end
+    end
+  end
+end
diff --git a/app/workers/repository_import_worker.rb b/app/workers/repository_import_worker.rb
index 2eabcd39f2312..fe3645fe24b2b 100644
--- a/app/workers/repository_import_worker.rb
+++ b/app/workers/repository_import_worker.rb
@@ -6,6 +6,7 @@ class RepositoryImportWorker # rubocop:disable Scalability/IdempotentWorker
   data_consistency :always
   include ExceptionBacktrace
   include ProjectStartImport
+  include Sidekiq::InterruptionsExhausted
 
   feature_category :importers
   worker_has_external_dependencies!
@@ -14,6 +15,10 @@ class RepositoryImportWorker # rubocop:disable Scalability/IdempotentWorker
   sidekiq_options status_expiration: Gitlab::Import::StuckImportJob::IMPORT_JOBS_EXPIRATION
   worker_resource_boundary :memory
 
+  sidekiq_interruptions_exhausted do |job|
+    new.perform_failure(job['args'].first)
+  end
+
   def perform(project_id)
     Gitlab::QueryLimiting.disable!('https://gitlab.com/gitlab-org/gitlab/-/issues/464677')
 
@@ -37,6 +42,15 @@ def perform(project_id)
     end
   end
 
+  def perform_failure(project_id)
+    @project = Project.find_by_id(project_id)
+    import_export_upload = @project.import_export_uploads.find_by_user_id(project.creator.id)
+
+    fail_import('Import process reached the maximum number of interruptions')
+
+    ::Gitlab::Import::RemoveImportFileWorker.perform_async(import_export_upload.id)
+  end
+
   private
 
   attr_reader :project
diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb
index 8a02b72906abd..7df1eab32de65 100644
--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -536,6 +536,9 @@
 Settings.cron_jobs['import_export_project_cleanup_worker'] ||= {}
 Settings.cron_jobs['import_export_project_cleanup_worker']['cron'] ||= '0 * * * *'
 Settings.cron_jobs['import_export_project_cleanup_worker']['job_class'] = 'ImportExportProjectCleanupWorker'
+Settings.cron_jobs['gitlab_import_import_file_cleanup_worker'] ||= {}
+Settings.cron_jobs['gitlab_import_import_file_cleanup_worker']['cron'] ||= '30 * * * *'
+Settings.cron_jobs['gitlab_import_import_file_cleanup_worker']['job_class'] = 'Gitlab::Import::ImportFileCleanupWorker'
 Settings.cron_jobs['ci_archive_traces_cron_worker'] ||= {}
 Settings.cron_jobs['ci_archive_traces_cron_worker']['cron'] ||= '17 * * * *'
 Settings.cron_jobs['ci_archive_traces_cron_worker']['job_class'] = 'Ci::ArchiveTracesCronWorker'
diff --git a/config/sidekiq_queues.yml b/config/sidekiq_queues.yml
index 582d299ffc2ab..46b240129f405 100644
--- a/config/sidekiq_queues.yml
+++ b/config/sidekiq_queues.yml
@@ -471,6 +471,8 @@
   - 1
 - - import_refresh_import_jid
   - 1
+- - import_remove_import_file
+  - 1
 - - import_user_mapping_assignment_from_csv
   - 1
 - - incident_management
diff --git a/db/migrate/20250227104609_add_partial_index_to_import_export_uploads_on_update_at_and_id.rb b/db/migrate/20250227104609_add_partial_index_to_import_export_uploads_on_update_at_and_id.rb
new file mode 100644
index 0000000000000..da2861d678144
--- /dev/null
+++ b/db/migrate/20250227104609_add_partial_index_to_import_export_uploads_on_update_at_and_id.rb
@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+
+class AddPartialIndexToImportExportUploadsOnUpdateAtAndId < Gitlab::Database::Migration[2.2]
+  milestone '17.10'
+  disable_ddl_transaction!
+
+  INDEX_NAME = 'idx_import_export_uploads_updated_at_id_import_file'
+
+  def up
+    add_concurrent_index :import_export_uploads, [:updated_at, :id], where: 'import_file IS NOT NULL', name: INDEX_NAME
+  end
+
+  def down
+    remove_concurrent_index_by_name :import_export_uploads, INDEX_NAME
+  end
+end
diff --git a/db/schema_migrations/20250227104609 b/db/schema_migrations/20250227104609
new file mode 100644
index 0000000000000..f05e6f4c82dd8
--- /dev/null
+++ b/db/schema_migrations/20250227104609
@@ -0,0 +1 @@
+ce4fb636326638e05acb2a7dc0109706faa85b117232e7dd5e9f32cf07240152
\ No newline at end of file
diff --git a/db/structure.sql b/db/structure.sql
index 9fb6101a663be..aed86583716c5 100644
--- a/db/structure.sql
+++ b/db/structure.sql
@@ -31058,6 +31058,8 @@ CREATE INDEX idx_hosted_runner_usage_on_project_billing_month ON ci_gitlab_hoste
 
 CREATE UNIQUE INDEX idx_hosted_runner_usage_unique ON ci_gitlab_hosted_runner_monthly_usages USING btree (runner_id, billing_month, root_namespace_id, project_id);
 
+CREATE INDEX idx_import_export_uploads_updated_at_id_import_file ON import_export_uploads USING btree (updated_at, id) WHERE (import_file IS NOT NULL);
+
 CREATE UNIQUE INDEX idx_import_placeholder_memberships_on_source_user_group_id ON import_placeholder_memberships USING btree (source_user_id, group_id);
 
 CREATE INDEX idx_import_placeholder_memberships_on_source_user_id_and_id ON import_placeholder_memberships USING btree (source_user_id, id);
diff --git a/lib/gitlab/import_export/importer.rb b/lib/gitlab/import_export/importer.rb
index de1ffdabf7b75..0fcdd1e0460ff 100644
--- a/lib/gitlab/import_export/importer.rb
+++ b/lib/gitlab/import_export/importer.rb
@@ -19,6 +19,8 @@ def initialize(project)
 
       def execute
         if import_file && check_version! && restorers.all?(&:restore) && overwrite_project
+          remove_import_file
+
           project
         else
           raise Projects::ImportService::Error, shared.errors.to_sentence
@@ -32,7 +34,6 @@ def execute
         raise Projects::ImportService::Error, e.message
       ensure
         remove_base_tmp_dir
-        remove_import_file
       end
 
       private
diff --git a/spec/lib/gitlab/import_export/importer_spec.rb b/spec/lib/gitlab/import_export/importer_spec.rb
index 71d8a316b1fc3..9d1932e92f5d5 100644
--- a/spec/lib/gitlab/import_export/importer_spec.rb
+++ b/spec/lib/gitlab/import_export/importer_spec.rb
@@ -2,7 +2,7 @@
 
 require 'spec_helper'
 
-RSpec.describe Gitlab::ImportExport::Importer do
+RSpec.describe Gitlab::ImportExport::Importer, feature_category: :importers do
   let(:user) { create(:user) }
   let(:test_path) { "#{Dir.tmpdir}/importer_spec" }
   let(:shared) { project.import_export_shared }
diff --git a/spec/models/import_export_upload_spec.rb b/spec/models/import_export_upload_spec.rb
index 6ac7ef0611d3e..51704bed78b36 100644
--- a/spec/models/import_export_upload_spec.rb
+++ b/spec/models/import_export_upload_spec.rb
@@ -2,7 +2,7 @@
 
 require 'spec_helper'
 
-RSpec.describe ImportExportUpload do
+RSpec.describe ImportExportUpload, feature_category: :importers do
   let(:project) { create(:project) }
 
   subject(:import_export_upload) { described_class.new(project: project) }
@@ -35,7 +35,7 @@
 
   describe 'scopes' do
     let_it_be(:upload1) { create(:import_export_upload, export_file: fixture_file_upload('spec/fixtures/project_export.tar.gz')) }
-    let_it_be(:upload2) { create(:import_export_upload, export_file: nil) }
+    let_it_be(:upload2) { create(:import_export_upload, export_file: nil, import_file: fixture_file_upload('spec/fixtures/project_export.tar.gz')) }
     let_it_be(:upload3) { create(:import_export_upload, export_file: fixture_file_upload('spec/fixtures/project_export.tar.gz'), updated_at: 25.hours.ago) }
     let_it_be(:upload4) { create(:import_export_upload, export_file: nil, updated_at: 2.days.ago) }
 
@@ -45,6 +45,12 @@
       end
     end
 
+    describe '.with_import_file' do
+      it 'returns uploads with import file' do
+        expect(described_class.with_import_file).to contain_exactly(upload2)
+      end
+    end
+
     describe '.updated_before' do
       it 'returns uploads for a specified date' do
         expect(described_class.updated_before(24.hours.ago)).to contain_exactly(upload3, upload4)
diff --git a/spec/services/import/import_file_cleanup_service_spec.rb b/spec/services/import/import_file_cleanup_service_spec.rb
new file mode 100644
index 0000000000000..bc1939a449606
--- /dev/null
+++ b/spec/services/import/import_file_cleanup_service_spec.rb
@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Import::ImportFileCleanupService, feature_category: :importers do
+  subject(:service) { described_class.new }
+
+  describe '#execute' do
+    it 'enqueues a removal job for old import_file' do
+      upload = create(
+        :import_export_upload,
+        updated_at: (described_class::LAST_MODIFIED + 1.hour).ago,
+        import_file: fixture_file_upload('spec/fixtures/project_export.tar.gz')
+      )
+
+      expect(::Gitlab::Import::RemoveImportFileWorker).to receive(:perform_async).with(upload.id)
+
+      service.execute
+    end
+
+    context 'when import_file is new' do
+      it 'does not enqueue removal job' do
+        create(
+          :import_export_upload,
+          updated_at: (described_class::LAST_MODIFIED - 1.hour).ago,
+          import_file: fixture_file_upload('spec/fixtures/project_export.tar.gz')
+        )
+
+        expect(::Gitlab::Import::RemoveImportFileWorker).not_to receive(:perform_async)
+
+        service.execute
+      end
+    end
+  end
+end
diff --git a/spec/workers/gitlab/import/import_file_cleanup_worker_spec.rb b/spec/workers/gitlab/import/import_file_cleanup_worker_spec.rb
new file mode 100644
index 0000000000000..72fe32a53d156
--- /dev/null
+++ b/spec/workers/gitlab/import/import_file_cleanup_worker_spec.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Gitlab::Import::ImportFileCleanupWorker, feature_category: :importers do
+  subject(:worker) { described_class.new }
+
+  describe '#perform' do
+    it_behaves_like 'an idempotent worker'
+
+    it 'executes Import::ImportFileCleanupService' do
+      expect_next_instance_of(Import::ImportFileCleanupService) do |service|
+        expect(service).to receive(:execute)
+      end
+
+      worker.perform
+    end
+  end
+end
diff --git a/spec/workers/gitlab/import/remove_import_file_worker_spec.rb b/spec/workers/gitlab/import/remove_import_file_worker_spec.rb
new file mode 100644
index 0000000000000..b7ca4e4c74a86
--- /dev/null
+++ b/spec/workers/gitlab/import/remove_import_file_worker_spec.rb
@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Gitlab::Import::RemoveImportFileWorker, feature_category: :importers do
+  let(:upload) do
+    create(
+      :import_export_upload,
+      updated_at: 4.days.ago,
+      import_file: fixture_file_upload('spec/fixtures/project_export.tar.gz')
+    )
+  end
+
+  subject(:worker) { described_class.new }
+
+  describe '#perform' do
+    before do
+      allow_next_instance_of(::Import::Framework::Logger) do |logger|
+        allow(logger).to receive(:info)
+      end
+    end
+
+    it_behaves_like 'an idempotent worker' do
+      let(:job_args) { [upload.id] }
+    end
+
+    it 'removes import_file of the upload and logs' do
+      expect_next_instance_of(::Import::Framework::Logger) do |logger|
+        expect(logger)
+          .to receive(:info)
+          .with(
+            message: 'Removed ImportExportUpload import_file',
+            project_id: upload.project_id,
+            group_id: upload.group_id
+          )
+      end
+
+      expect { worker.perform(upload.id) }.to change { upload.reload.import_file.file.nil? }.to(true)
+    end
+
+    context 'when upload cannot be found' do
+      it 'returns' do
+        expect(ImportExportUpload).to receive(:find_by_id).with(upload.id).and_return(nil)
+        allow(upload).to receive(:remove_import_file!)
+
+        worker.perform(upload.id)
+
+        expect(upload).not_to have_received(:remove_import_file!)
+      end
+    end
+  end
+end
diff --git a/spec/workers/repository_import_worker_spec.rb b/spec/workers/repository_import_worker_spec.rb
index 44a0a0e0cf0f2..3eff7f20ffd97 100644
--- a/spec/workers/repository_import_worker_spec.rb
+++ b/spec/workers/repository_import_worker_spec.rb
@@ -3,18 +3,18 @@
 require 'spec_helper'
 
 RSpec.describe RepositoryImportWorker, feature_category: :importers do
-  describe '#perform' do
-    let(:project) { build_stubbed(:project, :import_scheduled, import_state: import_state, import_url: 'url') }
-    let(:import_state) { create(:import_state, status: :scheduled) }
-    let(:jid) { '12345678' }
-
-    before do
-      allow(subject).to receive(:jid).and_return(jid)
-      allow(Project).to receive(:find_by_id).with(project.id).and_return(project)
-      allow(project).to receive(:after_import)
-      allow(import_state).to receive(:start).and_return(true)
-    end
+  let(:project) { build_stubbed(:project, :import_scheduled, import_state: import_state, import_url: 'url') }
+  let(:import_state) { create(:import_state, status: :scheduled) }
+  let(:jid) { '12345678' }
+
+  before do
+    allow(subject).to receive(:jid).and_return(jid)
+    allow(Project).to receive(:find_by_id).with(project.id).and_return(project)
+    allow(project).to receive(:after_import)
+    allow(import_state).to receive(:start).and_return(true)
+  end
 
+  describe '#perform' do
     context 'when project not found (deleted)' do
       before do
         allow(Project).to receive(:find_by_id).with(project.id).and_return(nil)
@@ -88,4 +88,22 @@
       end
     end
   end
+
+  describe '.sidekiq_interruptions_exhausted' do
+    it 'sets import status to failed and removes import_file' do
+      user = build_stubbed(:user)
+      upload = build_stubbed(:import_export_upload, project: project, user: user)
+      job = { 'args' => [project.id] }
+
+      allow(import_state).to receive(:mark_as_failed)
+      allow(project).to receive_message_chain(:import_export_uploads, :find_by_user_id).and_return(upload)
+      expect(::Gitlab::Import::RemoveImportFileWorker).to receive(:perform_async).with(upload.id)
+
+      described_class.interruptions_exhausted_block.call(job)
+
+      expect(import_state)
+        .to have_received(:mark_as_failed)
+        .with('Import process reached the maximum number of interruptions')
+    end
+  end
 end
-- 
GitLab