diff --git a/app/services/bulk_imports/file_download_service.rb b/app/services/bulk_imports/file_download_service.rb index a2c8ba5b1cdef469876226c45c302205a1fbc2c7..45f1350df92acb014de821a3318fd6d227d39fac 100644 --- a/app/services/bulk_imports/file_download_service.rb +++ b/app/services/bulk_imports/file_download_service.rb @@ -10,10 +10,11 @@ # @param filename [String] Name of the file to download, if known. Use remote filename if none given. module BulkImports class FileDownloadService + include ::BulkImports::FileDownloads::FilenameFetch + include ::BulkImports::FileDownloads::Validations + ServiceError = Class.new(StandardError) - REMOTE_FILENAME_PATTERN = %r{filename="(?<filename>[^"]+)"}.freeze - FILENAME_SIZE_LIMIT = 255 # chars before the extension DEFAULT_FILE_SIZE_LIMIT = 5.gigabytes DEFAULT_ALLOWED_CONTENT_TYPES = %w(application/gzip application/octet-stream).freeze @@ -74,6 +75,10 @@ def download_file raise e end + def raise_error(message) + raise ServiceError, message + end + def http_client @http_client ||= BulkImports::Clients::HTTP.new( url: configuration.url, @@ -85,24 +90,20 @@ def allow_local_requests? ::Gitlab::CurrentSettings.allow_local_requests_from_web_hooks_and_services? end - def headers - @headers ||= http_client.head(relative_url).headers - end - - def validate_filepath - Gitlab::Utils.check_path_traversal!(filepath) + def response_headers + @response_headers ||= http_client.head(relative_url).headers end def validate_tmpdir Gitlab::Utils.check_allowed_absolute_path!(tmpdir, [Dir.tmpdir]) end - def validate_symlink - if File.lstat(filepath).symlink? - File.delete(filepath) + def filepath + @filepath ||= File.join(@tmpdir, filename) + end - raise(ServiceError, 'Invalid downloaded file') - end + def filename + @filename.presence || remote_filename end def validate_url @@ -113,61 +114,5 @@ def validate_url schemes: %w(http https) ) end - - def validate_content_length - validate_size!(headers['content-length']) - end - - def validate_size!(size) - if size.blank? - raise ServiceError, 'Missing content-length header' - elsif size.to_i > file_size_limit - raise ServiceError, "File size %{size} exceeds limit of %{limit}" % { - size: ActiveSupport::NumberHelper.number_to_human_size(size), - limit: ActiveSupport::NumberHelper.number_to_human_size(file_size_limit) - } - end - end - - def validate_content_type - content_type = headers['content-type'] - - raise(ServiceError, 'Invalid content type') if content_type.blank? || allowed_content_types.exclude?(content_type) - end - - def filepath - @filepath ||= File.join(@tmpdir, filename) - end - - def filename - @filename.presence || remote_filename - end - - # Fetch the remote filename information from the request content-disposition header - # - Raises if the filename does not exist - # - If the filename is longer then 255 chars truncate it - # to be a total of 255 chars (with the extension) - def remote_filename - @remote_filename ||= - headers['content-disposition'].to_s - .match(REMOTE_FILENAME_PATTERN) # matches the filename pattern - .then { |match| match&.named_captures || {} } # ensures the match is a hash - .fetch('filename') # fetches the 'filename' key or raise KeyError - .then(&File.method(:basename)) # Ensures to remove path from the filename (../ for instance) - .then(&method(:ensure_filename_size)) # Ensures the filename is within the FILENAME_SIZE_LIMIT - rescue KeyError - raise ServiceError, 'Remote filename not provided in content-disposition header' - end - - def ensure_filename_size(filename) - if filename.length <= FILENAME_SIZE_LIMIT - filename - else - extname = File.extname(filename) - basename = File.basename(filename, extname)[0, FILENAME_SIZE_LIMIT] - - "#{basename}#{extname}" - end - end end end diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml index a1958f1b540b169ff02b7a1e76aa28eaedeb86eb..e62e7019a112055351c9135f1d9367d41ef821a8 100644 --- a/app/workers/all_queues.yml +++ b/app/workers/all_queues.yml @@ -1083,6 +1083,15 @@ :weight: 1 :idempotent: false :tags: [] +- :name: github_importer:github_import_import_release_attachments + :worker_name: Gitlab::GithubImport::ImportReleaseAttachmentsWorker + :feature_category: :importers + :has_external_dependencies: true + :urgency: :low + :resource_boundary: :unknown + :weight: 1 + :idempotent: false + :tags: [] - :name: github_importer:github_import_refresh_import_jid :worker_name: Gitlab::GithubImport::RefreshImportJidWorker :feature_category: :importers @@ -1101,6 +1110,15 @@ :weight: 1 :idempotent: false :tags: [] +- :name: github_importer:github_import_stage_import_attachments + :worker_name: Gitlab::GithubImport::Stage::ImportAttachmentsWorker + :feature_category: :importers + :has_external_dependencies: false + :urgency: :low + :resource_boundary: :unknown + :weight: 1 + :idempotent: false + :tags: [] - :name: github_importer:github_import_stage_import_base_data :worker_name: Gitlab::GithubImport::Stage::ImportBaseDataWorker :feature_category: :importers diff --git a/app/workers/gitlab/github_import/advance_stage_worker.rb b/app/workers/gitlab/github_import/advance_stage_worker.rb index 70d18d8004cbf3517c21fc619d8fb56cd81d11a1..f65710e911058f16c951af3e0b14895612915e8a 100644 --- a/app/workers/gitlab/github_import/advance_stage_worker.rb +++ b/app/workers/gitlab/github_import/advance_stage_worker.rb @@ -25,6 +25,7 @@ class AdvanceStageWorker # rubocop:disable Scalability/IdempotentWorker issues_and_diff_notes: Stage::ImportIssuesAndDiffNotesWorker, issue_events: Stage::ImportIssueEventsWorker, notes: Stage::ImportNotesWorker, + attachments: Stage::ImportAttachmentsWorker, lfs_objects: Stage::ImportLfsObjectsWorker, finish: Stage::FinishImportWorker }.freeze diff --git a/app/workers/gitlab/github_import/import_release_attachments_worker.rb b/app/workers/gitlab/github_import/import_release_attachments_worker.rb new file mode 100644 index 0000000000000000000000000000000000000000..c6f45ec1d7c7378ee6debade27fa5b8b7053a3b3 --- /dev/null +++ b/app/workers/gitlab/github_import/import_release_attachments_worker.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +module Gitlab + module GithubImport + class ImportReleaseAttachmentsWorker # rubocop:disable Scalability/IdempotentWorker + include ObjectImporter + + def representation_class + Representation::ReleaseAttachments + end + + def importer_class + Importer::ReleaseAttachmentsImporter + end + + def object_type + :release_attachment + end + end + end +end diff --git a/app/workers/gitlab/github_import/stage/import_attachments_worker.rb b/app/workers/gitlab/github_import/stage/import_attachments_worker.rb new file mode 100644 index 0000000000000000000000000000000000000000..17cc14656e44115ab1cfe6b0900bdf97a210a7cf --- /dev/null +++ b/app/workers/gitlab/github_import/stage/import_attachments_worker.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +module Gitlab + module GithubImport + module Stage + class ImportAttachmentsWorker # rubocop:disable Scalability/IdempotentWorker + include ApplicationWorker + + data_consistency :always + + sidekiq_options retry: 5 + include GithubImport::Queue + include StageMethods + + # client - An instance of Gitlab::GithubImport::Client. + # project - An instance of Project. + def import(client, project) + return skip_to_next_stage(project) if feature_disabled?(project) + + waiters = importers.each_with_object({}) do |importer, hash| + waiter = start_importer(project, importer, client) + hash[waiter.key] = waiter.jobs_remaining + end + move_to_next_stage(project, waiters) + end + + private + + # For future issue/mr/milestone/etc attachments importers + def importers + [::Gitlab::GithubImport::Importer::ReleasesAttachmentsImporter] + end + + def start_importer(project, importer, client) + info(project.id, message: "starting importer", importer: importer.name) + importer.new(project, client).execute + end + + def skip_to_next_stage(project) + info(project.id, message: "skipping importer", importer: 'Attachments') + move_to_next_stage(project) + end + + def move_to_next_stage(project, waiters = {}) + AdvanceStageWorker.perform_async(project.id, waiters, :lfs_objects) + end + + def feature_disabled?(project) + Feature.disabled?(:github_importer_attachments_import, project.group, type: :ops) + end + end + end + end +end diff --git a/app/workers/gitlab/github_import/stage/import_notes_worker.rb b/app/workers/gitlab/github_import/stage/import_notes_worker.rb index 167b3e147a3c5565cf62f2aee9071da17a1d8ecc..b53e31ce40e97137fa2f1e2db7345e806d6754da 100644 --- a/app/workers/gitlab/github_import/stage/import_notes_worker.rb +++ b/app/workers/gitlab/github_import/stage/import_notes_worker.rb @@ -21,11 +21,7 @@ def import(client, project) hash[waiter.key] = waiter.jobs_remaining end - AdvanceStageWorker.perform_async( - project.id, - waiters, - :lfs_objects - ) + AdvanceStageWorker.perform_async(project.id, waiters, :attachments) end def importers(project) diff --git a/config/feature_flags/ops/github_importer_attachments_import.yml b/config/feature_flags/ops/github_importer_attachments_import.yml new file mode 100644 index 0000000000000000000000000000000000000000..0450634a0cfa42691f56fa1ebc6e2bb0ada0926c --- /dev/null +++ b/config/feature_flags/ops/github_importer_attachments_import.yml @@ -0,0 +1,8 @@ +--- +name: github_importer_attachments_import +introduced_by_url: +rollout_issue_url: +milestone: '15.3' +type: ops +group: group::import +default_enabled: false diff --git a/lib/bulk_imports/file_downloads/filename_fetch.rb b/lib/bulk_imports/file_downloads/filename_fetch.rb new file mode 100644 index 0000000000000000000000000000000000000000..b6bb0fd8c817906803b5f0887fc6fa5e49c542cc --- /dev/null +++ b/lib/bulk_imports/file_downloads/filename_fetch.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +module BulkImports + module FileDownloads + module FilenameFetch + REMOTE_FILENAME_PATTERN = %r{filename="(?<filename>[^"]+)"}.freeze + FILENAME_SIZE_LIMIT = 255 # chars before the extension + + def raise_error(message) + raise NotImplementedError + end + + private + + # Fetch the remote filename information from the request content-disposition header + # - Raises if the filename does not exist + # - If the filename is longer then 255 chars truncate it + # to be a total of 255 chars (with the extension) + # rubocop:disable Gitlab/ModuleWithInstanceVariables + def remote_filename + @remote_filename ||= begin + pattern = BulkImports::FileDownloads::FilenameFetch::REMOTE_FILENAME_PATTERN + name = response_headers['content-disposition'].to_s + .match(pattern) # matches the filename pattern + .then { |match| match&.named_captures || {} } # ensures the match is a hash + .fetch('filename') # fetches the 'filename' key or raise KeyError + + name = File.basename(name) # Ensures to remove path from the filename (../ for instance) + ensure_filename_size(name) # Ensures the filename is within the FILENAME_SIZE_LIMIT + end + rescue KeyError + raise_error 'Remote filename not provided in content-disposition header' + end + # rubocop:enable Gitlab/ModuleWithInstanceVariables + + def ensure_filename_size(filename) + limit = BulkImports::FileDownloads::FilenameFetch::FILENAME_SIZE_LIMIT + return filename if filename.length <= limit + + extname = File.extname(filename) + basename = File.basename(filename, extname)[0, limit] + "#{basename}#{extname}" + end + end + end +end diff --git a/lib/bulk_imports/file_downloads/validations.rb b/lib/bulk_imports/file_downloads/validations.rb new file mode 100644 index 0000000000000000000000000000000000000000..ae94267a6e840f8283e10359f2493155cddc1ce7 --- /dev/null +++ b/lib/bulk_imports/file_downloads/validations.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +module BulkImports + module FileDownloads + module Validations + def raise_error(message) + raise NotImplementedError + end + + def filepath + raise NotImplementedError + end + + def file_size_limit + raise NotImplementedError + end + + def response_headers + raise NotImplementedError + end + + private + + def validate_filepath + Gitlab::Utils.check_path_traversal!(filepath) + end + + def validate_content_type + content_type = response_headers['content-type'] + + raise_error('Invalid content type') if content_type.blank? || allowed_content_types.exclude?(content_type) + end + + def validate_symlink + return unless File.lstat(filepath).symlink? + + File.delete(filepath) + raise_error 'Invalid downloaded file' + end + + def validate_content_length + validate_size!(response_headers['content-length']) + end + + def validate_size!(size) + if size.blank? + raise_error 'Missing content-length header' + elsif size.to_i > file_size_limit + raise_error format( + "File size %{size} exceeds limit of %{limit}", + size: ActiveSupport::NumberHelper.number_to_human_size(size), + limit: ActiveSupport::NumberHelper.number_to_human_size(file_size_limit) + ) + end + end + end + end +end diff --git a/lib/gitlab/github_import/attachments_downloader.rb b/lib/gitlab/github_import/attachments_downloader.rb new file mode 100644 index 0000000000000000000000000000000000000000..0827aec9bea627d6c8e9fb1ea4ee0a4b8a8784e4 --- /dev/null +++ b/lib/gitlab/github_import/attachments_downloader.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +module Gitlab + module GithubImport + class AttachmentsDownloader + include ::Gitlab::ImportExport::CommandLineUtil + include ::BulkImports::FileDownloads::FilenameFetch + include ::BulkImports::FileDownloads::Validations + + DownloadError = Class.new(StandardError) + + FILENAME_SIZE_LIMIT = 255 # chars before the extension + DEFAULT_FILE_SIZE_LIMIT = 25.megabytes + TMP_DIR = File.join(Dir.tmpdir, 'github_attachments').freeze + + attr_reader :file_url, :filename, :file_size_limit + + def initialize(file_url, file_size_limit: DEFAULT_FILE_SIZE_LIMIT) + @file_url = file_url + @file_size_limit = file_size_limit + + filename = URI(file_url).path.split('/').last + @filename = ensure_filename_size(filename) + end + + def perform + validate_content_length + validate_filepath + + file = download + validate_symlink + file + end + + private + + def raise_error(message) + raise DownloadError, message + end + + def response_headers + @response_headers ||= + Gitlab::HTTP.perform_request(Net::HTTP::Head, file_url, {}).headers + end + + def download + file = File.open(filepath, 'wb') + Gitlab::HTTP.perform_request(Net::HTTP::Get, file_url, stream_body: true) { |batch| file.write(batch) } + file + end + + def filepath + @filepath ||= begin + dir = File.join(TMP_DIR, SecureRandom.uuid) + mkdir_p dir + File.join(dir, filename) + end + end + end + end +end diff --git a/lib/gitlab/github_import/importer/release_attachments_importer.rb b/lib/gitlab/github_import/importer/release_attachments_importer.rb new file mode 100644 index 0000000000000000000000000000000000000000..ad72907a66c5278c0216cf6ba1fbd9a02dbb7a1a --- /dev/null +++ b/lib/gitlab/github_import/importer/release_attachments_importer.rb @@ -0,0 +1,55 @@ +# frozen_string_literal: true + +module Gitlab + module GithubImport + module Importer + class ReleaseAttachmentsImporter + attr_reader :release_db_id, :release_description, :project + + # release - An instance of `ReleaseAttachments`. + # project - An instance of `Project`. + # client - An instance of `Gitlab::GithubImport::Client`. + def initialize(release_attachments, project, _client = nil) + @release_db_id = release_attachments.release_db_id + @release_description = release_attachments.description + @project = project + end + + def execute + attachment_urls = MarkdownText.fetch_attachment_urls(release_description) + new_description = attachment_urls.reduce(release_description) do |description, url| + new_url = download_attachment(url) + description.gsub(url, new_url) + end + + Release.find(release_db_id).update_column(:description, new_description) + end + + private + + # in: github attachment markdown url + # out: gitlab attachment markdown url + def download_attachment(markdown_url) + url = extract_url_from_markdown(markdown_url) + name_prefix = extract_name_from_markdown(markdown_url) + + file = ::Gitlab::GithubImport::AttachmentsDownloader.new(url).perform + uploader = UploadService.new(project, file, FileUploader).execute + "#{name_prefix}(#{uploader.to_h[:url]})" + end + + # in: "" + # out: https://user-images.githubusercontent.com/.. + def extract_url_from_markdown(text) + text.match(%r{https://.*\)$}).to_a[0].chop + end + + # in: "" + # out: ![image-icon] + def extract_name_from_markdown(text) + text.match(%r{^!?\[.*\]}).to_a[0] + end + end + end + end +end diff --git a/lib/gitlab/github_import/importer/releases_attachments_importer.rb b/lib/gitlab/github_import/importer/releases_attachments_importer.rb new file mode 100644 index 0000000000000000000000000000000000000000..7221c802d83904b660fc4ad39ce28fe78f0415d9 --- /dev/null +++ b/lib/gitlab/github_import/importer/releases_attachments_importer.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +module Gitlab + module GithubImport + module Importer + class ReleasesAttachmentsImporter + include ParallelScheduling + + BATCH_SIZE = 100 + + # The method that will be called for traversing through all the objects to + # import, yielding them to the supplied block. + def each_object_to_import + project.releases.select(:id, :description).each_batch(of: BATCH_SIZE, column: :id) do |batch| + batch.each do |release| + next if already_imported?(release) + + Gitlab::GithubImport::ObjectCounter.increment(project, object_type, :fetched) + + yield release + + # We mark the object as imported immediately so we don't end up + # scheduling it multiple times. + mark_as_imported(release) + end + end + end + + def representation_class + Representation::ReleaseAttachments + end + + def importer_class + ReleaseAttachmentsImporter + end + + def sidekiq_worker_class + ImportReleaseAttachmentsWorker + end + + def collection_method + :release_attachments + end + + def object_type + :release_attachment + end + + def id_for_already_imported_cache(release) + release.id + end + + def object_representation(object) + representation_class.from_db_record(object) + end + end + end + end +end diff --git a/lib/gitlab/github_import/markdown_text.rb b/lib/gitlab/github_import/markdown_text.rb index 692016bd0053172e70287f7908b1310d2b169462..bf2856bc77fe03a0406c2dfe9417657c69fa69f6 100644 --- a/lib/gitlab/github_import/markdown_text.rb +++ b/lib/gitlab/github_import/markdown_text.rb @@ -1,5 +1,8 @@ # frozen_string_literal: true +# This class includes overriding Kernel#format method +# what makes impossible to use it here +# rubocop:disable Style/FormatString module Gitlab module GithubImport class MarkdownText @@ -8,6 +11,21 @@ class MarkdownText ISSUE_REF_MATCHER = '%{github_url}/%{import_source}/issues' PULL_REF_MATCHER = '%{github_url}/%{import_source}/pull' + MEDIA_TYPES = %w[gif jpeg jpg mov mp4 png svg webm].freeze + DOC_TYPES = %w[ + csv docx fodg fodp fods fodt gz log md odf odg odp ods + odt pdf pptx tgz txt xls xlsx zip + ].freeze + ALL_TYPES = (MEDIA_TYPES + DOC_TYPES).freeze + + # On github.com we have base url for docs and CDN url for media. + # On github EE as far as we can know there is no CDN urls and media is placed on base url. + # To no escape the escaping symbol we use single quotes instead of double with interpolation. + # rubocop:disable Style/StringConcatenation + CDN_URL_MATCHER = '(!\[.+\]\(%{github_media_cdn}/\d+/(\w|-)+\.(' + MEDIA_TYPES.join('|') + ')\))' + BASE_URL_MATCHER = '(\[.+\]\(%{github_url}/.+/.+/files/\d+/.+\.(' + ALL_TYPES.join('|') + ')\))' + # rubocop:enable Style/StringConcatenation + class << self def format(*args) new(*args).to_s @@ -24,8 +42,20 @@ def convert_ref_links(text, project) .gsub(pull_ref_matcher, url_helpers.project_merge_requests_url(project)) end + def fetch_attachment_urls(text) + cdn_url_matcher = CDN_URL_MATCHER % { github_media_cdn: Regexp.escape(github_media_cdn) } + doc_url_matcher = BASE_URL_MATCHER % { github_url: Regexp.escape(github_url) } + + text.scan(Regexp.new(cdn_url_matcher)).map(&:first) + + text.scan(Regexp.new(doc_url_matcher)).map(&:first) + end + private + def github_media_cdn + 'https://user-images.githubusercontent.com' + end + # Returns github domain without slash in the end def github_url oauth_config = Gitlab::Auth::OAuth::Provider.config_for('github') || {} @@ -63,3 +93,4 @@ def format end end end +# rubocop:enable Style/FormatString diff --git a/lib/gitlab/github_import/parallel_scheduling.rb b/lib/gitlab/github_import/parallel_scheduling.rb index a8c18c74d246cf2054790a3f2529f76424006b05..bf5046de36c581fe7edd597831813fd56f657007 100644 --- a/lib/gitlab/github_import/parallel_scheduling.rb +++ b/lib/gitlab/github_import/parallel_scheduling.rb @@ -63,7 +63,7 @@ def execute # Imports all the objects in sequence in the current thread. def sequential_import each_object_to_import do |object| - repr = representation_class.from_api_response(object, additional_object_data) + repr = object_representation(object) importer_class.new(repr, project, client).execute end @@ -83,7 +83,7 @@ def spread_parallel_import import_arguments = [] each_object_to_import do |object| - repr = representation_class.from_api_response(object, additional_object_data) + repr = object_representation(object) import_arguments << [project.id, repr.to_hash, waiter.key] @@ -210,6 +210,10 @@ def additional_object_data {} end + def object_representation(object) + representation_class.from_api_response(object, additional_object_data) + end + def info(project_id, extra = {}) Logger.info(log_attributes(project_id, extra)) end diff --git a/lib/gitlab/github_import/representation/release_attachments.rb b/lib/gitlab/github_import/representation/release_attachments.rb new file mode 100644 index 0000000000000000000000000000000000000000..fd272be2405551980dbd5d7a070e1ab6d110d19a --- /dev/null +++ b/lib/gitlab/github_import/representation/release_attachments.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +# This class only partly represents Release record from DB and +# is used to connect ReleasesAttachmentsImporter with ReleaseAttachmentsImporter +# without modifying ObjectImporter a lot. +# Attachments are inside release's `description`. +module Gitlab + module GithubImport + module Representation + class ReleaseAttachments + include ToHash + include ExposeAttribute + + attr_reader :attributes + + expose_attribute :release_db_id, :description + + # Builds a event from a GitHub API response. + # + # release - An instance of `Release` model. + def self.from_db_record(release) + new( + release_db_id: release.id, + description: release.description + ) + end + + def self.from_json_hash(raw_hash) + new Representation.symbolize_hash(raw_hash) + end + + # attributes - A Hash containing the event details. The keys of this + # Hash (and any nested hashes) must be symbols. + def initialize(attributes) + @attributes = attributes + end + + def github_identifiers + { db_id: release_db_id } + end + end + end + end +end diff --git a/spec/lib/bulk_imports/file_downloads/filename_fetch_spec.rb b/spec/lib/bulk_imports/file_downloads/filename_fetch_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..a77eba06027937ae2a62ba159052c20af14de350 --- /dev/null +++ b/spec/lib/bulk_imports/file_downloads/filename_fetch_spec.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe BulkImports::FileDownloads::FilenameFetch do + let(:dummy_instance) { dummy_class.new } + let(:dummy_class) do + Class.new do + include BulkImports::FileDownloads::FilenameFetch + end + end + + describe '#raise_error' do + it { expect { dummy_instance.raise_error('text') }.to raise_exception(NotImplementedError) } + end +end diff --git a/spec/lib/bulk_imports/file_downloads/validations_spec.rb b/spec/lib/bulk_imports/file_downloads/validations_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..85f45c2a8f0be418ecb33f5780cfc83c71e0faff --- /dev/null +++ b/spec/lib/bulk_imports/file_downloads/validations_spec.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe BulkImports::FileDownloads::Validations do + let(:dummy_instance) { dummy_class.new } + let(:dummy_class) do + Class.new do + include BulkImports::FileDownloads::Validations + end + end + + describe '#raise_error' do + it { expect { dummy_instance.raise_error('text') }.to raise_exception(NotImplementedError) } + end + + describe '#filepath' do + it { expect { dummy_instance.filepath }.to raise_exception(NotImplementedError) } + end + + describe '#response_headers' do + it { expect { dummy_instance.response_headers }.to raise_exception(NotImplementedError) } + end + + describe '#file_size_limit' do + it { expect { dummy_instance.file_size_limit }.to raise_exception(NotImplementedError) } + end +end diff --git a/spec/lib/gitlab/github_import/attachments_downloader_spec.rb b/spec/lib/gitlab/github_import/attachments_downloader_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..bdce0bb1624e629d0e451339c084460856a8e324 --- /dev/null +++ b/spec/lib/gitlab/github_import/attachments_downloader_spec.rb @@ -0,0 +1,75 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubImport::AttachmentsDownloader do + subject(:downloader) { described_class.new(file_url) } + + let_it_be(:file_url) { 'https://example.com/avatar.png' } + let_it_be(:content_type) { 'application/octet-stream' } + + let(:content_length) { 1000 } + let(:chunk_double) { instance_double(HTTParty::FragmentWithResponse, code: 200) } + let(:headers_double) do + instance_double( + HTTParty::Response, + code: 200, + success?: true, + parsed_response: {}, + headers: { + 'content-length' => content_length, + 'content-type' => content_type + } + ) + end + + before do + allow(Gitlab::HTTP).to receive(:perform_request) + .with(Net::HTTP::Get, file_url, stream_body: true).and_yield(chunk_double) + allow(Gitlab::HTTP).to receive(:perform_request) + .with(Net::HTTP::Head, file_url, {}).and_return(headers_double) + end + + context 'when file valid' do + it 'downloads file' do + file = downloader.perform + + expect(File.exist?(file.path)).to eq(true) + end + end + + context 'when filename is malicious' do + let_it_be(:file_url) { 'https://example.com/ava%2F..%2Ftar.png' } + + it 'raises expected exception' do + expect { downloader.perform }.to raise_exception( + Gitlab::Utils::PathTraversalAttackError, + 'Invalid path' + ) + end + end + + context 'when file size exceeds limit' do + let(:content_length) { 26.megabytes } + + it 'raises expected exception' do + expect { downloader.perform }.to raise_exception( + Gitlab::GithubImport::AttachmentsDownloader::DownloadError, + 'File size 26 MB exceeds limit of 25 MB' + ) + end + end + + context 'when file name length exceeds limit' do + before do + stub_const('BulkImports::FileDownloads::FilenameFetch::FILENAME_SIZE_LIMIT', 2) + end + + it 'chops filename' do + file = downloader.perform + + expect(File.exist?(file.path)).to eq(true) + expect(File.basename(file)).to eq('av.png') + end + end +end diff --git a/spec/lib/gitlab/github_import/importer/release_attachments_importer_spec.rb b/spec/lib/gitlab/github_import/importer/release_attachments_importer_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..81fe8c5fbc55796f63edf4d2c8ec09e1ef4bb50b --- /dev/null +++ b/spec/lib/gitlab/github_import/importer/release_attachments_importer_spec.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubImport::Importer::ReleaseAttachmentsImporter do + subject(:importer) { described_class.new(release_attachments, project, client) } + + let_it_be(:project) { create(:project) } + + let(:client) { instance_double('Gitlab::GithubImport::Client') } + let(:release) { create(:release, project: project, description: description) } + let(:release_attachments) do + Gitlab::GithubImport::Representation::ReleaseAttachments + .from_json_hash(release_db_id: release.id, description: release.description) + end + + let(:doc_url) { 'https://github.com/nickname/public-test-repo/files/9020437/git-cheat-sheet.txt' } + let(:image_url) { 'https://user-images.githubusercontent.com/6833842/0cf366b61ef2.jpeg' } + let(:description) do + <<-TEXT.strip + Some text... + + [special-doc](#{doc_url}) +  + TEXT + end + + describe '#execute' do + let(:downloader_stub) { instance_double(Gitlab::GithubImport::AttachmentsDownloader) } + let(:tmp_stub_doc) { Tempfile.create('attachment_download_test.txt') } + let(:tmp_stub_image) { Tempfile.create('image.jpeg') } + + context 'when importing doc attachment' do + before do + allow(Gitlab::GithubImport::AttachmentsDownloader).to receive(:new).with(doc_url) + .and_return(downloader_stub) + allow(Gitlab::GithubImport::AttachmentsDownloader).to receive(:new).with(image_url) + .and_return(downloader_stub) + allow(downloader_stub).to receive(:perform).and_return(tmp_stub_doc, tmp_stub_image) + + allow(UploadService).to receive(:new) + .with(project, tmp_stub_doc, FileUploader).and_call_original + allow(UploadService).to receive(:new) + .with(project, tmp_stub_image, FileUploader).and_call_original + end + + it 'updates release description with new attachment url' do + importer.execute + + release.reload + expect(release.description).to start_with("Some text...\n\n [special-doc](/uploads/") + expect(release.description).to include(' + end + end + end +end diff --git a/spec/lib/gitlab/github_import/importer/releases_attachments_importer_spec.rb b/spec/lib/gitlab/github_import/importer/releases_attachments_importer_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..1aeb3462cd50bf1e8e4f09d29bb6653ddf66a214 --- /dev/null +++ b/spec/lib/gitlab/github_import/importer/releases_attachments_importer_spec.rb @@ -0,0 +1,74 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubImport::Importer::ReleasesAttachmentsImporter do + subject { described_class.new(project, client) } + + let_it_be(:project) { create(:project) } + + let(:client) { instance_double(Gitlab::GithubImport::Client) } + + describe '#each_object_to_import', :clean_gitlab_redis_cache do + let!(:release_1) { create(:release, project: project) } + let!(:release_2) { create(:release, project: project) } + + it 'iterates each project release' do + list = [] + subject.each_object_to_import do |object| + list << object + end + expect(list).to contain_exactly(release_1, release_2) + end + + context 'when release is already processed' do + it "doesn't process this release" do + subject.mark_as_imported(release_1) + + list = [] + subject.each_object_to_import do |object| + list << object + end + expect(list).to contain_exactly(release_2) + end + end + end + + describe '#representation_class' do + it { expect(subject.representation_class).to eq(Gitlab::GithubImport::Representation::ReleaseAttachments) } + end + + describe '#importer_class' do + it { expect(subject.importer_class).to eq(Gitlab::GithubImport::Importer::ReleaseAttachmentsImporter) } + end + + describe '#sidekiq_worker_class' do + it { expect(subject.sidekiq_worker_class).to eq(Gitlab::GithubImport::ImportReleaseAttachmentsWorker) } + end + + describe '#collection_method' do + it { expect(subject.collection_method).to eq(:release_attachments) } + end + + describe '#object_type' do + it { expect(subject.object_type).to eq(:release_attachment) } + end + + describe '#id_for_already_imported_cache' do + let(:release) { build_stubbed(:release) } + + it { expect(subject.id_for_already_imported_cache(release)).to eq(release.id) } + end + + describe '#object_representation' do + let(:release) { build_stubbed(:release) } + + it 'returns release attachments representation' do + representation = subject.object_representation(release) + + expect(representation.class).to eq subject.representation_class + expect(representation.release_db_id).to eq release.id + expect(representation.description).to eq release.description + end + end +end diff --git a/spec/lib/gitlab/github_import/markdown_text_spec.rb b/spec/lib/gitlab/github_import/markdown_text_spec.rb index ad45469a4c34611dfa66acd88a63140564be070f..1da6bb064039593b966d113c77d65e2509f33c3e 100644 --- a/spec/lib/gitlab/github_import/markdown_text_spec.rb +++ b/spec/lib/gitlab/github_import/markdown_text_spec.rb @@ -60,6 +60,34 @@ end end + describe '.fetch_attachment_urls' do + let(:image_extension) { described_class::MEDIA_TYPES.sample } + let(:image_attachment) do + "" + end + + let(:doc_extension) { described_class::DOC_TYPES.sample } + let(:doc_attachment) do + "[some-doc](https://github.com/nickname/public-test-repo/"\ + "files/9020437/git-cheat-sheet.#{doc_extension})" + end + + let(:text) do + <<-TEXT + Comment with an attachment + #{image_attachment} + #{FFaker::Lorem.sentence} + #{doc_attachment} + TEXT + end + + it 'fetches attachment urls' do + expect(described_class.fetch_attachment_urls(text)) + .to contain_exactly(image_attachment, doc_attachment) + end + end + describe '#to_s' do it 'returns the text when the author was found' do author = double(:author, login: 'Alice') diff --git a/spec/lib/gitlab/github_import/parallel_scheduling_spec.rb b/spec/lib/gitlab/github_import/parallel_scheduling_spec.rb index 738e7c88d7d0ecddceb4ce8af8eaec85dbcff0fd..860bb60f3ed4e760484a1db9c75152140b968684 100644 --- a/spec/lib/gitlab/github_import/parallel_scheduling_spec.rb +++ b/spec/lib/gitlab/github_import/parallel_scheduling_spec.rb @@ -15,6 +15,10 @@ def importer_class Class end + def sidekiq_worker_class + Class + end + def object_type :dummy end diff --git a/spec/lib/gitlab/github_import/representation/release_attachments_spec.rb b/spec/lib/gitlab/github_import/representation/release_attachments_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..0ef9dad6a132644563557795ee86183636b65cb6 --- /dev/null +++ b/spec/lib/gitlab/github_import/representation/release_attachments_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubImport::Representation::ReleaseAttachments do + shared_examples 'a Release attachments data' do + it 'returns an instance of ReleaseAttachments' do + expect(representation).to be_an_instance_of(described_class) + end + + it 'includes release DB id' do + expect(representation.release_db_id).to eq 42 + end + + it 'includes release description' do + expect(representation.description).to eq 'Some text here..' + end + end + + describe '.from_db_record' do + let(:release) { build_stubbed(:release, id: 42, description: 'Some text here..') } + + it_behaves_like 'a Release attachments data' do + let(:representation) { described_class.from_db_record(release) } + end + end + + describe '.from_json_hash' do + it_behaves_like 'a Release attachments data' do + let(:hash) do + { + 'release_db_id' => 42, + 'description' => 'Some text here..' + } + end + + let(:representation) { described_class.from_json_hash(hash) } + end + end + + describe '#github_identifiers' do + it 'returns a hash with needed identifiers' do + release_id = rand(100) + representation = described_class.new(release_db_id: release_id, description: 'text') + + expect(representation.github_identifiers).to eq({ db_id: release_id }) + end + end +end diff --git a/spec/services/bulk_imports/file_download_service_spec.rb b/spec/services/bulk_imports/file_download_service_spec.rb index 81229cc8431c1575b2d5bb2424d29afbf51fc119..ec9cc719e24beab6b47c94f917dced4a2a5d3c68 100644 --- a/spec/services/bulk_imports/file_download_service_spec.rb +++ b/spec/services/bulk_imports/file_download_service_spec.rb @@ -277,7 +277,7 @@ let_it_be(:content_disposition) { 'filename="../../xxx.b"' } before do - stub_const("#{described_class}::FILENAME_SIZE_LIMIT", 1) + stub_const('BulkImports::FileDownloads::FilenameFetch::FILENAME_SIZE_LIMIT', 1) end it 'raises an error when the filename is not provided in the request header' do diff --git a/spec/workers/every_sidekiq_worker_spec.rb b/spec/workers/every_sidekiq_worker_spec.rb index b9d7998225be98fe8a51d6e1b273da50cbb32e07..b41f2f123fa737bc26196605f8fabd4b46cf93ee 100644 --- a/spec/workers/every_sidekiq_worker_spec.rb +++ b/spec/workers/every_sidekiq_worker_spec.rb @@ -257,6 +257,7 @@ 'GeoRepositoryDestroyWorker' => 3, 'GitGarbageCollectWorker' => false, 'Gitlab::GithubImport::AdvanceStageWorker' => 3, + 'Gitlab::GithubImport::ImportReleaseAttachmentsWorker' => 5, 'Gitlab::GithubImport::ImportDiffNoteWorker' => 5, 'Gitlab::GithubImport::ImportIssueWorker' => 5, 'Gitlab::GithubImport::ImportIssueEventWorker' => 5, @@ -271,6 +272,7 @@ 'Gitlab::GithubImport::Stage::ImportIssuesAndDiffNotesWorker' => 5, 'Gitlab::GithubImport::Stage::ImportIssueEventsWorker' => 5, 'Gitlab::GithubImport::Stage::ImportLfsObjectsWorker' => 5, + 'Gitlab::GithubImport::Stage::ImportAttachmentsWorker' => 5, 'Gitlab::GithubImport::Stage::ImportNotesWorker' => 5, 'Gitlab::GithubImport::Stage::ImportPullRequestsMergedByWorker' => 5, 'Gitlab::GithubImport::Stage::ImportPullRequestsReviewsWorker' => 5, diff --git a/spec/workers/gitlab/github_import/import_release_attachments_worker_spec.rb b/spec/workers/gitlab/github_import/import_release_attachments_worker_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..cd53c6ee9c0451f11f46bb0d93f072560c67f813 --- /dev/null +++ b/spec/workers/gitlab/github_import/import_release_attachments_worker_spec.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubImport::ImportReleaseAttachmentsWorker do + subject(:worker) { described_class.new } + + describe '#import' do + let(:import_state) { create(:import_state, :started) } + + let(:project) do + instance_double('Project', full_path: 'foo/bar', id: 1, import_state: import_state) + end + + let(:client) { instance_double('Gitlab::GithubImport::Client') } + let(:importer) { instance_double('Gitlab::GithubImport::Importer::ReleaseAttachmentsImporter') } + + let(:release_hash) do + { + 'release_db_id' => rand(100), + 'description' => <<-TEXT + Some text... + +  + TEXT + } + end + + it 'imports an issue event' do + expect(Gitlab::GithubImport::Importer::ReleaseAttachmentsImporter) + .to receive(:new) + .with( + an_instance_of(Gitlab::GithubImport::Representation::ReleaseAttachments), + project, + client + ) + .and_return(importer) + + expect(importer).to receive(:execute) + + expect(Gitlab::GithubImport::ObjectCounter) + .to receive(:increment) + .and_call_original + + worker.import(project, client, release_hash) + end + end +end diff --git a/spec/workers/gitlab/github_import/stage/import_attachments_worker_spec.rb b/spec/workers/gitlab/github_import/stage/import_attachments_worker_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..afed0debd90afb3d573d6bbdba9c31f56cd88eb4 --- /dev/null +++ b/spec/workers/gitlab/github_import/stage/import_attachments_worker_spec.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubImport::Stage::ImportAttachmentsWorker do + subject(:worker) { described_class.new } + + let(:project) { create(:project) } + let!(:group) { create(:group, projects: [project]) } + let(:feature_flag_state) { [group] } + + describe '#import' do + let(:importer) { instance_double('Gitlab::GithubImport::Importer::ReleasesAttachmentsImporter') } + let(:client) { instance_double('Gitlab::GithubImport::Client') } + + before do + stub_feature_flags(github_importer_attachments_import: feature_flag_state) + end + + it 'imports release attachments' do + waiter = Gitlab::JobWaiter.new(2, '123') + + expect(Gitlab::GithubImport::Importer::ReleasesAttachmentsImporter) + .to receive(:new) + .with(project, client) + .and_return(importer) + + expect(importer).to receive(:execute).and_return(waiter) + + expect(Gitlab::GithubImport::AdvanceStageWorker) + .to receive(:perform_async) + .with(project.id, { '123' => 2 }, :lfs_objects) + + worker.import(client, project) + end + + context 'when feature flag is disabled' do + let(:feature_flag_state) { false } + + it 'skips release attachments import and calls next stage' do + expect(Gitlab::GithubImport::Importer::ReleasesAttachmentsImporter).not_to receive(:new) + expect(Gitlab::GithubImport::AdvanceStageWorker).to receive(:perform_async).with(project.id, {}, :lfs_objects) + + worker.import(client, project) + end + end + end +end diff --git a/spec/workers/gitlab/github_import/stage/import_notes_worker_spec.rb b/spec/workers/gitlab/github_import/stage/import_notes_worker_spec.rb index f9f21e4dfa282cfda88080fc5aad441fe035da0e..adf20d24a7e84ec74a27f8285d418bd531fe2391 100644 --- a/spec/workers/gitlab/github_import/stage/import_notes_worker_spec.rb +++ b/spec/workers/gitlab/github_import/stage/import_notes_worker_spec.rb @@ -26,7 +26,7 @@ expect(Gitlab::GithubImport::AdvanceStageWorker) .to receive(:perform_async) - .with(project.id, { '123' => 2 }, :lfs_objects) + .with(project.id, { '123' => 2 }, :attachments) worker.import(client, project) end