Skip to content
代码片段 群组 项目
未验证 提交 3d09c222 编辑于 作者: Michael Kozono's avatar Michael Kozono 提交者: GitLab
浏览文件

Merge branch '417197-geo-orphaned-uploads-lead-to-sync-timed-out-after-28800' into 'master'

Fix: Geo: Orphaned uploads lead to "Sync timed out after 28800"

See merge request https://gitlab.com/gitlab-org/gitlab/-/merge_requests/142456



Merged-by: default avatarMichael Kozono <mkozono@gitlab.com>
Approved-by: default avatarIan Baum <ibaum@gitlab.com>
Approved-by: default avatarMichael Kozono <mkozono@gitlab.com>
Reviewed-by: default avatarMichael Kozono <mkozono@gitlab.com>
Reviewed-by: default avatarAakriti Gupta <agupta@gitlab.com>
Co-authored-by: default avatarKyle Yetter <kyetter@ebackpack.com>
No related branches found
No related tags found
无相关合并请求
......@@ -313,6 +313,23 @@ def ok_to_skip_download?
resource_exists? && skip_download_can_rely_on_verification?
end
# Downloading replicators need to verify the state of the resource and its association
# prior to attempting downloads to avoid triggering exceptions. Call this method
# to trigger validation and receive an error detail message string if any issues are present.
#
# Override this method in subclasses to provide more context-specific error details
#
# NOTE: we've kept this implementation pretty simple so far for the sake of
# detecting problems with replicating Upload and other blob data types.
# If we find later that we need to add this more extensively, or detect
# multiple errors (ala ActiveRecord models), we should revisit and refactor
# this implementation
#
# @return [String|nil] an error message String if validation fails, nil if validation succeeds
def predownload_validation_failure
nil
end
# @abstract
# @return [String] a checksum representing the data
def calculate_checksum
......
......@@ -17,5 +17,21 @@ def self.verification_feature_flag_enabled?
def carrierwave_uploader
model_record.retrieve_uploader
end
# Do not allow download unless the upload's owner `model` is present.
# Otherwise, attempting to build file paths will raise an exception.
def predownload_validation_failure
error_message = super
return error_message if error_message
upload = model_record
unless upload.model.present?
missing_model = "#{upload.model_type} with ID #{upload.model_id}"
return "The model which owns Upload with ID #{upload.id} is missing: #{missing_model}"
end
nil
end
end
end
......@@ -25,33 +25,37 @@ def initialize(replicator:)
def execute
try_obtain_lease do
start_time = Time.current
sync_successful = false
registry.start!
download_result = ::Gitlab::Geo::Replication::BlobDownloader.new(replicator: @replicator).execute
mark_as_synced = download_result.success
if mark_as_synced
registry.synced!
else
message = download_result.reason
error = download_result.extra_details&.delete(:error)
if error
Gitlab::ErrorTracking.track_exception(
error,
replicable_name: @replicator.replicable_name,
model_record_id: @replicator.model_record_id
)
begin
downloader = ::Gitlab::Geo::Replication::BlobDownloader.new(replicator: @replicator)
download_result = downloader.execute
sync_successful = process_download_result(download_result)
rescue StandardError => error
# if an exception raises here, it will be stuck in "started" state until
# the cleanup process forces it to failed much later.
# To avoid that, catch the error, mark sync as
# failed, and re-raise the exception here.
registry.failed!(
message: "Error while attempting to sync",
error: error
)
track_exception(error)
raise error
ensure
# make sure we're not stuck in a started state still
if registry.started?
sync_successful ? registry.synced! : registry.failed!(message: "Unknown system error")
end
registry.failed!(message: message, error: error, missing_on_primary: download_result.primary_missing_file)
end
log_download(mark_as_synced, download_result, start_time)
log_download(sync_successful, download_result, start_time)
!!mark_as_synced
sync_successful
end
end
......@@ -61,6 +65,22 @@ def registry
@registry ||= @replicator.registry
end
def process_download_result(download_result)
if download_result.success
registry.synced!
return true
end
message = download_result.reason
error = download_result.extra_details&.delete(:error)
track_exception(error) if error
registry.failed!(message: message, error: error, missing_on_primary: download_result.primary_missing_file)
false
end
def log_download(mark_as_synced, download_result, start_time)
metadata = {
replicable_name: @replicator.replicable_name,
......@@ -77,6 +97,14 @@ def log_download(mark_as_synced, download_result, start_time)
log_info("Blob download", metadata)
end
def track_exception(exception)
Gitlab::ErrorTracking.track_exception(
exception,
replicable_name: @replicator.replicable_name,
model_record_id: @replicator.model_record_id
)
end
def lease_key
@lease_key ||= "#{self.class.name.underscore}:#{@replicator.replicable_name}:#{@replicator.model_record.id}"
end
......
......@@ -86,6 +86,11 @@ def check_preconditions
return failure_result(reason: 'Skipping transfer as there is no Primary node to download from')
end
predownload_error_message = replicator.predownload_validation_failure
if predownload_error_message.present?
return failure_result(reason: "Skipping transfer due to validation error: #{predownload_error_message}")
end
if file_storage?
if File.directory?(absolute_path)
return failure_result(reason: 'Skipping transfer as destination exist and is a directory')
......
......@@ -6,4 +6,29 @@
let(:model_record) { create(:upload, :with_file) }
include_examples 'a blob replicator'
describe "predownload_validation_failure" do
context "when upload is valid and has an associated model/owner" do
it "returns nil" do
expect(replicator.predownload_validation_failure).to be_nil
end
end
context "when upload is orphaned from its own model association" do
before do
# break the model association on the upload
model_record.model_id = -1
model_record.save!(validate: false)
model_record.reload
end
it "returns an error string" do
upload = model_record
missing_model = "#{upload.model_type} with ID #{upload.model_id}"
expect(replicator.predownload_validation_failure).to eq(
"The model which owns Upload with ID #{upload.id} is missing: #{missing_model}"
)
end
end
end
end
......@@ -22,24 +22,23 @@
describe "#execute" do
let(:downloader) { double(:downloader) }
before do
expect(downloader).to receive(:execute).and_return(result)
expect(::Gitlab::Geo::Replication::BlobDownloader).to receive(:new).and_return(downloader)
end
context 'when exception is raised' do
context 'when the downloader result object contains an error' do
let(:error) { StandardError.new('Error') }
let(:result) do
double(
:result,
success: false,
primary_missing_file:
false,
primary_missing_file: false,
bytes_downloaded: 0,
reason: 'foo',
extra_details: { error: error })
end
before do
expect(downloader).to receive(:execute).and_return(result)
expect(::Gitlab::Geo::Replication::BlobDownloader).to receive(:new).and_return(downloader)
end
it 'tracks exception' do
expect(Gitlab::ErrorTracking).to receive(:track_exception).with(
error,
......@@ -70,7 +69,64 @@
end
end
context 'when the replicator fails pre-download validation' do
before do
expect(replicator).to receive(:predownload_validation_failure).and_return(
"This upload is busted"
)
end
it "creates the registry" do
expect do
subject.execute
end.to change { registry_class.count }.by(1)
end
it "sets sync state to failed" do
subject.execute
expect(registry_class.last).to be_failed
end
it "captures the error details in the registry record" do
subject.execute
expect(registry_class.last.last_sync_failure).to include("This upload is busted")
end
end
context 'when exception is raised by the downloader' do
let(:error) { StandardError.new('Some data inconsistency') }
before do
expect(downloader).to receive(:execute).and_raise(error)
expect(::Gitlab::Geo::Replication::BlobDownloader).to receive(:new).and_return(downloader)
end
it 'marks the replicator registry record as failed' do
expect { subject.execute }.to raise_error(error)
registry = replicator.registry
expect(registry).to be_failed
expect(registry.last_sync_failure).to include("Error while attempting to sync")
end
it 'reports the exception' do
expect(Gitlab::ErrorTracking).to receive(:track_exception).with(
error,
replicable_name: replicator.replicable_name,
model_record_id: replicator.model_record_id
)
expect { subject.execute }.to raise_error(error)
end
end
context "when it can obtain the exclusive lease" do
before do
expect(downloader).to receive(:execute).and_return(result)
expect(::Gitlab::Geo::Replication::BlobDownloader).to receive(:new).and_return(downloader)
end
context "when the registry record does not exist" do
context "when the downloader returns success" do
let(:result) { double(:result, success: true, primary_missing_file: false, bytes_downloaded: 123, reason: nil, extra_details: nil) }
......
0% 加载中 .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册