From 14e68ac13830c99f81c4e0b505dba29cb7c746eb Mon Sep 17 00:00:00 2001 From: Ravi Kumar <rkumar@gitlab.com> Date: Fri, 2 Feb 2024 14:09:41 +0000 Subject: [PATCH] Reindex wikis to adopt new structure of id Reindex wikis to adopt the new id structure. The group wikis and project wikis id will be prefixed by g_ and p_ respectively. After the reindexing, all the wiki docs will have schema_version 2402. Changelog: fixed MR: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/143273 EE: true --- GITLAB_ELASTICSEARCH_INDEXER_VERSION | 2 +- ...20240130215043_reindex_wikis_to_fix_id.yml | 10 ++ .../20240130215043_reindex_wikis_to_fix_id.rb | 64 +++++++ ee/lib/gitlab/elastic/indexer.rb | 2 +- ...0130215043_reindex_wikis_to_fix_id_spec.rb | 161 ++++++++++++++++++ ee/spec/lib/gitlab/elastic/indexer_spec.rb | 8 +- 6 files changed, 241 insertions(+), 6 deletions(-) create mode 100644 ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml create mode 100644 ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb create mode 100644 ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb diff --git a/GITLAB_ELASTICSEARCH_INDEXER_VERSION b/GITLAB_ELASTICSEARCH_INDEXER_VERSION index 6016e8addc4d0..f6cdf40983ffd 100644 --- a/GITLAB_ELASTICSEARCH_INDEXER_VERSION +++ b/GITLAB_ELASTICSEARCH_INDEXER_VERSION @@ -1 +1 @@ -4.6.0 +4.7.0 diff --git a/ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml b/ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml new file mode 100644 index 0000000000000..2cca3a0049542 --- /dev/null +++ b/ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml @@ -0,0 +1,10 @@ +--- +name: ReindexWikisToFixId +version: '20240130215043' +description: Reindex all wikis to adopt new structure(g_ or p_) of id +group: group::global search +milestone: '16.9' +introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/143273 +obsolete: false +marked_obsolete_by_url: +marked_obsolete_in_milestone: diff --git a/ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb b/ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb new file mode 100644 index 0000000000000..db50407b58cf6 --- /dev/null +++ b/ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +class ReindexWikisToFixId < Elastic::Migration + include Elastic::MigrationHelper + + batched! + throttle_delay 5.minutes + retry_on_failure + + ELASTIC_TIMEOUT = '5m' + MAX_BATCH_SIZE = 50 + SCHEMA_VERSION = 24_02 + + def migrate + if completed? + log 'Migration Completed', total_remaining: 0 + return + end + + set_migration_state(batch_size: batch_size) if migration_state[:batch_size].blank? + + remaining_rids_to_reindex.each do |rid| + m = rid.match(/wiki_(?<type>project|group)_(?<id>\d+)/) + ElasticWikiIndexerWorker.perform_in(rand(throttle_delay).seconds, m[:id], m[:type].capitalize, force: true) + end + end + + def completed? + total_remaining = remaining_documents_count + set_migration_state(documents_remaining: total_remaining) + log('Checking if migration is finished', total_remaining: total_remaining) + total_remaining == 0 + end + + def batch_size + migration_state[:batch_size].presence || [get_number_of_shards(index_name: index_name), MAX_BATCH_SIZE].min + end + + private + + def remaining_rids_to_reindex + results = client.search( + index: index_name, + body: { + size: 0, query: query_with_old_schema_version, aggs: { rids: { terms: { size: batch_size, field: 'rid' } } } + } + ) + rids_hist = results.dig('aggregations', 'rids', 'buckets') || [] + rids_hist.pluck('key') # rubocop: disable CodeReuse/ActiveRecord -- no ActiveRecord relation + end + + def remaining_documents_count + helper.refresh_index(index_name: index_name) + client.count(index: index_name, body: { query: query_with_old_schema_version })['count'] + end + + def query_with_old_schema_version + { range: { schema_version: { lt: SCHEMA_VERSION } } } + end + + def index_name + Elastic::Latest::WikiConfig.index_name + end +end diff --git a/ee/lib/gitlab/elastic/indexer.rb b/ee/lib/gitlab/elastic/indexer.rb index b2868743cfb03..ebef51832eb13 100644 --- a/ee/lib/gitlab/elastic/indexer.rb +++ b/ee/lib/gitlab/elastic/indexer.rb @@ -13,7 +13,7 @@ class Indexer Error = Class.new(StandardError) BLOB_SCHEMA_VERSION = 23_08 COMMIT_SCHEMA_VERSION = 23_06 - WIKI_SCHEMA_VERSION = 23_10 + WIKI_SCHEMA_VERSION = 24_02 class << self def indexer_version diff --git a/ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb b/ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb new file mode 100644 index 0000000000000..e7fca36415e3f --- /dev/null +++ b/ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb @@ -0,0 +1,161 @@ +# frozen_string_literal: true + +require 'spec_helper' +require_relative 'migration_shared_examples' +require File.expand_path('ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb') + +RSpec.describe ReindexWikisToFixId, :elastic_clean, :sidekiq_inline, feature_category: :global_search do + let(:version) { 20240130215043 } + let(:migration) { described_class.new(version) } + let(:helper) { Gitlab::Elastic::Helper.new } + let(:client) { ::Gitlab::Search::Client.new } + let(:index_name) { Elastic::Latest::WikiConfig.index_name } + let_it_be(:project) { create(:project, :wiki_repo) } + let_it_be(:project2) { create(:project, :wiki_repo) } + let_it_be(:project3) { create(:project, :wiki_repo) } + let_it_be(:group) { create(:group) } + let_it_be(:group2) { create(:group) } + let_it_be(:group3) { create(:group) } + let_it_be(:group_wiki) { create(:group_wiki, group: group) } + let_it_be(:group_wiki2) { create(:group_wiki, group: group2) } + let_it_be(:group_wiki3) { create(:group_wiki, group: group3) } + let_it_be(:project_wiki) { create(:project_wiki, project: project) } + let_it_be(:project_wiki2) { create(:project_wiki, project: project2) } + let_it_be(:project_wiki3) { create(:project_wiki, project: project3) } + + before do + stub_ee_application_setting(elasticsearch_search: true, elasticsearch_indexing: true) + allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_project?).with(anything).and_return true + allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_namespace?).with(anything).and_return true + allow(migration).to receive(:helper).and_return(helper) + set_elasticsearch_migration_to :reindex_wikis_to_fix_routing, including: false + allow(migration).to receive(:client).and_return(client) + [project_wiki, project_wiki2, project_wiki3, group_wiki, group_wiki2, group_wiki3].each do |wiki| + wiki.create_page('index_page', 'Bla bla term') + wiki.create_page('index_page2', 'Bla bla term') + wiki.index_wiki_blobs + end + ensure_elasticsearch_index! # ensure objects are indexed + end + + describe 'migration_options' do + before do + set_old_schema_version_in_all_documents! + end + + it 'has migration options set', :aggregate_failures do + batch_size = [migration.get_number_of_shards(index_name: index_name), described_class::MAX_BATCH_SIZE].min + expect(migration).to be_batched + expect(migration.batch_size).to eq batch_size + expect(migration.throttle_delay).to eq(5.minutes) + expect(migration).to be_retry_on_failure + end + end + + describe '.migrate' do + context 'if migration is completed' do + it 'performs logging and does not call ElasticWikiIndexerWorker' do + expect(migration).to receive(:log).with("Setting migration_state to #{{ documents_remaining: 0 }.to_json}").once + expect(migration).to receive(:log).with('Checking if migration is finished', { total_remaining: 0 }).once + expect(migration).to receive(:log).with('Migration Completed', { total_remaining: 0 }).once + expect(ElasticWikiIndexerWorker).not_to receive(:perform_in) + migration.migrate + end + end + + context 'if migration is not completed' do + let(:batch_size) { migration.batch_size } + + before do + set_old_schema_version_in_all_documents! + end + + it 'performs logging and calls ElasticWikiIndexerWorker' do + expect(migration).to receive(:log).with( + "Setting migration_state to #{{ documents_remaining: 2 * total_rids }.to_json}").once + expect(migration).to receive(:log).with("Setting migration_state to #{{ batch_size: batch_size }.to_json}").once + expect(migration).to receive(:log).with('Checking if migration is finished', + { total_remaining: 2 * total_rids }).once + delay = a_value_between(0, migration.throttle_delay.seconds) + expect(ElasticWikiIndexerWorker).to receive(:perform_in).exactly(batch_size).times.with(delay, anything, + anything, force: true) + + migration.migrate + end + end + end + + describe 'integration test' do + let(:batch_size) { 2 } + + before do + set_old_schema_version_in_all_documents! + allow(migration).to receive(:batch_size).and_return(batch_size) + # Remove elasticsearch for project2 and group2 + allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_project?).with(project2).and_return false + allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_namespace?).with(group2).and_return false + # Delete project3 and group3 + project3.delete + group3.delete + end + + it "migration will be completed and delete docs of the container that don't use elasticsearch or deleted" do + initial_rids_to_reindex = total_rids + expect(remaining_rids_to_reindex).to eq initial_rids_to_reindex + expect(migration).not_to be_completed + migration.migrate + expect(migration).not_to be_completed + expect(remaining_rids_to_reindex).to eq initial_rids_to_reindex - batch_size + 10.times do + break if migration.completed? + + migration.migrate + sleep 0.01 + end + expect(migration).to be_completed + # Less project3(deleted), group3(deleted), project2(not used elasticsearch), group2(not used elasticsearch) + expect(total_rids).to eq initial_rids_to_reindex - 4 + end + end + + describe '.completed?' do + subject { migration.completed? } + + context 'when all the documents have the new schema_version(2402)' do + # With the 4.7.0 GITLAB_ELASTICSEARCH_INDEXER_VERSION all the new wikis will have schema_version 2402 + it 'returns true' do + is_expected.to be true + end + end + + context 'when some items are missing new schema_version' do + before do + set_old_schema_version_in_all_documents! + end + + it 'returns false' do + is_expected.to be false + end + end + end + + def set_old_schema_version_in_all_documents! + client.update_by_query(index: index_name, refresh: true, conflicts: 'proceed', + body: { script: { lang: 'painless', source: 'ctx._source.schema_version = 2310' } } + ) + end + + def total_rids + helper.refresh_index(index_name: index_name) + client.search( + index: index_name, body: { size: 0, aggs: { rids: { terms: { field: 'rid' } } } } + ).dig('aggregations', 'rids', 'buckets').size + end + + def remaining_rids_to_reindex + helper.refresh_index(index_name: index_name) + client.search(index: index_name, + body: { size: 0, query: { range: { schema_version: { lt: described_class::SCHEMA_VERSION } } }, + aggs: { rids: { terms: { field: 'rid' } } } }).dig('aggregations', 'rids', 'buckets').size + end +end diff --git a/ee/spec/lib/gitlab/elastic/indexer_spec.rb b/ee/spec/lib/gitlab/elastic/indexer_spec.rb index af1522a9f1343..7259729527842 100644 --- a/ee/spec/lib/gitlab/elastic/indexer_spec.rb +++ b/ee/spec/lib/gitlab/elastic/indexer_spec.rb @@ -505,7 +505,7 @@ def indexed_commits_for(term) '--skip-commits', "--wiki-access-level=#{project.wiki_access_level}", "--archived=false", - "--schema-version-wiki=2310", + "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}", "--traversal-ids=#{project.namespace_ancestry}", "#{project.wiki.repository.disk_path}.git" ], @@ -535,7 +535,7 @@ def indexed_commits_for(term) '--skip-commits', "--wiki-access-level=#{project.wiki_access_level}", "--archived=false", - "--schema-version-wiki=2310", + "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}", "--traversal-ids=#{project.namespace_ancestry}", "#{project.wiki.repository.disk_path}.git" ], @@ -595,7 +595,7 @@ def indexed_commits_for(term) '--blob-type=wiki_blob', '--skip-commits', "--wiki-access-level=#{project.wiki_access_level}", - "--schema-version-wiki=2310", + "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}", "--traversal-ids=#{project.namespace_ancestry}", "#{project.wiki.repository.disk_path}.git" ], @@ -639,7 +639,7 @@ def indexed_commits_for(term) '--blob-type=wiki_blob', '--skip-commits', "--wiki-access-level=#{group.wiki_access_level}", - "--schema-version-wiki=2310", + "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}", "--traversal-ids=#{group.elastic_namespace_ancestry}", "#{group.wiki.repository.disk_path}.git" ], nil, hash_including('ELASTIC_CONNECTION_INFO' => elasticsearch_config.to_json, 'RAILS_ENV' => Rails.env) -- GitLab