From 14e68ac13830c99f81c4e0b505dba29cb7c746eb Mon Sep 17 00:00:00 2001
From: Ravi Kumar <rkumar@gitlab.com>
Date: Fri, 2 Feb 2024 14:09:41 +0000
Subject: [PATCH] Reindex wikis to adopt new structure of id

Reindex wikis to adopt the new id structure. The group wikis and project
wikis id will be prefixed by g_ and p_ respectively. After the
reindexing, all the wiki docs will have schema_version 2402.

Changelog: fixed
MR: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/143273
EE: true
---
 GITLAB_ELASTICSEARCH_INDEXER_VERSION          |   2 +-
 ...20240130215043_reindex_wikis_to_fix_id.yml |  10 ++
 .../20240130215043_reindex_wikis_to_fix_id.rb |  64 +++++++
 ee/lib/gitlab/elastic/indexer.rb              |   2 +-
 ...0130215043_reindex_wikis_to_fix_id_spec.rb | 161 ++++++++++++++++++
 ee/spec/lib/gitlab/elastic/indexer_spec.rb    |   8 +-
 6 files changed, 241 insertions(+), 6 deletions(-)
 create mode 100644 ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml
 create mode 100644 ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb
 create mode 100644 ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb

diff --git a/GITLAB_ELASTICSEARCH_INDEXER_VERSION b/GITLAB_ELASTICSEARCH_INDEXER_VERSION
index 6016e8addc4d0..f6cdf40983ffd 100644
--- a/GITLAB_ELASTICSEARCH_INDEXER_VERSION
+++ b/GITLAB_ELASTICSEARCH_INDEXER_VERSION
@@ -1 +1 @@
-4.6.0
+4.7.0
diff --git a/ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml b/ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml
new file mode 100644
index 0000000000000..2cca3a0049542
--- /dev/null
+++ b/ee/elastic/docs/20240130215043_reindex_wikis_to_fix_id.yml
@@ -0,0 +1,10 @@
+---
+name: ReindexWikisToFixId
+version: '20240130215043'
+description: Reindex all wikis to adopt new structure(g_ or p_) of id
+group: group::global search
+milestone: '16.9'
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/143273
+obsolete: false
+marked_obsolete_by_url:
+marked_obsolete_in_milestone:
diff --git a/ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb b/ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb
new file mode 100644
index 0000000000000..db50407b58cf6
--- /dev/null
+++ b/ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb
@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+
+class ReindexWikisToFixId < Elastic::Migration
+  include Elastic::MigrationHelper
+
+  batched!
+  throttle_delay 5.minutes
+  retry_on_failure
+
+  ELASTIC_TIMEOUT = '5m'
+  MAX_BATCH_SIZE = 50
+  SCHEMA_VERSION = 24_02
+
+  def migrate
+    if completed?
+      log 'Migration Completed', total_remaining: 0
+      return
+    end
+
+    set_migration_state(batch_size: batch_size) if migration_state[:batch_size].blank?
+
+    remaining_rids_to_reindex.each do |rid|
+      m = rid.match(/wiki_(?<type>project|group)_(?<id>\d+)/)
+      ElasticWikiIndexerWorker.perform_in(rand(throttle_delay).seconds, m[:id], m[:type].capitalize, force: true)
+    end
+  end
+
+  def completed?
+    total_remaining = remaining_documents_count
+    set_migration_state(documents_remaining: total_remaining)
+    log('Checking if migration is finished', total_remaining: total_remaining)
+    total_remaining == 0
+  end
+
+  def batch_size
+    migration_state[:batch_size].presence || [get_number_of_shards(index_name: index_name), MAX_BATCH_SIZE].min
+  end
+
+  private
+
+  def remaining_rids_to_reindex
+    results = client.search(
+      index: index_name,
+      body: {
+        size: 0, query: query_with_old_schema_version, aggs: { rids: { terms: { size: batch_size, field: 'rid' } } }
+      }
+    )
+    rids_hist = results.dig('aggregations', 'rids', 'buckets') || []
+    rids_hist.pluck('key') # rubocop: disable CodeReuse/ActiveRecord -- no ActiveRecord relation
+  end
+
+  def remaining_documents_count
+    helper.refresh_index(index_name: index_name)
+    client.count(index: index_name, body: { query: query_with_old_schema_version })['count']
+  end
+
+  def query_with_old_schema_version
+    { range: { schema_version: { lt: SCHEMA_VERSION } } }
+  end
+
+  def index_name
+    Elastic::Latest::WikiConfig.index_name
+  end
+end
diff --git a/ee/lib/gitlab/elastic/indexer.rb b/ee/lib/gitlab/elastic/indexer.rb
index b2868743cfb03..ebef51832eb13 100644
--- a/ee/lib/gitlab/elastic/indexer.rb
+++ b/ee/lib/gitlab/elastic/indexer.rb
@@ -13,7 +13,7 @@ class Indexer
       Error = Class.new(StandardError)
       BLOB_SCHEMA_VERSION = 23_08
       COMMIT_SCHEMA_VERSION = 23_06
-      WIKI_SCHEMA_VERSION = 23_10
+      WIKI_SCHEMA_VERSION = 24_02
 
       class << self
         def indexer_version
diff --git a/ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb b/ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb
new file mode 100644
index 0000000000000..e7fca36415e3f
--- /dev/null
+++ b/ee/spec/elastic/migrate/20240130215043_reindex_wikis_to_fix_id_spec.rb
@@ -0,0 +1,161 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require_relative 'migration_shared_examples'
+require File.expand_path('ee/elastic/migrate/20240130215043_reindex_wikis_to_fix_id.rb')
+
+RSpec.describe ReindexWikisToFixId, :elastic_clean, :sidekiq_inline, feature_category: :global_search do
+  let(:version) { 20240130215043 }
+  let(:migration) { described_class.new(version) }
+  let(:helper) { Gitlab::Elastic::Helper.new }
+  let(:client) { ::Gitlab::Search::Client.new }
+  let(:index_name) { Elastic::Latest::WikiConfig.index_name }
+  let_it_be(:project) { create(:project, :wiki_repo) }
+  let_it_be(:project2) { create(:project, :wiki_repo) }
+  let_it_be(:project3) { create(:project, :wiki_repo) }
+  let_it_be(:group) { create(:group) }
+  let_it_be(:group2) { create(:group) }
+  let_it_be(:group3) { create(:group) }
+  let_it_be(:group_wiki) { create(:group_wiki, group: group) }
+  let_it_be(:group_wiki2) { create(:group_wiki, group: group2) }
+  let_it_be(:group_wiki3) { create(:group_wiki, group: group3) }
+  let_it_be(:project_wiki) { create(:project_wiki, project: project) }
+  let_it_be(:project_wiki2) { create(:project_wiki, project: project2) }
+  let_it_be(:project_wiki3) { create(:project_wiki, project: project3) }
+
+  before do
+    stub_ee_application_setting(elasticsearch_search: true, elasticsearch_indexing: true)
+    allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_project?).with(anything).and_return true
+    allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_namespace?).with(anything).and_return true
+    allow(migration).to receive(:helper).and_return(helper)
+    set_elasticsearch_migration_to :reindex_wikis_to_fix_routing, including: false
+    allow(migration).to receive(:client).and_return(client)
+    [project_wiki, project_wiki2, project_wiki3, group_wiki, group_wiki2, group_wiki3].each do |wiki|
+      wiki.create_page('index_page', 'Bla bla term')
+      wiki.create_page('index_page2', 'Bla bla term')
+      wiki.index_wiki_blobs
+    end
+    ensure_elasticsearch_index! # ensure objects are indexed
+  end
+
+  describe 'migration_options' do
+    before do
+      set_old_schema_version_in_all_documents!
+    end
+
+    it 'has migration options set', :aggregate_failures do
+      batch_size = [migration.get_number_of_shards(index_name: index_name), described_class::MAX_BATCH_SIZE].min
+      expect(migration).to be_batched
+      expect(migration.batch_size).to eq batch_size
+      expect(migration.throttle_delay).to eq(5.minutes)
+      expect(migration).to be_retry_on_failure
+    end
+  end
+
+  describe '.migrate' do
+    context 'if migration is completed' do
+      it 'performs logging and does not call ElasticWikiIndexerWorker' do
+        expect(migration).to receive(:log).with("Setting migration_state to #{{ documents_remaining: 0 }.to_json}").once
+        expect(migration).to receive(:log).with('Checking if migration is finished', { total_remaining: 0 }).once
+        expect(migration).to receive(:log).with('Migration Completed', { total_remaining: 0 }).once
+        expect(ElasticWikiIndexerWorker).not_to receive(:perform_in)
+        migration.migrate
+      end
+    end
+
+    context 'if migration is not completed' do
+      let(:batch_size) { migration.batch_size }
+
+      before do
+        set_old_schema_version_in_all_documents!
+      end
+
+      it 'performs logging and calls ElasticWikiIndexerWorker' do
+        expect(migration).to receive(:log).with(
+          "Setting migration_state to #{{ documents_remaining: 2 * total_rids }.to_json}").once
+        expect(migration).to receive(:log).with("Setting migration_state to #{{ batch_size: batch_size }.to_json}").once
+        expect(migration).to receive(:log).with('Checking if migration is finished',
+          { total_remaining: 2 * total_rids }).once
+        delay = a_value_between(0, migration.throttle_delay.seconds)
+        expect(ElasticWikiIndexerWorker).to receive(:perform_in).exactly(batch_size).times.with(delay, anything,
+          anything, force: true)
+
+        migration.migrate
+      end
+    end
+  end
+
+  describe 'integration test' do
+    let(:batch_size) { 2 }
+
+    before do
+      set_old_schema_version_in_all_documents!
+      allow(migration).to receive(:batch_size).and_return(batch_size)
+      # Remove elasticsearch for project2 and group2
+      allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_project?).with(project2).and_return false
+      allow(::Gitlab::CurrentSettings).to receive(:elasticsearch_indexes_namespace?).with(group2).and_return false
+      # Delete project3 and group3
+      project3.delete
+      group3.delete
+    end
+
+    it "migration will be completed and delete docs of the container that don't use elasticsearch or deleted" do
+      initial_rids_to_reindex = total_rids
+      expect(remaining_rids_to_reindex).to eq initial_rids_to_reindex
+      expect(migration).not_to be_completed
+      migration.migrate
+      expect(migration).not_to be_completed
+      expect(remaining_rids_to_reindex).to eq initial_rids_to_reindex - batch_size
+      10.times do
+        break if migration.completed?
+
+        migration.migrate
+        sleep 0.01
+      end
+      expect(migration).to be_completed
+      # Less project3(deleted), group3(deleted), project2(not used elasticsearch), group2(not used elasticsearch)
+      expect(total_rids).to eq initial_rids_to_reindex - 4
+    end
+  end
+
+  describe '.completed?' do
+    subject { migration.completed? }
+
+    context 'when all the documents have the new schema_version(2402)' do
+      # With the 4.7.0 GITLAB_ELASTICSEARCH_INDEXER_VERSION all the new wikis will have schema_version 2402
+      it 'returns true' do
+        is_expected.to be true
+      end
+    end
+
+    context 'when some items are missing new schema_version' do
+      before do
+        set_old_schema_version_in_all_documents!
+      end
+
+      it 'returns false' do
+        is_expected.to be false
+      end
+    end
+  end
+
+  def set_old_schema_version_in_all_documents!
+    client.update_by_query(index: index_name, refresh: true, conflicts: 'proceed',
+      body: { script: { lang: 'painless', source: 'ctx._source.schema_version = 2310' } }
+    )
+  end
+
+  def total_rids
+    helper.refresh_index(index_name: index_name)
+    client.search(
+      index: index_name, body: { size: 0, aggs: { rids: { terms: { field: 'rid' } } } }
+    ).dig('aggregations', 'rids', 'buckets').size
+  end
+
+  def remaining_rids_to_reindex
+    helper.refresh_index(index_name: index_name)
+    client.search(index: index_name,
+      body: { size: 0, query: { range: { schema_version: { lt: described_class::SCHEMA_VERSION } } },
+              aggs: { rids: { terms: { field: 'rid' } } } }).dig('aggregations', 'rids', 'buckets').size
+  end
+end
diff --git a/ee/spec/lib/gitlab/elastic/indexer_spec.rb b/ee/spec/lib/gitlab/elastic/indexer_spec.rb
index af1522a9f1343..7259729527842 100644
--- a/ee/spec/lib/gitlab/elastic/indexer_spec.rb
+++ b/ee/spec/lib/gitlab/elastic/indexer_spec.rb
@@ -505,7 +505,7 @@ def indexed_commits_for(term)
               '--skip-commits',
               "--wiki-access-level=#{project.wiki_access_level}",
               "--archived=false",
-              "--schema-version-wiki=2310",
+              "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}",
               "--traversal-ids=#{project.namespace_ancestry}",
               "#{project.wiki.repository.disk_path}.git"
             ],
@@ -535,7 +535,7 @@ def indexed_commits_for(term)
             '--skip-commits',
             "--wiki-access-level=#{project.wiki_access_level}",
             "--archived=false",
-            "--schema-version-wiki=2310",
+            "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}",
             "--traversal-ids=#{project.namespace_ancestry}",
             "#{project.wiki.repository.disk_path}.git"
           ],
@@ -595,7 +595,7 @@ def indexed_commits_for(term)
               '--blob-type=wiki_blob',
               '--skip-commits',
               "--wiki-access-level=#{project.wiki_access_level}",
-              "--schema-version-wiki=2310",
+              "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}",
               "--traversal-ids=#{project.namespace_ancestry}",
               "#{project.wiki.repository.disk_path}.git"
             ],
@@ -639,7 +639,7 @@ def indexed_commits_for(term)
           '--blob-type=wiki_blob',
           '--skip-commits',
           "--wiki-access-level=#{group.wiki_access_level}",
-          "--schema-version-wiki=2310",
+          "--schema-version-wiki=#{described_class::WIKI_SCHEMA_VERSION}",
           "--traversal-ids=#{group.elastic_namespace_ancestry}",
           "#{group.wiki.repository.disk_path}.git"
         ], nil, hash_including('ELASTIC_CONNECTION_INFO' => elasticsearch_config.to_json, 'RAILS_ENV' => Rails.env)
-- 
GitLab