From f2e2ead8e4988288d617f9f7399e80db3e7c21d7 Mon Sep 17 00:00:00 2001
From: Omar Qunsul <oqunsul@gitlab.com>
Date: Wed, 23 Aug 2023 10:33:02 +0000
Subject: [PATCH] Detecting duplicate btree indexes

Adding tests to detect duplicate btree indexes. This will
help us avoid adding unnecessary indexes

Changelog: other
---
 spec/db/schema_spec.rb                        |  43 +++
 .../helpers/database/duplicate_indexes.rb     |  77 +++++
 .../helpers/database/duplicate_indexes.yml    | 264 ++++++++++++++++++
 .../database/duplicate_indexes_spec.rb        | 108 +++++++
 4 files changed, 492 insertions(+)
 create mode 100644 spec/support/helpers/database/duplicate_indexes.rb
 create mode 100644 spec/support/helpers/database/duplicate_indexes.yml
 create mode 100644 spec/support_specs/database/duplicate_indexes_spec.rb

diff --git a/spec/db/schema_spec.rb b/spec/db/schema_spec.rb
index a7c00d2be326..fbd9c4d32cca 100644
--- a/spec/db/schema_spec.rb
+++ b/spec/db/schema_spec.rb
@@ -187,6 +187,44 @@
               expect(ignored_columns).to match_array(ignored_columns - foreign_keys)
             end
           end
+
+          context 'btree indexes' do
+            it 'only has existing indexes in the ignored duplicate indexes duplicate_indexes.yml' do
+              table_ignored_indexes = (ignored_indexes[table] || {}).to_a.flatten.uniq
+              indexes_by_name = indexes.map(&:name)
+              expect(indexes_by_name).to include(*table_ignored_indexes)
+            end
+
+            it 'does not have any duplicated indexes' do
+              duplicate_indexes = Database::DuplicateIndexes.new(table, indexes).duplicate_indexes
+              expect(duplicate_indexes).to be_an_instance_of Hash
+
+              table_ignored_indexes = ignored_indexes[table] || {}
+
+              # We ignore all the indexes that are explicitly ignored in duplicate_indexes.yml
+              duplicate_indexes.each do |index, matching_indexes|
+                duplicate_indexes[index] = matching_indexes.reject do |matching_index|
+                  table_ignored_indexes.fetch(index.name, []).include?(matching_index.name) ||
+                    table_ignored_indexes.fetch(matching_index.name, []).include?(index.name)
+                end
+
+                duplicate_indexes.delete(index) if duplicate_indexes[index].empty?
+              end
+
+              if duplicate_indexes.present?
+                btree_index = duplicate_indexes.each_key.first
+                matching_indexes = duplicate_indexes[btree_index]
+
+                error_message = <<~ERROR
+                    Duplicate index: #{btree_index.name} with #{matching_indexes.map(&:name)}
+                    #{btree_index.name} : #{btree_index.columns.inspect}
+                    #{matching_indexes.first.name} : #{matching_indexes.first.columns.inspect}.
+                    Consider dropping the indexes #{matching_indexes.map(&:name).join(', ')}
+                ERROR
+                raise error_message
+              end
+            end
+          end
         end
       end
     end
@@ -409,4 +447,9 @@ def ignored_limit_enums(model)
   def ignored_jsonb_columns(model)
     IGNORED_JSONB_COLUMNS.fetch(model, [])
   end
+
+  def ignored_indexes
+    duplicate_indexes_file_path = "spec/support/helpers/database/duplicate_indexes.yml"
+    @ignored_indexes ||= YAML.load_file(Rails.root.join(duplicate_indexes_file_path)) || {}
+  end
 end
diff --git a/spec/support/helpers/database/duplicate_indexes.rb b/spec/support/helpers/database/duplicate_indexes.rb
new file mode 100644
index 000000000000..0ad8ee1e0554
--- /dev/null
+++ b/spec/support/helpers/database/duplicate_indexes.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+module Database
+  class DuplicateIndexes
+    attr_accessor :table_name, :indexes
+
+    BTREE_INDEX_STRUCT = Struct.new(:name, :columns, :unique)
+
+    def initialize(table_name, indexes)
+      @table_name = table_name
+      @indexes = indexes
+    end
+
+    def duplicate_indexes
+      ret = {}
+
+      btree_indexes.each do |btree_index|
+        matching_indexes = matching_indexes_for(btree_index)
+        next unless matching_indexes.any?
+
+        ret[btree_index] = matching_indexes
+      end
+
+      ret
+    end
+
+    def self.btree_index_struct(index)
+      BTREE_INDEX_STRUCT.new(
+        index.name,
+        Array.wrap(index.columns).map do |column|
+          # https://apidock.com/rails/ActiveRecord/ConnectionAdapters/PostgreSQL/SchemaStatements/indexes
+          # asc is the default order
+          column_order = index.orders.is_a?(Symbol) ? index.orders : (index.orders[column] || :asc)
+          { name: column, order: column_order }
+        end,
+        index.unique
+      )
+    end
+
+    private
+
+    def btree_indexes
+      return @btree_indexes if @btree_indexes
+
+      # We only scan non-conditional btree indexes
+      @btree_indexes = indexes.select do |index|
+        index.using == :btree && index.where.nil? && index.opclasses.blank?
+      end
+
+      @btree_indexes = @btree_indexes.map { |index| self.class.btree_index_struct(index) }
+    end
+
+    def matching_indexes_for(btree_index)
+      all_matching_indexes = []
+
+      # When comparing btree_index with other_index. btree_index is the index that can have more columns
+      # than the other_index.
+      (1..btree_index.columns.length).each do |subset_length|
+        columns = btree_index.columns.first(subset_length)
+        matching_indexes = btree_indexes.reject { |other_index| other_index == btree_index }.select do |other_index|
+          other_index.columns == columns
+        end
+
+        # For now we ignore other indexes that are UNIQUE and have a matching columns subset of
+        # the btree_index columns, as UNIQUE indexes are still needed to enforce uniqueness
+        # constraints on subset of the columns.
+        matching_indexes = matching_indexes.reject do |other_index|
+          (other_index.unique && (other_index.columns.length < btree_index.columns.length))
+        end
+
+        all_matching_indexes += matching_indexes
+      end
+
+      all_matching_indexes
+    end
+  end
+end
diff --git a/spec/support/helpers/database/duplicate_indexes.yml b/spec/support/helpers/database/duplicate_indexes.yml
new file mode 100644
index 000000000000..c90cfa62ae0e
--- /dev/null
+++ b/spec/support/helpers/database/duplicate_indexes.yml
@@ -0,0 +1,264 @@
+---
+# It maps table_name to {index1: array_of_duplicate_indexes, index2: array_of_duplicate_indexes, ... }
+abuse_reports:
+  idx_abuse_reports_user_id_status_and_category:
+  - index_abuse_reports_on_user_id
+alert_management_http_integrations:
+  index_http_integrations_on_project_and_endpoint:
+  - index_alert_management_http_integrations_on_project_id
+analytics_cycle_analytics_group_stages:
+  index_group_stages_on_group_id_group_value_stream_id_and_name:
+  - index_analytics_ca_group_stages_on_group_id
+approval_project_rules_users:
+  index_approval_project_rules_users_1:
+  - index_approval_project_rules_users_on_approval_project_rule_id
+approvals:
+  index_approvals_on_merge_request_id_and_created_at:
+  - index_approvals_on_merge_request_id
+board_group_recent_visits:
+  index_board_group_recent_visits_on_user_group_and_board:
+  - index_board_group_recent_visits_on_user_id
+board_project_recent_visits:
+  index_board_project_recent_visits_on_user_project_and_board:
+  - index_board_project_recent_visits_on_user_id
+board_user_preferences:
+  index_board_user_preferences_on_user_id_and_board_id:
+  - index_board_user_preferences_on_user_id
+boards_epic_board_recent_visits:
+  index_epic_board_recent_visits_on_user_group_and_board:
+  - index_boards_epic_board_recent_visits_on_user_id
+boards_epic_user_preferences:
+  index_boards_epic_user_preferences_on_board_user_epic_unique:
+  - index_boards_epic_user_preferences_on_board_id
+bulk_import_batch_trackers:
+  i_bulk_import_trackers_id_batch_number:
+  - index_bulk_import_batch_trackers_on_tracker_id
+bulk_import_export_batches:
+  i_bulk_import_export_batches_id_batch_number:
+  - index_bulk_import_export_batches_on_export_id
+ci_job_artifacts:
+  index_ci_job_artifacts_on_id_project_id_and_created_at:
+  - index_ci_job_artifacts_on_project_id
+  index_ci_job_artifacts_on_id_project_id_and_file_type:
+  - index_ci_job_artifacts_on_project_id
+  index_ci_job_artifacts_on_project_id_and_id:
+  - index_ci_job_artifacts_on_project_id
+ci_pipeline_artifacts:
+  index_ci_pipeline_artifacts_on_pipeline_id_and_file_type:
+  - index_ci_pipeline_artifacts_on_pipeline_id
+ci_stages:
+  index_ci_stages_on_pipeline_id_and_name:
+  - index_ci_stages_on_pipeline_id
+  index_ci_stages_on_pipeline_id_and_position:
+  - index_ci_stages_on_pipeline_id
+dast_site_tokens:
+  index_dast_site_token_on_project_id_and_url:
+  - index_dast_site_tokens_on_project_id
+design_management_designs:
+  index_design_management_designs_on_iid_and_project_id:
+  - index_design_management_designs_on_project_id
+design_management_designs_versions:
+  design_management_designs_versions_uniqueness:
+  - index_design_management_designs_versions_on_design_id
+error_tracking_errors:
+  index_et_errors_on_project_id_and_status_and_id:
+  - index_error_tracking_errors_on_project_id
+  index_et_errors_on_project_id_and_status_events_count_id_desc:
+  - index_error_tracking_errors_on_project_id
+  index_et_errors_on_project_id_and_status_first_seen_at_id_desc:
+  - index_error_tracking_errors_on_project_id
+  index_et_errors_on_project_id_and_status_last_seen_at_id_desc:
+  - index_error_tracking_errors_on_project_id
+geo_node_namespace_links:
+  index_geo_node_namespace_links_on_geo_node_id_and_namespace_id:
+  - index_geo_node_namespace_links_on_geo_node_id
+in_product_marketing_emails:
+  index_in_product_marketing_emails_on_user_campaign:
+  - index_in_product_marketing_emails_on_user_id
+  index_in_product_marketing_emails_on_user_track_series:
+  - index_in_product_marketing_emails_on_user_id
+incident_management_oncall_participants:
+  index_inc_mgmnt_oncall_participants_on_user_id_and_rotation_id:
+  - index_inc_mgmnt_oncall_participants_on_oncall_user_id
+incident_management_oncall_schedules:
+  index_im_oncall_schedules_on_project_id_and_iid:
+  - index_incident_management_oncall_schedules_on_project_id
+instance_audit_events_streaming_headers:
+  idx_instance_external_audit_event_destination_id_key_uniq:
+  - idx_headers_instance_external_audit_event_destination_id
+issue_links:
+  index_issue_links_on_source_id_and_target_id:
+  - index_issue_links_on_source_id
+issues:
+  index_issues_on_author_id_and_id_and_created_at:
+  - index_issues_on_author_id
+jira_connect_subscriptions:
+  idx_jira_connect_subscriptions_on_installation_id_namespace_id:
+  - idx_jira_connect_subscriptions_on_installation_id
+list_user_preferences:
+  index_list_user_preferences_on_user_id_and_list_id:
+  - index_list_user_preferences_on_user_id
+member_tasks:
+  index_member_tasks_on_member_id_and_project_id:
+  - index_member_tasks_on_member_id
+members:
+  index_members_on_member_namespace_id_compound:
+  - index_members_on_member_namespace_id
+merge_request_assignees:
+  index_merge_request_assignees_on_merge_request_id_and_user_id:
+  - index_merge_request_assignees_on_merge_request_id
+merge_request_metrics:
+  index_mr_metrics_on_target_project_id_merged_at_nulls_last:
+  - index_merge_request_metrics_on_target_project_id
+merge_requests:
+  index_merge_requests_on_author_id_and_created_at:
+  - index_merge_requests_on_author_id
+  index_merge_requests_on_author_id_and_id:
+  - index_merge_requests_on_author_id
+  index_merge_requests_on_author_id_and_target_project_id:
+  - index_merge_requests_on_author_id
+ml_candidate_params:
+  index_ml_candidate_params_on_candidate_id_on_name:
+  - index_ml_candidate_params_on_candidate_id
+ml_candidates:
+  index_ml_candidates_on_project_id_on_internal_id:
+  - index_ml_candidates_on_project_id
+ml_model_versions:
+  index_ml_model_versions_on_project_id_and_model_id_and_version:
+  - index_ml_model_versions_on_project_id
+ml_models:
+  index_ml_models_on_project_id_and_name:
+  - index_ml_models_on_project_id
+p_ci_runner_machine_builds:
+  index_p_ci_runner_machine_builds_on_runner_machine_id:
+  - index_ci_runner_machine_builds_on_runner_machine_id
+packages_debian_group_distributions:
+  uniq_pkgs_debian_group_distributions_group_id_and_codename:
+  - index_packages_debian_group_distributions_on_group_id
+  uniq_pkgs_debian_group_distributions_group_id_and_suite:
+  - index_packages_debian_group_distributions_on_group_id
+packages_debian_project_distributions:
+  uniq_pkgs_debian_project_distributions_project_id_and_codename:
+  - index_packages_debian_project_distributions_on_project_id
+  uniq_pkgs_debian_project_distributions_project_id_and_suite:
+  - index_packages_debian_project_distributions_on_project_id
+packages_tags:
+  index_packages_tags_on_package_id_and_updated_at:
+  - index_packages_tags_on_package_id
+pages_deployments:
+  index_pages_deployments_unique_path_prefix_by_project:
+  - index_pages_deployments_on_project_id
+pages_domains:
+  index_pages_domains_on_project_id_and_enabled_until:
+  - index_pages_domains_on_project_id
+  index_pages_domains_on_verified_at_and_enabled_until:
+  - index_pages_domains_on_verified_at
+personal_access_tokens:
+  index_pat_on_user_id_and_expires_at:
+  - index_personal_access_tokens_on_user_id
+pm_affected_packages:
+  i_affected_packages_unique_for_upsert:
+  - index_pm_affected_packages_on_pm_advisory_id
+pm_package_version_licenses:
+  i_pm_package_version_licenses_join_ids:
+  - index_pm_package_version_licenses_on_pm_package_version_id
+pm_package_versions:
+  i_pm_package_versions_on_package_id_and_version:
+  - index_pm_package_versions_on_pm_package_id
+project_compliance_standards_adherence:
+  u_project_compliance_standards_adherence_for_reporting:
+  - index_project_compliance_standards_adherence_on_project_id
+project_relation_exports:
+  index_project_export_job_relation:
+  - index_project_relation_exports_on_project_export_job_id
+project_repositories:
+  index_project_repositories_on_shard_id_and_project_id:
+  - index_project_repositories_on_shard_id
+project_topics:
+  index_project_topics_on_project_id_and_topic_id:
+  - index_project_topics_on_project_id
+projects:
+  index_projects_api_path_id_desc:
+  - index_on_projects_path
+  index_projects_on_path_and_id:
+  - index_on_projects_path
+protected_environments:
+  index_protected_environments_on_project_id_and_name:
+  - index_protected_environments_on_project_id
+protected_tags:
+  index_protected_tags_on_project_id_and_name:
+  - index_protected_tags_on_project_id
+related_epic_links:
+  index_related_epic_links_on_source_id_and_target_id:
+  - index_related_epic_links_on_source_id
+requirements_management_test_reports:
+  idx_test_reports_on_issue_id_created_at_and_id:
+  - index_requirements_management_test_reports_on_issue_id
+sbom_component_versions:
+  index_sbom_component_versions_on_component_id_and_version:
+  - index_sbom_component_versions_on_component_id
+sbom_occurrences:
+  index_sbom_occurrences_for_input_file_path_search:
+  - index_sbom_occurrences_on_project_id_component_id
+  - index_sbom_occurrences_on_project_id
+  idx_sbom_occurrences_on_project_id_and_source_id:
+  - index_sbom_occurrences_on_project_id
+  index_sbom_occurrences_on_project_id_and_id:
+  - index_sbom_occurrences_on_project_id
+  index_sbom_occurrences_on_project_id_component_id:
+  - index_sbom_occurrences_on_project_id
+  index_sbom_occurrences_on_project_id_and_component_id_and_id:
+  - index_sbom_occurrences_on_project_id_component_id
+  - index_sbom_occurrences_on_project_id
+  index_sbom_occurrences_on_project_id_and_package_manager:
+  - index_sbom_occurrences_on_project_id
+scan_result_policies:
+  index_scan_result_policies_on_position_in_configuration:
+  - index_scan_result_policies_on_policy_configuration_id
+search_namespace_index_assignments:
+  index_search_namespace_index_assignments_uniqueness_index_type:
+  - index_search_namespace_index_assignments_on_namespace_id
+  index_search_namespace_index_assignments_uniqueness_on_index_id:
+  - index_search_namespace_index_assignments_on_namespace_id
+sprints:
+  sequence_is_unique_per_iterations_cadence_id:
+  - index_sprints_iterations_cadence_id
+taggings:
+  taggings_idx:
+  - index_taggings_on_tag_id
+term_agreements:
+  term_agreements_unique_index:
+  - index_term_agreements_on_user_id
+todos:
+  index_todos_on_author_id_and_created_at:
+  - index_todos_on_author_id
+user_callouts:
+  index_user_callouts_on_user_id_and_feature_name:
+  - index_user_callouts_on_user_id
+users:
+  index_users_on_state_and_user_type:
+  - index_users_on_state
+vulnerabilities:
+  index_vulnerabilities_project_id_state_severity_default_branch:
+  - index_vulnerabilities_on_project_id_and_state_and_severity
+vulnerability_external_issue_links:
+  idx_vulnerability_ext_issue_links_on_vulne_id_and_ext_issue:
+  - index_vulnerability_external_issue_links_on_vulnerability_id
+vulnerability_finding_links:
+  finding_link_name_url_idx:
+  - finding_links_on_vulnerability_occurrence_id
+vulnerability_finding_signatures:
+  idx_vuln_signatures_uniqueness_signature_sha:
+  - index_vulnerability_finding_signatures_on_finding_id
+vulnerability_flags:
+  index_vulnerability_flags_on_unique_columns:
+  - index_vulnerability_flags_on_vulnerability_occurrence_id
+web_hook_logs:
+  index_web_hook_logs_on_web_hook_id_and_created_at:
+  - index_web_hook_logs_part_on_web_hook_id
+web_hooks:
+  index_web_hooks_on_project_id_recent_failures:
+  - index_web_hooks_on_project_id
+work_item_hierarchy_restrictions:
+  index_work_item_hierarchy_restrictions_on_parent_and_child:
+  - index_work_item_hierarchy_restrictions_on_parent_type_id
diff --git a/spec/support_specs/database/duplicate_indexes_spec.rb b/spec/support_specs/database/duplicate_indexes_spec.rb
new file mode 100644
index 000000000000..bbcb80f0593d
--- /dev/null
+++ b/spec/support_specs/database/duplicate_indexes_spec.rb
@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Database::DuplicateIndexes, feature_category: :database do
+  index_class = ActiveRecord::ConnectionAdapters::IndexDefinition
+  let(:default_index_options) { { using: :btree, orders: {}, unique: false, opclasses: {}, where: nil } }
+
+  let(:table_name) { 'foobar' }
+  let(:index1) { instance_double(index_class, default_index_options.merge(name: 'index1', columns: %w[user_id])) }
+  let(:index1_copy) { instance_double(index_class, default_index_options.merge(name: 'index1b', columns: %w[user_id])) }
+  let(:index2) { instance_double(index_class, default_index_options.merge(name: 'index2', columns: %w[project_id])) }
+  let(:index3) do
+    instance_double(index_class, default_index_options.merge(name: 'index3', columns: %w[user_id project_id]))
+  end
+
+  let(:index3_inverse) do
+    instance_double(index_class, default_index_options.merge(name: 'index3_inverse', columns: %w[project_id user_id]))
+  end
+
+  let(:index1_unique) do
+    instance_double(index_class, default_index_options.merge(name: 'index1_unique', columns: %w[user_id], unique: true))
+  end
+
+  let(:index1_desc) do
+    instance_double(
+      index_class,
+      default_index_options.merge(name: 'index1', columns: %w[user_id], orders: { user_id: 'desc' })
+    )
+  end
+
+  let(:index3_with_where) do
+    instance_double(
+      index_class,
+      default_index_options.merge(name: 'index3_with_where', columns: %w[user_id project_id], where: "id > 100")
+    )
+  end
+
+  subject(:duplicate_indexes) do
+    described_class.new(table_name, indexes).duplicate_indexes
+  end
+
+  context 'when there are no duplicate indexes' do
+    let(:indexes) { [index1, index2] }
+
+    it { expect(duplicate_indexes).to be_empty }
+  end
+
+  context 'when overlapping indexes' do
+    let(:indexes) { [index1, index3] }
+
+    it 'detects a duplicate index between index1 and index3' do
+      expected_duplicate_indexes = { index_struct(index3) => [index_struct(index1)] }
+
+      expect(duplicate_indexes).to eq(expected_duplicate_indexes)
+    end
+  end
+
+  context 'when the indexes have the inverse order of columns' do
+    let(:indexes) { [index3, index3_inverse] }
+
+    it 'does not detect duplicate indexes between index3 and index3_inverse' do
+      expect(duplicate_indexes).to eq({})
+    end
+  end
+
+  # For now we ignore other indexes that are UNIQUE and have a matching columns subset of
+  # the btree_index columns, as UNIQUE indexes are still needed to enforce uniqueness
+  # constraints on subset of the columns.
+  context 'when the index with matching sub-columns is unique' do
+    let(:indexes) { [index3, index1_unique] }
+
+    it 'does not detect duplicate indexes between index3 and index1_unique' do
+      expect(duplicate_indexes).to eq({})
+    end
+  end
+
+  context 'when the one of the indexes is a conditional index' do
+    let(:indexes) { [index3, index3_with_where] }
+
+    it 'does not detect duplicate indexes between index3 and index3_with_where' do
+      expect(duplicate_indexes).to eq({})
+    end
+  end
+
+  context 'when identical indexes' do
+    let(:indexes) { [index1, index1_copy] }
+
+    it 'detects a duplicate index between index1 and index3' do
+      expected_duplicate_indexes = {
+        index_struct(index1) => [index_struct(index1_copy)],
+        index_struct(index1_copy) => [index_struct(index1)]
+      }
+
+      expect(duplicate_indexes).to eq(expected_duplicate_indexes)
+    end
+  end
+
+  context 'when indexes have the same columns but with different order' do
+    let(:indexes) { [index1, index1_desc] }
+
+    it { expect(duplicate_indexes).to be_empty }
+  end
+
+  def index_struct(index)
+    Database::DuplicateIndexes.btree_index_struct(index)
+  end
+end
-- 
GitLab