From f4fa78e183dea282f7f6ff5b2d14842f093d7ca0 Mon Sep 17 00:00:00 2001
From: Dmitry Gruzd <dgruzd@gitlab.com>
Date: Tue, 5 Dec 2023 11:53:50 +0000
Subject: [PATCH] Cache Zoekt responses

---
 .../ops/zoekt_cache_search_responses.yml      |   8 +
 ee/lib/gitlab/zoekt/search_results.rb         |  99 +++++++++----
 ee/lib/search/zoekt/cache.rb                  |  95 ++++++++++++
 .../lib/gitlab/zoekt/search_results_spec.rb   |  16 ++
 ee/spec/lib/search/zoekt/cache_spec.rb        | 139 ++++++++++++++++++
 5 files changed, 327 insertions(+), 30 deletions(-)
 create mode 100644 ee/config/feature_flags/ops/zoekt_cache_search_responses.yml
 create mode 100644 ee/lib/search/zoekt/cache.rb
 create mode 100644 ee/spec/lib/search/zoekt/cache_spec.rb

diff --git a/ee/config/feature_flags/ops/zoekt_cache_search_responses.yml b/ee/config/feature_flags/ops/zoekt_cache_search_responses.yml
new file mode 100644
index 0000000000000..b67bacef64629
--- /dev/null
+++ b/ee/config/feature_flags/ops/zoekt_cache_search_responses.yml
@@ -0,0 +1,8 @@
+---
+name: zoekt_cache_search_responses
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/137337
+rollout_issue_url:
+milestone: '16.7'
+type: ops
+group: group::global search
+default_enabled: false
diff --git a/ee/lib/gitlab/zoekt/search_results.rb b/ee/lib/gitlab/zoekt/search_results.rb
index 91b64b1613887..faa7abca96b48 100644
--- a/ee/lib/gitlab/zoekt/search_results.rb
+++ b/ee/lib/gitlab/zoekt/search_results.rb
@@ -25,11 +25,11 @@ def initialize(current_user, query, limit_project_ids = nil, node_id:, order_by:
         @filters = filters
       end
 
-      def objects(scope, page: 1, per_page: DEFAULT_PER_PAGE, preload_method: nil)
+      def objects(_scope, page: 1, per_page: DEFAULT_PER_PAGE, preload_method: nil)
         blobs(page: page, per_page: per_page, preload_method: preload_method)
       end
 
-      def formatted_count(scope)
+      def formatted_count(_scope)
         limited_counter_with_delimiter(blobs_count)
       end
 
@@ -60,7 +60,7 @@ def parse_zoekt_search_result(result, project)
         )
       end
 
-      def aggregations(scope)
+      def aggregations(_scope)
         []
       end
 
@@ -114,64 +114,103 @@ def limited_counter_with_delimiter(count)
         end
       end
 
-      def search_as_found_blob(query, repositories, page:, per_page:, options:, preload_method:)
+      def search_as_found_blob(query, _repositories, page:, per_page:, options:, preload_method:)
         zoekt_search_and_wrap(query, page: page,
-                              per_page: per_page,
-                              options: options,
-                              preload_method: preload_method) do |result, project|
+          per_page: per_page,
+          options: options,
+          preload_method: preload_method) do |result, project|
           parse_zoekt_search_result(result, project)
         end
       end
 
-      def zoekt_extract_results(search_result, per_page:, offset:)
-        results = []
+      # Performs zoekt search and parses the response
+      #
+      # @param query [String] search query
+      # @param per_page [Integer] how many documents per page
+      # @param options [Hash] additional options
+      # @param page_limit [Integer] maximum number of pages we parse
+      # @return [Array<Hash, Integer>] the results and total count
+      def zoekt_search(query, per_page:, options:, page_limit:)
+        response = ::Gitlab::Search::Zoekt::Client.search(
+          query,
+          num: ZOEKT_COUNT_LIMIT,
+          project_ids: options[:project_ids],
+          node_id: options[:node_id]
+        )
+
+        if response[:Error]
+          @blobs_count = 0
+          @error = response[:Error]
+          return [{}, 0]
+        end
+
+        total_count = response[:Result][:MatchCount].clamp(0, ZOEKT_COUNT_LIMIT)
+        results = zoekt_extract_result_pages(response, per_page: per_page, page_limit: page_limit)
 
+        [results, total_count]
+      end
+
+      # Extracts results from the Zoekt resposne
+      #
+      # @param response [Hash] JSON response converted to hash
+      # @param per_page [Integer] how many documents per page
+      # @param page_limit [Integer] maximum number of pages we parse
+      # @return [Hash<Integer, Array<Hash>>] results hash with pages as keys (zero-based)
+      def zoekt_extract_result_pages(response, per_page:, page_limit:)
+        results = {}
         i = 0
-        (search_result[:Result][:Files] || []).each do |r|
-          project_id = r[:Repository].to_i
 
-          r[:LineMatches].each do |match|
-            i += 1
+        files = response.dig(:Result, :Files) || []
+        files.each do |file|
+          project_id = file[:Repository].to_i
 
-            next if i <= offset
-            return results if i > offset + per_page
+          file[:LineMatches].each do |match|
+            current_page = i / per_page
+            return results if current_page == page_limit
 
-            results << {
+            results[current_page] ||= []
+            results[current_page] << {
               project_id: project_id,
               content: [match[:Before], match[:Line], match[:After]].compact.map { |l| Base64.decode64(l) }.join("\n"),
               line: match[:LineNumber],
-              path: r[:FileName]
+              path: file[:FileName]
             }
+
+            i += 1
           end
         end
 
         results
       end
 
-      def zoekt_search_and_wrap(query, page: 1, per_page: 20, options: {}, preload_method: nil, &blk)
-        search_result = ::Gitlab::Search::Zoekt::Client.search(
+      def zoekt_search_and_wrap(query, per_page:, page: 1, options: {}, preload_method: nil, &blk)
+        zoekt_cache = ::Search::Zoekt::Cache.new(
           query,
-          num: ZOEKT_COUNT_LIMIT,
+          current_user: current_user,
+          page: page,
+          per_page: per_page,
           project_ids: options[:project_ids],
-          node_id: options[:node_id]
+          max_per_page: DEFAULT_PER_PAGE * 2
         )
 
-        if search_result[:Error]
-          @blobs_count = 0
-          @error = search_result[:Error]
-          return Kaminari.paginate_array([])
+        search_results, total_count = zoekt_cache.fetch do |page_limit|
+          zoekt_search(query, per_page: per_page, page_limit: page_limit, options: options)
         end
 
-        total_count = search_result[:Result][:MatchCount].clamp(0, ZOEKT_COUNT_LIMIT)
-        offset = (page - 1) * per_page
-
-        results = zoekt_extract_results(search_result, per_page: per_page, offset: offset)
-        items, total_count = yield_each_zoekt_search_result(results, preload_method, total_count, &blk)
+        items, total_count = yield_each_zoekt_search_result(
+          search_results[page - 1],
+          preload_method,
+          total_count,
+          &blk
+        )
 
+        offset = (page - 1) * per_page
         Kaminari.paginate_array(items, total_count: total_count, limit: per_page, offset: offset)
       end
 
       def yield_each_zoekt_search_result(response, preload_method, total_count)
+        return [[], total_count] if total_count == 0
+
         project_ids = response.pluck(:project_id).uniq # rubocop:disable CodeReuse/ActiveRecord
         projects = Project.with_route.id_in(project_ids)
         projects = projects.public_send(preload_method) if preload_method # rubocop:disable GitlabSecurity/PublicSend
diff --git a/ee/lib/search/zoekt/cache.rb b/ee/lib/search/zoekt/cache.rb
new file mode 100644
index 0000000000000..ff614f8372ab0
--- /dev/null
+++ b/ee/lib/search/zoekt/cache.rb
@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+
+module Search
+  module Zoekt
+    class Cache
+      MAX_PAGES = 10
+      EXPIRES_IN = 5.minutes
+
+      attr_reader :current_user, :query, :project_ids, :per_page, :current_page, :max_per_page
+
+      def initialize(query, current_user:, project_ids:, per_page:, page:, max_per_page:)
+        @query = query
+        @current_user = current_user
+        @project_ids = project_ids
+        @per_page = per_page
+        @current_page = page
+        @max_per_page = max_per_page
+      end
+
+      def enabled?
+        return false unless valid_arguments?
+
+        Feature.enabled?(:zoekt_cache_search_responses, current_user, type: :ops)
+      end
+
+      def fetch
+        return yield page_limit unless enabled?
+
+        search_results, total_count = read_cache
+
+        unless search_results
+          search_results, total_count = yield page_limit
+
+          update_cache!(search_results: search_results, total_count: total_count)
+        end
+
+        [search_results, total_count]
+      end
+
+      def cache_key(page: current_page)
+        user_id = current_user&.id || 0
+        # We need to use {user_id} as part of the key for Redis Cluster support
+        "cache:zoekt:{#{user_id}}/#{search_fingerprint}/#{per_page}/#{page}"
+      end
+
+      private
+
+      def valid_arguments?
+        project_ids.is_a?(Array) && project_ids.present? && per_page <= max_per_page
+      end
+
+      def with_redis(&block)
+        Gitlab::Redis::Cache.with(&block) # rubocop:disable CodeReuse/ActiveRecord -- this has nothing to do with AR
+      end
+
+      def page_limit
+        return [current_page, MAX_PAGES].max if enabled?
+
+        current_page
+      end
+
+      def search_fingerprint
+        OpenSSL::Digest.hexdigest('SHA256', "#{query}-#{project_ids.sort.join(',')}")
+      end
+
+      def read_cache
+        data = with_redis do |redis|
+          redis.get(cache_key)
+        end
+
+        return unless data
+
+        Marshal.load(data) # rubocop:disable Security/MarshalLoad -- We're loading data we saved below (similar to Rails.cache)
+      end
+
+      def update_cache!(search_results:, total_count:)
+        return unless search_results && total_count > 0
+
+        with_redis do |redis|
+          redis.multi do |pipeline|
+            (0..MAX_PAGES).each do |page_idx|
+              result = search_results[page_idx]
+              next unless result
+
+              cached_result = [{ page_idx => result }, total_count]
+
+              key = cache_key(page: page_idx + 1)
+              pipeline.set(key, Marshal.dump(cached_result), ex: EXPIRES_IN)
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/ee/spec/lib/gitlab/zoekt/search_results_spec.rb b/ee/spec/lib/gitlab/zoekt/search_results_spec.rb
index ad5d4309970e9..2536190c6e2fb 100644
--- a/ee/spec/lib/gitlab/zoekt/search_results_spec.rb
+++ b/ee/spec/lib/gitlab/zoekt/search_results_spec.rb
@@ -29,6 +29,22 @@
       expect(results.blobs_count).to eq 5
     end
 
+    it 'instanciates zoekt cache with correct arguments' do
+      query = 'use.*egex'
+      results = described_class.new(user, query, limit_project_ids, node_id: node_id)
+
+      expect(Search::Zoekt::Cache).to receive(:new).with(
+        query,
+        current_user: user,
+        page: 1,
+        per_page: described_class::DEFAULT_PER_PAGE,
+        project_ids: limit_project_ids,
+        max_per_page: described_class::DEFAULT_PER_PAGE * 2
+      ).and_call_original
+
+      results.objects('blobs')
+    end
+
     it 'correctly handles pagination' do
       per_page = 2
 
diff --git a/ee/spec/lib/search/zoekt/cache_spec.rb b/ee/spec/lib/search/zoekt/cache_spec.rb
new file mode 100644
index 0000000000000..14530d67613af
--- /dev/null
+++ b/ee/spec/lib/search/zoekt/cache_spec.rb
@@ -0,0 +1,139 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Search::Zoekt::Cache, :clean_gitlab_redis_cache, feature_category: :global_search do
+  let(:default_options) { { per_page: 20, max_per_page: 40 } }
+  let(:query) { 'foo' }
+  let_it_be(:user1) { build(:user, id: 1) }
+  let(:page) { 1 }
+  let(:project_ids) { [3, 2, 1] }
+  let(:search_results) { { 0 => { project_id: 1 }, 1 => { project_id: 2 }, 2 => { project_id: 3 } } }
+  let(:total_count) { 3 }
+  let(:response) { [search_results, total_count] }
+
+  subject(:cache) do
+    described_class.new(query, **default_options.merge(current_user: user1, project_ids: project_ids, page: page))
+  end
+
+  before do
+    stub_const("#{described_class.name}::MAX_PAGES", 2)
+  end
+
+  describe '#cache_key' do
+    let(:uniq_examples) do
+      [
+        { current_user: build(:user, id: 1), query: 'foo', project_ids: [3, 2], page: 1 },
+        { current_user: build(:user, id: 1), query: 'foo', project_ids: [3, 2], page: 1, per_page: 10 },
+        { current_user: build(:user, id: 1), query: 'bar', project_ids: [3, 2], page: 1 },
+        { current_user: build(:user, id: 1), query: 'foo', project_ids: [2], page: 1 },
+        { current_user: build(:user, id: 1), query: 'foo', project_ids: [3, 2], page: 2 },
+        { current_user: build(:user, id: 2), query: 'foo', project_ids: [3, 2], page: 1 },
+        { current_user: build(:user, id: nil), query: 'foo', project_ids: [3, 2], page: 1 }
+      ]
+    end
+
+    let(:duplicate_examples) do
+      [
+        { current_user: build(:user, id: 1), query: 'foo', project_ids: [1, 3, 2], page: 1, per_page: 20 },
+        { current_user: build(:user, id: 1), query: 'foo', project_ids: [2, 3, 1], page: 1, per_page: 20 }
+      ]
+    end
+
+    it 'returns unique cache keys for different queries' do
+      result = {}
+
+      uniq_examples.each do |e|
+        cache_key = described_class.new(e[:query], **default_options.merge(e.except(:query))).cache_key
+
+        result[cache_key] ||= []
+        result[cache_key] << e
+      end
+
+      result.each do |key, examples|
+        expect(examples.size).to eq(1), "#{key} has duplicate examples: #{examples}"
+      end
+    end
+
+    it 'returns the same cache key for duplicate queries' do
+      cache_keys = duplicate_examples.map do |e|
+        described_class.new(e[:query], **default_options.merge(e.except(:query))).cache_key
+      end
+
+      expect(cache_keys.uniq.size).to eq(1)
+    end
+  end
+
+  context 'when feature flag is disabled' do
+    before do
+      stub_feature_flags(zoekt_cache_search_responses: false)
+    end
+
+    describe '#enabled?' do
+      it 'returns false' do
+        expect(cache.enabled?).to be false
+      end
+    end
+
+    describe '#fetch' do
+      it 'does not use cache' do
+        expect(cache).not_to receive(:read_cache)
+        expect(response).to receive(:length)
+
+        data = cache.fetch do
+          response.length
+          response
+        end
+
+        expect(data).to eq(response)
+      end
+    end
+  end
+
+  context 'when feature flag is enabled' do
+    before do
+      stub_feature_flags(zoekt_cache_search_responses: true)
+    end
+
+    describe '#enabled?' do
+      it 'returns true' do
+        expect(cache.enabled?).to be true
+      end
+
+      context 'when project_ids is empty' do
+        let(:project_ids) { [] }
+
+        it 'returns false' do
+          expect(cache.enabled?).to be false
+        end
+      end
+    end
+
+    describe '#fetch' do
+      it 'reads and updates cache' do
+        expect(cache).to receive(:read_cache)
+        expect(cache).to receive(:update_cache!)
+
+        data = cache.fetch do |page_limit|
+          expect(page_limit).to eq(described_class::MAX_PAGES)
+          response
+        end
+
+        expect(data).to eq(response)
+      end
+
+      context 'when page is higher than the limit' do
+        let(:page) { 3 }
+
+        it 'sets the correct page limit' do
+          data = cache.fetch do |page_limit|
+            expect(page_limit).to eq(page)
+            response
+          end
+
+          expect(data).to eq(response)
+        end
+      end
+    end
+  end
+end
-- 
GitLab