From c92ed80147f7d0de40bf28429f5feebb65b26f1f Mon Sep 17 00:00:00 2001
From: Alexandru Croitor <acroitor@gitlab.com>
Date: Mon, 28 Aug 2023 13:11:59 +0000
Subject: [PATCH] Add model and DB tables to support vertex embeddings for docs

This MR adds database table vertex_gitlab_docs to store vertex
embeddigns for gitlab documentation.

This also adds the text_embeddings method to the VertexAI::Client
class to call vertex ai endpoint for building text embeddings.

re https://gitlab.com/gitlab-org/gitlab/-/issues/420939
---
 .../embedding/vertex/gitlab_documentation.rb  |  35 ++++++
 ee/db/embedding/docs/vertex_gitlab_docs.yml   |  10 ++
 ...0230821103900_create_vertex_gitlab_docs.rb |  20 +++
 ..._index_on_version_to_vertex_gitlab_docs.rb |  15 +++
 ...embedding_is_null_to_vertex_gitlab_docs.rb |  15 +++
 .../schema_migrations/20230821103900          |   1 +
 .../schema_migrations/20230821113000          |   1 +
 .../schema_migrations/20230821113500          |   1 +
 ee/db/embedding/structure.sql                 |  31 +++++
 ee/lib/gitlab/llm/vertex_ai/client.rb         |  14 ++-
 .../model_configurations/text_embeddings.rb   |  23 ++++
 ee/spec/factories/embedding/gitlab_docs.rb    |  18 +++
 .../lib/gitlab/llm/vertex_ai/client_spec.rb   |   6 +
 .../text_embeddings_spec.rb                   |  35 ++++++
 .../vertex/gitlab_documentation_spec.rb       | 118 ++++++++++++++++++
 15 files changed, 342 insertions(+), 1 deletion(-)
 create mode 100644 ee/app/models/embedding/vertex/gitlab_documentation.rb
 create mode 100644 ee/db/embedding/docs/vertex_gitlab_docs.yml
 create mode 100644 ee/db/embedding/migrate/20230821103900_create_vertex_gitlab_docs.rb
 create mode 100644 ee/db/embedding/post_migrate/20230821113000_add_index_on_version_to_vertex_gitlab_docs.rb
 create mode 100644 ee/db/embedding/post_migrate/20230821113500_add_index_on_version_where_embedding_is_null_to_vertex_gitlab_docs.rb
 create mode 100644 ee/db/embedding/schema_migrations/20230821103900
 create mode 100644 ee/db/embedding/schema_migrations/20230821113000
 create mode 100644 ee/db/embedding/schema_migrations/20230821113500
 create mode 100644 ee/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings.rb
 create mode 100644 ee/spec/factories/embedding/gitlab_docs.rb
 create mode 100644 ee/spec/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings_spec.rb
 create mode 100644 ee/spec/models/embedding/vertex/gitlab_documentation_spec.rb

diff --git a/ee/app/models/embedding/vertex/gitlab_documentation.rb b/ee/app/models/embedding/vertex/gitlab_documentation.rb
new file mode 100644
index 0000000000000..46025865a6f99
--- /dev/null
+++ b/ee/app/models/embedding/vertex/gitlab_documentation.rb
@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+
+module Embedding
+  module Vertex
+    class GitlabDocumentation < ::Embedding::ApplicationRecord
+      self.table_name = 'vertex_gitlab_docs'
+
+      has_neighbors :embedding
+
+      scope :current, -> { where(version: get_current_version) }
+      scope :previous, -> { where("version < ?", get_current_version) }
+      scope :nil_embeddings_for_version, ->(version) { where(version: version, embedding: nil) }
+
+      scope :neighbor_for, ->(embedding, limit:) do
+        nearest_neighbors(:embedding, embedding, distance: 'cosine').limit(limit)
+      end
+
+      def self.current_version_cache_key
+        'vertex_gitlab_documentation:version:current'
+      end
+
+      def self.get_current_version
+        Gitlab::Redis::SharedState.with do |redis|
+          redis.get(current_version_cache_key)
+        end.to_i
+      end
+
+      def self.set_current_version!(version)
+        Gitlab::Redis::SharedState.with do |redis|
+          redis.set(current_version_cache_key, version.to_i)
+        end
+      end
+    end
+  end
+end
diff --git a/ee/db/embedding/docs/vertex_gitlab_docs.yml b/ee/db/embedding/docs/vertex_gitlab_docs.yml
new file mode 100644
index 0000000000000..8186de51add62
--- /dev/null
+++ b/ee/db/embedding/docs/vertex_gitlab_docs.yml
@@ -0,0 +1,10 @@
+---
+table_name: vertex_gitlab_docs
+classes:
+- Embedding::Vertex::GitlabDocumentation
+feature_categories:
+  - duo_chat
+description: GitLab documentation embeddings built with VertexAI
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/129917
+milestone: '16.4'
+gitlab_schema: gitlab_embedding
\ No newline at end of file
diff --git a/ee/db/embedding/migrate/20230821103900_create_vertex_gitlab_docs.rb b/ee/db/embedding/migrate/20230821103900_create_vertex_gitlab_docs.rb
new file mode 100644
index 0000000000000..5ccb6176e379e
--- /dev/null
+++ b/ee/db/embedding/migrate/20230821103900_create_vertex_gitlab_docs.rb
@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+
+class CreateVertexGitlabDocs < Gitlab::Database::Migration[2.1]
+  enable_lock_retries!
+
+  def up
+    create_table :vertex_gitlab_docs do |t|
+      t.timestamps_with_timezone null: false
+      t.integer :version, default: 0, null: false
+      t.vector :embedding, limit: 768
+      t.text :url, null: false, limit: 2048
+      t.text :content, null: false, limit: 32768
+      t.jsonb :metadata, null: false
+    end
+  end
+
+  def down
+    drop_table :vertex_gitlab_docs
+  end
+end
diff --git a/ee/db/embedding/post_migrate/20230821113000_add_index_on_version_to_vertex_gitlab_docs.rb b/ee/db/embedding/post_migrate/20230821113000_add_index_on_version_to_vertex_gitlab_docs.rb
new file mode 100644
index 0000000000000..f298846e99660
--- /dev/null
+++ b/ee/db/embedding/post_migrate/20230821113000_add_index_on_version_to_vertex_gitlab_docs.rb
@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+
+class AddIndexOnVersionToVertexGitlabDocs < Gitlab::Database::Migration[2.1]
+  INDEX_NAME = 'index_vertex_gitlab_docs_on_version'
+
+  disable_ddl_transaction!
+
+  def up
+    add_concurrent_index :vertex_gitlab_docs, :version, name: INDEX_NAME
+  end
+
+  def down
+    remove_concurrent_index_by_name :vertex_gitlab_docs, INDEX_NAME
+  end
+end
diff --git a/ee/db/embedding/post_migrate/20230821113500_add_index_on_version_where_embedding_is_null_to_vertex_gitlab_docs.rb b/ee/db/embedding/post_migrate/20230821113500_add_index_on_version_where_embedding_is_null_to_vertex_gitlab_docs.rb
new file mode 100644
index 0000000000000..0eb8e0ae3160a
--- /dev/null
+++ b/ee/db/embedding/post_migrate/20230821113500_add_index_on_version_where_embedding_is_null_to_vertex_gitlab_docs.rb
@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+
+class AddIndexOnVersionWhereEmbeddingIsNullToVertexGitlabDocs < Gitlab::Database::Migration[2.1]
+  INDEX_NAME = 'index_vertex_gitlab_docs_on_version_where_embedding_is_null'
+
+  disable_ddl_transaction!
+
+  def up
+    add_concurrent_index :vertex_gitlab_docs, :version, where: 'embedding IS NULL', name: INDEX_NAME
+  end
+
+  def down
+    remove_concurrent_index_by_name :vertex_gitlab_docs, INDEX_NAME
+  end
+end
diff --git a/ee/db/embedding/schema_migrations/20230821103900 b/ee/db/embedding/schema_migrations/20230821103900
new file mode 100644
index 0000000000000..bcae240e581b9
--- /dev/null
+++ b/ee/db/embedding/schema_migrations/20230821103900
@@ -0,0 +1 @@
+2fd9bf701a9830b2160e5190ab92c07264fccced574e8b5e69ac5f57d17ac5d9
\ No newline at end of file
diff --git a/ee/db/embedding/schema_migrations/20230821113000 b/ee/db/embedding/schema_migrations/20230821113000
new file mode 100644
index 0000000000000..760a364adf500
--- /dev/null
+++ b/ee/db/embedding/schema_migrations/20230821113000
@@ -0,0 +1 @@
+12dac3188d21a16766bd9576e0b56b092bd2715b4f631d8f2c23eb1776cd1760
\ No newline at end of file
diff --git a/ee/db/embedding/schema_migrations/20230821113500 b/ee/db/embedding/schema_migrations/20230821113500
new file mode 100644
index 0000000000000..0533cf67b5d6c
--- /dev/null
+++ b/ee/db/embedding/schema_migrations/20230821113500
@@ -0,0 +1 @@
+16ffca4983b73a4bec8dd319f8584d78ce66adae715d27ca146d3e1ac203fa4e
\ No newline at end of file
diff --git a/ee/db/embedding/structure.sql b/ee/db/embedding/structure.sql
index c93b6dc291a48..9e0636df33217 100644
--- a/ee/db/embedding/structure.sql
+++ b/ee/db/embedding/structure.sql
@@ -35,8 +35,32 @@ CREATE SEQUENCE tanuki_bot_mvc_id_seq
 
 ALTER SEQUENCE tanuki_bot_mvc_id_seq OWNED BY tanuki_bot_mvc.id;
 
+CREATE TABLE vertex_gitlab_docs (
+    id bigint NOT NULL,
+    created_at timestamp with time zone NOT NULL,
+    updated_at timestamp with time zone NOT NULL,
+    version integer DEFAULT 0 NOT NULL,
+    embedding vector(768),
+    url text NOT NULL,
+    content text NOT NULL,
+    metadata jsonb NOT NULL,
+    CONSTRAINT check_2e35a254ce CHECK ((char_length(url) <= 2048)),
+    CONSTRAINT check_93ca52e019 CHECK ((char_length(content) <= 32768))
+);
+
+CREATE SEQUENCE vertex_gitlab_docs_id_seq
+    START WITH 1
+    INCREMENT BY 1
+    NO MINVALUE
+    NO MAXVALUE
+    CACHE 1;
+
+ALTER SEQUENCE vertex_gitlab_docs_id_seq OWNED BY vertex_gitlab_docs.id;
+
 ALTER TABLE ONLY tanuki_bot_mvc ALTER COLUMN id SET DEFAULT nextval('tanuki_bot_mvc_id_seq'::regclass);
 
+ALTER TABLE ONLY vertex_gitlab_docs ALTER COLUMN id SET DEFAULT nextval('vertex_gitlab_docs_id_seq'::regclass);
+
 ALTER TABLE ONLY ar_internal_metadata
     ADD CONSTRAINT ar_internal_metadata_pkey PRIMARY KEY (key);
 
@@ -46,8 +70,15 @@ ALTER TABLE ONLY schema_migrations
 ALTER TABLE ONLY tanuki_bot_mvc
     ADD CONSTRAINT tanuki_bot_mvc_pkey PRIMARY KEY (id);
 
+ALTER TABLE ONLY vertex_gitlab_docs
+    ADD CONSTRAINT vertex_gitlab_docs_pkey PRIMARY KEY (id);
+
 CREATE UNIQUE INDEX index_tanuki_bot_mvc_on_chroma_id ON tanuki_bot_mvc USING btree (chroma_id);
 
 CREATE INDEX index_tanuki_bot_mvc_on_version ON tanuki_bot_mvc USING btree (version);
 
 CREATE INDEX index_tanuki_bot_mvc_on_version_where_embedding_is_null ON tanuki_bot_mvc USING btree (version) WHERE (embedding IS NULL);
+
+CREATE INDEX index_vertex_gitlab_docs_on_version ON vertex_gitlab_docs USING btree (version);
+
+CREATE INDEX index_vertex_gitlab_docs_on_version_where_embedding_is_null ON vertex_gitlab_docs USING btree (version) WHERE (embedding IS NULL);
diff --git a/ee/lib/gitlab/llm/vertex_ai/client.rb b/ee/lib/gitlab/llm/vertex_ai/client.rb
index 32e6d4ed4d1b1..057c9b937f310 100644
--- a/ee/lib/gitlab/llm/vertex_ai/client.rb
+++ b/ee/lib/gitlab/llm/vertex_ai/client.rb
@@ -80,11 +80,23 @@ def code_completion(content:, **options)
           )
         end
 
+        # @param [String] content - Input string
+        # @param [Hash] options - Additional options to pass to the request
+        def text_embeddings(content:, **options)
+          request(
+            content: content,
+            config: Configuration.new(
+              model_config: ModelConfigurations::TextEmbeddings.new
+            ),
+            **options
+          )
+        end
+
         private
 
         attr_reader :logger
 
-        retry_methods_with_exponential_backoff :chat, :text, :code, :messages_chat, :code_completion
+        retry_methods_with_exponential_backoff :chat, :text, :code, :messages_chat, :code_completion, :text_embeddings
 
         def request(content:, config:, **options)
           logger.info(message: "Performing request to Vertex", config: config)
diff --git a/ee/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings.rb b/ee/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings.rb
new file mode 100644
index 0000000000000..eea759b5fb9a7
--- /dev/null
+++ b/ee/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Llm
+    module VertexAi
+      module ModelConfigurations
+        class TextEmbeddings < Base
+          NAME = 'textembedding-gecko'
+
+          def payload(content)
+            {
+              instances: [
+                {
+                  content: content
+                }
+              ]
+            }
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/ee/spec/factories/embedding/gitlab_docs.rb b/ee/spec/factories/embedding/gitlab_docs.rb
new file mode 100644
index 0000000000000..bdc19923b5f6f
--- /dev/null
+++ b/ee/spec/factories/embedding/gitlab_docs.rb
@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+
+FactoryBot.define do
+  factory :vertex_gitlab_documentation, class: 'Embedding::Vertex::GitlabDocumentation' do
+    url { 'http://example.com/path/to/a/doc' }
+
+    sequence(:metadata) do |n|
+      {
+        info: "Description for #{n}",
+        source: "path/to/a/doc_#{n}.md",
+        source_type: 'doc'
+      }
+    end
+
+    content { 'Some text' }
+    embedding { Array.new(768, 0.3) }
+  end
+end
diff --git a/ee/spec/lib/gitlab/llm/vertex_ai/client_spec.rb b/ee/spec/lib/gitlab/llm/vertex_ai/client_spec.rb
index e7ee61d0445aa..357e86583171b 100644
--- a/ee/spec/lib/gitlab/llm/vertex_ai/client_spec.rb
+++ b/ee/spec/lib/gitlab/llm/vertex_ai/client_spec.rb
@@ -193,6 +193,12 @@
     it_behaves_like 'forwarding the request correctly'
   end
 
+  describe '#text_embeddings' do
+    subject(:response) { client.text_embeddings(content: 'anything', **options) }
+
+    it_behaves_like 'forwarding the request correctly'
+  end
+
   describe '#request' do
     let(:url) { 'https://example.com/api' }
     let(:config) do
diff --git a/ee/spec/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings_spec.rb b/ee/spec/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings_spec.rb
new file mode 100644
index 0000000000000..f108608d6df06
--- /dev/null
+++ b/ee/spec/lib/gitlab/llm/vertex_ai/model_configurations/text_embeddings_spec.rb
@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Gitlab::Llm::VertexAi::ModelConfigurations::TextEmbeddings, feature_category: :ai_abstraction_layer do
+  let(:host) { 'example-env.com' }
+  let(:project) { 'cllm' }
+
+  before do
+    stub_application_setting(vertex_ai_host: host)
+    stub_application_setting(vertex_ai_project: project)
+  end
+
+  describe '#payload' do
+    it 'returns default payload' do
+      expect(subject.payload('some content')).to eq(
+        {
+          instances: [
+            {
+              content: 'some content'
+            }
+          ]
+        }
+      )
+    end
+  end
+
+  describe '#url' do
+    it 'returns correct url replacing default value' do
+      expect(subject.url).to eq(
+        'https://example-env.com/v1/projects/cllm/locations/us-central1/publishers/google/models/textembedding-gecko:predict'
+      )
+    end
+  end
+end
diff --git a/ee/spec/models/embedding/vertex/gitlab_documentation_spec.rb b/ee/spec/models/embedding/vertex/gitlab_documentation_spec.rb
new file mode 100644
index 0000000000000..4a900b4a0c55a
--- /dev/null
+++ b/ee/spec/models/embedding/vertex/gitlab_documentation_spec.rb
@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Embedding::Vertex::GitlabDocumentation, :clean_gitlab_redis_shared_state, type: :model, feature_category: :duo_chat do
+  let(:version) { 111 }
+
+  describe 'scopes' do
+    describe '.neighbor_for' do
+      subject(:neighbors) do
+        described_class.neighbor_for(question.embedding, limit: limit)
+      end
+
+      let_it_be(:question) { build(:vertex_gitlab_documentation) }
+      let(:limit) { 10 }
+
+      it 'calls nearest_neighbors for question' do
+        create_list(:vertex_gitlab_documentation, 2)
+
+        expect(described_class).to receive(:nearest_neighbors)
+          .with(:embedding, question.embedding, distance: 'cosine').and_call_original.once
+
+        neighbors
+      end
+
+      context 'with a far away embedding' do
+        let_it_be(:far) { create(:vertex_gitlab_documentation, embedding: Array.new(768, -0.000999)) }
+        let_it_be(:near) { create(:vertex_gitlab_documentation, embedding: Array.new(768, 0.000333)) }
+
+        it 'returns all neighbors' do
+          expect(neighbors).to match_array([near, far])
+        end
+
+        context 'with a limit of one' do
+          let(:limit) { 1 }
+
+          it 'does not return the far neighbor' do
+            expect(neighbors).to match_array(near)
+          end
+        end
+      end
+    end
+
+    describe '.current' do
+      let!(:current_records) { create_list(:vertex_gitlab_documentation, 5, version: version) }
+      let!(:previous_records) { create_list(:vertex_gitlab_documentation, 3, version: version - 1) }
+
+      it 'is empty' do
+        current = described_class.current
+
+        expect(current.count).to eq(0)
+      end
+
+      context 'when there are records matching the current version' do
+        before do
+          allow(described_class).to receive(:get_current_version).and_return(version)
+        end
+
+        it 'returns matching records' do
+          current = described_class.current
+
+          expect(current).to eq(current_records)
+        end
+      end
+    end
+
+    describe '.previous' do
+      let!(:current_records) { create_list(:vertex_gitlab_documentation, 5, version: version) }
+      let!(:previous_records) { create_list(:vertex_gitlab_documentation, 3, version: version - 1) }
+
+      it 'is empty' do
+        previous = described_class.previous
+
+        expect(previous.count).to eq(0)
+      end
+
+      context 'when there are records matching the previous version' do
+        before do
+          allow(described_class).to receive(:get_current_version).and_return(version)
+        end
+
+        it 'returns matching records' do
+          previous = described_class.previous
+
+          expect(previous).to eq(previous_records)
+        end
+      end
+    end
+  end
+
+  describe '.get_current_version' do
+    it 'returns 0' do
+      expect(described_class.get_current_version).to eq(0)
+    end
+
+    context 'when it exists in redis' do
+      before do
+        Gitlab::Redis::SharedState.with do |redis|
+          redis.set(described_class.current_version_cache_key, version)
+        end
+      end
+
+      it 'returns the value' do
+        expect(described_class.get_current_version).to eq(version)
+      end
+    end
+  end
+
+  describe '.set_current_version!' do
+    it 'updates the version in redis' do
+      expect(described_class.get_current_version).to eq(0)
+
+      described_class.set_current_version!(version)
+
+      expect(described_class.get_current_version).to eq(version)
+    end
+  end
+end
-- 
GitLab