diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index bbb55a901010ae3c3699253593129aea5914cc3e..206b0525610873d84ae8a6497e80749f20647484 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -816,9 +816,9 @@ Settings.cron_jobs['sync_service_token_worker'] ||= {} Settings.cron_jobs['sync_service_token_worker']['cron'] ||= "#{rand(60)} #{rand(5..6)} * * * UTC" Settings.cron_jobs['sync_service_token_worker']['job_class'] = '::Ai::SyncServiceTokenWorker' - Settings.cron_jobs['llm_embedding_gitlab_documentation_create_empty_embeddings_records_worker'] ||= {} - Settings.cron_jobs['llm_embedding_gitlab_documentation_create_empty_embeddings_records_worker']['cron'] ||= '0 5 * * 1,2,3,4,5' - Settings.cron_jobs['llm_embedding_gitlab_documentation_create_empty_embeddings_records_worker']['job_class'] ||= 'Llm::Embedding::GitlabDocumentation::CreateEmptyEmbeddingsRecordsWorker' + Settings.cron_jobs['llm_embedding_gitlab_documentation_create_embeddings_records_worker'] ||= {} + Settings.cron_jobs['llm_embedding_gitlab_documentation_create_embeddings_records_worker']['cron'] ||= '0 5 * * 1,2,3,4,5' + Settings.cron_jobs['llm_embedding_gitlab_documentation_create_embeddings_records_worker']['job_class'] ||= 'Llm::Embedding::GitlabDocumentation::CreateEmbeddingsRecordsWorker' Settings.cron_jobs['llm_embedding_gitlab_documentation_cleanup_previous_versions_records_worker'] ||= {} Settings.cron_jobs['llm_embedding_gitlab_documentation_cleanup_previous_versions_records_worker']['cron'] ||= '0 0 * * *' Settings.cron_jobs['llm_embedding_gitlab_documentation_cleanup_previous_versions_records_worker']['job_class'] ||= 'Llm::Embedding::GitlabDocumentation::CleanupPreviousVersionsRecordsWorker' diff --git a/doc/development/ai_features/glossary.md b/doc/development/ai_features/glossary.md index 0a0dcdd4af73bf827aad12f5c6f13950475be7d9..21b9354c3ae1316cba9405c963d1ca297b6c9e61 100644 --- a/doc/development/ai_features/glossary.md +++ b/doc/development/ai_features/glossary.md @@ -43,7 +43,7 @@ multiple GitLab deployments, instances, and cells. We use it as an umbrella term embeddings are stored in the `vertex_gitlab_docs` database table in the `embeddings` database. The embeddings search is done in Postgres using the `vector` extension. The vertex embeddings database is updated based on the - latest version of GitLab documentation on daily basis by running `Llm::Embedding::GitlabDocumentation::CreateEmptyEmbeddingsRecordsWorker` as a cronjob. + latest version of GitLab documentation on a daily basis by running `Llm::Embedding::GitlabDocumentation::CreateEmbeddingsRecordsWorker` as a cronjob. - **Golden Questions**: a small subset of the types of questions we think a user should be able to ask GitLab Duo Chat. Used to generate data for Chat evaluation. [Questions for Chat Beta](https://gitlab.com/groups/gitlab-org/-/epics/10550#what-the-user-can-ask). diff --git a/ee/app/workers/all_queues.yml b/ee/app/workers/all_queues.yml index f4fb42e0c34d3456ad4906201e9a40a658c0de83..750f9c80ffcc5a5e18885b65e8329da2f4764f6c 100644 --- a/ee/app/workers/all_queues.yml +++ b/ee/app/workers/all_queues.yml @@ -426,6 +426,15 @@ :weight: 1 :idempotent: true :tags: [] +- :name: cronjob:llm_embedding_gitlab_documentation_create_embeddings_records + :worker_name: Llm::Embedding::GitlabDocumentation::CreateEmbeddingsRecordsWorker + :feature_category: :duo_chat + :has_external_dependencies: false + :urgency: :throttled + :resource_boundary: :unknown + :weight: 1 + :idempotent: true + :tags: [] - :name: cronjob:llm_embedding_gitlab_documentation_create_empty_embeddings_records :worker_name: Llm::Embedding::GitlabDocumentation::CreateEmptyEmbeddingsRecordsWorker :feature_category: :duo_chat diff --git a/ee/app/workers/llm/embedding/gitlab_documentation/create_embeddings_records_worker.rb b/ee/app/workers/llm/embedding/gitlab_documentation/create_embeddings_records_worker.rb new file mode 100644 index 0000000000000000000000000000000000000000..bd372de9eea2990f4f1372353d1188131796d861 --- /dev/null +++ b/ee/app/workers/llm/embedding/gitlab_documentation/create_embeddings_records_worker.rb @@ -0,0 +1,111 @@ +# frozen_string_literal: true + +module Llm + module Embedding + module GitlabDocumentation + class CreateEmbeddingsRecordsWorker + include ApplicationWorker + include CronjobQueue # rubocop:disable Scalability/CronWorkerContext -- CreateDbEmbeddingsPerDocFileWorker is queued by this worker, but it requires no context + include Gitlab::ExclusiveLeaseHelpers + include EmbeddingsWorkerContext + + idempotent! + data_consistency :always # rubocop: disable SidekiqLoadBalancing/WorkerDataConsistency -- This worker runs as a cron job + feature_category :duo_chat + urgency :throttled + sidekiq_options retry: 3 + + def perform + return unless Gitlab::Saas.feature_available?(FEATURE_NAME) + return unless ::Feature.enabled?(:ai_duo_chat_switch, type: :ops) + return unless ::License.feature_available?(:ai_chat) # license check + + embeddings_sources = extract_embedding_sources + + files.each do |filename| + content = File.read(filename) + source = filename.gsub(Rails.root.to_s, '') + + next unless embeddable?(content) + + current_md5sum = extract_md5sum(embeddings_sources, source) + + # Create the digest by concatenating the file content and the model used, so that we generate new embeddings + # if either change + new_md5sum = OpenSSL::Digest::SHA256.hexdigest( + content + ::Gitlab::Llm::VertexAi::ModelConfigurations::TextEmbeddings::NAME + ) + + # If the file digest did not change, then there's no need to rebuild its embeddings, just used them as is. + next if new_md5sum == current_md5sum + + CreateDbEmbeddingsPerDocFileWorker.perform_async(filename, update_version) + + logger.info( + structured_payload( + message: 'Enqueued DB embeddings creation', + filename: filename, + new_version: update_version + ) + ) + end + + cleanup_embeddings_for_missing_files(embeddings_sources) + end + + private + + def extract_embedding_sources + embeddings_sources = Set.new + select_columns = "distinct version, metadata->>'source' as source, metadata->>'md5sum' as md5sum" + + MODEL.select(select_columns).each_batch do |batch| + data = batch.map do |em| + { version: em.version, source: em.source, md5sum: em.md5sum }.with_indifferent_access + end + + embeddings_sources.merge(data) + end + + embeddings_sources.group_by { |em| em[:source] } + end + + def extract_md5sum(embeddings_sources, source) + embeddings_for_source = embeddings_sources.delete(source) + embedding = embeddings_for_source&.find { |embedding| embedding[:version] == MODEL.current_version } + + embedding&.dig('md5sum') + end + + def embeddable?(content) + return false if content.empty? + return false if content.include?('This document was moved to [another location]') + + true + end + + def cleanup_embeddings_for_missing_files(embeddings_sources) + embeddings_sources.keys.each_slice(20) do |sources| + MODEL.for_sources(sources).each_batch(of: BATCH_SIZE) { |batch| batch.delete_all } + + logger.info( + structured_payload( + message: 'Deleting embeddings for missing files', + filename: sources, + new_version: MODEL.current_version + ) + ) + end + end + + def files + Dir[Rails.root.join("#{DOC_DIRECTORY}/**/*.md")] + end + + def update_version + @update_version ||= MODEL.current_version + 1 + end + end + end + end +end diff --git a/ee/app/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker.rb b/ee/app/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker.rb index 653dfa7be00b85002c6e0bdc3002771ad783bb95..682c543fd083ae87aba5de32ca58c87357fe70d3 100644 --- a/ee/app/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker.rb +++ b/ee/app/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker.rb @@ -3,6 +3,8 @@ module Llm module Embedding module GitlabDocumentation + # DEPRECATED: Remove this in favor for CreateEmbeddingsRecordsWorker + # see https://gitlab.com/gitlab-org/gitlab/-/issues/438337 class CreateEmptyEmbeddingsRecordsWorker include ApplicationWorker include CronjobQueue # rubocop:disable Scalability/CronWorkerContext @@ -15,96 +17,7 @@ class CreateEmptyEmbeddingsRecordsWorker urgency :throttled sidekiq_options retry: 3 - def perform - return unless Gitlab::Saas.feature_available?(FEATURE_NAME) - return unless ::Feature.enabled?(:ai_duo_chat_switch, type: :ops) - return unless ::License.feature_available?(:ai_chat) # license check - - embeddings_sources = extract_embedding_sources - - files.each do |filename| - content = File.read(filename) - source = filename.gsub(Rails.root.to_s, '') - - next unless embeddable?(content) - - current_md5sum = extract_md5sum(embeddings_sources, source) - - # Create the digest by concatenating the file content and the model used, so that we generate new embeddings - # if either change - new_md5sum = OpenSSL::Digest::SHA256.hexdigest( - content + ::Gitlab::Llm::VertexAi::ModelConfigurations::TextEmbeddings::NAME - ) - - # If the file digest did not change, then there's no need to rebuild its embeddings, just used them as is. - next if new_md5sum == current_md5sum - - CreateDbEmbeddingsPerDocFileWorker.perform_async(filename, update_version) - - logger.info( - structured_payload( - message: 'Enqueued DB embeddings creation', - filename: filename, - new_version: update_version - ) - ) - end - - cleanup_embeddings_for_missing_files(embeddings_sources) - end - - private - - def extract_embedding_sources - embeddings_sources = Set.new - select_columns = "distinct version, metadata->>'source' as source, metadata->>'md5sum' as md5sum" - - MODEL.select(select_columns).each_batch do |batch| - data = batch.map do |em| - { version: em.version, source: em.source, md5sum: em.md5sum }.with_indifferent_access - end - - embeddings_sources.merge(data) - end - - embeddings_sources.group_by { |em| em[:source] } - end - - def extract_md5sum(embeddings_sources, source) - embeddings_for_source = embeddings_sources.delete(source) - embedding = embeddings_for_source&.find { |embedding| embedding[:version] == MODEL.current_version } - - embedding&.dig('md5sum') - end - - def embeddable?(content) - return false if content.empty? - return false if content.include?('This document was moved to [another location]') - - true - end - - def cleanup_embeddings_for_missing_files(embeddings_sources) - embeddings_sources.keys.each_slice(20) do |sources| - MODEL.for_sources(sources).each_batch(of: BATCH_SIZE) { |batch| batch.delete_all } - - logger.info( - structured_payload( - message: 'Deleting embeddings for missing files', - filename: sources, - new_version: MODEL.current_version - ) - ) - end - end - - def files - Dir[Rails.root.join("#{DOC_DIRECTORY}/**/*.md")] - end - - def update_version - @update_version ||= MODEL.current_version + 1 - end + def perform; end end end end diff --git a/ee/spec/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker_spec.rb b/ee/spec/workers/llm/embedding/gitlab_documentation/create_embeddings_records_worker_spec.rb similarity index 97% rename from ee/spec/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker_spec.rb rename to ee/spec/workers/llm/embedding/gitlab_documentation/create_embeddings_records_worker_spec.rb index aad3ddaafef7c63f189331c653e2a663683ed267..e28b06e37dc979467234033d2876a9b13c4de285 100644 --- a/ee/spec/workers/llm/embedding/gitlab_documentation/create_empty_embeddings_records_worker_spec.rb +++ b/ee/spec/workers/llm/embedding/gitlab_documentation/create_embeddings_records_worker_spec.rb @@ -2,7 +2,7 @@ require 'spec_helper' -RSpec.describe Llm::Embedding::GitlabDocumentation::CreateEmptyEmbeddingsRecordsWorker, feature_category: :duo_chat do +RSpec.describe Llm::Embedding::GitlabDocumentation::CreateEmbeddingsRecordsWorker, feature_category: :duo_chat do it_behaves_like 'worker with data consistency', described_class, data_consistency: :always describe '#perform' do diff --git a/spec/workers/every_sidekiq_worker_spec.rb b/spec/workers/every_sidekiq_worker_spec.rb index dc2851bd562eb8a83915ac7e8e6c643fefdf183e..01548ac8bc7fe3f43e1133c2b673d47bd7b067b4 100644 --- a/spec/workers/every_sidekiq_worker_spec.rb +++ b/spec/workers/every_sidekiq_worker_spec.rb @@ -342,7 +342,10 @@ 'LdapGroupSyncWorker' => 3, 'Licenses::ResetSubmitLicenseUsageDataBannerWorker' => 13, 'Llm::Embedding::GitlabDocumentation::SetEmbeddingsOnTheRecordWorker' => 5, + # DEPRECATED: Remove this in favor for CreateEmbeddingsRecordsWorker + # see https://gitlab.com/gitlab-org/gitlab/-/issues/438337 'Llm::Embedding::GitlabDocumentation::CreateEmptyEmbeddingsRecordsWorker' => 3, + 'Llm::Embedding::GitlabDocumentation::CreateEmbeddingsRecordsWorker' => 3, 'Llm::Embedding::GitlabDocumentation::CreateDbEmbeddingsPerDocFileWorker' => 5, 'MailScheduler::IssueDueWorker' => 3, 'MailScheduler::NotificationServiceWorker' => 3,