diff --git a/.gitlab/ci/rails.gitlab-ci.yml b/.gitlab/ci/rails.gitlab-ci.yml index 96793b1f5d49d47334daa2b30a8713e4a4fa34e0..098c5a8a5e113ab1ce8176f341fb7a9f8bad685f 100644 --- a/.gitlab/ci/rails.gitlab-ci.yml +++ b/.gitlab/ci/rails.gitlab-ci.yml @@ -765,41 +765,46 @@ rspec system pg14-as-if-foss clusterwide-db: - .clusterwide-db - .rails:rules:clusterwide-db -rspec-ee unit gitlab-duo-chat pg14: +.rspec-ee-base-gitlab-duo: + extends: + - .rspec-ee-base-pg14 variables: REAL_AI_REQUEST: "true" - RSPEC_RETRY_RETRY_COUNT: 0 + +rspec-ee unit gitlab-duo-chat-zeroshot pg14: extends: - - .rspec-ee-base-pg14 - - .rails:rules:ee-gitlab-duo-chat-base - parallel: - matrix: - - DUO_RSPEC: ["lib/gitlab/llm/chain/agents/zero_shot/executor_real_requests_spec.rb", "support_specs/helpers/chat_qa_evaluation_helpers_spec.rb"] + - .rspec-ee-base-gitlab-duo + - .rails:rules:ee-gitlab-duo-chat-optional script: - !reference [.base-script, script] - - bundle exec rspec -Ispec -rspec_helper --failure-exit-code 0 --tag real_ai_request --color -- ee/spec/${DUO_RSPEC} + - rspec_paralellized_job "--tag zeroshot_executor" + +rspec-ee unit gitlab-duo-chat-qa-fast pg14: + extends: + - .rspec-ee-base-gitlab-duo + - .rails:rules:ee-gitlab-duo-chat-qa-fast + script: + - !reference [.base-script, script] + - rspec_paralellized_job "--tag fast_chat_qa_evaluation" rspec-ee unit gitlab-duo-chat-qa pg14: variables: - REAL_AI_REQUEST: "true" + QA_EVAL_REPORT_FILENAME: "qa_evaluation_report.md" RSPEC_RETRY_RETRY_COUNT: 0 extends: - - .rspec-ee-base-pg14 - - .rails:rules:ee-gitlab-duo-chat-base - parallel: - matrix: - - DUO_RSPEC: ["qa_epic_spec.rb", "qa_issue_spec.rb"] + - .rspec-ee-base-gitlab-duo + - .rails:rules:ee-gitlab-duo-chat-qa-full script: - !reference [.base-script, script] - source ./scripts/utils.sh - install_gitlab_gem - - bundle exec rspec -Ispec -rspec_helper --failure-exit-code 0 --tag real_ai_request --color -- ee/spec/lib/gitlab/llm/chain/agents/zero_shot/${DUO_RSPEC} + - bundle exec rspec -Ispec -rspec_helper --failure-exit-code 0 --color --tag chat_qa_evaluation -- ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_evaluation_spec.rb - ./scripts/duo_chat/reporter.rb artifacts: expire_in: 5d paths: - tmp/duo_chat/qa*.json - - "${DUO_RSPEC}.md" + - "${QA_EVAL_REPORT_FILENAME}" rspec-ee migration pg14: extends: diff --git a/.gitlab/ci/rules.gitlab-ci.yml b/.gitlab/ci/rules.gitlab-ci.yml index f9690b0247cba8bc05c9412e5c48b71b88fc7412..6a5bade71f47b9172e242bb13abc77149738b4ad 100644 --- a/.gitlab/ci/rules.gitlab-ci.yml +++ b/.gitlab/ci/rules.gitlab-ci.yml @@ -2125,8 +2125,28 @@ when: never - if: '$VERTEX_AI_CREDENTIALS == null' when: never + - <<: *if-fork-merge-request + when: never + +.rails:rules:ee-gitlab-duo-chat-optional: + rules: + - !reference [".rails:rules:ee-gitlab-duo-chat-base", rules] + - <<: *if-merge-request + changes: *backend-patterns + when: manual + allow_failure: true + +.rails:rules:ee-gitlab-duo-chat-qa-fast: + rules: + - !reference [".rails:rules:ee-gitlab-duo-chat-base", rules] - <<: *if-merge-request changes: *ai-patterns + +.rails:rules:ee-gitlab-duo-chat-qa-full: + rules: + - !reference [".rails:rules:ee-gitlab-duo-chat-optional", rules] + - <<: *if-default-branch-refs + changes: *setup-test-env-patterns when: manual allow_failure: true diff --git a/doc/development/ai_features/duo_chat.md b/doc/development/ai_features/duo_chat.md index dfaad73220fa346c4d3b35cb1fd32b0e1743df73..2fde672aa7ebce1b1acbb52334c331523fd2f87a 100644 --- a/doc/development/ai_features/duo_chat.md +++ b/doc/development/ai_features/duo_chat.md @@ -109,17 +109,28 @@ make sure a new fixture is generated and committed together with the change. ## Running the rspecs tagged with `real_ai_request` -The rspecs tagged with the metadata `real_ai_request` can be run in GitLab project's CI by triggering -`rspec-ee unit gitlab-duo-chat`. -The former runs with Vertex APIs enabled. The CI jobs are optional and allowed to fail to account for -the non-deterministic nature of LLM responses. +The following CI jobs for GitLab project run the rspecs tagged with `real_ai_request`: + +- `rspec-ee unit gitlab-duo-chat-zeroshot`: + the job runs `ee/spec/lib/gitlab/llm/chain/agents/zero_shot/executor_real_requests_spec.rb`. + The job is optionally triggered and allowed to fail. + +- `rspec-ee unit gitlab-duo-chat-qa`: + The job runs the QA evaluation tests in + `ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_evaluation_spec.rb`. + The job is optionally triggered and allowed to fail. + +- `rspec-ee unit gitlab-duo-chat-qa-fast`: + The job runs a single QA evaluation test from `ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_evaluation_spec.rb`. + The job is always run and not allowed to fail. Although there's a chance that the QA test still might fail, + it is cheap and fast to run and intended to prevent a regression in the QA test helpers. ### Management of credentials and API keys for CI jobs All API keys required to run the rspecs should be [masked](../../ci/variables/index.md#mask-a-cicd-variable) The exception is GCP credentials as they contain characters that prevent them from being masked. -Because `rspec-ee unit gitlab-duo-chat` needs to run on MR branches, GCP credentials cannot be added as a protected variable +Because the CI jobs need to run on MR branches, GCP credentials cannot be added as a protected variable and must be added as a regular CI variable. For security, the GCP credentials and the associated project added to GitLab project's CI must not be able to access any production infrastructure and sandboxed. diff --git a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/executor_real_requests_spec.rb b/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/executor_real_requests_spec.rb index a3e10446a1e4ebc3c3271752e9f839c35e6aba20..eed66007806264299515f364bcd5ba14ee25a5ea 100644 --- a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/executor_real_requests_spec.rb +++ b/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/executor_real_requests_spec.rb @@ -7,7 +7,7 @@ let_it_be(:user) { create(:user) } - describe 'real requests', :real_ai_request, :saas do + describe 'real requests', :real_ai_request, :zeroshot_executor, :saas do using RSpec::Parameterized::TableSyntax let_it_be_with_reload(:group) { create(:group_with_plan, :public, plan: :ultimate_plan) } diff --git a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_epic_spec.rb b/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_epic_spec.rb deleted file mode 100644 index f5e1431328543654cbf41a203c9d67ad618d5e4c..0000000000000000000000000000000000000000 --- a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_epic_spec.rb +++ /dev/null @@ -1,62 +0,0 @@ -# frozen_string_literal: true - -require 'spec_helper' - -RSpec.describe 'GitLab Duo Chat QA Evaluation for Epic', :clean_gitlab_redis_chat, feature_category: :duo_chat do - include Gitlab::Routing.url_helpers - include ChatQaEvaluationHelpers - - describe 'evaluation', :real_ai_request, :saas do - let_it_be(:user) { create(:user) } - - include_context 'with sample production epics and issues' - - before do - stub_licensed_features(ai_features: true) - stub_ee_application_setting(should_check_namespace_plan: true) - stub_licensed_features(ai_tanuki_bot: true) - - stub_licensed_features(epics: true) - root_group = epic.group.root_ancestor - root_group.namespace_settings.update_attribute(:experiment_features_enabled, true) - root_group.add_owner(user) - end - - where(:question_template) do - [ - ["Summarize the comments into bullet points?"], - ["Summarize with bullet points"], - ["Can you create a simpler list of which questions a user should be able to ask according to this epic."], - ["Summarize this Epic."], - ["How much work is left to be done %<url>s?"], - ["How much work is left to be done in this epic?"], - ["Please summarize what the objective and next steps are for %<url>s"] - ] - end - - with_them do - where(:epic_id) do - [ - [822061], # https://gitlab.com/groups/gitlab-org/-/epics/10550 - [835460], # https://gitlab.com/groups/gitlab-org/-/epics/10694 - [854759] # https://gitlab.com/groups/gitlab-org/-/epics/10814 - ] - end - - with_them do - let(:epic) { Epic.find(epic_id) } - let(:context) { epic.to_json } - let(:url) { group_epic_url(epic.group, epic) } - let(:question) { format(question_template, { url: url }) } - - it 'answers the question correctly' do - evaluations = evaluate_without_reference(user, epic, question, context)[:evaluations] - - evaluations.each do |eval| - expect(eval[:response]).to match(/Grade: CORRECT/i) - end - end - end - end - end -end diff --git a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_evaluation_spec.rb b/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_evaluation_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..c8a82780790d95c7ca55f92fdfa8732d2a10fd04 --- /dev/null +++ b/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_evaluation_spec.rb @@ -0,0 +1,155 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe 'GitLab Duo Chat QA Evaluation', :real_ai_request, :saas, :clean_gitlab_redis_chat, feature_category: :duo_chat do + include Gitlab::Routing.url_helpers + include DuoChatQaEvaluationHelpers + include DuoChatFixtureHelpers + + let_it_be(:user) { create(:user) } + + # These fixtures have been created using https://gitlab.com/gitlab-org/gitlab/-/snippets/3613745 + let_it_be(:epic_fixtures) { load_fixture('epics') } + let_it_be(:issue_fixtures) { load_fixture('issues') } + + before_all do + # link_reference_pattern is memoized for Issue + # and stubbed url (gitlab.com) is not used to derive the link reference pattern. + Issue.instance_variable_set(:@link_reference_pattern, nil) + + # Create epics and issues from the fixture data + (epic_fixtures + issue_fixtures).each { |issuable| create_users(issuable) } + epics = epic_fixtures.filter_map { |epic| restore_epic(epic) } + issues = issue_fixtures.filter_map { |issue| restore_issue(issue) } + + [ + issues.map { |issue| issue.project.group.root_ancestor }, + epics.map { |epic| epic.group.root_ancestor } + ].flatten.each do |group| + group.namespace_settings.update_attribute(:experiment_features_enabled, true) + group.add_owner(user) + end + + issues.map(&:project).each { |project| project.add_developer(user) } + end + + before do + # Note: In SaaS simulation mode, + # the url must be `https://gitlab.com` but the routing helper returns `localhost` + # and breaks GitLab ReferenceExtractor + stub_default_url_options(host: "gitlab.com", protocol: "https") + stub_ee_application_setting(should_check_namespace_plan: true) + stub_licensed_features(ai_features: true, ai_tanuki_bot: true, epics: true) + end + + shared_examples 'the questions are correctly answered' do + let(:test_cases) do + question_templates.flat_map do |template| + resource_ids.map do |resource_id| + resource = resource_model.find(resource_id) + + { + question: format(template, template_params.call(resource)), + issuable: resource, + context: resource.to_json + } + end + end + end + + it 'answers the questions correctly' do + test_results = batch_evaluate + + test_results.each do |result| + print_evaluation(result) + + result[:evaluations].each do |eval| + grading = eval[:response] + + # Skip if no grade (CORRECT or INCORRECT) is present in the response. + # (the LLM request failed for some reason or the LLM did not follow the instruction.) + next unless grading.match(/Grade: CORRECT/i) || grading.match(/Grade: INCORRECT/i) + + expect(grading).to match(/Grade: CORRECT/i) + end + end + end + end + + # The following block is always run in the CI. + # The purpose of this test is to detect a regression when there is an interface update. + describe 'Fast QA evaluation', :fast_chat_qa_evaluation, :aggregate_failures do + let(:resource_model) { Issue } + let(:resource_ids) { [24652824] } # https://gitlab.com/gitlab-org/gitlab/-/issues/17800 + let(:template_params) { ->(_) { {} } } + let(:question_templates) { ["Summarize this issue"] } + + it_behaves_like 'the questions are correctly answered' + end + + context 'for issue questions', :chat_qa_evaluation, :aggregate_failures do + let(:resource_model) { Issue } + let(:template_params) { ->(issue) { { url: project_issue_url(issue.project, issue) } } } + + let(:resource_ids) do + [ + 24652824, # https://gitlab.com/gitlab-org/gitlab/-/issues/17800 + 113414743, # https://gitlab.com/gitlab-org/gitlab/-/issues/371038 + 128440335, # https://gitlab.com/gitlab-org/gitlab/-/issues/412831 + 129393876, # https://gitlab.com/gitlab-org/gitlab/-/issues/415547 + 130125924, # https://gitlab.com/gitlab-org/gitlab/-/issues/416800 + 130193114 # https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/34345 + ] + end + + let(:question_templates) do + [ + "what is this issue about?", + "Summarize the comments into bullet points?", + "Summarize with bullet points", + "What are the unique use cases raised by commenters in this issue?", + "Could you summarize this issue", + "Summarize this Issue", + "%<url>s - Summarize this issue", + "What is the status of %<url>s?", + "Please summarize the latest activity and current status of the issue %<url>s", + "How can I improve the description of %<url>s " \ + "so that readers understand the value and problems to be solved?", + "Please rewrite the description of %<url>s so that readers " \ + "understand the value and problems to be solved. " \ + "Also add common \"jobs to be done\" or use cases which should be considered from a usability perspective.", + "Are there any open questions relating to this issue? %<url>s" + ] + end + + it_behaves_like 'the questions are correctly answered' + end + + context 'for epic questions', :chat_qa_evaluation, :aggregate_failures do + let(:resource_model) { Epic } + let(:template_params) { ->(epic) { { url: group_epic_url(epic.group, epic) } } } + + let(:resource_ids) do + [ + 822061, # https://gitlab.com/groups/gitlab-org/-/epics/10550 + 835460, # https://gitlab.com/groups/gitlab-org/-/epics/10694 + 854759 # https://gitlab.com/groups/gitlab-org/-/epics/10814 + ] + end + + let(:question_templates) do + [ + "Summarize the comments into bullet points?", + "Summarize with bullet points", + "Can you create a simpler list of which questions a user should be able to ask according to this epic.", + "How much work is left to be done %<url>s?", + "How much work is left to be done in this epic?", + "Please summarize what the objective and next steps are for %<url>s", + "Summarize this Epic." + ] + end + + it_behaves_like 'the questions are correctly answered' + end +end diff --git a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_issue_spec.rb b/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_issue_spec.rb deleted file mode 100644 index 0b2d526fa1308b93913b23539df20b434c6ddca2..0000000000000000000000000000000000000000 --- a/ee/spec/lib/gitlab/llm/chain/agents/zero_shot/qa_issue_spec.rb +++ /dev/null @@ -1,73 +0,0 @@ -# frozen_string_literal: true - -require 'spec_helper' - -RSpec.describe 'GitLab Duo Chat QA Evaluation for Issue', :clean_gitlab_redis_chat, feature_category: :duo_chat do - include Gitlab::Routing.url_helpers - include ChatQaEvaluationHelpers - - describe 'evaluation', :real_ai_request, :saas do - let_it_be(:user) { create(:user) } - - include_context 'with sample production epics and issues' - - before do - stub_licensed_features(ai_features: true) - stub_ee_application_setting(should_check_namespace_plan: true) - stub_licensed_features(ai_tanuki_bot: true) - - root_group = issue.project.group.root_ancestor - root_group.namespace_settings.update_attribute(:experiment_features_enabled, true) - root_group.add_owner(user) - issue.project.add_developer(user) - end - - where(:question_template) do - [ - ["what is this issue about?"], - ["Summarize the comments into bullet points?"], - ["Summarize with bullet points"], - ["What are the unique use cases raised by commenters in this issue?"], - ["Could you summarize this issue"], - ["Summarize this Issue"], - ["%<url>s - Summarize this issue"], - ["What is the status of %<url>s?"], - ["Please summarize the latest activity and current status of the issue %<url>s"], - ["How can I improve the description of %<url>s " \ - "so that readers understand the value and problems to be solved?"], - ["Please rewrite the description of %<url>s so that readers " \ - "understand the value and problems to be solved. " \ - "Also add common \"jobs to be done\" or use cases which should be considered from a usability perspective."], - ["Are there any open questions relating to this issue? %<url>s"] - ] - end - - with_them do - where(:issue_id) do - [ - [24652824], # https://gitlab.com/gitlab-org/gitlab/-/issues/17800 - [113414743], # https://gitlab.com/gitlab-org/gitlab/-/issues/371038 - [128440335], # https://gitlab.com/gitlab-org/gitlab/-/issues/412831 - [129393876], # https://gitlab.com/gitlab-org/gitlab/-/issues/415547 - [130125924], # https://gitlab.com/gitlab-org/gitlab/-/issues/416800 - [130193114] # https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/34345 - ] - end - - with_them do - let(:issue) { Issue.find(issue_id) } - let(:context) { issue.to_json } - let(:url) { project_issue_url(issue.project, issue) } - let(:question) { format(question_template, { url: url }) } - - it 'answers the question correctly' do - evaluations = evaluate_without_reference(user, issue, question, context)[:evaluations] - - evaluations.each do |eval| - expect(eval[:response]).to match(/Grade: CORRECT/i) - end - end - end - end - end -end diff --git a/ee/spec/support/helpers/duo_chat_fixture_helpers.rb b/ee/spec/support/helpers/duo_chat_fixture_helpers.rb index 830a8c808437d2ebee4310be0fc1c7555c57fcd9..1c40b87ae717c3f34adc4f7fdb83d60cf641c989 100644 --- a/ee/spec/support/helpers/duo_chat_fixture_helpers.rb +++ b/ee/spec/support/helpers/duo_chat_fixture_helpers.rb @@ -5,7 +5,7 @@ def restore_epic(epic) return if Epic.exists?(epic[:data][:id]) # Create ancestor epics - restore_epic(epics.find { |e| e[:data][:id] == epic[:data][:parent_id] }) if epic[:data][:parent_id] + restore_epic(epic_fixtures.find { |e| e[:data][:id] == epic[:data][:parent_id] }) if epic[:data][:parent_id] # Create the epic's group and the group's direct ancestors. root_group = epic[:namespace_hierarchy].first @@ -32,10 +32,12 @@ def restore_epic(epic) create_labels(epic) # Create epic - create(:epic, **epic[:data].except(:start_date_sourcing_epic_id, :due_date_sourcing_epic_id)) + created_epic = create(:epic, **epic[:data].except(:start_date_sourcing_epic_id, :due_date_sourcing_epic_id)) # Create notes epic[:notes].each { |note_attrs| create(:note, **note_attrs) } + + created_epic end # rubocop: disable Metrics/AbcSize @@ -94,10 +96,12 @@ def restore_issue(issue) issue_attrs = issue[:data].merge!({ work_item_type_id: WorkItems::Type.default_by_type(:issue).id }) - create(:issue, **issue_attrs) + created_issue = create(:issue, **issue_attrs) # Create notes issue[:notes].each { |note_attrs| create(:note, **note_attrs) } + + created_issue end # rubocop: enable Metrics/AbcSize @@ -134,5 +138,6 @@ def load_fixture(object_type) .select { |f| f.match(/.json/) } .map { |f| File.join(fixture_path, f) } .map { |f| Gitlab::Json.parse(File.read(f)) } + .map(&:deep_symbolize_keys) end end diff --git a/ee/spec/support/helpers/chat_qa_evaluation_helpers.rb b/ee/spec/support/helpers/duo_chat_qa_evaluation_helpers.rb similarity index 65% rename from ee/spec/support/helpers/chat_qa_evaluation_helpers.rb rename to ee/spec/support/helpers/duo_chat_qa_evaluation_helpers.rb index c88f1d54012fd81363f1bbf196bc9af4703850e4..eab1a2478601722611d22870934c8c0e64dba68d 100644 --- a/ee/spec/support/helpers/chat_qa_evaluation_helpers.rb +++ b/ee/spec/support/helpers/duo_chat_qa_evaluation_helpers.rb @@ -1,8 +1,10 @@ # frozen_string_literal: true -module ChatQaEvaluationHelpers +module DuoChatQaEvaluationHelpers TMP_REPORT_PATH = "tmp/duo_chat" ANTHROPIC_TIMEOUT = 50.seconds + NUM_THREADS = 10 # Arbitrarily chosen. Adjust as needed. + THREAD_START_DELAY = 2 PROMPT = <<~PROMPT @@ -41,12 +43,20 @@ module ChatQaEvaluationHelpers Assistant: PROMPT - def evaluate_without_reference(user, resource, question, context) - response = chat(user, resource, { content: question, cache_response: false, request_id: "12345" }) + # This method runs the given question through through GitLab Duo Chat service + # then asks LLMs (Claude and Vertex as of now) to grade GitLab Duo Chat's response using the given context. + # + # @param user [User] The current user authorized to read `issuable` and use GitLab Duo Chat + # @param issuable [Issue, Epic] The issuable to be used in as GitLab Duo Chat's context + # @param question [String] The question the user is asking to GitLab Duo Chat + # @param context [String] The context that will be used by the LLMs during evaluation. + # The context will usually be a JSON serialization of the issuable being asked about. + def evaluate_without_reference(user, issuable:, question:, context:) + response = chat(user, issuable, { content: question, cache_response: false, request_id: SecureRandom.uuid }) result = { question: question, - resource: resource.to_reference(full: true), + resource: issuable.to_reference(full: true), answer: response[:response_modifier].response_body, tools_used: response[:tools_used], evaluations: [] @@ -61,13 +71,42 @@ def evaluate_without_reference(user, resource, question, context) result[:evaluations].push(evaluate_with_claude(user, test_prompt)) result[:evaluations].push(evaluate_with_vertex(user, test_prompt)) - print_evaluation(result) - save_evaluation(result) - result end - def save_evaluation(result) + def batch_evaluate + test_results = Queue.new + test_queue = Queue.new + test_cases.each { |test_case| test_queue << test_case } + + (1..NUM_THREADS).map do |_| + sleep(THREAD_START_DELAY) # Do not start all threads immediately. + + Thread.new do + until test_queue.empty? + test_case = test_queue.pop + resource = test_case[:issuable].to_reference(full: true) + question = test_case[:question] + puts "Sending the evaluation request for '#{question}' with (#{resource})" + + Sidekiq::Worker.skipping_transaction_check do + Sidekiq::Testing.fake! do + test_results << evaluate_without_reference(user, **test_case) + end + rescue Net::ReadTimeout => _error + # Few requests may fail after exceeding the timeout threshold. Ignore them. + end + end + end + end.each(&:join) + + test_results = Array.new(test_results.size) { test_results.pop } + save_evaluations(test_results) + + test_results + end + + def save_evaluations(result) save_path = File.join(ENV.fetch('CI_PROJECT_DIR', ''), TMP_REPORT_PATH) file_path = File.join(save_path, "qa_#{Time.current.to_i}.json") FileUtils.mkdir_p(File.dirname(file_path)) diff --git a/ee/spec/support/shared_contexts/duo_chat_evaluation_shared_context.rb b/ee/spec/support/shared_contexts/duo_chat_evaluation_shared_context.rb deleted file mode 100644 index c06a46fb022e24e25b355b10a49865d1fd7d0abf..0000000000000000000000000000000000000000 --- a/ee/spec/support/shared_contexts/duo_chat_evaluation_shared_context.rb +++ /dev/null @@ -1,25 +0,0 @@ -# frozen_string_literal: true - -RSpec.shared_context 'with sample production epics and issues' do - include DuoChatFixtureHelpers - - let_it_be(:epics) { load_fixture('epics').map(&:deep_symbolize_keys) } - let_it_be(:issues) { load_fixture('issues').map(&:deep_symbolize_keys) } - - before do - # Note: In SaaS simulation mode, - # the url must be `https://gitlab.com` but the routing helper returns `localhost` - # and breaks GitLab ReferenceExtractor - stub_default_url_options(host: "gitlab.com", protocol: "https") - - # link_reference_pattern is memoized for Issue - # and stubbed url (gitlab.com) is not used to derive the link reference pattern. - Issue.instance_variable_set(:@link_reference_pattern, nil) - end - - before_all do - (epics + issues).each { |issuable| create_users(issuable) } - epics.each { |epic| restore_epic(epic) } - issues.each { |issue| restore_issue(issue) } - end -end diff --git a/ee/spec/support_specs/helpers/chat_qa_evaluation_helpers_spec.rb b/ee/spec/support_specs/helpers/chat_qa_evaluation_helpers_spec.rb deleted file mode 100644 index 6e9951c3f37dbd18b1fb17a08e519daeccc31c2b..0000000000000000000000000000000000000000 --- a/ee/spec/support_specs/helpers/chat_qa_evaluation_helpers_spec.rb +++ /dev/null @@ -1,62 +0,0 @@ -# frozen_string_literal: true - -require 'spec_helper' - -RSpec.describe ChatQaEvaluationHelpers, feature_category: :duo_chat do - include described_class - - describe 'evaluation without reference answer', :clean_gitlab_redis_chat, :real_ai_request, :saas do - let_it_be_with_reload(:group) { create(:group_with_plan, :public, plan: :ultimate_plan) } - let_it_be(:project) { create(:project, :repository, group: group) } - let_it_be(:user) { create(:user) } - let_it_be(:issue) do - create(:issue, project: project, title: 'A testing issue for AI reliability', - description: 'This issue is about evaluating reliability of various AI providers.') - end - - let(:question) { "Summarize this issue" } - - before_all do - group.add_owner(user) - end - - before do - stub_licensed_features(ai_features: true) - stub_ee_application_setting(should_check_namespace_plan: true) - - group.namespace_settings.update!( - experiment_features_enabled: true - ) - - stub_licensed_features(ai_tanuki_bot: true) - end - - context 'when the qa evaluation helper is fed the correct issue data' do - it 'evaluates as correct' do - result = evaluate_without_reference(user, issue, question, issue.to_json) - - result[:evaluations].each do |eval| - expect(eval[:response]).to match(/Grade: CORRECT/i) - end - - expect(result[:tools_used]).to match([Gitlab::Llm::Chain::Tools::IssueIdentifier::Executor, - Gitlab::Llm::Chain::Tools::JsonReader::Executor]) - end - end - - context 'when the qa evaluation helper is fed an incorrect issue data' do - # Duo chat answers the question based on `issue` - # The evaluator's given the context `issue` with different title and description - it 'evaluates as incorrect' do - modified_issue_context = issue.attributes - modified_issue_context["title"] = "Cloud provider's reliability" - modified_issue_context["description"] = 'This issue is about the reliability of various cloud providers.' - - evaluations = evaluate_without_reference(user, issue, question, modified_issue_context.to_json)[:evaluations] - evaluations.each do |eval| - expect(eval[:response]).to match(/Grade: INCORRECT/i) - end - end - end - end -end diff --git a/scripts/duo_chat/reporter.rb b/scripts/duo_chat/reporter.rb index 686a49164a707e6369ba6bba54fa04b9f73b3ca7..e72a393694f10534dd86bbd710bfd471f7e99fbe 100755 --- a/scripts/duo_chat/reporter.rb +++ b/scripts/duo_chat/reporter.rb @@ -5,7 +5,7 @@ require 'json' class Reporter - IDENTIFIABLE_NOTE_TAG = 'gitlab-org/ai-powered/ai-framework:duo-chat-qa-evaluation-' + IDENTIFIABLE_NOTE_TAG = 'gitlab-org/ai-powered/ai-framework:duo-chat-qa-evaluation' GRADE_TO_EMOJI_MAPPING = { correct: ":white_check_mark:", @@ -25,7 +25,7 @@ def run .merge_request_notes(ci_project_id, merge_request_iid) .auto_paginate .select do |note| - note.body.include? note_identifier_tag + note.body.include? IDENTIFIABLE_NOTE_TAG end note = report_notes.max_by { |note| Time.parse(note.created_at) } @@ -47,17 +47,13 @@ def run private def report_filename - "#{ENV['DUO_RSPEC']}.md" + ENV['QA_EVAL_REPORT_FILENAME'] end def artifact_path File.join(ENV['CI_PROJECT_DIR'], report_filename) end - def note_identifier_tag - "#{IDENTIFIABLE_NOTE_TAG}#{ENV['DUO_RSPEC']}" - end - def com_gitlab_client @com_gitlab_client ||= Gitlab.client( endpoint: "https://gitlab.com/api/v4", @@ -67,7 +63,7 @@ def com_gitlab_client def report_note report = <<~MARKDOWN - <!-- #{note_identifier_tag} --> + <!-- #{IDENTIFIABLE_NOTE_TAG} --> ## GitLab Duo Chat QA evaluation @@ -105,7 +101,7 @@ def report_note if report.length > 1000000 return <<~MARKDOWN - <!-- #{note_identifier_tag} --> + <!-- #{IDENTIFIABLE_NOTE_TAG} --> ## GitLab Duo Chat QA evaluation @@ -125,7 +121,7 @@ def report_note def report_data @report_data ||= Dir[File.join(ENV['CI_PROJECT_DIR'], "tmp/duo_chat/qa*.json")] - .map { |file| JSON.parse(File.read(file)) } + .flat_map { |file| JSON.parse(File.read(file)) } end def eval_content