Add label table

Remove <?xml> tag

Add label table
c8bf16f1 · Mark Chao · GitLab · 0bad7ee0 · c8bf16f1 · c8bf16f1
--- a/ee/lib/gitlab/llm/anthropic/completions/categorize_question.rb
+++ b/ee/lib/gitlab/llm/anthropic/completions/categorize_question.rb
@@ -5,20 +5,30 @@ module Llm
    module Anthropic
      module Completions
        class CategorizeQuestion < Gitlab::Llm::Completions::Base
-          SCHEMA_URL = 'iglu:com.gitlab/ai_question_category/jsonschema/1-0-0'
+          SCHEMA_URL = 'iglu:com.gitlab/ai_question_category/jsonschema/1-1-0'
+
+          private_class_method def self.load_xml(filename)
+            File.read(File.join(File.dirname(__FILE__), '..', '..', 'fixtures', filename)).tr("\n", '')
+          end
+
+          LLM_MATCHING_CATEGORIES_XML = load_xml('categories.xml') # mandatory category definition
+          LLM_MATCHING_LABELS_XML = load_xml('labels.xml') # boolean attribute definitions

          REQUIRED_KEYS = %w[detailed_category category].freeze
-          OPTIONAL_KEYS = [].freeze
+          OPTIONAL_KEYS = (
+            %w[language] +
+              Hash.from_xml(LLM_MATCHING_LABELS_XML)
+                  .dig('root', 'label').pluck('type') # rubocop:disable CodeReuse/ActiveRecord -- Array#pluck
+          ).freeze
          PERMITTED_KEYS = REQUIRED_KEYS + OPTIONAL_KEYS

          def execute
            @ai_client = ::Gitlab::Llm::Anthropic::Client.new(user, tracking_context: tracking_context)
-            response = response_for(user, options)
+            @storage = ::Gitlab::Llm::ChatStorage.new(user)
+            @messages = @storage.messages_up_to(options[:message_id])
            @logger = Gitlab::Llm::Logger.build

-            result = process_response(response, user)
-
-            if result
+            if track(user, attributes_from_llm)
              ResponseModifiers::CategorizeQuestion.new(nil)
            else
              ResponseModifiers::CategorizeQuestion.new(error: 'Event not tracked')
@@ -27,10 +37,7 @@ def execute

          private

-          def response_for(user, options)
-            template = ai_prompt_class.new(user, options)
-            request(template)
-          end
+          attr_reader :messages

          def request(template)
            @ai_client.complete(
@@ -38,28 +45,31 @@ def request(template)
            )&.dig("completion").to_s.strip
          end

-          def process_response(response, user)
-            json = Gitlab::Json.parse(response)
-
-            return false unless json
+          def attributes_from_llm
+            template = ai_prompt_class.new(messages, options)
+            data = Gitlab::Json.parse(request(template)) || {}

-            track(user, json)
+            # Turn array of matched label strings into boolean attributes
+            labels = data.delete('labels')
+            labels&.each { |label| data[label] = true }

+            data
          rescue JSON::ParserError
            error_message = "JSON has an invalid format."
            @logger.error(message: "Error", class: self.class.to_s, error: error_message)
-
-            false
+            {}
          end

-          def track(user, json)
-            unless contains_categories?(json)
+          def track(user, attributes)
+            return false if attributes.empty?
+
+            unless contains_categories?(attributes)
              error_message = 'Response did not contain defined categories'
              @logger.error(message: "Error", class: self.class.to_s, error: error_message)
              return false
            end

-            context = SnowplowTracker::SelfDescribingJson.new(SCHEMA_URL, json.slice(*PERMITTED_KEYS))
+            context = SnowplowTracker::SelfDescribingJson.new(SCHEMA_URL, attributes.slice(*PERMITTED_KEYS))

            Gitlab::Tracking.event(
              self.class.to_s,
@@ -70,9 +80,9 @@ def track(user, json)
            )
          end

-          def contains_categories?(json)
+          def contains_categories?(hash)
            REQUIRED_KEYS.each do |key|
-              return false unless json.has_key?(key)
+              return false unless hash.has_key?(key)
            end
          end
        end

--- a/ee/lib/gitlab/llm/fixtures/categories.xml
+++ b/ee/lib/gitlab/llm/fixtures/categories.xml
-<?xml version="1.0" encoding="UTF-8"?><root><row>
+<root><row>
  <category>Documentation about GitLab</category>

  <detailed_category>Question about GitLab Duo such as "what can you do?", "how can you help me?"</detailed_category>

--- a/ee/lib/gitlab/llm/fixtures/labels.xml
+++ b/ee/lib/gitlab/llm/fixtures/labels.xml
+<root>
+<label><type>contains_rejection</type><desc>User indicates that the answer they received is incorrect, incomplete, or unsatisfactory</desc></label>
+<label><type>contains_rejection_previous_answer_incorrect</type><desc>User indicates that the answer they received is incorrect</desc></label>
+<label><type>contains_rejection_previous_answer_incomplete</type><desc>User indicates that the answer they received is incomplete</desc></label>
+<label><type>contains_rejection_previous_answer_unsatisfactory</type><desc>User indicates that the answer they received is unsatisfactory</desc></label>
+<label><type>is_follow_up_question</type><desc>The user asks a follow-up question to seek additional information or clarification in response to a previous answer</desc></label>
+<label><type>contains_clarification</type><desc>The user seems to not have understood the previous answer and asks a follow-up question to clarify this</desc></label>
+<label><type>contains_intellectual_property</type><desc>User has input intellectual property such as copyrighted material, trademarks or company secrets</desc></label>
+<label><type>contains_credentials</type><desc>User has input credentials such as usernames, passwords, tokens, and other things that can be used to authenticate to digital systems</desc></label>
+<label><type>contains_code</type><desc>Question contains code written in a programming language</desc></label>
+<label><type>contains_personal_information</type><desc>User has input personally identifiable information (PII) such as names, email addresses, phone numbers, or credit card numbers</desc></label>
+<label><type>compares_two_things</type><desc>User has requested chat to compare two things against each other</desc></label>
+<label><type>compares_more_than_two_things</type><desc>User has requested chat to compare two or more things against each other</desc></label>
+<label><type>requests_answer_in_certain_form</type><desc>User requests the chat to return the answer in a certain form, for example: short, long, bulleted list, containing a code snippet, formal, informal</desc></label>
+<label><type>contains_request_to_format_the_answer</type><desc>User requests the chat to return the answer in a certain format, for example: XML, JSON, HTML, markdown</desc></label>
+<label><type>is_related_to_gitlab</type><desc>User's question is related to GitLab, GitLab features, or how to use GitLab</desc></label>
+<label><type>is_related_to_gitlab_data</type><desc>User's question is related to data in GitLab, such as the content of an issue, epic, code file, MR, or pipeline</desc></label>
+<label><type>is_related_to_devsecops</type><desc>User's question relates to DevOps or DevSecOps, for example continuous integration and continuous deployment (CI/CD) pipelines, security testing tools, code scanning and review, threat modeling, security training for development teams, and automated compliance checks</desc></label>
+<label><type>is_poorly_formulated</type><desc>The user has composed a question that is poorly formulated and/or ambiguous</desc></label>
+</root>
--- a/ee/lib/gitlab/llm/templates/categorize_question.rb
+++ b/ee/lib/gitlab/llm/templates/categorize_question.rb
@@ -6,41 +6,48 @@ module Templates
      class CategorizeQuestion
        include Gitlab::Utils::StrongMemoize

-        def initialize(user, params = {})
-          @user = user
-          @params = params
-        end
+        PROMPT = ERB.new(<<~PROMPT)
+          \n\nHuman: You are helpful assistant, ready to give as accurate answer as possible in JSON format.

-        def to_prompt
-          prompt = <<~PROMPT
-            \n\nHuman: You are helpful assistant, ready to give as accurate answer as possible in JSON format.
+          Based on the information below (user input, <% if previous_answer %>previous answer, <% end %>categories, labels, language), classify user input's category, detailed_category, labels. There may be multiple labels. Don't provide clarification or explanation. Always return only a JSON hash, e.g.:
+          <example>{"category": "Write, improve, or explain code", "detailed_category": "What are the potential security risks in this code?", "labels": ["contains_credentials", "contains_rejection_previous_answer_incorrect"], "language": "en"}</example>
+          <example>{"category": "Documentation about GitLab", "detailed_category": "Documentation about GitLab", "labels": [], "language": "ja"}</example>

-            Given categories below (formatted with XML) return category and detailed_category of question below. Question is prefixed by "q".
+          <% if previous_answer %>
+          Previous answer:
+          <answer><%= previous_answer %></answer>
+          <% end %>

-            Categories XML:
-            %<categories>s
+          User input:
+          <input><%= question %></input>

-            q: %<question>s
+          Categories:
+          <%= ::Gitlab::Llm::Anthropic::Completions::CategorizeQuestion::LLM_MATCHING_CATEGORIES_XML %>

-            Return category and detailed category, always using JSON format. Example of said JSON:
-            "{"category": "Write, improve, or explain code", "detailed_category": "What are the potential security risks in this code?" }".
+          Labels:
+          <%= ::Gitlab::Llm::Anthropic::Completions::CategorizeQuestion::LLM_MATCHING_LABELS_XML %>

-            Always return only JSON structure.
+          Assistant:
+        PROMPT

-            Assistant:
-            JSON:
-          PROMPT
+        def initialize(messages, params = {})
+          @messages = messages
+          @params = params
+        end

-          format(prompt, question: params[:question], categories: categories_parsed_file)
+        def to_prompt
+          previous_message = messages[-2]
+          previous_answer = previous_message&.assistant? ? previous_message.content : nil
+
+          PROMPT.result_with_hash(
+            question: params[:question],
+            previous_answer: previous_answer
+          )
        end

        private

-        attr_reader :user, :params
-
-        def categories_parsed_file
-          File.read(File.join(File.dirname(__FILE__), '..', 'fixtures', 'categories.xml'))
-        end
+        attr_reader :params, :messages
      end
    end
  end

--- a/ee/spec/lib/gitlab/llm/anthropic/completions/categorize_question_spec.rb
+++ b/ee/spec/lib/gitlab/llm/anthropic/completions/categorize_question_spec.rb
@@ -6,27 +6,40 @@
  describe '#execute' do
    let(:user) { build(:user) }
    let(:ai_client) { ::Gitlab::Llm::Anthropic::Client.new(nil) }
-    let(:response) {  { 'completion' => answer.to_s } }
+    let(:response) {  { 'completion' => llm_analysis_response.to_s } }
+    let(:llm_analysis_response) do
+      {
+        detailed_category: "Summarize issue",
+        category: 'Summarize something',
+        labels: %w[contains_code is_related_to_gitlab],
+        language: 'en',
+        extra: 'foo'
+      }.to_json
+    end

    let(:prompt_message) do
      build(:ai_message, :categorize_question, user: user, resource: user, request_id: 'uuid')
    end

-    let(:options) { { question: 'What is the pipeline?' } }
+    let(:message_id) { '<message_id>' }
+    let(:options) { { question: 'What is the pipeline?', message_id: message_id } }
+    let(:template_class) { ::Gitlab::Llm::Templates::CategorizeQuestion }
+    let(:prompt) { '<prompt>' }

    subject(:categorize_action) do
-      described_class.new(prompt_message, ::Gitlab::Llm::Templates::CategorizeQuestion, **options).execute
+      described_class.new(prompt_message, template_class, **options).execute
    end

    before do
+      allow_next_instance_of(template_class) do |template|
+        allow(template).to receive(:to_prompt).and_return(prompt)
+      end
      allow_next_instance_of(::Gitlab::Llm::Anthropic::Client) do |ai_client|
-        allow(ai_client).to receive(:complete).and_return(response)
+        allow(ai_client).to receive(:complete).with(prompt: prompt).and_return(response)
      end
    end

    context 'with valid response' do
-      let(:answer) { { detailed_category: "Summarize issue", category: 'Summarize something' }.to_json }
-
      it 'tracks event' do
        expect(categorize_action.errors).to be_empty

@@ -37,14 +50,20 @@
          user: user,
          context: [{
            schema: described_class::SCHEMA_URL,
-            data: { 'detailed_category' => "Summarize issue", 'category' => 'Summarize something' }
+            data: {
+              'detailed_category' => "Summarize issue",
+              'category' => 'Summarize something',
+              'contains_code' => true,
+              "is_related_to_gitlab" => true,
+              'language' => 'en'
+            }
          }]
        )
      end
    end

    context 'with incomplete response' do
-      let(:answer) { { category: 'Summarize something' }.to_json }
+      let(:llm_analysis_response) { { category: 'Summarize something' }.to_json }

      it 'does not track event' do
        expect(categorize_action.errors).to include('Event not tracked')
@@ -54,13 +73,13 @@
          action: 'ai_question_category',
          property: 'uuid',
          user: user,
-          context: []
+          context: anything
        )
      end
    end

    context 'with invalid response' do
-      let(:answer) { "invalid" }
+      let(:llm_analysis_response) { "invalid" }

      it 'does not track event' do
        expect(categorize_action.errors).to include('Event not tracked')
@@ -70,7 +89,7 @@
          action: 'ai_question_category',
          property: 'uuid',
          user: user,
-          context: []
+          context: anything
        )
      end
    end

--- a/ee/spec/lib/gitlab/llm/templates/categorize_question_spec.rb
+++ b/ee/spec/lib/gitlab/llm/templates/categorize_question_spec.rb
@@ -3,10 +3,10 @@
 require 'spec_helper'

 RSpec.describe Gitlab::Llm::Templates::CategorizeQuestion, feature_category: :duo_chat do
-  let(:user) { build(:user) }
+  let(:messages) { [] }
  let(:question) { 'what is the issue' }

-  subject { described_class.new(user, { question: question }) }
+  subject { described_class.new(messages, { question: question }) }

  describe '#to_prompt' do
    it 'includes question' do
@@ -15,10 +15,34 @@
      expect(prompt).to include(question)
    end

-    it 'includes xml part' do
+    it 'includes xmls' do
      prompt = subject.to_prompt

-      expect(prompt).to include('<?xml version="1.0" encoding="UTF-8"?><root><row>')
+      expect(prompt).to include("Categories:\n<root>")
+      expect(prompt).to include("Labels:\n<root>")
+    end
+
+    context 'when previous answer is absent' do
+      it 'does not include previous answer' do
+        prompt = subject.to_prompt
+
+        expect(prompt).not_to include("Previous answer:\n<answer>")
+      end
+    end
+
+    context 'when previous answer is present' do
+      let(:messages) do
+        [
+          instance_double(Gitlab::Llm::ChatMessage, assistant?: true, content: '<LLM answer>'),
+          instance_double(Gitlab::Llm::ChatMessage, assistant?: false, content: '<user input>')
+        ]
+      end
+
+      it 'includes previous answer' do
+        prompt = subject.to_prompt
+
+        expect(prompt).to include("Previous answer:\n<answer>")
+      end
    end
  end
 end