From bbcdfe3d75c4328fed11f4fd6544e24980723565 Mon Sep 17 00:00:00 2001
From: Brett Walker <bwalker@gitlab.com>
Date: Wed, 25 Jan 2023 14:23:35 -0600
Subject: [PATCH] Remove escaped spans when absolutely not needed

Fixes a situation where a user name ending with `_`
and refernced with `\_` during autocomplete
would not be properly detected.

Changelog: fixed
---
 .../output_example_snapshots/html.yml         | 22 +++----
 .../filter/markdown_post_escape_filter.rb     | 65 +++++++++++++++----
 lib/banzai/filter/sanitization_filter.rb      |  1 +
 .../lib/banzai/pipeline/full_pipeline_spec.rb | 36 +++++++++-
 .../pipeline/plain_markdown_pipeline_spec.rb  | 13 ++--
 5 files changed, 108 insertions(+), 29 deletions(-)

diff --git a/glfm_specification/output_example_snapshots/html.yml b/glfm_specification/output_example_snapshots/html.yml
index f82e9d18150fa..cc5892f6f3bd0 100644
--- a/glfm_specification/output_example_snapshots/html.yml
+++ b/glfm_specification/output_example_snapshots/html.yml
@@ -423,7 +423,7 @@
   canonical: |
     <p>## foo</p>
   static: |-
-    <p data-sourcepos="1:1-1:28" dir="auto"><span>#</span># foo</p>
+    <p data-sourcepos="1:1-1:28" dir="auto"><span data-escaped-char>#</span># foo</p>
   wysiwyg: |-
     <p>## foo</p>
 04_02_00__leaf_blocks__atx_headings__005:
@@ -534,11 +534,11 @@
     <h1>foo #</h1>
   static: |-
     <h3 data-sourcepos="1:1-1:33" dir="auto">
-    <a id="user-content-foo-" class="anchor" href="#foo-" aria-hidden="true"></a>foo <span>#</span>##</h3>
+    <a id="user-content-foo-" class="anchor" href="#foo-" aria-hidden="true"></a>foo <span data-escaped-char>#</span>##</h3>
     <h2 data-sourcepos="2:1-2:32" dir="auto">
-    <a id="user-content-foo--1" class="anchor" href="#foo--1" aria-hidden="true"></a>foo #<span>#</span>#</h2>
+    <a id="user-content-foo--1" class="anchor" href="#foo--1" aria-hidden="true"></a>foo #<span data-escaped-char>#</span>#</h2>
     <h1 data-sourcepos="3:1-3:29" dir="auto">
-    <a id="user-content-foo--2" class="anchor" href="#foo--2" aria-hidden="true"></a>foo <span>#</span>
+    <a id="user-content-foo--2" class="anchor" href="#foo--2" aria-hidden="true"></a>foo <span data-escaped-char>#</span>
     </h1>
   wysiwyg: |-
     <h3>foo ###</h3>
@@ -4785,7 +4785,7 @@
   canonical: |
     <p>!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</p>
   static: |-
-    <p data-sourcepos="1:1-1:295" dir="auto"><span>!</span>"<span>#</span><span>$</span><span>%</span><span>&amp;</span>'()*+,-./:;&lt;=&gt;?<span>@</span>[\]<span>^</span><span>_</span>`<span>{</span>|<span>}</span><span>~</span></p>
+    <p data-sourcepos="1:1-1:295" dir="auto"><span data-escaped-char>!</span>"<span data-escaped-char>#</span><span data-escaped-char>$</span><span data-escaped-char>%</span><span data-escaped-char>&amp;</span>'()*+,-./:;&lt;=&gt;?<span data-escaped-char>@</span>[\]<span data-escaped-char>^</span>_`{|}<span data-escaped-char>~</span></p>
   wysiwyg: |-
     <p>!"#$%&amp;'()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</p>
 06_02_00__inlines__backslash_escapes__002:
@@ -4810,9 +4810,9 @@
     `not code`
     1. not a list
     * not a list
-    <span>#</span> not a heading
+    <span data-escaped-char>#</span> not a heading
     [foo]: /url "not a reference"
-    <span>&amp;</span>ouml; not a character entity</p>
+    <span data-escaped-char>&amp;</span>ouml; not a character entity</p>
   wysiwyg: |-
     <p>*not emphasized*
     &lt;br/&gt; not a tag
@@ -5929,7 +5929,7 @@
   canonical: |
     <p>foo <em>_</em></p>
   static: |-
-    <p data-sourcepos="1:1-1:29" dir="auto">foo <em><span>_</span></em></p>
+    <p data-sourcepos="1:1-1:29" dir="auto">foo <em>_</em></p>
   wysiwyg: |-
     <p>foo <em>_</em></p>
 06_05_00__inlines__emphasis_and_strong_emphasis__100:
@@ -5950,7 +5950,7 @@
   canonical: |
     <p>foo <strong>_</strong></p>
   static: |-
-    <p data-sourcepos="1:1-1:31" dir="auto">foo <strong><span>_</span></strong></p>
+    <p data-sourcepos="1:1-1:31" dir="auto">foo <strong>_</strong></p>
   wysiwyg: |-
     <p>foo <strong>_</strong></p>
 06_05_00__inlines__emphasis_and_strong_emphasis__103:
@@ -6639,7 +6639,7 @@
   canonical: |
     <p>[bar][foo!]</p>
   static: |-
-    <p data-sourcepos="1:1-1:33" dir="auto">[bar][foo<span>!</span>]</p>
+    <p data-sourcepos="1:1-1:33" dir="auto">[bar][foo<span data-escaped-char>!</span>]</p>
   wysiwyg: |-
     <p>[bar][foo!]</p>
     <pre>[foo!]: /url</pre>
@@ -7043,7 +7043,7 @@
   canonical: |
     <p>!<a href="/url" title="title">foo</a></p>
   static: |-
-    <p data-sourcepos="1:1-1:28" dir="auto"><span>!</span><a href="/url" title="title">foo</a></p>
+    <p data-sourcepos="1:1-1:28" dir="auto"><span data-escaped-char>!</span><a href="/url" title="title">foo</a></p>
   wysiwyg: |-
     <p>!<a target="_blank" rel="noopener noreferrer nofollow" href="/url" title="title">foo</a></p>
     <pre>[foo]: /url "title"</pre>
diff --git a/lib/banzai/filter/markdown_post_escape_filter.rb b/lib/banzai/filter/markdown_post_escape_filter.rb
index 8c0bd62f80af9..4d37fba33aa1a 100644
--- a/lib/banzai/filter/markdown_post_escape_filter.rb
+++ b/lib/banzai/filter/markdown_post_escape_filter.rb
@@ -7,11 +7,11 @@ class MarkdownPostEscapeFilter < HTML::Pipeline::Filter
       LITERAL_KEYWORD   = MarkdownPreEscapeFilter::LITERAL_KEYWORD
       LITERAL_REGEX     = %r{#{LITERAL_KEYWORD}-(.*?)-#{LITERAL_KEYWORD}}.freeze
       NOT_LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-((%5C|\\).+?)-#{LITERAL_KEYWORD}}.freeze
-      SPAN_REGEX        = %r{<span>(.*?)</span>}.freeze
+      SPAN_REGEX        = %r{<span data-escaped-char>(.*?)</span>}.freeze
 
-      XPATH_A         = Gitlab::Utils::Nokogiri.css_to_xpath('a').freeze
-      XPATH_LANG_TAG  = Gitlab::Utils::Nokogiri.css_to_xpath('pre').freeze
-      XPATH_CODE_SPAN = Gitlab::Utils::Nokogiri.css_to_xpath('code > span').freeze
+      XPATH_A            = Gitlab::Utils::Nokogiri.css_to_xpath('a').freeze
+      XPATH_LANG_TAG     = Gitlab::Utils::Nokogiri.css_to_xpath('pre').freeze
+      XPATH_ESCAPED_CHAR = Gitlab::Utils::Nokogiri.css_to_xpath('span[data-escaped-char]').freeze
 
       def call
         return doc unless result[:escaped_literals]
@@ -22,7 +22,7 @@ def call
         @doc = parse_html(new_html)
 
         remove_spans_in_certain_attributes
-        remove_spans_in_code
+        remove_unnecessary_escapes
 
         doc
       end
@@ -57,7 +57,7 @@ def add_spans(html)
           escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:token] == last_match_token }
           escaped_char = escaped_item ? escaped_item[:char] : ::Regexp.last_match(1)
 
-          "<span>#{escaped_char}</span>"
+          "<span data-escaped-char>#{escaped_char}</span>"
         end
 
         html
@@ -75,14 +75,55 @@ def remove_spans_in_certain_attributes
         end
       end
 
-      # Any `<span>` that makes it into a `<code>` element is from the math processing,
-      # convert back to the escaped character, such as `\$`
-      def remove_spans_in_code
-        doc.xpath(XPATH_CODE_SPAN).each do |node|
-          escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:char] == node.content && item[:latex] }
+      def remove_unnecessary_escapes
+        doc.xpath(XPATH_ESCAPED_CHAR).each do |node|
+          escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:char] == node.content }
+
+          next unless escaped_item
+
+          if node.parent.name == 'code'
+            # For any `data-escaped-char` that makes it into a `<code>` element,
+            # convert back to the escaped character, such as `\$`. Usually this would
+            # only happen for dollar math
+            content = +escaped_item[:escaped]
+          elsif escaped_item[:latex] && !escaped_item[:reference]
+            # Character only used in latex, since it's outside of a code block we can
+            # transform into the regular character
+            content = +escaped_item[:char]
+          else
+            # Escaped reference character, so leave as is. This is so that our normal
+            # reference processing can be short-circuited by escaping the reference,
+            # like \@username
+            next
+          end
+
+          merge_adjacent_text_nodes(node, content)
+        end
+      end
+
+      def text_node?(node)
+        node.is_a?(Nokogiri::XML::Text)
+      end
 
-          node.replace(escaped_item[:escaped]) if escaped_item
+      # Merge directly adjacent text nodes and replace existing node with
+      # the merged content. For example, the document could be
+      #   #(Text "~c_bug"), #(Element:0x57724 { name = "span" }, children = [ #(Text "_")] })]
+      # Our reference processing requires a single string of text to match against. So even if it was
+      #   #(Text "~c_bug"), #(Text "_")
+      # it wouldn't match.  Merging together will give
+      #   #(Text "~c_bug_")
+      def merge_adjacent_text_nodes(node, content)
+        if text_node?(node.previous)
+          content.prepend(node.previous.content)
+          node.previous.remove
         end
+
+        if text_node?(node.next)
+          content.concat(node.next.content)
+          node.next.remove
+        end
+
+        node.replace(content)
       end
     end
   end
diff --git a/lib/banzai/filter/sanitization_filter.rb b/lib/banzai/filter/sanitization_filter.rb
index de9b846efe9be..b119c2ffccf5a 100644
--- a/lib/banzai/filter/sanitization_filter.rb
+++ b/lib/banzai/filter/sanitization_filter.rb
@@ -20,6 +20,7 @@ def customize_allowlist(allowlist)
 
         # Allow the 'data-sourcepos' from CommonMark on all elements
         allowlist[:attributes][:all].push('data-sourcepos')
+        allowlist[:attributes][:all].push('data-escaped-char')
 
         # Remove any `style` properties not required for table alignment
         allowlist[:transformers].push(self.class.remove_unsafe_table_style)
diff --git a/spec/lib/banzai/pipeline/full_pipeline_spec.rb b/spec/lib/banzai/pipeline/full_pipeline_spec.rb
index c1d5f16b5621b..ca05a353d47a9 100644
--- a/spec/lib/banzai/pipeline/full_pipeline_spec.rb
+++ b/spec/lib/banzai/pipeline/full_pipeline_spec.rb
@@ -3,6 +3,8 @@
 require 'spec_helper'
 
 RSpec.describe Banzai::Pipeline::FullPipeline, feature_category: :team_planning do
+  using RSpec::Parameterized::TableSyntax
+
   describe 'References' do
     let(:project) { create(:project, :public) }
     let(:issue)   { create(:issue, project: project) }
@@ -157,14 +159,44 @@
       markdown = "\\#{issue.to_reference}"
       output = described_class.to_html(markdown, project: project)
 
-      expect(output).to include("<span>#</span>#{issue.iid}")
+      expect(output).to include("<span data-escaped-char>#</span>#{issue.iid}")
     end
 
     it 'converts user reference with escaped underscore because of italics' do
       markdown = '_@test\__'
       output = described_class.to_html(markdown, project: project)
 
-      expect(output).to include('<em>@test<span>_</span></em>')
+      expect(output).to include('<em>@test_</em>')
+    end
+
+    context 'when a reference (such as a label name) is autocompleted with characters that require escaping' do
+      # Labels are fairly representative of the type of characters that can be in a reference
+      # and aligns with the testing in spec/frontend/gfm_auto_complete_spec.js
+      where(:valid, :label_name, :markdown) do
+        # These are currently not supported
+        # true   | 'a~bug'      | '~"a\~bug"'
+        # true   | 'b~~bug~~'   | '~"b\~\~bug\~\~"'
+
+        true   | 'c_bug_'     | '~c_bug\_'
+        true   | 'c_bug_'     | 'Label ~c_bug\_ and _more_ text'
+        true   | 'd _bug_'    | '~"d \_bug\_"'
+        true   | 'e*bug*'     | '~"e\*bug\*"'
+        true   | 'f *bug*'    | '~"f \*bug\*"'
+        true   | 'f *bug*'    | 'Label ~"f \*bug\*" **with** more text'
+        true   | 'g`bug`'     | '~"g\`bug\`" '
+        true   | 'h `bug`'    | '~"h \`bug\`"'
+      end
+
+      with_them do
+        it 'detects valid escaped reference' do
+          create(:label, name: label_name, project: project)
+
+          result = Banzai::Pipeline::FullPipeline.call(markdown, project: project)
+
+          expect(result[:output].css('a').first.attr('class')).to eq 'gfm gfm-label has-tooltip gl-link gl-label-link'
+          expect(result[:output].css('a').first.content).to eq label_name
+        end
+      end
     end
   end
 
diff --git a/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb b/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb
index 0e4a4e4492e85..e7c15ed9cf6c2 100644
--- a/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb
+++ b/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb
@@ -15,10 +15,15 @@
       result = described_class.call(markdown, project: project)
       output = result[:output].to_html
 
-      Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.pluck(:char).each do |char|
-        char = '&amp;' if char == '&'
-
-        expect(output).to include("<span>#{char}</span>")
+      Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.each do |item|
+        char = item[:char] == '&' ? '&amp;' : item[:char]
+
+        if item[:reference]
+          expect(output).to include("<span data-escaped-char>#{char}</span>")
+        else
+          expect(output).not_to include("<span data-escaped-char>#{char}</span>")
+          expect(output).to include(char)
+        end
       end
 
       expect(result[:escaped_literals]).to be_truthy
-- 
GitLab