From bbcdfe3d75c4328fed11f4fd6544e24980723565 Mon Sep 17 00:00:00 2001 From: Brett Walker <bwalker@gitlab.com> Date: Wed, 25 Jan 2023 14:23:35 -0600 Subject: [PATCH] Remove escaped spans when absolutely not needed Fixes a situation where a user name ending with `_` and refernced with `\_` during autocomplete would not be properly detected. Changelog: fixed --- .../output_example_snapshots/html.yml | 22 +++---- .../filter/markdown_post_escape_filter.rb | 65 +++++++++++++++---- lib/banzai/filter/sanitization_filter.rb | 1 + .../lib/banzai/pipeline/full_pipeline_spec.rb | 36 +++++++++- .../pipeline/plain_markdown_pipeline_spec.rb | 13 ++-- 5 files changed, 108 insertions(+), 29 deletions(-) diff --git a/glfm_specification/output_example_snapshots/html.yml b/glfm_specification/output_example_snapshots/html.yml index f82e9d18150fa..cc5892f6f3bd0 100644 --- a/glfm_specification/output_example_snapshots/html.yml +++ b/glfm_specification/output_example_snapshots/html.yml @@ -423,7 +423,7 @@ canonical: | <p>## foo</p> static: |- - <p data-sourcepos="1:1-1:28" dir="auto"><span>#</span># foo</p> + <p data-sourcepos="1:1-1:28" dir="auto"><span data-escaped-char>#</span># foo</p> wysiwyg: |- <p>## foo</p> 04_02_00__leaf_blocks__atx_headings__005: @@ -534,11 +534,11 @@ <h1>foo #</h1> static: |- <h3 data-sourcepos="1:1-1:33" dir="auto"> - <a id="user-content-foo-" class="anchor" href="#foo-" aria-hidden="true"></a>foo <span>#</span>##</h3> + <a id="user-content-foo-" class="anchor" href="#foo-" aria-hidden="true"></a>foo <span data-escaped-char>#</span>##</h3> <h2 data-sourcepos="2:1-2:32" dir="auto"> - <a id="user-content-foo--1" class="anchor" href="#foo--1" aria-hidden="true"></a>foo #<span>#</span>#</h2> + <a id="user-content-foo--1" class="anchor" href="#foo--1" aria-hidden="true"></a>foo #<span data-escaped-char>#</span>#</h2> <h1 data-sourcepos="3:1-3:29" dir="auto"> - <a id="user-content-foo--2" class="anchor" href="#foo--2" aria-hidden="true"></a>foo <span>#</span> + <a id="user-content-foo--2" class="anchor" href="#foo--2" aria-hidden="true"></a>foo <span data-escaped-char>#</span> </h1> wysiwyg: |- <h3>foo ###</h3> @@ -4785,7 +4785,7 @@ canonical: | <p>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</p> static: |- - <p data-sourcepos="1:1-1:295" dir="auto"><span>!</span>"<span>#</span><span>$</span><span>%</span><span>&</span>'()*+,-./:;<=>?<span>@</span>[\]<span>^</span><span>_</span>`<span>{</span>|<span>}</span><span>~</span></p> + <p data-sourcepos="1:1-1:295" dir="auto"><span data-escaped-char>!</span>"<span data-escaped-char>#</span><span data-escaped-char>$</span><span data-escaped-char>%</span><span data-escaped-char>&</span>'()*+,-./:;<=>?<span data-escaped-char>@</span>[\]<span data-escaped-char>^</span>_`{|}<span data-escaped-char>~</span></p> wysiwyg: |- <p>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</p> 06_02_00__inlines__backslash_escapes__002: @@ -4810,9 +4810,9 @@ `not code` 1. not a list * not a list - <span>#</span> not a heading + <span data-escaped-char>#</span> not a heading [foo]: /url "not a reference" - <span>&</span>ouml; not a character entity</p> + <span data-escaped-char>&</span>ouml; not a character entity</p> wysiwyg: |- <p>*not emphasized* <br/> not a tag @@ -5929,7 +5929,7 @@ canonical: | <p>foo <em>_</em></p> static: |- - <p data-sourcepos="1:1-1:29" dir="auto">foo <em><span>_</span></em></p> + <p data-sourcepos="1:1-1:29" dir="auto">foo <em>_</em></p> wysiwyg: |- <p>foo <em>_</em></p> 06_05_00__inlines__emphasis_and_strong_emphasis__100: @@ -5950,7 +5950,7 @@ canonical: | <p>foo <strong>_</strong></p> static: |- - <p data-sourcepos="1:1-1:31" dir="auto">foo <strong><span>_</span></strong></p> + <p data-sourcepos="1:1-1:31" dir="auto">foo <strong>_</strong></p> wysiwyg: |- <p>foo <strong>_</strong></p> 06_05_00__inlines__emphasis_and_strong_emphasis__103: @@ -6639,7 +6639,7 @@ canonical: | <p>[bar][foo!]</p> static: |- - <p data-sourcepos="1:1-1:33" dir="auto">[bar][foo<span>!</span>]</p> + <p data-sourcepos="1:1-1:33" dir="auto">[bar][foo<span data-escaped-char>!</span>]</p> wysiwyg: |- <p>[bar][foo!]</p> <pre>[foo!]: /url</pre> @@ -7043,7 +7043,7 @@ canonical: | <p>!<a href="/url" title="title">foo</a></p> static: |- - <p data-sourcepos="1:1-1:28" dir="auto"><span>!</span><a href="/url" title="title">foo</a></p> + <p data-sourcepos="1:1-1:28" dir="auto"><span data-escaped-char>!</span><a href="/url" title="title">foo</a></p> wysiwyg: |- <p>!<a target="_blank" rel="noopener noreferrer nofollow" href="/url" title="title">foo</a></p> <pre>[foo]: /url "title"</pre> diff --git a/lib/banzai/filter/markdown_post_escape_filter.rb b/lib/banzai/filter/markdown_post_escape_filter.rb index 8c0bd62f80af9..4d37fba33aa1a 100644 --- a/lib/banzai/filter/markdown_post_escape_filter.rb +++ b/lib/banzai/filter/markdown_post_escape_filter.rb @@ -7,11 +7,11 @@ class MarkdownPostEscapeFilter < HTML::Pipeline::Filter LITERAL_KEYWORD = MarkdownPreEscapeFilter::LITERAL_KEYWORD LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-(.*?)-#{LITERAL_KEYWORD}}.freeze NOT_LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-((%5C|\\).+?)-#{LITERAL_KEYWORD}}.freeze - SPAN_REGEX = %r{<span>(.*?)</span>}.freeze + SPAN_REGEX = %r{<span data-escaped-char>(.*?)</span>}.freeze - XPATH_A = Gitlab::Utils::Nokogiri.css_to_xpath('a').freeze - XPATH_LANG_TAG = Gitlab::Utils::Nokogiri.css_to_xpath('pre').freeze - XPATH_CODE_SPAN = Gitlab::Utils::Nokogiri.css_to_xpath('code > span').freeze + XPATH_A = Gitlab::Utils::Nokogiri.css_to_xpath('a').freeze + XPATH_LANG_TAG = Gitlab::Utils::Nokogiri.css_to_xpath('pre').freeze + XPATH_ESCAPED_CHAR = Gitlab::Utils::Nokogiri.css_to_xpath('span[data-escaped-char]').freeze def call return doc unless result[:escaped_literals] @@ -22,7 +22,7 @@ def call @doc = parse_html(new_html) remove_spans_in_certain_attributes - remove_spans_in_code + remove_unnecessary_escapes doc end @@ -57,7 +57,7 @@ def add_spans(html) escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:token] == last_match_token } escaped_char = escaped_item ? escaped_item[:char] : ::Regexp.last_match(1) - "<span>#{escaped_char}</span>" + "<span data-escaped-char>#{escaped_char}</span>" end html @@ -75,14 +75,55 @@ def remove_spans_in_certain_attributes end end - # Any `<span>` that makes it into a `<code>` element is from the math processing, - # convert back to the escaped character, such as `\$` - def remove_spans_in_code - doc.xpath(XPATH_CODE_SPAN).each do |node| - escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:char] == node.content && item[:latex] } + def remove_unnecessary_escapes + doc.xpath(XPATH_ESCAPED_CHAR).each do |node| + escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:char] == node.content } + + next unless escaped_item + + if node.parent.name == 'code' + # For any `data-escaped-char` that makes it into a `<code>` element, + # convert back to the escaped character, such as `\$`. Usually this would + # only happen for dollar math + content = +escaped_item[:escaped] + elsif escaped_item[:latex] && !escaped_item[:reference] + # Character only used in latex, since it's outside of a code block we can + # transform into the regular character + content = +escaped_item[:char] + else + # Escaped reference character, so leave as is. This is so that our normal + # reference processing can be short-circuited by escaping the reference, + # like \@username + next + end + + merge_adjacent_text_nodes(node, content) + end + end + + def text_node?(node) + node.is_a?(Nokogiri::XML::Text) + end - node.replace(escaped_item[:escaped]) if escaped_item + # Merge directly adjacent text nodes and replace existing node with + # the merged content. For example, the document could be + # #(Text "~c_bug"), #(Element:0x57724 { name = "span" }, children = [ #(Text "_")] })] + # Our reference processing requires a single string of text to match against. So even if it was + # #(Text "~c_bug"), #(Text "_") + # it wouldn't match. Merging together will give + # #(Text "~c_bug_") + def merge_adjacent_text_nodes(node, content) + if text_node?(node.previous) + content.prepend(node.previous.content) + node.previous.remove end + + if text_node?(node.next) + content.concat(node.next.content) + node.next.remove + end + + node.replace(content) end end end diff --git a/lib/banzai/filter/sanitization_filter.rb b/lib/banzai/filter/sanitization_filter.rb index de9b846efe9be..b119c2ffccf5a 100644 --- a/lib/banzai/filter/sanitization_filter.rb +++ b/lib/banzai/filter/sanitization_filter.rb @@ -20,6 +20,7 @@ def customize_allowlist(allowlist) # Allow the 'data-sourcepos' from CommonMark on all elements allowlist[:attributes][:all].push('data-sourcepos') + allowlist[:attributes][:all].push('data-escaped-char') # Remove any `style` properties not required for table alignment allowlist[:transformers].push(self.class.remove_unsafe_table_style) diff --git a/spec/lib/banzai/pipeline/full_pipeline_spec.rb b/spec/lib/banzai/pipeline/full_pipeline_spec.rb index c1d5f16b5621b..ca05a353d47a9 100644 --- a/spec/lib/banzai/pipeline/full_pipeline_spec.rb +++ b/spec/lib/banzai/pipeline/full_pipeline_spec.rb @@ -3,6 +3,8 @@ require 'spec_helper' RSpec.describe Banzai::Pipeline::FullPipeline, feature_category: :team_planning do + using RSpec::Parameterized::TableSyntax + describe 'References' do let(:project) { create(:project, :public) } let(:issue) { create(:issue, project: project) } @@ -157,14 +159,44 @@ markdown = "\\#{issue.to_reference}" output = described_class.to_html(markdown, project: project) - expect(output).to include("<span>#</span>#{issue.iid}") + expect(output).to include("<span data-escaped-char>#</span>#{issue.iid}") end it 'converts user reference with escaped underscore because of italics' do markdown = '_@test\__' output = described_class.to_html(markdown, project: project) - expect(output).to include('<em>@test<span>_</span></em>') + expect(output).to include('<em>@test_</em>') + end + + context 'when a reference (such as a label name) is autocompleted with characters that require escaping' do + # Labels are fairly representative of the type of characters that can be in a reference + # and aligns with the testing in spec/frontend/gfm_auto_complete_spec.js + where(:valid, :label_name, :markdown) do + # These are currently not supported + # true | 'a~bug' | '~"a\~bug"' + # true | 'b~~bug~~' | '~"b\~\~bug\~\~"' + + true | 'c_bug_' | '~c_bug\_' + true | 'c_bug_' | 'Label ~c_bug\_ and _more_ text' + true | 'd _bug_' | '~"d \_bug\_"' + true | 'e*bug*' | '~"e\*bug\*"' + true | 'f *bug*' | '~"f \*bug\*"' + true | 'f *bug*' | 'Label ~"f \*bug\*" **with** more text' + true | 'g`bug`' | '~"g\`bug\`" ' + true | 'h `bug`' | '~"h \`bug\`"' + end + + with_them do + it 'detects valid escaped reference' do + create(:label, name: label_name, project: project) + + result = Banzai::Pipeline::FullPipeline.call(markdown, project: project) + + expect(result[:output].css('a').first.attr('class')).to eq 'gfm gfm-label has-tooltip gl-link gl-label-link' + expect(result[:output].css('a').first.content).to eq label_name + end + end end end diff --git a/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb b/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb index 0e4a4e4492e85..e7c15ed9cf6c2 100644 --- a/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb +++ b/spec/lib/banzai/pipeline/plain_markdown_pipeline_spec.rb @@ -15,10 +15,15 @@ result = described_class.call(markdown, project: project) output = result[:output].to_html - Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.pluck(:char).each do |char| - char = '&' if char == '&' - - expect(output).to include("<span>#{char}</span>") + Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.each do |item| + char = item[:char] == '&' ? '&' : item[:char] + + if item[:reference] + expect(output).to include("<span data-escaped-char>#{char}</span>") + else + expect(output).not_to include("<span data-escaped-char>#{char}</span>") + expect(output).to include(char) + end end expect(result[:escaped_literals]).to be_truthy -- GitLab