From dd35c3ddf6dce7a69cc116fe6165dad68b8e9251 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Tue, 2 Aug 2016 17:51:17 +0200
Subject: [PATCH] Improve AutolinkFilter#text_parse performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By using clever XPath queries we can quite significantly improve the
performance of this method. The actual improvement depends a bit on the
amount of links used but in my tests the new implementation is usually
around 8 times faster than the old one. This was measured using the
following benchmark:

    require 'benchmark/ips'

    text = '<p>' + Note.select("string_agg(note, '') AS note").limit(50).take[:note] + '</p>'
    document = Nokogiri::HTML.fragment(text)
    filter = Banzai::Filter::AutolinkFilter.new(document, autolink: true)

    puts "Input size: #{(text.bytesize.to_f / 1024 / 1024).round(2)} MB"

    filter.rinku_parse

    Benchmark.ips(time: 15) do |bench|
      bench.report 'text_parse' do
        filter.text_parse
      end

      bench.report 'text_parse_fast' do
        filter.text_parse_fast
      end

      bench.compare!
    end

Here the "text_parse_fast" method is the new implementation and
"text_parse" the old one. The input size was around 180 MB. Running this
benchmark outputs the following:

    Input size: 181.16 MB
    Calculating -------------------------------------
              text_parse     1.000  i/100ms
         text_parse_fast     9.000  i/100ms
    -------------------------------------------------
              text_parse     13.021  (±15.4%) i/s -    188.000
         text_parse_fast    112.741  (± 3.5%) i/s -      1.692k

    Comparison:
         text_parse_fast:      112.7 i/s
              text_parse:       13.0 i/s - 8.66x slower

Again the production timings may (and most likely will) vary depending
on the input being processed.
---
 CHANGELOG                            |  1 +
 lib/banzai/filter/autolink_filter.rb | 15 +++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index c099c63ce86dd..47e959dea6871 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,6 +11,7 @@ v 8.11.0 (unreleased)
   - Add support for using RequestStore within Sidekiq tasks via SIDEKIQ_REQUEST_STORE env variable
   - Optimize maximum user access level lookup in loading of notes
   - Add "No one can push" as an option for protected branches. !5081
+  - Improve performance of AutolinkFilter#text_parse by using XPath
   - Environments have an url to link to
   - Limit git rev-list output count to one in forced push check
   - Clean up unused routes (Josef Strzibny)
diff --git a/lib/banzai/filter/autolink_filter.rb b/lib/banzai/filter/autolink_filter.rb
index 9ed45707515c1..799b83b106936 100644
--- a/lib/banzai/filter/autolink_filter.rb
+++ b/lib/banzai/filter/autolink_filter.rb
@@ -31,6 +31,14 @@ class AutolinkFilter < HTML::Pipeline::Filter
       # Text matching LINK_PATTERN inside these elements will not be linked
       IGNORE_PARENTS = %w(a code kbd pre script style).to_set
 
+      # The XPath query to use for finding text nodes to parse.
+      TEXT_QUERY = %Q(descendant-or-self::text()[
+        not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')})
+        and contains(., '://')
+        and not(starts-with(., 'http'))
+        and not(starts-with(., 'ftp'))
+      ])
+
       def call
         return doc if context[:autolink] == false
 
@@ -66,16 +74,11 @@ def rinku_parse
       # Autolinks any text matching LINK_PATTERN that Rinku didn't already
       # replace
       def text_parse
-        search_text_nodes(doc).each do |node|
+        doc.xpath(TEXT_QUERY).each do |node|
           content = node.to_html
 
-          next if has_ancestor?(node, IGNORE_PARENTS)
           next unless content.match(LINK_PATTERN)
 
-          # If Rinku didn't link this, there's probably a good reason, so we'll
-          # skip it too
-          next if content.start_with?(*%w(http https ftp))
-
           html = autolink_filter(content)
 
           next if html == content
-- 
GitLab