diff --git a/Gemfile b/Gemfile index 28c8c2bdf4b08839c85e442bcf9fad396e4d417a..b7b5aff6983be8d96b42212aee5653c962ccad0e 100644 --- a/Gemfile +++ b/Gemfile @@ -261,6 +261,8 @@ gem 'rouge', '~> 4.3.0', feature_category: :shared gem 'truncato', '~> 0.7.12', feature_category: :team_planning gem 'nokogiri', '~> 1.16', feature_category: :shared gem 'gitlab-glfm-markdown', '~> 0.0.21', feature_category: :markdown +gem 'tanuki_emoji', '~> 0.9', feature_category: :markdown +gem 'unicode-emoji', '~> 3.6', feature_category: :markdown # Calendar rendering gem 'icalendar', '~> 2.10.1', feature_category: :system_access @@ -376,7 +378,6 @@ gem 'terser', '1.0.2' # rubocop:todo Gemfile/MissingFeatureCategory gem 'click_house-client', path: 'gems/click_house-client', require: 'click_house/client' # rubocop:todo Gemfile/MissingFeatureCategory gem 'addressable', '~> 2.8' # rubocop:todo Gemfile/MissingFeatureCategory -gem 'tanuki_emoji', '~> 0.9' # rubocop:todo Gemfile/MissingFeatureCategory gem 'gon', '~> 6.4.0' # rubocop:todo Gemfile/MissingFeatureCategory gem 'request_store', '~> 1.5.1' # rubocop:todo Gemfile/MissingFeatureCategory gem 'base32', '~> 0.3.0' # rubocop:todo Gemfile/MissingFeatureCategory diff --git a/Gemfile.checksum b/Gemfile.checksum index a76298177c68d377d91afc345b0fa9a1651b1ef7..f42856e73493be0ecba8d7b595be86c9bc23d39a 100644 --- a/Gemfile.checksum +++ b/Gemfile.checksum @@ -746,6 +746,8 @@ {"name":"unf_ext","version":"0.0.8.2","platform":"x64-mingw32","checksum":"f7e4c01774c91eb22e30d53dfc40ffbbb5a175f785c8f6f1be17ad96a0b29ed0"}, {"name":"unf_ext","version":"0.0.8.2","platform":"x86-mingw32","checksum":"6d44c13c98924bebd15ebdd4ed196ead403a0770ac03304570873349fda2a208"}, {"name":"unicode-display_width","version":"2.4.2","platform":"ruby","checksum":"6a10205d1a19ca790c4e53064ba93f09d9eb234bf6bd135d9deb6001c21428be"}, +{"name":"unicode-emoji","version":"3.6.0","platform":"ruby","checksum":"9d333b0bec74bbf0992b77ce59c809b762f0d1b4018ea454f524491fabc8e5ec"}, +{"name":"unicode-version","version":"1.4.0","platform":"ruby","checksum":"56409a354a042df2e9acd547f58f81c6ee24839560b47f4907b1c097b6677563"}, {"name":"unicode_utils","version":"1.4.0","platform":"ruby","checksum":"b922d0cf2313b6b7136ada6645ce7154ffc86418ca07d53b058efe9eb72f2a40"}, {"name":"uniform_notifier","version":"1.16.0","platform":"ruby","checksum":"99b39ee4a0864e3b49f375b5e5803eb26d35ed6eb1719c96407573a87bc4dbb5"}, {"name":"unleash","version":"3.2.2","platform":"ruby","checksum":"0f6e56498de920de66a01bceffb93933693ade646bb853fc70eb16bd1026b93b"}, diff --git a/Gemfile.lock b/Gemfile.lock index 4bec0c8babf28cdbe4929f7c015e19d657fc1f33..7f7fa57c07f4e43d1fd3b8b1584c4a576e2be3f1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1876,6 +1876,9 @@ GEM unf_ext unf_ext (0.0.8.2) unicode-display_width (2.4.2) + unicode-emoji (3.6.0) + unicode-version (~> 1.0) + unicode-version (1.4.0) unicode_utils (1.4.0) uniform_notifier (1.16.0) unleash (3.2.2) @@ -2306,6 +2309,7 @@ DEPENDENCIES tty-prompt (~> 0.23) typhoeus (~> 1.4.0) undercover (~> 0.5.0) + unicode-emoji (~> 3.6) unleash (~> 3.2.2) valid_email (~> 0.1) validates_hostname (~> 1.0.13) diff --git a/Gemfile.next.checksum b/Gemfile.next.checksum index 0a4a89226bfaf45fdf5087f4419a45ee3e36976e..0750eac6608a48137fa46ba9ebafc160e603026d 100644 --- a/Gemfile.next.checksum +++ b/Gemfile.next.checksum @@ -761,6 +761,8 @@ {"name":"unf_ext","version":"0.0.8.2","platform":"x64-mingw32","checksum":"f7e4c01774c91eb22e30d53dfc40ffbbb5a175f785c8f6f1be17ad96a0b29ed0"}, {"name":"unf_ext","version":"0.0.8.2","platform":"x86-mingw32","checksum":"6d44c13c98924bebd15ebdd4ed196ead403a0770ac03304570873349fda2a208"}, {"name":"unicode-display_width","version":"2.4.2","platform":"ruby","checksum":"6a10205d1a19ca790c4e53064ba93f09d9eb234bf6bd135d9deb6001c21428be"}, +{"name":"unicode-emoji","version":"3.6.0","platform":"ruby","checksum":"9d333b0bec74bbf0992b77ce59c809b762f0d1b4018ea454f524491fabc8e5ec"}, +{"name":"unicode-version","version":"1.4.0","platform":"ruby","checksum":"56409a354a042df2e9acd547f58f81c6ee24839560b47f4907b1c097b6677563"}, {"name":"unicode_utils","version":"1.4.0","platform":"ruby","checksum":"b922d0cf2313b6b7136ada6645ce7154ffc86418ca07d53b058efe9eb72f2a40"}, {"name":"uniform_notifier","version":"1.16.0","platform":"ruby","checksum":"99b39ee4a0864e3b49f375b5e5803eb26d35ed6eb1719c96407573a87bc4dbb5"}, {"name":"unleash","version":"3.2.2","platform":"ruby","checksum":"0f6e56498de920de66a01bceffb93933693ade646bb853fc70eb16bd1026b93b"}, diff --git a/Gemfile.next.lock b/Gemfile.next.lock index ad21cdea93eda0d5cf554ecba57304591d52bab0..164ca3f6ae61e7f0fa85d7ade58c163ee182c958 100644 --- a/Gemfile.next.lock +++ b/Gemfile.next.lock @@ -1903,6 +1903,9 @@ GEM unf_ext unf_ext (0.0.8.2) unicode-display_width (2.4.2) + unicode-emoji (3.6.0) + unicode-version (~> 1.0) + unicode-version (1.4.0) unicode_utils (1.4.0) uniform_notifier (1.16.0) unleash (3.2.2) @@ -2333,6 +2336,7 @@ DEPENDENCIES tty-prompt (~> 0.23) typhoeus (~> 1.4.0) undercover (~> 0.5.0) + unicode-emoji (~> 3.6) unleash (~> 3.2.2) valid_email (~> 0.1) validates_hostname (~> 1.0.13) diff --git a/lib/banzai/filter/emoji_filter.rb b/lib/banzai/filter/emoji_filter.rb index 498cca0e516acf9c2be184cddf9b23b1e2609ca6..33be7e588224e538011035bde3c94c2e93cde792 100644 --- a/lib/banzai/filter/emoji_filter.rb +++ b/lib/banzai/filter/emoji_filter.rb @@ -21,10 +21,8 @@ def call content = node.to_html - next unless content.include?(':') || emoji_unicode_pattern_untrusted.match?(content) - html = emoji_unicode_element_unicode_filter(content) - html = emoji_name_element_unicode_filter(html) + html = emoji_name_element_unicode_filter(html) if content.include?(':') next if html == content @@ -43,9 +41,13 @@ def emoji_name_element_unicode_filter(text) Gitlab::Utils::Gsub .gsub_with_limit(text, emoji_pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| emoji = TanukiEmoji.find_by_alpha_code(match_data[0]) - @emoji_count += 1 if emoji - Gitlab::Emoji.gl_emoji_tag(emoji) if emoji + if emoji + @emoji_count += 1 + Gitlab::Emoji.gl_emoji_tag(emoji) + else + match_data[0] + end end end @@ -55,11 +57,16 @@ def emoji_name_element_unicode_filter(text) # # Returns a String with unicode emoji replaced with gl-emoji unicode. def emoji_unicode_element_unicode_filter(text) - emoji_unicode_pattern_untrusted.replace_gsub(text, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match| - emoji = TanukiEmoji.find_by_codepoints(match[1]) - @emoji_count += 1 if emoji - - Gitlab::Emoji.gl_emoji_tag(emoji) if emoji + Gitlab::Utils::Gsub + .gsub_with_limit(text, emoji_unicode_pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| + emoji = TanukiEmoji.find_by_codepoints(match_data[0]) + + if emoji + @emoji_count += 1 + Gitlab::Emoji.gl_emoji_tag(emoji) + else + match_data[0] + end end end @@ -68,12 +75,10 @@ def self.emoji_pattern @emoji_pattern ||= TanukiEmoji.index.alpha_code_pattern end - # Build an unstrusted regexp that matches all valid unicode emojis names. - def self.emoji_unicode_pattern_untrusted - return @emoji_unicode_pattern_untrusted if @emoji_unicode_pattern_untrusted - - source = TanukiEmoji.index.codepoints_pattern.source - @emoji_unicode_pattern_untrusted = Gitlab::UntrustedRegexp.new(source) + def self.emoji_unicode_pattern + # Use regex from unicode-emoji gem. This is faster than the built-in TanukiEmoji + # regex for large documents. + Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT end private @@ -82,8 +87,8 @@ def emoji_pattern self.class.emoji_pattern end - def emoji_unicode_pattern_untrusted - self.class.emoji_unicode_pattern_untrusted + def emoji_unicode_pattern + self.class.emoji_unicode_pattern end end end diff --git a/spec/lib/banzai/filter/emoji_filter_spec.rb b/spec/lib/banzai/filter/emoji_filter_spec.rb index dd31aecc6d63ffa37fe94502ae3b506d41da772b..7db1c0a33e8c252b914783680749d1224a957b1f 100644 --- a/spec/lib/banzai/filter/emoji_filter_spec.rb +++ b/spec/lib/banzai/filter/emoji_filter_spec.rb @@ -99,6 +99,24 @@ expect(doc.to_html).to match(/^This deserves a <gl-emoji.+>, big time\.\z/) end + context 'when TanukiEmoji can not find the emoji' do + it 'alpha code is not replaced with tag' do + allow(TanukiEmoji).to receive(:find_by_alpha_code).and_return(nil) + + doc = filter(':smile:') + + expect(doc.css('gl-emoji').size).to eq 0 + end + + it 'unicode emoji is not replaced with tag' do + allow(TanukiEmoji).to receive(:find_by_codepoints).and_return(nil) + + doc = filter('ðŸ‘') + + expect(doc.css('gl-emoji').size).to eq 0 + end + end + it 'ignores backref emoji in footnote references' do footnote = <<~HTML <p>↩ Test<sup data-sourcepos="1:9-1:12" class="footnote-ref"><a href="#fn-1" id="fnref-1" data-footnote-ref>1</a></sup></p> @@ -138,6 +156,24 @@ end end + context 'when using TanukiEmoji' do + # the regex doesn't find emoji components, and they are not really meant to be used + # by themselves, so ignore them. + let(:exclude_components) { "ðŸ»ðŸ¼ðŸ½ðŸ¾ðŸ¿" } + + it 'finds all unicode emoji codepoints with regex' do + TanukiEmoji.index.all.each do |emoji| # rubocop:disable Rails/FindEach -- not a Rails model + next if exclude_components.include?(emoji.codepoints) + + expect(described_class.emoji_unicode_pattern.match?(emoji.codepoints)).to be_truthy + + emoji.codepoints_alternates.each do |alternate| + expect(described_class.emoji_unicode_pattern.match?(alternate)).to be_truthy + end + end + end + end + context 'and protects against pathological number of emojis' do it 'limit keeps it from timing out' do expect do