From 8f86ef32dff18c0b6499bc7d934e222990451c32 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Sat, 14 Sep 2019 16:22:07 +1200 Subject: URL_Title: Allow entities and wchars mixed in titles This patch moves the HTML entity decoding until after the raw bytes from the HTML document are translated through charsets. Previously, entities were used as decoded by the HTML parser into UTF-8, which meant that non-UTF-8-encoded strings from documents could become mixed with UTF-8 characters, making the subsequent character encoding transformation impossible to perform correctly. --- Plugin/URL_Title.pm | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 0a62782..60e7691 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -4,6 +4,7 @@ use strict; use warnings; use HTTP::Tiny; use HTML::Parser; +use HTML::Entities; use utf8; use Encode; @@ -25,8 +26,9 @@ sub configure { return $self; } +# Globals set by HTML parser, used by get_title my $title; -my $charset; +my $charset = "utf8"; # charset default to utf8, if not specified in HTML sub start_handler { @@ -34,7 +36,8 @@ sub start_handler my $attr = shift; my $self = shift; if ($tag eq "title") { - $self->handler(text => sub { $title = shift; }, "dtext"); + # Note: NOT dtext. leave entities until after decoding text. See comment below + $self->handler(text => sub { $title = shift; }, "text"); $self->handler(end => sub { shift->eof if shift eq "title"; }, "tagname,self"); } elsif ($tag eq "meta") { @@ -86,18 +89,21 @@ sub get_title $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; - if ($charset and lc($charset) ne "utf-8") { - my $dc = Encode::find_encoding($charset); - return (undef, undef, "Error: Unknown encoding $charset") unless $dc; - $title = $dc->decode($title); - } else { - # fall back on a guess of UTF-8 FIXME is this non-standard - utf8::decode($title); - } + # Decode raw bytes from document's title + my $dc = Encode::find_encoding($charset); + return (undef, undef, "Error: Unknown encoding $charset") unless $dc; + $title = $dc->decode($title); + + # Finally, collapse entities into the characters they represent. Note this + # must be done instead of pulling dtext from the HTML parser, else you can + # end up with a bad mix of encodings (see documents with wchars mixed with + # entities representing more wchars in titles) + decode_entities($title); + + # Normalise and trim whitespace for tidiness $title =~ s/\s+/ /g; $title =~ s/(^\s+|\s+$)//g; - utf8::upgrade($title); return (undef, undef, "Error: No title") unless $title; my $shorturl = $url; -- cgit v1.1