diff options
author | David Phillips <david@sighup.nz> | 2019-09-14 16:22:07 +1200 |
---|---|---|
committer | David Phillips <david@sighup.nz> | 2019-09-14 16:22:27 +1200 |
commit | 8f86ef32dff18c0b6499bc7d934e222990451c32 (patch) | |
tree | 478751c61c4c07c6ab9a8748468a59862acf2aa0 | |
parent | ea6db8e62753a011321da89439c650f70bd1d032 (diff) | |
download | idalius-8f86ef32dff18c0b6499bc7d934e222990451c32.tar.xz |
URL_Title: Allow entities and wchars mixed in titles
This patch moves the HTML entity decoding until after the raw bytes from the
HTML document are translated through charsets. Previously, entities were used
as decoded by the HTML parser into UTF-8, which meant that non-UTF-8-encoded
strings from documents could become mixed with UTF-8 characters, making
the subsequent character encoding transformation impossible to perform
correctly.
-rw-r--r-- | Plugin/URL_Title.pm | 28 |
1 files changed, 17 insertions, 11 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 0a62782..60e7691 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -4,6 +4,7 @@ use strict; use warnings; use HTTP::Tiny; use HTML::Parser; +use HTML::Entities; use utf8; use Encode; @@ -25,8 +26,9 @@ sub configure { return $self; } +# Globals set by HTML parser, used by get_title my $title; -my $charset; +my $charset = "utf8"; # charset default to utf8, if not specified in HTML sub start_handler { @@ -34,7 +36,8 @@ sub start_handler my $attr = shift; my $self = shift; if ($tag eq "title") { - $self->handler(text => sub { $title = shift; }, "dtext"); + # Note: NOT dtext. leave entities until after decoding text. See comment below + $self->handler(text => sub { $title = shift; }, "text"); $self->handler(end => sub { shift->eof if shift eq "title"; }, "tagname,self"); } elsif ($tag eq "meta") { @@ -86,18 +89,21 @@ sub get_title $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; - if ($charset and lc($charset) ne "utf-8") { - my $dc = Encode::find_encoding($charset); - return (undef, undef, "Error: Unknown encoding $charset") unless $dc; - $title = $dc->decode($title); - } else { - # fall back on a guess of UTF-8 FIXME is this non-standard - utf8::decode($title); - } + # Decode raw bytes from document's title + my $dc = Encode::find_encoding($charset); + return (undef, undef, "Error: Unknown encoding $charset") unless $dc; + $title = $dc->decode($title); + + # Finally, collapse entities into the characters they represent. Note this + # must be done instead of pulling dtext from the HTML parser, else you can + # end up with a bad mix of encodings (see documents with wchars mixed with + # entities representing more wchars in titles) + decode_entities($title); + + # Normalise and trim whitespace for tidiness $title =~ s/\s+/ /g; $title =~ s/(^\s+|\s+$)//g; - utf8::upgrade($title); return (undef, undef, "Error: No title") unless $title; my $shorturl = $url; |