aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Phillips <david@sighup.nz>2019-09-14 16:22:07 +1200
committerDavid Phillips <david@sighup.nz>2019-09-14 16:22:27 +1200
commit8f86ef32dff18c0b6499bc7d934e222990451c32 (patch)
tree478751c61c4c07c6ab9a8748468a59862acf2aa0
parentea6db8e62753a011321da89439c650f70bd1d032 (diff)
downloadidalius-8f86ef32dff18c0b6499bc7d934e222990451c32.tar.xz
URL_Title: Allow entities and wchars mixed in titles
This patch moves the HTML entity decoding until after the raw bytes from the HTML document are translated through charsets. Previously, entities were used as decoded by the HTML parser into UTF-8, which meant that non-UTF-8-encoded strings from documents could become mixed with UTF-8 characters, making the subsequent character encoding transformation impossible to perform correctly.
-rw-r--r--Plugin/URL_Title.pm28
1 files changed, 17 insertions, 11 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 0a62782..60e7691 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -4,6 +4,7 @@ use strict;
use warnings;
use HTTP::Tiny;
use HTML::Parser;
+use HTML::Entities;
use utf8;
use Encode;
@@ -25,8 +26,9 @@ sub configure {
return $self;
}
+# Globals set by HTML parser, used by get_title
my $title;
-my $charset;
+my $charset = "utf8"; # charset default to utf8, if not specified in HTML
sub start_handler
{
@@ -34,7 +36,8 @@ sub start_handler
my $attr = shift;
my $self = shift;
if ($tag eq "title") {
- $self->handler(text => sub { $title = shift; }, "dtext");
+ # Note: NOT dtext. leave entities until after decoding text. See comment below
+ $self->handler(text => sub { $title = shift; }, "text");
$self->handler(end => sub { shift->eof if shift eq "title"; },
"tagname,self");
} elsif ($tag eq "meta") {
@@ -86,18 +89,21 @@ sub get_title
$p->parse($html);
return (undef, undef, "Error parsing HTML: $!") if $!;
- if ($charset and lc($charset) ne "utf-8") {
- my $dc = Encode::find_encoding($charset);
- return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
- $title = $dc->decode($title);
- } else {
- # fall back on a guess of UTF-8 FIXME is this non-standard
- utf8::decode($title);
- }
+ # Decode raw bytes from document's title
+ my $dc = Encode::find_encoding($charset);
+ return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
+ $title = $dc->decode($title);
+
+ # Finally, collapse entities into the characters they represent. Note this
+ # must be done instead of pulling dtext from the HTML parser, else you can
+ # end up with a bad mix of encodings (see documents with wchars mixed with
+ # entities representing more wchars in titles)
+ decode_entities($title);
+
+ # Normalise and trim whitespace for tidiness
$title =~ s/\s+/ /g;
$title =~ s/(^\s+|\s+$)//g;
- utf8::upgrade($title);
return (undef, undef, "Error: No title") unless $title;
my $shorturl = $url;