URL_Title: Allow entities and wchars mixed in titles

This patch moves the HTML entity decoding until after the raw bytes from the HTML document are translated through charsets. Previously, entities were used as decoded by the HTML parser into UTF-8, which meant that non-UTF-8-encoded strings from documents could become mixed with UTF-8 characters, making the subsequent character encoding transformation impossible to perform correctly.
author: David Phillips <david@sighup.nz> 2019-09-14 16:22:07 +1200
committer: David Phillips <david@sighup.nz> 2019-09-14 16:22:27 +1200
commit: 8f86ef32dff18c0b6499bc7d934e222990451c32 (patch)
tree: 478751c61c4c07c6ab9a8748468a59862acf2aa0
parent: ea6db8e62753a011321da89439c650f70bd1d032 (diff)
download: idalius-8f86ef32dff18c0b6499bc7d934e222990451c32.tar.xz
1 files changed, 17 insertions, 11 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 0a62782..60e7691 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -4,6 +4,7 @@ use strict;
 use warnings;
 use HTTP::Tiny;
 use HTML::Parser;
+use HTML::Entities;
 use utf8;
 use Encode;
 
@@ -25,8 +26,9 @@ sub configure {
 	return $self;
 }
 
+# Globals set by HTML parser, used by get_title
 my $title;
-my $charset;
+my $charset = "utf8"; # charset default to utf8, if not specified in HTML
 
 sub start_handler
 {
@@ -34,7 +36,8 @@ sub start_handler
 	my $attr = shift;
 	my $self = shift;
 	if ($tag eq "title") {
-		$self->handler(text => sub { $title = shift; }, "dtext");
+		# Note: NOT dtext. leave entities until after decoding text. See comment below
+		$self->handler(text => sub { $title = shift; }, "text");
 		$self->handler(end  => sub { shift->eof if shift eq "title"; },
 		                    "tagname,self");
 	} elsif ($tag eq "meta") {
@@ -86,18 +89,21 @@ sub get_title
 	$p->parse($html);
 	return (undef, undef, "Error parsing HTML: $!") if $!;
 
-	if ($charset and lc($charset) ne "utf-8") {
-		my $dc = Encode::find_encoding($charset);
-		return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
-		$title = $dc->decode($title);
-	} else {
-		# fall back on a guess of UTF-8 FIXME is this non-standard
-		utf8::decode($title);
-	}
+	# Decode raw bytes from document's title
+	my $dc = Encode::find_encoding($charset);
+	return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
+	$title = $dc->decode($title);
+
+	# Finally, collapse entities into the characters they represent. Note this
+	# must be done instead of pulling dtext from the HTML parser, else you can
+	# end up with a bad mix of encodings (see documents with wchars mixed with
+	# entities representing more wchars in titles)
+	decode_entities($title);
+
+	# Normalise and trim whitespace for tidiness
 	$title =~ s/\s+/ /g;
 	$title =~ s/(^\s+|\s+$)//g;
 
-	utf8::upgrade($title);
 	return (undef, undef, "Error: No title") unless $title;
 
 	my $shorturl = $url;
author	David Phillips <david@sighup.nz>	2019-09-14 16:22:07 +1200
committer	David Phillips <david@sighup.nz>	2019-09-14 16:22:27 +1200
commit	8f86ef32dff18c0b6499bc7d934e222990451c32 (patch)
tree	478751c61c4c07c6ab9a8748468a59862acf2aa0
parent	ea6db8e62753a011321da89439c650f70bd1d032 (diff)
download	idalius-8f86ef32dff18c0b6499bc7d934e222990451c32.tar.xz