aboutsummaryrefslogtreecommitdiff
path: root/Plugin
diff options
context:
space:
mode:
Diffstat (limited to 'Plugin')
-rw-r--r--Plugin/URL_Title.pm28
1 files changed, 17 insertions, 11 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 0a62782..60e7691 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -4,6 +4,7 @@ use strict;
use warnings;
use HTTP::Tiny;
use HTML::Parser;
+use HTML::Entities;
use utf8;
use Encode;
@@ -25,8 +26,9 @@ sub configure {
return $self;
}
+# Globals set by HTML parser, used by get_title
my $title;
-my $charset;
+my $charset = "utf8"; # charset default to utf8, if not specified in HTML
sub start_handler
{
@@ -34,7 +36,8 @@ sub start_handler
my $attr = shift;
my $self = shift;
if ($tag eq "title") {
- $self->handler(text => sub { $title = shift; }, "dtext");
+ # Note: NOT dtext. leave entities until after decoding text. See comment below
+ $self->handler(text => sub { $title = shift; }, "text");
$self->handler(end => sub { shift->eof if shift eq "title"; },
"tagname,self");
} elsif ($tag eq "meta") {
@@ -86,18 +89,21 @@ sub get_title
$p->parse($html);
return (undef, undef, "Error parsing HTML: $!") if $!;
- if ($charset and lc($charset) ne "utf-8") {
- my $dc = Encode::find_encoding($charset);
- return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
- $title = $dc->decode($title);
- } else {
- # fall back on a guess of UTF-8 FIXME is this non-standard
- utf8::decode($title);
- }
+ # Decode raw bytes from document's title
+ my $dc = Encode::find_encoding($charset);
+ return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
+ $title = $dc->decode($title);
+
+ # Finally, collapse entities into the characters they represent. Note this
+ # must be done instead of pulling dtext from the HTML parser, else you can
+ # end up with a bad mix of encodings (see documents with wchars mixed with
+ # entities representing more wchars in titles)
+ decode_entities($title);
+
+ # Normalise and trim whitespace for tidiness
$title =~ s/\s+/ /g;
$title =~ s/(^\s+|\s+$)//g;
- utf8::upgrade($title);
return (undef, undef, "Error: No title") unless $title;
my $shorturl = $url;