diff options
-rw-r--r-- | Plugin/URL_Title.pm | 28 |
1 files changed, 17 insertions, 11 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 0a62782..60e7691 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -4,6 +4,7 @@ use strict; use warnings; use HTTP::Tiny; use HTML::Parser; +use HTML::Entities; use utf8; use Encode; @@ -25,8 +26,9 @@ sub configure { return $self; } +# Globals set by HTML parser, used by get_title my $title; -my $charset; +my $charset = "utf8"; # charset default to utf8, if not specified in HTML sub start_handler { @@ -34,7 +36,8 @@ sub start_handler my $attr = shift; my $self = shift; if ($tag eq "title") { - $self->handler(text => sub { $title = shift; }, "dtext"); + # Note: NOT dtext. leave entities until after decoding text. See comment below + $self->handler(text => sub { $title = shift; }, "text"); $self->handler(end => sub { shift->eof if shift eq "title"; }, "tagname,self"); } elsif ($tag eq "meta") { @@ -86,18 +89,21 @@ sub get_title $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; - if ($charset and lc($charset) ne "utf-8") { - my $dc = Encode::find_encoding($charset); - return (undef, undef, "Error: Unknown encoding $charset") unless $dc; - $title = $dc->decode($title); - } else { - # fall back on a guess of UTF-8 FIXME is this non-standard - utf8::decode($title); - } + # Decode raw bytes from document's title + my $dc = Encode::find_encoding($charset); + return (undef, undef, "Error: Unknown encoding $charset") unless $dc; + $title = $dc->decode($title); + + # Finally, collapse entities into the characters they represent. Note this + # must be done instead of pulling dtext from the HTML parser, else you can + # end up with a bad mix of encodings (see documents with wchars mixed with + # entities representing more wchars in titles) + decode_entities($title); + + # Normalise and trim whitespace for tidiness $title =~ s/\s+/ /g; $title =~ s/(^\s+|\s+$)//g; - utf8::upgrade($title); return (undef, undef, "Error: No title") unless $title; my $shorturl = $url; |