From a21d52d884ac0435888c087bc0ba71a44a3f05b1 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Wed, 19 Jun 2019 20:46:07 +1200 Subject: URL_Title: Extract charset from HTML tag if present --- Plugin/URL_Title.pm | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 6582a95..5299e1f 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -5,6 +5,7 @@ use warnings; use HTTP::Tiny; use HTML::Parser; use utf8; +use Encode; use IdaliusConfig qw/assert_scalar/; @@ -25,14 +26,22 @@ sub configure { } my $title; +my $charset; sub start_handler { - return if shift ne "title"; + my $tag = shift; + my $attr = shift; my $self = shift; - $self->handler(text => sub { $title = shift; }, "dtext"); - $self->handler(end => sub { shift->eof if shift eq "title"; }, - "tagname,self"); + if ($tag eq "title") { + $self->handler(text => sub { $title = shift; }, "dtext"); + $self->handler(end => sub { shift->eof if shift eq "title"; }, + "tagname,self"); + } elsif ($tag eq "meta") { + if ($attr->{charset}) { + $charset = $attr->{charset}; + } + } } sub get_title @@ -70,14 +79,21 @@ sub get_title } my $html = $response->{content}; - utf8::decode($html); $title = ""; my $p = HTML::Parser->new(api_version => 3); - $p->handler( start => \&start_handler, "tagname,self"); + $p->handler( start => \&start_handler, "tagname,attr,self" ); $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; + if ($charset) { + my $dc = Encode::find_encoding($charset); + return (undef, undef, "Error: Unknown encoding $charset") unless $dc; + $title = $dc->decode($title); + } else { + # fall back on a guess of UTF-8 FIXME is this non-standard + utf8::decode($title); + } $title =~ s/\s+/ /g; $title =~ s/(^\s+|\s+$)//g; -- cgit v1.1