diff options
author | David Phillips <david@sighup.nz> | 2019-06-19 20:46:07 +1200 |
---|---|---|
committer | David Phillips <david@sighup.nz> | 2019-06-19 20:46:07 +1200 |
commit | a21d52d884ac0435888c087bc0ba71a44a3f05b1 (patch) | |
tree | 0c9f154985930dfa7c0ae42dd2282a083079ab0c /Plugin/URL_Title.pm | |
parent | a1ce9696c929aeaf1738cf281f4241d100525341 (diff) | |
download | idalius-a21d52d884ac0435888c087bc0ba71a44a3f05b1.tar.xz |
URL_Title: Extract charset from HTML tag if present
Diffstat (limited to 'Plugin/URL_Title.pm')
-rw-r--r-- | Plugin/URL_Title.pm | 28 |
1 files changed, 22 insertions, 6 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 6582a95..5299e1f 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -5,6 +5,7 @@ use warnings; use HTTP::Tiny; use HTML::Parser; use utf8; +use Encode; use IdaliusConfig qw/assert_scalar/; @@ -25,14 +26,22 @@ sub configure { } my $title; +my $charset; sub start_handler { - return if shift ne "title"; + my $tag = shift; + my $attr = shift; my $self = shift; - $self->handler(text => sub { $title = shift; }, "dtext"); - $self->handler(end => sub { shift->eof if shift eq "title"; }, - "tagname,self"); + if ($tag eq "title") { + $self->handler(text => sub { $title = shift; }, "dtext"); + $self->handler(end => sub { shift->eof if shift eq "title"; }, + "tagname,self"); + } elsif ($tag eq "meta") { + if ($attr->{charset}) { + $charset = $attr->{charset}; + } + } } sub get_title @@ -70,14 +79,21 @@ sub get_title } my $html = $response->{content}; - utf8::decode($html); $title = ""; my $p = HTML::Parser->new(api_version => 3); - $p->handler( start => \&start_handler, "tagname,self"); + $p->handler( start => \&start_handler, "tagname,attr,self" ); $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; + if ($charset) { + my $dc = Encode::find_encoding($charset); + return (undef, undef, "Error: Unknown encoding $charset") unless $dc; + $title = $dc->decode($title); + } else { + # fall back on a guess of UTF-8 FIXME is this non-standard + utf8::decode($title); + } $title =~ s/\s+/ /g; $title =~ s/(^\s+|\s+$)//g; |