aboutsummaryrefslogtreecommitdiff
path: root/Plugin
diff options
context:
space:
mode:
authorDavid Phillips <david@sighup.nz>2019-06-19 20:46:07 +1200
committerDavid Phillips <david@sighup.nz>2019-06-19 20:46:07 +1200
commita21d52d884ac0435888c087bc0ba71a44a3f05b1 (patch)
tree0c9f154985930dfa7c0ae42dd2282a083079ab0c /Plugin
parenta1ce9696c929aeaf1738cf281f4241d100525341 (diff)
downloadidalius-a21d52d884ac0435888c087bc0ba71a44a3f05b1.tar.xz
URL_Title: Extract charset from HTML tag if present
Diffstat (limited to 'Plugin')
-rw-r--r--Plugin/URL_Title.pm28
1 files changed, 22 insertions, 6 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 6582a95..5299e1f 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -5,6 +5,7 @@ use warnings;
use HTTP::Tiny;
use HTML::Parser;
use utf8;
+use Encode;
use IdaliusConfig qw/assert_scalar/;
@@ -25,14 +26,22 @@ sub configure {
}
my $title;
+my $charset;
sub start_handler
{
- return if shift ne "title";
+ my $tag = shift;
+ my $attr = shift;
my $self = shift;
- $self->handler(text => sub { $title = shift; }, "dtext");
- $self->handler(end => sub { shift->eof if shift eq "title"; },
- "tagname,self");
+ if ($tag eq "title") {
+ $self->handler(text => sub { $title = shift; }, "dtext");
+ $self->handler(end => sub { shift->eof if shift eq "title"; },
+ "tagname,self");
+ } elsif ($tag eq "meta") {
+ if ($attr->{charset}) {
+ $charset = $attr->{charset};
+ }
+ }
}
sub get_title
@@ -70,14 +79,21 @@ sub get_title
}
my $html = $response->{content};
- utf8::decode($html);
$title = "";
my $p = HTML::Parser->new(api_version => 3);
- $p->handler( start => \&start_handler, "tagname,self");
+ $p->handler( start => \&start_handler, "tagname,attr,self" );
$p->parse($html);
return (undef, undef, "Error parsing HTML: $!") if $!;
+ if ($charset) {
+ my $dc = Encode::find_encoding($charset);
+ return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
+ $title = $dc->decode($title);
+ } else {
+ # fall back on a guess of UTF-8 FIXME is this non-standard
+ utf8::decode($title);
+ }
$title =~ s/\s+/ /g;
$title =~ s/(^\s+|\s+$)//g;