diff options
| author | David Phillips <david@sighup.nz> | 2019-06-19 20:46:07 +1200 | 
|---|---|---|
| committer | David Phillips <david@sighup.nz> | 2019-06-19 20:46:07 +1200 | 
| commit | a21d52d884ac0435888c087bc0ba71a44a3f05b1 (patch) | |
| tree | 0c9f154985930dfa7c0ae42dd2282a083079ab0c | |
| parent | a1ce9696c929aeaf1738cf281f4241d100525341 (diff) | |
| download | idalius-a21d52d884ac0435888c087bc0ba71a44a3f05b1.tar.xz | |
URL_Title: Extract charset from HTML tag if present
| -rw-r--r-- | Plugin/URL_Title.pm | 28 | 
1 files changed, 22 insertions, 6 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 6582a95..5299e1f 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -5,6 +5,7 @@ use warnings;  use HTTP::Tiny;  use HTML::Parser;  use utf8; +use Encode;  use IdaliusConfig qw/assert_scalar/; @@ -25,14 +26,22 @@ sub configure {  }  my $title; +my $charset;  sub start_handler  { -	return if shift ne "title"; +	my $tag = shift; +	my $attr = shift;  	my $self = shift; -	$self->handler(text => sub { $title = shift; }, "dtext"); -	$self->handler(end  => sub { shift->eof if shift eq "title"; }, -	                    "tagname,self"); +	if ($tag eq "title") { +		$self->handler(text => sub { $title = shift; }, "dtext"); +		$self->handler(end  => sub { shift->eof if shift eq "title"; }, +		                    "tagname,self"); +	} elsif ($tag eq "meta") { +		if ($attr->{charset}) { +			$charset = $attr->{charset}; +		} +	}  }  sub get_title @@ -70,14 +79,21 @@ sub get_title  	}  	my $html = $response->{content}; -	utf8::decode($html);  	$title = "";  	my $p = HTML::Parser->new(api_version => 3); -	$p->handler( start => \&start_handler, "tagname,self"); +	$p->handler( start => \&start_handler, "tagname,attr,self" );  	$p->parse($html);  	return (undef, undef, "Error parsing HTML: $!") if $!; +	if ($charset) { +		my $dc = Encode::find_encoding($charset); +		return (undef, undef, "Error: Unknown encoding $charset") unless $dc; +		$title = $dc->decode($title); +	} else { +		# fall back on a guess of UTF-8 FIXME is this non-standard +		utf8::decode($title); +	}  	$title =~ s/\s+/ /g;  	$title =~ s/(^\s+|\s+$)//g;  | 
