aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Phillips <david@sighup.nz>2019-09-14 18:08:48 +1200
committerDavid Phillips <david@sighup.nz>2019-09-14 18:08:48 +1200
commit675510557f7499251ac8db3f19b1ca9a692c3f08 (patch)
tree53f1289db0d411e4fb1be003c0aec73ad20ac615
parent8f86ef32dff18c0b6499bc7d934e222990451c32 (diff)
downloadidalius-675510557f7499251ac8db3f19b1ca9a692c3f08.tar.xz
URL_Title: use Content-Type header or its http-equiv
This patch adds the ability for URL_Title to fall back on the Content-Type meta http-equiv tag, or failing that, the Content-Type HTTP header itself. This should improve correctnes when dealing with HTML documents other than HTML5.
-rw-r--r--Plugin/URL_Title.pm21
1 files changed, 20 insertions, 1 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 60e7691..14b963d 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -28,7 +28,8 @@ sub configure {
# Globals set by HTML parser, used by get_title
my $title;
-my $charset = "utf8"; # charset default to utf8, if not specified in HTML
+my $charset;
+my $content_type; # Content-Type header, picked from HTTP or http-equiv
sub start_handler
{
@@ -43,6 +44,10 @@ sub start_handler
} elsif ($tag eq "meta") {
if ($attr->{charset}) {
$charset = $attr->{charset};
+ } elsif ( $attr->{"http-equiv"}
+ && lc($attr->{"http-equiv"}) eq "content-type"
+ && $attr->{content}) {
+ $content_type = $attr->{"content"};
}
}
}
@@ -89,6 +94,20 @@ sub get_title
$p->parse($html);
return (undef, undef, "Error parsing HTML: $!") if $!;
+ # Pick out charset from the following in order of precedence:
+ # 1. <meta charset="…"> (stored in $charset already, if present)
+ # 2. <meta http-equiv="Content-Type">
+ # 3. Content-Type HTTP header
+ # 4. Default to "utf8"
+ if (!$charset) {
+ $content_type //= $response->{headers}->{"content-type"};
+ if ($content_type =~ m/;\s*charset=(\S+)/) {
+ $charset = $1;
+ } else {
+ $charset = "utf8";
+ }
+ }
+
# Decode raw bytes from document's title
my $dc = Encode::find_encoding($charset);
return (undef, undef, "Error: Unknown encoding $charset") unless $dc;