diff options
author | David Phillips <david@sighup.nz> | 2019-09-14 18:08:48 +1200 |
---|---|---|
committer | David Phillips <david@sighup.nz> | 2019-09-14 18:08:48 +1200 |
commit | 675510557f7499251ac8db3f19b1ca9a692c3f08 (patch) | |
tree | 53f1289db0d411e4fb1be003c0aec73ad20ac615 | |
parent | 8f86ef32dff18c0b6499bc7d934e222990451c32 (diff) | |
download | idalius-675510557f7499251ac8db3f19b1ca9a692c3f08.tar.xz |
URL_Title: use Content-Type header or its http-equiv
This patch adds the ability for URL_Title to fall back on the Content-Type
meta http-equiv tag, or failing that, the Content-Type HTTP header itself.
This should improve correctnes when dealing with HTML documents other than
HTML5.
-rw-r--r-- | Plugin/URL_Title.pm | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 60e7691..14b963d 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -28,7 +28,8 @@ sub configure { # Globals set by HTML parser, used by get_title my $title; -my $charset = "utf8"; # charset default to utf8, if not specified in HTML +my $charset; +my $content_type; # Content-Type header, picked from HTTP or http-equiv sub start_handler { @@ -43,6 +44,10 @@ sub start_handler } elsif ($tag eq "meta") { if ($attr->{charset}) { $charset = $attr->{charset}; + } elsif ( $attr->{"http-equiv"} + && lc($attr->{"http-equiv"}) eq "content-type" + && $attr->{content}) { + $content_type = $attr->{"content"}; } } } @@ -89,6 +94,20 @@ sub get_title $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; + # Pick out charset from the following in order of precedence: + # 1. <meta charset="β¦"> (stored in $charset already, if present) + # 2. <meta http-equiv="Content-Type"> + # 3. Content-Type HTTP header + # 4. Default to "utf8" + if (!$charset) { + $content_type //= $response->{headers}->{"content-type"}; + if ($content_type =~ m/;\s*charset=(\S+)/) { + $charset = $1; + } else { + $charset = "utf8"; + } + } + # Decode raw bytes from document's title my $dc = Encode::find_encoding($charset); return (undef, undef, "Error: Unknown encoding $charset") unless $dc; |