From 675510557f7499251ac8db3f19b1ca9a692c3f08 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Sat, 14 Sep 2019 18:08:48 +1200 Subject: URL_Title: use Content-Type header or its http-equiv This patch adds the ability for URL_Title to fall back on the Content-Type meta http-equiv tag, or failing that, the Content-Type HTTP header itself. This should improve correctnes when dealing with HTML documents other than HTML5. --- Plugin/URL_Title.pm | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'Plugin') diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 60e7691..14b963d 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -28,7 +28,8 @@ sub configure { # Globals set by HTML parser, used by get_title my $title; -my $charset = "utf8"; # charset default to utf8, if not specified in HTML +my $charset; +my $content_type; # Content-Type header, picked from HTTP or http-equiv sub start_handler { @@ -43,6 +44,10 @@ sub start_handler } elsif ($tag eq "meta") { if ($attr->{charset}) { $charset = $attr->{charset}; + } elsif ( $attr->{"http-equiv"} + && lc($attr->{"http-equiv"}) eq "content-type" + && $attr->{content}) { + $content_type = $attr->{"content"}; } } } @@ -89,6 +94,20 @@ sub get_title $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; + # Pick out charset from the following in order of precedence: + # 1. (stored in $charset already, if present) + # 2. + # 3. Content-Type HTTP header + # 4. Default to "utf8" + if (!$charset) { + $content_type //= $response->{headers}->{"content-type"}; + if ($content_type =~ m/;\s*charset=(\S+)/) { + $charset = $1; + } else { + $charset = "utf8"; + } + } + # Decode raw bytes from document's title my $dc = Encode::find_encoding($charset); return (undef, undef, "Error: Unknown encoding $charset") unless $dc; -- cgit v1.1