From 675510557f7499251ac8db3f19b1ca9a692c3f08 Mon Sep 17 00:00:00 2001
From: David Phillips <david@sighup.nz>
Date: Sat, 14 Sep 2019 18:08:48 +1200
Subject: URL_Title: use Content-Type header or its http-equiv

This patch adds the ability for URL_Title to fall back on the Content-Type
meta http-equiv tag, or failing that, the Content-Type HTTP header itself.
This should improve correctnes when dealing with HTML documents other than
HTML5.
---
 Plugin/URL_Title.pm | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'Plugin')

diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 60e7691..14b963d 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -28,7 +28,8 @@ sub configure {
 
 # Globals set by HTML parser, used by get_title
 my $title;
-my $charset = "utf8"; # charset default to utf8, if not specified in HTML
+my $charset;
+my $content_type; # Content-Type header, picked from HTTP or http-equiv
 
 sub start_handler
 {
@@ -43,6 +44,10 @@ sub start_handler
 	} elsif ($tag eq "meta") {
 		if ($attr->{charset}) {
 			$charset = $attr->{charset};
+		} elsif (   $attr->{"http-equiv"}
+			     && lc($attr->{"http-equiv"}) eq "content-type"
+			     && $attr->{content}) {
+			$content_type = $attr->{"content"};
 		}
 	}
 }
@@ -89,6 +94,20 @@ sub get_title
 	$p->parse($html);
 	return (undef, undef, "Error parsing HTML: $!") if $!;
 
+	# Pick out charset from the following in order of precedence:
+	# 1. <meta charset="…"> (stored in $charset already, if present)
+	# 2. <meta http-equiv="Content-Type">
+	# 3. Content-Type HTTP header
+	# 4. Default to "utf8"
+	if (!$charset) {
+		$content_type //= $response->{headers}->{"content-type"};
+		if ($content_type =~ m/;\s*charset=(\S+)/) {
+			$charset = $1;
+		} else {
+			$charset = "utf8";
+		}
+	}
+
 	# Decode raw bytes from document's title
 	my $dc = Encode::find_encoding($charset);
 	return (undef, undef, "Error: Unknown encoding $charset") unless $dc;
-- 
cgit v1.1