aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Phillips <david@yeah.nah.nz>2020-06-27 12:01:25 +1200
committerDavid Phillips <david@yeah.nah.nz>2020-06-27 12:28:52 +1200
commitcd62457f66c80ff5f0e8643910f094e54cea06e2 (patch)
treedb2a67e373025360af4e53562e80a3846b4afb47
parent93637014e5deb7305a75f9bfec6c3701a4b814a7 (diff)
downloadidalius-cd62457f66c80ff5f0e8643910f094e54cea06e2.tar.xz
URL_Title: Add URL blacklist regex ability
This patch adds a new configuration option to the URL_Title module so that the bot configuration may declare a list of regular expressions to match on a URL in order to determine if it is blacklisted.
-rw-r--r--Plugin/URL_Title.pm27
1 files changed, 18 insertions, 9 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm
index 9076c44..e6d13a3 100644
--- a/Plugin/URL_Title.pm
+++ b/Plugin/URL_Title.pm
@@ -25,6 +25,7 @@ sub configure {
IdaliusConfig::assert_scalar($config, $self, "url_len");
die "url_len must be positive" if $config->{url_len} <= 0;
+ IdaliusConfig::assert_list($config, $self, "blacklist");
# debug_dump is an optional parameter to dump base64(gzip(page_data)) into
# the log - defaults to disabled and not checked here
@@ -71,6 +72,22 @@ sub get_title
}
return (undef, "No URL found in that string", undef) unless $url;
+ my $shorturl = $url;
+ # remove http(s):// to avoid triggering other poorly configured bots
+ $shorturl =~ s,^https?://,,g;
+ $shorturl =~ s,/$,,g;
+
+ # truncate URL without http(s):// to configured length if needed
+ $shorturl = (substr $shorturl, 0, $config->{url_len}) . "…" if length ($shorturl) > $config->{url_len};
+
+ # perform domain blacklisting
+ my ($domain) = $url =~ m/https?:\/\/([^:\/]+)/ig;
+ foreach my $pattern (@{$config->{blacklist}}) {
+ if ($domain =~ /^$pattern$/i) {
+ return (undef, "domain blacklisted by '$pattern'", undef);
+ }
+ }
+
# FIXME add more XML-based formats that we can theoretically extract titles from
# FIXME factor out accepted formats and response match into accepted formats array
my %headers = (
@@ -102,7 +119,7 @@ sub get_title
my $html = $response->{content};
if ($response->{headers}->{"content-encoding"} &&
- $response->{headers}->{"content-encoding"} == "gzip") {
+ $response->{headers}->{"content-encoding"} eq "gzip") {
my $new_html;
gunzip \$html => \$new_html or return (undef, undef, "Error: gzip decompression failed: $!");
$html = $new_html;
@@ -157,14 +174,6 @@ sub get_title
return (undef, undef, "Error: No title") unless $title;
- my $shorturl = $url;
- # remove http(s):// to avoid triggering other poorly configured bots
- $shorturl =~ s,^https?://,,g;
- $shorturl =~ s,/$,,g;
-
- # truncate URL without http(s):// to configured length if needed
- $shorturl = (substr $shorturl, 0, $config->{url_len}) . "…" if length ($shorturl) > $config->{url_len};
-
my $composed_title = "$title ($shorturl)";
return $composed_title;
}