From cd62457f66c80ff5f0e8643910f094e54cea06e2 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Sat, 27 Jun 2020 12:01:25 +1200 Subject: URL_Title: Add URL blacklist regex ability This patch adds a new configuration option to the URL_Title module so that the bot configuration may declare a list of regular expressions to match on a URL in order to determine if it is blacklisted. --- Plugin/URL_Title.pm | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 9076c44..e6d13a3 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -25,6 +25,7 @@ sub configure { IdaliusConfig::assert_scalar($config, $self, "url_len"); die "url_len must be positive" if $config->{url_len} <= 0; + IdaliusConfig::assert_list($config, $self, "blacklist"); # debug_dump is an optional parameter to dump base64(gzip(page_data)) into # the log - defaults to disabled and not checked here @@ -71,6 +72,22 @@ sub get_title } return (undef, "No URL found in that string", undef) unless $url; + my $shorturl = $url; + # remove http(s):// to avoid triggering other poorly configured bots + $shorturl =~ s,^https?://,,g; + $shorturl =~ s,/$,,g; + + # truncate URL without http(s):// to configured length if needed + $shorturl = (substr $shorturl, 0, $config->{url_len}) . "…" if length ($shorturl) > $config->{url_len}; + + # perform domain blacklisting + my ($domain) = $url =~ m/https?:\/\/([^:\/]+)/ig; + foreach my $pattern (@{$config->{blacklist}}) { + if ($domain =~ /^$pattern$/i) { + return (undef, "domain blacklisted by '$pattern'", undef); + } + } + # FIXME add more XML-based formats that we can theoretically extract titles from # FIXME factor out accepted formats and response match into accepted formats array my %headers = ( @@ -102,7 +119,7 @@ sub get_title my $html = $response->{content}; if ($response->{headers}->{"content-encoding"} && - $response->{headers}->{"content-encoding"} == "gzip") { + $response->{headers}->{"content-encoding"} eq "gzip") { my $new_html; gunzip \$html => \$new_html or return (undef, undef, "Error: gzip decompression failed: $!"); $html = $new_html; @@ -157,14 +174,6 @@ sub get_title return (undef, undef, "Error: No title") unless $title; - my $shorturl = $url; - # remove http(s):// to avoid triggering other poorly configured bots - $shorturl =~ s,^https?://,,g; - $shorturl =~ s,/$,,g; - - # truncate URL without http(s):// to configured length if needed - $shorturl = (substr $shorturl, 0, $config->{url_len}) . "…" if length ($shorturl) > $config->{url_len}; - my $composed_title = "$title ($shorturl)"; return $composed_title; } -- cgit v1.1