diff options
author | David Phillips <david@yeah.nah.nz> | 2020-06-27 12:01:25 +1200 |
---|---|---|
committer | David Phillips <david@yeah.nah.nz> | 2020-06-27 12:28:52 +1200 |
commit | cd62457f66c80ff5f0e8643910f094e54cea06e2 (patch) | |
tree | db2a67e373025360af4e53562e80a3846b4afb47 /Plugin/URL_Title.pm | |
parent | 93637014e5deb7305a75f9bfec6c3701a4b814a7 (diff) | |
download | idalius-cd62457f66c80ff5f0e8643910f094e54cea06e2.tar.xz |
URL_Title: Add URL blacklist regex ability
This patch adds a new configuration option to the URL_Title module so that
the bot configuration may declare a list of regular expressions to match on a
URL in order to determine if it is blacklisted.
Diffstat (limited to 'Plugin/URL_Title.pm')
-rw-r--r-- | Plugin/URL_Title.pm | 27 |
1 files changed, 18 insertions, 9 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 9076c44..e6d13a3 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -25,6 +25,7 @@ sub configure { IdaliusConfig::assert_scalar($config, $self, "url_len"); die "url_len must be positive" if $config->{url_len} <= 0; + IdaliusConfig::assert_list($config, $self, "blacklist"); # debug_dump is an optional parameter to dump base64(gzip(page_data)) into # the log - defaults to disabled and not checked here @@ -71,6 +72,22 @@ sub get_title } return (undef, "No URL found in that string", undef) unless $url; + my $shorturl = $url; + # remove http(s):// to avoid triggering other poorly configured bots + $shorturl =~ s,^https?://,,g; + $shorturl =~ s,/$,,g; + + # truncate URL without http(s):// to configured length if needed + $shorturl = (substr $shorturl, 0, $config->{url_len}) . "β¦" if length ($shorturl) > $config->{url_len}; + + # perform domain blacklisting + my ($domain) = $url =~ m/https?:\/\/([^:\/]+)/ig; + foreach my $pattern (@{$config->{blacklist}}) { + if ($domain =~ /^$pattern$/i) { + return (undef, "domain blacklisted by '$pattern'", undef); + } + } + # FIXME add more XML-based formats that we can theoretically extract titles from # FIXME factor out accepted formats and response match into accepted formats array my %headers = ( @@ -102,7 +119,7 @@ sub get_title my $html = $response->{content}; if ($response->{headers}->{"content-encoding"} && - $response->{headers}->{"content-encoding"} == "gzip") { + $response->{headers}->{"content-encoding"} eq "gzip") { my $new_html; gunzip \$html => \$new_html or return (undef, undef, "Error: gzip decompression failed: $!"); $html = $new_html; @@ -157,14 +174,6 @@ sub get_title return (undef, undef, "Error: No title") unless $title; - my $shorturl = $url; - # remove http(s):// to avoid triggering other poorly configured bots - $shorturl =~ s,^https?://,,g; - $shorturl =~ s,/$,,g; - - # truncate URL without http(s):// to configured length if needed - $shorturl = (substr $shorturl, 0, $config->{url_len}) . "β¦" if length ($shorturl) > $config->{url_len}; - my $composed_title = "$title ($shorturl)"; return $composed_title; } |