diff options
| author | David Phillips <david@yeah.nah.nz> | 2020-06-27 12:01:25 +1200 | 
|---|---|---|
| committer | David Phillips <david@yeah.nah.nz> | 2020-06-27 12:28:52 +1200 | 
| commit | cd62457f66c80ff5f0e8643910f094e54cea06e2 (patch) | |
| tree | db2a67e373025360af4e53562e80a3846b4afb47 | |
| parent | 93637014e5deb7305a75f9bfec6c3701a4b814a7 (diff) | |
| download | idalius-cd62457f66c80ff5f0e8643910f094e54cea06e2.tar.xz | |
URL_Title: Add URL blacklist regex ability
This patch adds a new configuration option to the URL_Title module so that
the bot configuration may declare a list of regular expressions to match on a
URL in order to determine if it is blacklisted.
| -rw-r--r-- | Plugin/URL_Title.pm | 27 | 
1 files changed, 18 insertions, 9 deletions
diff --git a/Plugin/URL_Title.pm b/Plugin/URL_Title.pm index 9076c44..e6d13a3 100644 --- a/Plugin/URL_Title.pm +++ b/Plugin/URL_Title.pm @@ -25,6 +25,7 @@ sub configure {  	IdaliusConfig::assert_scalar($config, $self, "url_len");  	die "url_len must be positive" if $config->{url_len} <= 0; +	IdaliusConfig::assert_list($config, $self, "blacklist");  	# debug_dump is an optional parameter to dump base64(gzip(page_data)) into  	# the log - defaults to disabled and not checked here @@ -71,6 +72,22 @@ sub get_title  	}  	return (undef, "No URL found in that string", undef) unless $url; +	my $shorturl = $url; +	# remove http(s):// to avoid triggering other poorly configured bots +	$shorturl =~ s,^https?://,,g; +	$shorturl =~ s,/$,,g; + +	# truncate URL without http(s):// to configured length if needed +	$shorturl = (substr $shorturl, 0, $config->{url_len}) . "β¦" if length ($shorturl) > $config->{url_len}; + +	# perform domain blacklisting +	my ($domain) = $url =~ m/https?:\/\/([^:\/]+)/ig; +	foreach my $pattern (@{$config->{blacklist}}) { +		if ($domain =~ /^$pattern$/i) { +			return (undef, "domain blacklisted by '$pattern'", undef); +		} +	} +  	# FIXME add more XML-based formats that we can theoretically extract titles from  	# FIXME factor out accepted formats and response match into accepted formats array  	my %headers = ( @@ -102,7 +119,7 @@ sub get_title  	my $html = $response->{content};  	if ($response->{headers}->{"content-encoding"} && -	    $response->{headers}->{"content-encoding"} == "gzip") { +	    $response->{headers}->{"content-encoding"} eq "gzip") {  		my $new_html;  		gunzip \$html => \$new_html or return (undef, undef, "Error: gzip decompression failed: $!");  		$html = $new_html; @@ -157,14 +174,6 @@ sub get_title  	return (undef, undef, "Error: No title") unless $title; -	my $shorturl = $url; -	# remove http(s):// to avoid triggering other poorly configured bots -	$shorturl =~ s,^https?://,,g; -	$shorturl =~ s,/$,,g; - -	# truncate URL without http(s):// to configured length if needed -	$shorturl = (substr $shorturl, 0, $config->{url_len}) . "β¦" if length ($shorturl) > $config->{url_len}; -  	my $composed_title = "$title ($shorturl)";  	return $composed_title;  }  | 
