1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
package Plugin::URL_Title;
use strict;
use warnings;
use HTTP::Tiny;
use HTML::Parser;
use utf8;
my %config;
sub configure {
my $self = $_[0];
my $cmdref = $_[1];
my $cref = $_[2];
%config = %$cref;
return $self;
}
my $title;
sub start_handler
{
return if shift ne "title";
my $self = shift;
$self->handler(text => sub { $title = shift; }, "dtext");
$self->handler(end => sub { shift->eof if shift eq "title"; },
"tagname,self");
}
sub message
{
my ($self, $logger, $me, $who, $where, $raw_what, $what, $irc) = @_;
my $url;
return if ($config{url_on} == 0);
# Drawn from RFC 3986Β§2
if ($what =~ /(https?:\/\/[a-z0-9\-\._~:\/\?#\[\]@\!\$&'()\*\+,;=%]+)/i) {
$url = $1;
}
return unless $url;
# FIXME add more XML-based formats that we can theoretically extract titles from
# FIXME factor out accepted formats and response match into accepted formats array
my $http = HTTP::Tiny->new((default_headers => {'Range' => "bytes=0-65536", 'Accept' => 'text/html, image/svg+xml'}, timeout => 3));
my $response = $http->get($url);
if (!$response->{success}) {
$logger->("Something broke: $response->{reason}");
return;
}
if (!($response->{headers}->{"content-type"} =~ m,(text/html|image/svg\+xml),)) {
$logger->("I don't think I can parse titles from $response->{headers}->{'content-type'} - stopping here");
return;
}
my $html = $response->{content};
utf8::decode($html);
$title = "";
my $p = HTML::Parser->new(api_version => 3);
$p->handler( start => \&start_handler, "tagname,self");
$p->parse($html);
die "Error: $!\n" if $!;
$title =~ s/\s+/ /g;
$title =~ s/(^\s+|\s+$)//g;
utf8::upgrade($title);
return unless $title;
my $shorturl = $url;
# remove http(s):// to avoid triggering other poorly configured bots
$shorturl =~ s,^https?://,,g;
$shorturl =~ s,/$,,g;
# truncate URL without http(s):// to configured length if needed
$shorturl = (substr $shorturl, 0, $config{url_len}) . "β¦" if length ($shorturl) > $config{url_len};
my $composed_title = "$title ($shorturl)";
return $composed_title;
}
1;
|