aboutsummaryrefslogtreecommitdiff
path: root/Plugin/URL_Title.pm
blob: 6faa68975c26ef8b790696f58ebbe6e3121b637a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package Plugin::URL_Title;

use strict;
use warnings;
use HTTP::Tiny;
use HTML::Parser;
use utf8;

use IdaliusConfig qw/assert_scalar/;

my $config;

sub configure {
	my $self = shift;
	my $cmdref = shift;
	shift; # run_command
	$config = shift;

	IdaliusConfig::assert_scalar($config, $self, "url_len");
	die "url_len must be positive" if $config->{url_len} <= 0;

	$cmdref->($self, "title of", sub { $self->get_title_cmd(@_); });

	return $self;
}

my $title;

sub start_handler
{
	return if shift ne "title";
	my $self = shift;
	$self->handler(text => sub { $title = shift; }, "dtext");
	$self->handler(end  => sub { shift->eof if shift eq "title"; },
	                    "tagname,self");
}

sub get_title
{
	my ($what) = @_;
	my $url;

	# Drawn from RFC 3986Β§2
	if ($what =~ /(https?:\/\/[a-z0-9\-\._~:\/\?#\[\]@\!\$&'()\*\+,;=%]+)/i) {
		$url = $1;
	}
	return (undef, "No URL found in that string") unless $url;

	# FIXME add more XML-based formats that we can theoretically extract titles from
	# FIXME factor out accepted formats and response match into accepted formats array
	my $http = HTTP::Tiny->new((default_headers => {'Range' => "bytes=0-65536", 'Accept' => 'text/html, image/svg+xml'}, timeout => 3));

	my $response = $http->get($url);

	if (!$response->{success}) {
		if ($response->{status} == 599) {
			chomp $response->{content};
			return (undef, "Error: HTTP client: $response->{content}");
		} else {
			return (undef, "Error: HTTP $response->{status} ($response->{reason})");
		}
	}

	if (not $response->{headers}->{"content-type"}) {
		return (undef, "No content-type in reponse header, not continuing");
	}

	if (!($response->{headers}->{"content-type"} =~ m,(text/html|image/svg\+xml),)) {
		return (undef, "I don't think I can parse titles from $response->{headers}->{'content-type'} - stopping here");
	}

	my $html = $response->{content};
	utf8::decode($html);

	$title = "";
	my $p = HTML::Parser->new(api_version => 3);
	$p->handler( start => \&start_handler, "tagname,self");
	$p->parse($html);
	return (undef, "Error parsing HTML: $!") if $!;

	$title =~ s/\s+/ /g;
	$title =~ s/(^\s+|\s+$)//g;

	utf8::upgrade($title);
	return (undef, "Error: No title") unless $title;

	my $shorturl = $url;
	# remove http(s):// to avoid triggering other poorly configured bots
	$shorturl =~ s,^https?://,,g;
	$shorturl =~ s,/$,,g;

	# truncate URL without http(s):// to configured length if needed
	$shorturl = (substr $shorturl, 0, $config->{url_len}) . "…" if length ($shorturl) > $config->{url_len};

	my $composed_title = "$title ($shorturl)";
	return $composed_title;
}

sub get_title_cmd
{
	my ($self, $irc, $logger, $who, $where, $ided, $rest, $no_reenter, @arguments) = @_;

	my ($title, $error) = get_title($rest);
	$logger->($error) if $error;

	return $error if $error;
	$no_reenter->();
	return $title if $title;

}

sub on_message
{
	my ($self, $logger, $who, $where, $raw_what, $what, $irc) = @_;

	my ($title, $error) = get_title($what);
	$logger->($error) if $error;

	return $title if $title;
}

sub on_action {
	on_message(@_);
}
1;