package Plugin::URL_Title; use strict; use warnings; use HTTP::Tiny; use HTML::Parser; use HTML::Entities; use utf8; use Encode; use IO::Uncompress::Gunzip qw/gunzip/; # For optional debug page dumping use IO::Compress::Gzip qw/gzip/; use MIME::Base64 qw/encode_base64/; use IdaliusConfig qw/assert_scalar/; my $config; sub configure { my $self = shift; my $cmdref = shift; shift; # run_command $config = shift; IdaliusConfig::assert_scalar($config, $self, "url_len"); die "url_len must be positive" if $config->{url_len} <= 0; # debug_dump is an optional parameter to dump base64(gzip(page_data)) into # the log - defaults to disabled and not checked here $cmdref->($self, "title of", sub { $self->get_title_cmd(@_); }); return $self; } # Globals set by HTML parser, used by get_title my $title; my $charset; my $content_type; # Content-Type header, picked from HTTP or http-equiv sub start_handler { my $tag = shift; my $attr = shift; my $self = shift; if ($tag eq "title") { # Note: NOT dtext. leave entities until after decoding text. See comment below $self->handler(text => sub { $title = shift; }, "text"); $self->handler(end => sub { shift->eof if shift eq "title"; }, "tagname,self"); } elsif ($tag eq "meta") { if ($attr->{charset}) { $charset = $attr->{charset}; } elsif ( $attr->{"http-equiv"} && lc($attr->{"http-equiv"}) eq "content-type" && $attr->{content}) { $content_type = $attr->{"content"}; } } } sub get_title { my ($logger, $what) = @_; my $url; # Drawn from RFC 3986§2 if ($what =~ /(https?:\/\/[a-z0-9\-\._~:\/\?#\[\]@\!\$&'()\*\+,;=%]+)/i) { $url = $1; } return (undef, "No URL found in that string", undef) unless $url; # FIXME add more XML-based formats that we can theoretically extract titles from # FIXME factor out accepted formats and response match into accepted formats array my %headers = ( "Range" => "bytes=0-65536", "Accept" => "text/html, image/svg+xml", "Acceot-Encoding" => "gzip" ); my $http = HTTP::Tiny->new((default_headers => \%headers, timeout => 3, agent => 'Mozilla/5.0 (compatible; idalius IRC bot)')); my $response = $http->get($url); if (!$response->{success}) { if ($response->{status} == 599) { chomp $response->{content}; return (undef, undef, "Error: HTTP client: $response->{content}"); } else { return (undef, undef, "Error: HTTP $response->{status} ($response->{reason})"); } } if (not $response->{headers}->{"content-type"}) { return (undef, undef, "No content-type in reponse header, not continuing"); } if (!($response->{headers}->{"content-type"} =~ m,(text/html|image/svg\+xml),)) { return (undef, undef, "I don't think I can parse titles from $response->{headers}->{'content-type'} - stopping here"); } my $html = $response->{content}; if ($response->{headers}->{"content-encoding"} && $response->{headers}->{"content-encoding"} == "gzip") { my $new_html; gunzip \$html => \$new_html or return (undef, undef, "Error: gzip decompression failed: $!"); $html = $new_html; } if ($config->{debug_dump}) { my $zipped; if (gzip \$html => \$zipped) { my $encoded = encode_base64($zipped, ""); $logger->("debug_dump for $url: $encoded"); } else { $logger->("debug_dump gzip error: $!"); } } $charset = undef; $content_type = undef; $title = undef; my $p = HTML::Parser->new(api_version => 3); $p->handler( start => \&start_handler, "tagname,attr,self" ); $p->parse($html); return (undef, undef, "Error parsing HTML: $!") if $!; # Pick out charset from the following in order of precedence: # 1. (stored in $charset already, if present) # 2. # 3. Content-Type HTTP header # 4. Default to "utf8" if (!$charset) { $content_type //= $response->{headers}->{"content-type"}; if ($content_type =~ m/;\s*charset=(\S+)/) { $charset = $1; } else { $charset = "utf8"; } } # Decode raw bytes from document's title my $dc = Encode::find_encoding($charset); return (undef, undef, "Error: Unknown encoding $charset") unless $dc; $title = $dc->decode($title); # Finally, collapse entities into the characters they represent. Note this # must be done instead of pulling dtext from the HTML parser, else you can # end up with a bad mix of encodings (see documents with wchars mixed with # entities representing more wchars in titles) decode_entities($title); # Normalise and trim whitespace for tidiness $title =~ s/\s+/ /g; $title =~ s/(^\s+|\s+$)//g; return (undef, undef, "Error: No title") unless $title; my $shorturl = $url; # remove http(s):// to avoid triggering other poorly configured bots $shorturl =~ s,^https?://,,g; $shorturl =~ s,/$,,g; # truncate URL without http(s):// to configured length if needed $shorturl = (substr $shorturl, 0, $config->{url_len}) . "…" if length ($shorturl) > $config->{url_len}; my $composed_title = "$title ($shorturl)"; return $composed_title; } sub get_title_cmd { my ($self, $irc, $logger, $who, $where, $ided, $rest, $no_reenter, @arguments) = @_; my ($title, $warning, $error) = get_title($logger, $rest); $logger->($error) if $error; return $error if $error; return $warning if $warning; $no_reenter->(); return $title if $title; } sub on_message { my ($self, $logger, $who, $where, $raw_what, $what, $irc) = @_; my ($title, $warning, $error) = get_title($logger, $what); # Log only errors, not warnings $logger->($error) if $error; return $title if $title; } sub on_action { on_message(@_); } 1;