#!/usr/bin/perl -w
# $Id: p0rn-bot,v 1.12 2004/11/20 21:04:16 mitch Exp $
#
# automatically register pages with p0rn-proxy
#
# 2004 (C) by Christian Garbs <mitch@cgarbs.de>
# Licensed under GNU GPL.  See COPYING for details.

use strict;
use AppConfig qw(:expand);
use HTML::Parser;
use LWP::UserAgent;
use P0rn::Static;
use URI;

=head1 NAME

p0rn-bot - register pages with p0rn-proxy automatically

=head1 SYNOPSIS

B<p0rn-bot>

S<[ B<--depth> I<depth> ]>
S<[ B<--help> ]>
S<[ B<--proxy> I<proxy> ]>
S<[ B<--version> ]>

B<p0rn-proxy>
S<[ B<-c> I<configfile> ]>
S<[ B<-h> ]>
S<[ B<-P> I<proxy> ]>
S<[ B<-V> ]>

=head1 OVERVIEW

p0rn-bot is a script that automates p0rn-proxy handling.  It starts
with a given page and then follows all links recursivly.  All pages
encountered are registered with p0rn-proxy either as thumnails sites
or as galleries for download.

=head1 DESCRIPTION

Be sure to have L<p0rn-proxy(1)> running.  Then start B<p0rn-bot> with
the URL of a known thumbnail gallery page.  Now wait and see p0rn-bot
do it's work.

When p0rn-bot is finished, the pages are marked for download.  Run
L<p0rn-download(1)> to actually get them.

=head2 Switches

=over 5

=item B<--depth> I<depth> | B<-d> I<depth>

This sets how 'deep' links are being followed.  0 will only look at
the given start URL, 1 will visit the start URL plus all sites that
are linked on it and so forth...

Default depth is 2.

=item B<--help> | B<-h>

This prints a short help text and exits.

=item B<--proxy> I<proxy> | B<-P> I<proxy>

This gives the address where the p0rn-proxy is running.  It is given
in the form 'http://hostname:portnumber'.

Default is to use 'http://localhost:8080' as this is the default port
that p0rn-proxy listens to.

=item B<--version> | B<-V>

This prints the current version of japana and exits.

=back

=head1 SEE ALSO

L<p0rn-proxy(1)>

=head1 MODULES NEEDED

 use AppConfig;
 use HTML::Parser;
 use HTTP::Daemon;
 use LWP::UserAgent;
 use URI;

These modules can be obtained from L<http://www.cpan.org>.

=head1 BUGS

Please report bugs by mail to <F<p0rn-bugs@cgarbs.de>>.

=head1 AUTHOR

p0rn-bot was written by Christian Garbs <F<mitch@cgarbs.de>>.

=head1 AVAILABILITY

Look for updates at L<http://www.cgarbs.de/p0rn-comfort.en.html>.

=head1 COPYRIGHT

p0rn-bot is licensed under the GNU GPL.

=cut

######[ Global Variables ]
#

my $CVSVERSION = do { my @r = (q$Revision: 1.12 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
my ($status, $base, $ua, $parser);
my @pics;
my @pages;
my @todo;
my ($count_pic, $count_thumb, $count_empty) = (0, 0, 0);

######[ Subroutines ]
#

sub debug(@)
# print debugging message
{
    warn "@_\n";
}

sub mark_down($)
# add link to download list
{
    $ua->get("${Static::PROXY_CTRL_BASE}/${Static::PROXY_MARK_DOWNLOAD}/$_[0]");
}

sub mark_thumb($)
# add link to thumbnail list
{
    $ua->get("${Static::PROXY_CTRL_BASE}/${Static::PROXY_MARK_THUMB}/$_[0]");
}

sub unmark_thumb($)
# remove link from thumbnail list
{
    $ua->get("${Static::PROXY_CTRL_BASE}/${Static::PROXY_UNMARK_THUMB}/$_[0]");
}

sub start_handler
# callback for HTML::Parser to react on link tags
{
    return unless shift eq "a";
    my $attr = shift;
    my $link = $attr->{href};

    return unless defined $link;
    $link =~ s/\#.*$//;
    return if $link eq $base;
    return if $link =~ m,^$Static::PROXY_CTRL_BASE,;
    return if $link =~ /^javascript:/;
    return if $link =~ /^mailto:/;
    return if $link =~ /^\s*$/;
    
    $link = URI->new_abs($link, $base)->as_string;

    if ($link =~ /\.(jpe?g|wmv|mpe?g|avi)$/i) {
	push @pics, $link;
    } else {
	push @pages, $link;
    }
}

sub parse_page($$)
# parse an HTML page
{
    my ($response, $urls) = (@_);
    
    $base = $response->base;
    @pics = ();
    @pages = ();
    $parser->parse($response->content);

    $status .= $base . ' ' . @pics . '/' . @pages . ' ';

    if (@pics + @pages > 0) {
	if (@pics > @pages) {
	    $status .= 'PICTURE';
	    mark_down($base);
	    $count_pic++;
	} else {
	    $status .= 'thumbnail';
	    mark_thumb($base);
	    push @{$urls}, (@pages);
	    $count_thumb++;
	}
    } else {
	$status .= "empty";
	$count_empty++;
    }
}

sub check_url($$);
sub check_url($$)
# fetch an HTML page and process it
{
    my ($url, $recurse) = @_;
    my @urls;

    return unless defined $url;
    $recurse = 0 unless defined $recurse;
    
    $status = sprintf "[%02d] ", $recurse;
    my $response = $ua->get($url);
    
    if ($response->is_success) {
	parse_page( $response, \@urls );
    } else {
	$status .= "$url failed " . $response->code;
    }
    
    debug $status;

    if ($recurse) {
	# dedupe
	my %urls = map { $_ => 0 } @urls;
	foreach $url (keys %urls) {
	    push @todo, [$url, $recurse - 1];
	}
    }
}

sub check_proxy($)
# check for a running p0rn-proxy
{
    my $response = $ua->get("${Static::PROXY_CTRL_BASE}/${Static::PROXY_GET_VERSION}/_nouri_");
    if ($response->is_success) {
	foreach (split /\n/, $response->content) {
	    if ( /^VERSION: (.*)$/) {
		print "found p0rn-proxy $1\n";
		last;
	    }
	}
    } else {
	print "no p0rn-proxy found at '$_[0]'\n";
	exit 1;
    }
}

sub print_usage()
# prints a short help text and exits
{
    print << "EOF";

Usage: p0rn-bot [options] start_url
Supported options (long and short forms):
  -d, --depth         : set link recursion depth
  -h, --help          : print usage and exit
  -P, --proxy         : set proxy to use
  -v, --version       : print version number and exit
EOF
;
    exit 0;
}

######[ Main program ]
#
   
print "this is p0rn-bot ${Static::VERSION}/${CVSVERSION}\n";

# define configuration options
my $config = AppConfig->new( { CASE => 1 } );
$config->define( 'depth|d=s',         { DEFAULT => '2' } );
$config->define( 'proxy|P=s',         { DEFAULT => 'http://localhost:8080' } );
$config->define( 'version|V!' );
$config->define( 'help|h!' );

# override configuration with command line arguments
$config->getopt( qw(no_ignore_case), [ @ARGV ] );
my $starturl = shift;

# if we are to just print the version number, then quit now
exit if $config->version();
		 
# if we are to just print help, then do it now
print_usage() if $config->help();

# create UserAgent
$ua = LWP::UserAgent->new;
$ua->agent("p0rn-bot ${Static::VERSION}/${CVSVERSION}");
$ua->timeout(10);
$ua->proxy('http', $config->proxy());

# check settings
check_proxy($config->proxy());
die "no url given\n" unless (defined $starturl and $starturl ne '');

# create HTML Parser
$parser = HTML::Parser->new(api_version => 3);
$parser->handler( start => \&start_handler, "tagname,attr");

# unhide start URL
unmark_thumb($starturl);

# go
my $count = 0;
push @todo, [$starturl, $config->depth()];
while (@todo) {
    $count ++;
    my ($url, $depth) = @{shift @todo};
    check_url($url, $depth);
    print "(done/queued $count/".scalar @todo.")\n" unless $count % 25;
}

# finished
print "finished.  pic/thumb/empty: ${count_pic}/${count_thumb}/${count_empty}\n";
