#!/usr/bin/perl -w

=head1 NAME

ygspider - mirror the "Files" section of a Yahoo! Group

=head1 SYNOPSIS

ygspider [options] [group] [username] [password]

Where C<username> is your Yahoo! username and C<password> is your
password.

=head1 OPTIONS

=over

=item B<--help>

Generate a help message and exit.

=item B<--root> dir

Save mirrored files in this directory.  Default is a subdirectory of
the current directory named C<groups.yahoo.com>/C<group> where the final
C<group> is the name of the group.

=item B<--group> groupname

Spider this group.  This is an alternative to the positional argument.

=item B<--user> username

Specify a Yahoo! username to use for authentication.  This is an
alternative to the positional argument.

=item B<--password> password

Specify the password for the Yahoo! account.  This is an alternatice
to the positional argument.

=back

=head1 AUTHOR

Caleb Epstein E<lt>caleb dot epstein at gmail dot comE<gt>

=head1 VERSION

$Id$

=cut

use strict;
use WWW::Mechanize;
use Getopt::Long;
use File::Basename;
use File::Spec::Functions;
use Pod::Usage;

my ($group, $login, $passwd);
my $root;
my $progname = basename $0;

GetOptions ("group=s" => \$group,
	    "user=s" => \$login,
	    "password=s" => \$passwd,
	    "root=s" => \$root,
	    "help!" => sub { pod2usage (1); exit 0 })
  or pod2usage (2);

# See if we need to get the group from the positional arguments
if (not defined $group) {
   pod2usage (2) unless scalar @ARGV;
   $group = shift;
}

# See if we need to get user from positional arguments
if (not defined $login) {
   pod2usage (2) unless scalar @ARGV;
   $login = shift;
}

# See if we need to get password from positional arguments
if (not defined $passwd) {
   pod2usage (2) unless scalar @ARGV;
   $passwd = shift;
}

$root = File::Spec->rel2abs ("groups.yahoo.com/$group") unless defined $root;

print "$progname: mirroring $group files using username $login:\n";

$| = 1;

my %VISITED;

sub mkdir_p {
   my $dir = shift;

   my @DIRS = split /\//, $dir;
   my @DIR;

   foreach my $d (@DIRS) {
      push (@DIR, $d);
      my $path = File::Spec->catdir (@DIR);
      if (not -d $path) {
	 mkdir ($path, 0777) or die;
      }
   }
}

sub mirror {
   my $mech = shift;

   my @LINKS = @{$mech->links};

   foreach my $index (0 .. $#LINKS) { # my $link (@LINKS) {
      my $link = $LINKS[$index];
      my $url = $link->url_abs ();

      next if $url !~ m@(/group/$group/files/ |
			 grp\.yahoofs\.com)@ox
	or exists $VISITED{$url};

      $VISITED{$url} = 1;

      if ($url =~ m@http://.+\.grp\.yahoofs\.com/v1/[\w_-]{94}/(.+)$@) {
	 my $filename = catfile ($root, $1);
	 next if -e $filename;
	 my $dir = dirname $filename;
	 mkdir_p ($dir) unless -d $dir;
	 print "FILE: $url -> $filename\n";
	 $mech->follow ($index) or die;
	 open (FILE, "> $filename") or die "open $filename: $!\n";
	 print FILE $mech->content ();
	 close FILE;
	 $mech->back ();
      } else {
	 print "DIR: $url\n";
	 $mech->follow ($index) or die;
	 mirror ($mech);
	 $mech->back ();
      }
   }
}

my $mech = new WWW::Mechanize (agent => "Mozilla",
			       onerror => sub { die @_ }) or die;

my $url = "http://groups.yahoo.com/group/$group/files/";

print "ROOT: $url\n";

# Fetch main page, but we will need to authenticate
$mech->get ($url) or die;

# Follow login link
$mech->follow_link (url_regex => qr/login\.yahoo\.com/)
  or die;

# Submit login form
$mech->submit_form (form_name => 'login_form', 
		    fields => 
		    { login => $login,
		      passwd => $passwd }) or die;

# Re-fetch main page and start mirroring
$mech->get ($url) or die;

mirror ($mech);
