#!/usr/bin/perl -w 
#############################################################
#  This script takes a path as input and traverses shn/flac file sets recursively downloading info text and 
# checksum files from db.etree.org into any fileset with a shnid in the directory name.
# relies on command line "unzip" program, from www.info-zip.org
# On Win32: ftp://tug.ctan.org/tex-archive/tools/zip/info-zip/WIN32/unz552xN.exe
# Other platfomrs: ftp://tug.ctan.org/tex-archive/tools/zip/info-zip/
#  - run with --help for the manual, including info on options
#  - originally written by Dr. Unclear, distributed under GPL license
#############################################################

use strict;
use warnings;
use Getopt::Long;
use LWP::Simple;
use Cwd;
use File::Find;
use File::Spec;
use File::Copy;

my ($unzip, $wdir, @dirs, $absdir, $options, @existing_files, $opt_pattern, $single_dir, $opt_wav, $opt_del, $opt_id, $help_me, $get_etree, $etree_zip_url, $shnid, $basename, $id_etree,%rename_map, $orig_name, $new_name);

#########################################
# Set path to unzip program (www.info-zip.org)
# If unzip is already in your path, comment the next line out with a # in front
#$unzip = 'c:\temp\zip with spaces\unzip.exe';
#########################################

$single_dir = $opt_wav = $help_me = $opt_id = $opt_del = 0;
$options = GetOptions ("single|s" => \$single_dir,
                       "wav|w" => \$opt_wav,
                       "help|h" => \$help_me,
                       "delete|d" => \$opt_del,
                       "identify|i" => \$opt_id,
                       "pattern|p=s" => \$opt_pattern); 

&helpme if $help_me ==1;
                       
# Get path from the command line argument
my $path = shift                       
   or die "usage: $0 [options] (/)path/to/verify/\nUse $0 --help for more detailed help.";

# check if the above files exist
if (($unzip) && (! -f $unzip)) { die "unzip path not set properly.  Please see --help for more information." };

# if path to unzip isn't set explicitly, then rely on the fact it's in the path
$unzip = 'unzip' if (!$unzip);

print "Using \"$unzip\"...\n";

# if directory match pattern is not set as option, use a default
if (!$opt_pattern) {
   $opt_pattern = '^gd\d{2,4}';
} else {
   print "Looking for directories matching pattern: $opt_pattern\n\n";
}

# Set the original working dir so we can find our way home
$wdir = getcwd;

# Unless we've opted to use a single directory, read matching directories into an array
if ($single_dir == 0) {
   find( sub { push @dirs, $File::Find::name if -d && /$opt_pattern/ }, $path );
   print "No matching directories found!\n" if (!@dirs);
} else {
   # we've opted to use just one directory, so the array will contain just the path we fed the script
   push (@dirs, $path);
}

# loop through each matching directory and find ffp and md5 files
foreach (@dirs) {
   # set the absolute path from the relative path, for use in command line functions
	#$absdir = File::PathConvert::rel2abs($_, $wdir) ;
   $absdir = File::Spec->rel2abs($_, $wdir);
   
   # does this seed have a clearly identifiable shnid in the directory name?
   $shnid = '';
   # TODO: add opt_pattern before the shnid regex
   next unless ($absdir =~ m#\.(\d{2,6})\.#);
   # ignore leading zeros on the shnid
   $1 =~ /^0*([1-9]\d{1,5})/ ;
   $shnid = $1;
   
   print "=============================================\nWorking with $absdir:\nshnid: $shnid\n\n";
   chdir $absdir or next ">>> Can't chdir to $absdir:$!\n";

   &deletefiles($absdir) if $opt_del == 1;
   
   #DL the seed's etree text and checksum files as a zip
   $etree_zip_url = "http://db.etree.org/shn_downloadzip.php?shnid=$shnid";
   print "Downloading files from db.etree.org...   ";
   $_ = get($etree_zip_url);
   my $zip_file = "${shnid}.zip";
   open ZIP, ">", $zip_file or die $!;
   binmode(ZIP);
   print ZIP $_;
   close ZIP or die $!;
   print "[SUCCESS]\n";
   
   # Unzip into position
   # TODO : make this option cleaner
   # TODO: get rid of the caution from unzip when no wav md5 file is present
   if ($opt_wav ==1) {
      $_ = `"$unzip" -j ${shnid}.zip "*.txt" "*.md5"`;
   } else {
      $_ = `"$unzip" -j ${shnid}.zip "*.txt" "*.md5" -x "*wav*.md5"`;
   }
   unlink("${shnid}.zip");
   
   #### Rename files
   # TODO: move into a subroutine that is called only if we find a matching basename
   # TODO: How do we know we're not renaming existing files?
   # First get the base name for files from the info text file
   find( sub { $basename = $_ if -f && /^gd\d\d.+\.\d{1,6}\.txt$/ }, $absdir );
   $basename =~ /(.+?)\.txt/ ;
   $basename = $1;
   print "basename is $basename\n";

   # if option identify is set, then add ".etree" to filenames
   if ($opt_id == 1) {
      $id_etree = '.etree';
   } else {
      $id_etree = '';
   }
   
   # db.etree default file name suck.  This is a map of files to rename.
   %rename_map = (
   'flac-md5.md5' => "${basename}${id_etree}.md5", 
   'ffp.md5' => "${basename}${id_etree}.ffp", 
   'shn-md5.md5' => "${basename}${id_etree}.md5",
   'tot.md5' => "${basename}${id_etree}.md5",
   'all.md5' => "${basename}${id_etree}.md5",
   'all 3.md5' => "${basename}${id_etree}.md5",
   'both.md5' => "${basename}${id_etree}.md5",
   'wav.md5' => "${basename}${id_etree}.wav.md5",
   'wav-md5.md5' => "${basename}${id_etree}.wav.md5",
   'st5.md5' => "${basename}${id_etree}.st5",
   'cfp.md5' => "${basename}${id_etree}.cfp");
   
   # Rename checksum files we have that match our rename map
   foreach $orig_name (keys %rename_map) {
      $new_name = $rename_map{$orig_name};
      if (-f "$absdir/$orig_name" && ! -f "$absdir/$new_name") {
         move("$absdir/$orig_name","$absdir/$new_name");
      } elsif (-f "$absdir/$orig_name" && -f "$absdir/$new_name") {
         print "$absdir/$new_name already exists, leaving as $orig_name\n";
      }
   }
   print "\nDone with seed.\n";
}

######################################333
## SUBROUTINES
sub deletefiles {
	# empty the container arrays out
   my ($absdir) = @_  ;
	my @existing_files;
	# add all checksum and text files under this directory to an array
   find( sub { push @existing_files, $File::Find::name if -f && /\.cfp$|\.st5$|\.md5$|\.ffp$|\.txt$/i && $_ !~ /shntool/}, $absdir );
   
   # delete existing text and  checksum files
   foreach (@existing_files) {
          print "Deleting $_ ... \n";
          unlink($_); 
   }
}

sub helpme {
   my $help = <<HELPTEXT;
This script is designed to traverse a directory
containing shn or flac filesets and downloads
checksum and info text files from db.etree.org 
for any fileset with a discernable shnid.  
Why? It is recommended before any file set is seeded that 
you verify the file set against checksums from 
db.etree.org to make absolutely sure you're not 
seeding a file set with errors.  This script aims 
to facilitate that checking process.

By default, the script will traverse the input path 
looking for directories that start with the "gdYY*", 
where YY is meant to be a year designator (gdYY and 
gdYYYY directories will both work).  You can
override the default with the --pattern= option 
(see below).

This script should run on Windows, Mac OS X, Linux,
Cygwin, or just about any OS with perl installed.

OPTIONS:
---------------------------------------------
Option flags are taken prior to the path in 
command line usage.  Current options include:

  --single, -s: use this flag if you're looking 
  to verify just a single flac/shn fileset.  
  When this flag is set, the path inputed is not 
  traversed recursively.

  --wav, -w: by default, this script will skip
  wav md5s.  If you include this option, wav md5s
  will not be skipped.
  
  --delete, -d: by default, your existing text and
  checksum files are preserved.  Adding this flag
  deletes your existing text and checksum files before
  extracting the new ones from db.etree.org.  Note,
  any text file with "shntool" in the name is 
  preserved, even with this option.

  --pattern=regex, -p:  use this flag to override 
  the default directory pattern matching.  Uses 
  standard regular expressions.  See example below. 
  
DEPENDENCIES:
--------------------------------------------
This script relies on info-zip's "unzip" tool. You
may already have this tool on your computer, so 
open a command prompt/terminal and type "unzip"
to see if you do.

If not, this tool can be downloaded from:
http://www.info-zip.org/

On Windows 32-bit platforms, you can download 
and extract:
ftp://tug.ctan.org/tex-archive/tools/zip/info-zip/WIN32/unz552xN.exe
See the website for other platforms.

Once downloaded and extracted, find the "unzip" program
itself.  On Win 32 this is "unzip.exe".  Then
edit this script and modify the location of unzip to 
match the location on your computer. 

USAGE:
---------------------------------------------
Example - traverse a directory tree updating
checksums from db.etree.org that exist under the path:
  perl etree-verify.pl (/)path/to/yourdir/
  
Example - find directories starting with paf followed 
by a two digit year:
  perl etree-verify.pl --pattern=^paf\\\\d\\\\d
  
Example - test a single seed, and include wav 
md5s, delete existing checksums and info texts:
  perl etree-verify.pl --single --wav -- delete path/to/gdYYseed/
  
CREDITS:
Originally written by Dr. Unclear, distributed 
under the GPL license.
HELPTEXT
print $help;
exit 0;
}

