#!/usr/pkg/bin/perl
# $Id: listdup,v 1.4 2006/11/10 11:40:59 abs Exp $

use strict;
use warnings;
use Getopt::Std;
use Digest::SHA1;

my ( %bysize, %opt, $minsize, @dups );

$minsize = 0;

if ( !getopts( 'hnrs:E:I:vl:q', \%opt ) || $opt{h} || !@ARGV ) {
    print "Usage: listdup [opts]
opts:	-h	 This help
	-n	 No action - do not make any changes
	-l log	 Record deletions in log
	-r	 Remove duplicate files
	-s size  Set minimum size to check. Can take k or m suffix
	-I regex Only include files that match regex
	-E regex Exclude files that match regex
	-q	 Quiet - no output unless errors
	-v	 Verbose
";
    exit(1);
}

if ( $opt{s} ) {
    if    ( $opt{s} =~ /(.*)k/i ) { $minsize = $1 * 1024; }
    elsif ( $opt{s} =~ /(.*)m/i ) { $minsize = $1 * 1024 * 1024; }
    else                          { $minsize = $opt{s}; }
}

# List duplicate files (currently just checks size & md5's match)

verbose("Scan @ARGV and calculate file sizes\n");
foreach my $path (@ARGV) {
    -e $path || die("Unable to access $path: $!");
    add($path);
}

foreach my $size ( sort { $a <=> $b } keys %bysize ) {
    if ( @{ $bysize{$size} } > 1 ) {
        my %bysha1;
        verbose( "size $size", join( "\n  ", '', sort @{ $bysize{$size} } ),
            "\n" );
        foreach my $filename ( @{ $bysize{$size} } ) {
            open( FILE, $filename ) || die("Unable to read $filename: $!");
            my $sha1 = Digest::SHA1->new;
            $sha1->addfile(*FILE);
            my $digest = $sha1->b64digest;
            close(FILE);
            push( @{ $bysha1{$digest} }, $filename );
        }
        foreach my $sha1 ( keys %bysha1 ) {
            if ( @{ $bysha1{$sha1} } > 1 ) {
                verbose( "  sha1 $sha1\n\t",
                    join( "\n\t", sort @{ $bysha1{$sha1} } ), "\n" );
                my @files = sort @{ $bysha1{$sha1} };
                push( @dups, { size => -s $files[0], files => \@files } );
            }
        }
        verbose("\n");
    }
}
verbose("\n");

if ( $opt{l} ) {
    open( LOG, ">$opt{l}" ) || die("Unable to write $opt{l}: $!");
}
foreach my $ent ( sort { $a->{size} <=> $b->{size} } @dups ) {
    if ( $ent->{size} > 1024 ) {
        progress( sprintf( "%dK\n", $ent->{size} / 1024 ) );
    }
    else { progress("$ent->{size}\n"); }
    my @remove;
    foreach my $file ( @{ $ent->{files} } ) {
        push( @remove, $file );
    }
    if ( $opt{r} ) {
        if ( @remove == @{ $ent->{files} } ) {
            progress( "\tkeep    ", shift @remove, "\n" );
        }
        foreach my $file (@remove) {
            progress("\tremove  $file\n");
            if ( !$opt{n} ) {
                unlink($file) || die("Unlink($file) failed: $!");
                print LOG $file if $opt{l};
            }
        }
    }
    else { progress( join( "\n", @remove ), "\n\n" ); }
}
close(LOG) if $opt{l};

exit;

sub add {
    my ($path) = @_;

    -l $path && return;
    if ( -d $path ) {
        my (@list);

        if ( !opendir( DIR, $path ) ) { die("Unable to opendir($path): $!"); }
        @list = grep( $_ ne '.' && $_ ne '..' && $_ ne '.svn', readdir(DIR) );
        closedir(DIR);
        foreach (@list) { add("$path/$_"); }
    }
    else {
        return
          if ( $opt{I} && $path !~ /$opt{I}/ )
          || ( $opt{E} && $path =~ /$opt{E}/ );
        if ( -s $path > $minsize ) { push( @{ $bysize{ -s $path } }, $path ); }
    }
}

sub progress { print @_ unless $opt{q}; }

sub verbose { print @_ if $opt{v}; }

