Stories
Slash Boxes
Comments
NOTE: use Perl; is on undef hiatus. You can read content, but you can't post it. More info will be forthcoming forthcomingly.

All the Perl that's Practical to Extract and Report

use Perl Log In

Log In

[ Create a new account ]

ajt (2546)

ajt
  (email not shown publicly)
http://www.iredale.net/

UK based. Perl, XML/HTTP, SAP, Debian hacker.

  • CPAN: ATRICKETT [cpan.org]
  • PerlMonks: ajt [perlmonks.org]
  • Local LUG: AdamTrickett [lug.org.uk]
  • Debian Administration: ajt [debian-adm...ration.org]
  • LinkedIn: drajt [linkedin.com]

Journal of ajt (2546)

Monday July 31, 2006
07:45 AM

Finding Duplicate Files #1

[ #30485 ]

Here is try number 1 (warts and all):

#!/usr/bin/perl

#
# $Id: fdf.pl,v 1.1.1.1 2006-07-29 14:26:57 adam Exp $
#

use strict;
use warnings;
use File::Find;
use File::Glob;
use Digest::SHA;
use Getopt::Std;

my %options;
my $VERSION = 0.1;

getopts( 'dvg:l:u:ho:', \%options );

if ( $options{'h'} ) {
    show_usage();
    exit;
}

my @start_dirs = @ARGV;

if ( scalar @start_dirs < 1 ) {
    show_usage();
    die "\nERROR: No start directory provided.\n\n";
}

foreach my $start_dir (@start_dirs) {
    die "Unable to locate start Directory: $start_dir\n" unless -d $start_dir;
}

die
    "Upper Limit ($options{'u'}) is less than the Lower Limit ($options{'l'}).\n"
    if ( ( $options{'u'} && $options{'l'} )
    && ( $options{'u'} < $options{'l'} ) );

if ( $options{'d'} ) {
    $options{'v'} = 1;
}

if ( $options{'v'} ) {
    print {*STDERR} "Find Duplicate Files v$VERSION (verbose mode)\n";
    print {*STDERR} "         Search GLOB: $options{'g'}\n" if $options{'g'};
    print {*STDERR} "   Minimum file size: $options{'l'}\n" if $options{'l'};
    print {*STDERR} "   Maximum file size: $options{'u'}\n" if $options{'u'};
    print {*STDERR} "     Output Log File: $options{'o'}\n" if $options{'o'};

    foreach my $start_dir (@start_dirs) {
        print {*STDERR} "Finding all files in: $start_dir\n";
    }
    print {*STDERR} "\nThis may take a while...\n\n";
}

our %size_by_files;
our $sub_total = 0;

$File::Find::dont_use_nlink = 1;
find( \&pass_one, @start_dirs );

if ( $options{'v'} ) {
    print {*STDERR}
        "Pass 1 complete. Possibility of $sub_total bytes of duplication.\n",
        "Now calculating checksums. This may take a little while longer...\n\n";
}

my $dupes_by_size = pass_two( \%size_by_files );
my $output;

if ( $options{'o'} ) {
    open $output, '>', $options{'o'}
        or die "Unable to write to log file $options{'v'}\n";
    select $output;
}

foreach my $key ( sort keys %{$dupes_by_size} ) {
    if ( scalar @{ $dupes_by_size->{$key} } > 1 ) {

        my @files = @{ $dupes_by_size->{$key} };

        foreach my $file (@files) {
            print "$key\t$file\n";
        }
        print "\n";
    }
}

if ( $output ) {
    close $output;
}

exit;

sub pass_one {
    my $filename = $File::Find::name;
    if ( -f $filename && -r _ ) {
        my $size = -s _;
        if ( $size > 0 ) {
            my $use = 1;
            my @files;

            if ( $options{'l'} && $size < $options{'l'} ) {
                undef $use;
            }
            if ( $options{'u'} && $size > $options{'u'} ) {
                undef $use;
            }
            if ($use) {

                if ( $size_by_files{$size} ) {
                    @files     = @{ $size_by_files{$size} };
                    $sub_total = $sub_total + $size;
                }
                push @files, $filename;
                $size_by_files{$size} = \@files;
            }
        }
    }
    return;
}

sub pass_two {
    my $file_list  = shift;
    my $dupe_total = 0;
    my %dupe_files;

    foreach my $size ( sort keys %{$file_list} ) {
        if ( scalar @{ $file_list->{$size} } > 1 ) {
            my @files = @{ $file_list->{$size} };

            foreach my $file (@files) {
                my $digest;
               eval {
                    $digest = Digest::SHA->new()->addfile($file, "b")->hexdigest;
               };

                if ($digest) {
                    my @pos_files;
                    if ( $dupe_files{$digest} ) {
                        @pos_files  = @{ $dupe_files{$digest} };
                        $dupe_total = $dupe_total + $size;
                    }
                    push @pos_files, $file;
                    $dupe_files{$digest} = \@pos_files;
                }
                if ( $options{'d'} ) {
                    print {*STDERR} "$digest\t$file\t$size\n";
                }
            }
        }
    }
    if ( $options{'v'} ) {
        print {*STDERR}
            "Pass 2 complete. $dupe_total bytes of duplicates found.\n";
        if ( $options{'o'} ) {
            print {*STDERR} "Results logged to $options{'o'}\n\n";
        }
        else {
            print {*STDERR} "Results are show below:\n\n";
        }
    }
    return \%dupe_files;
}

sub show_usage {

    print <<"USAGE";

This is Find Duplicate Files version $VERSION

Usage:
    fdf [ -l <bytes> ] [ -u <bytes> ] [ -v ] [ -o <output_log> ] <path_to_scan>

Options:
    -l  Lower limit of files size to scan, in bytes
    -u  Upper limit of files size to scan, in bytes
    -v  Verbose mode (sent to *STDERR)
    -o  Output log
    -h  This usage note

Copyright:
    Copyright Adam John Trickett / iredale consulting 2006

Licence:
    OSI Certified Open Source Software.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public Licence as published
by the Free Software Foundation; either version 2 of the Licence,
or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public Licence for more details.

You should have received a copy of the GNU General Public Licence
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111, USA.

USAGE

    return;
}

The Fine Print: The following comments are owned by whoever posted them. We are not responsible for them in any way.
 Full
 Abbreviated
 Hidden
More | Login | Reply
Loading... please wait.