find-dups.sh

 1: #!/bin/sh -
 2: # This script compares the MD5 checksum for all plain files
 3: # in the current directory, and reports any duplicates found.
 4: # (Of course, hard links will be reported as duplicates too,
 5: # as will all zero length files.)
 6: # This works by creating an array "md5", indexed by the MD5 sum,
 7: # and containing the filename.  Since not all shells support
 8: # arrays dynamically named variable are used for the same effect.
 9: #
10: # This highly portable script was adapted from one by
11: # Peter Seebach in his book "Beginning Portable Shell Scripting"
12: # (C) 2008 by Apress, page 113.
13: # Adapted 11/2009 by Wayne Pollock, Tampa Florida USA
14: # $Id: find-dups.sh,v 1.0 2009/11/11 19:09:27 wpollock Exp $
15: #
16: # TODO: Add command line argument of directory to process,
17: #       A recursive option,
18: #       a help option
19: 
20: # Set PATH to find all POSIX utilities:
21: PATH=$(getconf PATH)
22: 
23: # Initialize MD5SUM if not already set:
24: : ${MD5SUM="md5sum"}
25: 
26: # Only find plain files, and only in the current directory:
27: find . ! -name . -prune -type f -print | while IFS= read FILE
28: do
29:     sum=`"$MD5SUM" "$FILE" | awk '{print $1}'`
30: 
31:     # Make a reference to the array element md5_$sum:
32:     eval assoc=\$md5_$sum
33: 
34:     # Check if a previous file with this sum was found:
35:     if test -z "$assoc"
36:     then
37:         # Put filename into array element md5_$sum:
38:         eval md5_$sum=\$FILE
39:     else
40:         # Display current and previous file names with same MD5 sum:
41:         printf '   Duplicates: "%s" and "%s"\n' "$FILE" "$assoc"
42:     fi
43: done