find-dups.sh
1: #!/bin/sh -
2: # This script compares the MD5 checksum for all plain files
3: # in the current directory, and reports any duplicates found.
4: # (Of course, hard links will be reported as duplicates too,
5: # as will all zero length files.)
6: # This works by creating an array "md5", indexed by the MD5 sum,
7: # and containing the filename. Since not all shells support
8: # arrays dynamically named variable are used for the same effect.
9: #
10: # This highly portable script was adapted from one by
11: # Peter Seebach in his book "Beginning Portable Shell Scripting"
12: # (C) 2008 by Apress, page 113.
13: # Adapted 11/2009 by Wayne Pollock, Tampa Florida USA
14: # $Id: find-dups.sh,v 1.0 2009/11/11 19:09:27 wpollock Exp $
15: #
16: # TODO: Add command line argument of directory to process,
17: # A recursive option,
18: # a help option
19:
20: # Set PATH to find all POSIX utilities:
21: PATH=$(getconf PATH)
22:
23: # Initialize MD5SUM if not already set:
24: : ${MD5SUM="md5sum"}
25:
26: # Only find plain files, and only in the current directory:
27: find . ! -name . -prune -type f -print | while IFS= read FILE
28: do
29: sum=`"$MD5SUM" "$FILE" | awk '{print $1}'`
30:
31: # Make a reference to the array element md5_$sum:
32: eval assoc=\$md5_$sum
33:
34: # Check if a previous file with this sum was found:
35: if test -z "$assoc"
36: then
37: # Put filename into array element md5_$sum:
38: eval md5_$sum=\$FILE
39: else
40: # Display current and previous file names with same MD5 sum:
41: printf ' Duplicates: "%s" and "%s"\n' "$FILE" "$assoc"
42: fi
43: done