#!/bin/sh

# dupfind: weed out duplicate/empty files from a specified directory

if [ x"$1" = x ]; then
	echo "Usage: $0 /path [ working directory ]"
	echo 
	echo "finds a list of all duplicate files in /path"
	exit
fi

echo "$0 starting"
SUM="md5 -n" # netbsd
#SUM=md5sum  # cygwin

date "+%s" > /dev/null 2>&1
if [ $? -gt 0 ]; then
	gdate=0
	"echo oops, not running gnu date..."
else
	gdate=1
	stime=`date +"%s"` # seconds since epoch
fi

tput > /dev/null 2>&1
if [ $? -gt 0 ]; then
	ce=''
	el='\n'
else
	ce=`tput ce`
	el='\r'
fi

top="$1"
if [ ! -d "$top" ]; then
	echo "ERROR: Directory does not exist ($top)"
	exit 1
fi

if [ "x$2" = x ]; then
	#work="$top"/dupfind-`date +"%Y%m%d"`
	work="$top"/dupfind-work
else
	work="$2"
fi

mkdir -p "$work"
cd "$work"
if [ $? -gt 0 ]; then
	echo "Could not change to $top/dupfind-$today" 
       	exit 1 
fi
echo "Working from $work"

# get all files
echo "Finding all files in $top"
if [ -f allfiles.txt ]; then
	echo "allfiles.txt already exists. reusing this file"
else
	find "$top" -type f > allfiles.txt 2>allfiles-errors.txt
fi

numerrs=0
if [ -f allfiles-errors.txt ]; then
	numerrs=`wc -l allfiles-errors.txt | awk '{print $1}'`
	if [ $numerrs -gt 0 ]; then
		echo "There were ($numerrs) errors finding all of the files. Please check allfiles-errors.txt"
	fi
fi
numfiles=`wc -l allfiles.txt | awk '{print $1}'`
echo "Found $numfiles files"

if [ -f all-cksum.txt ]; then
	echo "all-cksum.txt already exists, reusing this file."
else
	# get checksum sum for all files
	i=0 # current file count
	p=0 # percentage
	mod=`expr $numfiles / 100` # used to print progress ticks every percent done
	mod=`echo "$numfiles / 100 + 1" | bc` # used to print progress ticks every percent done
	echo "Getting checksum for all files (status update every $mod files)"
	while read f
	do
		i=`expr $i + 1`
		case "$f" in
	# put file exceptions here.
			*Demonoid.com.txt|*/tor/Download*|*.bak|*~) : ;;
	# anything else will be checked
			*) $SUM "$f" 2>cksum-errs.txt ;;
		esac
	# print the progress
		m=`expr $i % $mod`
		if [ $m = 0 ]; then
			p=`expr $p + 1` # percentage
			printf "${ce}${p}%%" >&2
			if [ $gdate = 1 ]; then
				ctime=`date "+%s"`               # current time
				etime=`expr $ctime - $stime`     # elapsed time
				ttime=`expr $etime \* 100 / $p`  # estimated total
				rtime=`expr $ttime - $etime`     # remaining
				printf " ($etime seconds elapsed, ~$rtime seconds remain (est)" >&2
			fi
			printf "${el}" >&2
		fi
	done < allfiles.txt > all-cksum.txt
	echo
	echo "Finished generating checksums"
fi
if [ -f cksum-errs.txt ]; then
	echo "WARNING: Errors generated in checksum process, see cksum-errs.txt for details"
fi
empty=`grep -c d41d8cd98f00b204e9800998ecf8427e all-cksum.txt`
if [ $empty -gt 0 ]; then
	echo "WARNING: You have $empty empty files. Check empty.txt for details."
	grep "d41d8cd98f00b204e9800998ecf8427e" all-cksum.txt > empty.txt
fi


# generate dup's
echo "Finding duplicate checksums"
awk '
# put any checksum exceptions in here.
/c347d69b388abbabaf2f894c4200465c/ { next } # common D .txt 
/d41d8cd98f00b204e9800998ecf8427e/ { next } # empty file
/89da1f7e26ff45e440ec2d97dd9c0e9f/ { next } # common .VOB
/ad6f3a281e2703e04bab1e8296a6c8ee|8848d64dca6761756ef349610d67f11e/ { next } # common TF .HTM
/c87dfcfc3077fe5ae3fabe68eef41db7|2e1b8d937c0a0a452b9a1561b52213af|61fa3991bc2f904035b9a724065bf3fd/ { next } # common .SWF
# everything else will be checked.
{h[$1]++}
END {
	for (m in h) {
		if (h[m] > 1) { print m }
		if (h[m] > 3) { print m > "ckdup-many.txt" }
	}
}
' all-cksum.txt | sort -u > ckdup.txt
manydup=0
if [ -f ckdup-many.txt ]; then
	manydup=`wc -l ckdup-many.txt | awk '{print $1}'`
fi
if [ $manydup -gt 0 ]; then
	echo "WARNING: There were a number of large dups. check ckdup-many.txt for details"
fi
dupcount=`wc -l ckdup.txt | awk '{print $1}'`
echo "Found $dupcount duplicates, finding duplicate filenames."

# Found all the dups, print them all to a file
#for m in `cat ckdup.txt`
#do
#	echo "$m :"
#	grep $m all-cksum.txt
#	echo
#done > all-dups.txt
#

awk '
{
	if (NF == 1) { # duplicate
		d[$1]++
	} else {       # filename
		if (len f[$1] > 0) {
			f[$1] = sprintf("%s\n%s\n",f[$1],$0)
		} else {
			f[$1] = $0
		}
	}
}
END {
	for (m in d) {
		printf("%s:\n%s\n",m,f[m]);
	}
}
' ckdup.txt all-cksum.txt > all-dups.txt

cat << EoF

$0 completed... Results stored in all-dups.txt
Work Directory: $work
Files Checked:  $numfiles - refer to allfiles.txt for details
File Errors:    $numerrs - refer to allfiles-errors.txt for details
Empty files:    $empty - refer to empty.txt for details
Duplicates:     $dupcount - refer to ckdup.txt for checksums
Many Dupes:     $manydup - refer to ckdup-many.txt for details
EoF

if [ $gdate = 1 ]; then
	etime=`date +"%s"`
	echo `expr $etime - $stime` seconds elapsed.
fi
