What is the fastest way to create a hash function which will be used to check if two files are equal?
Security is not very important.
Edit: I am sending a fi
The following is the code to find duplicate files from my personal project to sort pictures which also removes duplicates. As per my experience, first using fast hashing algo like CRC32 and then doing MD5 or SHA1 was even slower and didn't made any improvement as most of the files with same sizes were indeed duplicate so running hashing twice was more expensive from cpu time perspective, this approach may not be correct for all type of projects but it definitely true for image files. Here I am doing MD5 or SHA1 hashing only on the files with same size.
PS: It depends on Apache commons codec to generate hash efficiently.
Sample usage: new DuplicateFileFinder("MD5").findDuplicateFilesList(filesList);
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.codec.digest.DigestUtils;
/**
* Finds the duplicate files using md5/sha1 hashing, which is used only for the sizes which are of same size.
*
* @author HemantSingh
*
*/
public class DuplicateFileFinder {
private HashProvider hashProvider;
// Used only for logging purpose.
private String hashingAlgo;
public DuplicateFileFinder(String hashingAlgo) {
this.hashingAlgo = hashingAlgo;
if ("SHA1".equalsIgnoreCase(hashingAlgo)) {
hashProvider = new Sha1HashProvider();
} else if ("MD5".equalsIgnoreCase(hashingAlgo)) {
hashProvider = new Md5HashProvider();
} else {
throw new RuntimeException("Unsupported hashing algorithm:" + hashingAlgo + " Please use either SHA1 or MD5.");
}
}
/**
* This API returns the list of duplicate files reference.
*
* @param files
* - List of all the files which we need to check for duplicates.
* @return It returns the list which contains list of duplicate files for
* e.g. if a file a.JPG have 3 copies then first element in the list
* will be list with three references of File reference.
*/
public List> findDuplicateFilesList(List files) {
// First create the map for the file size and file reference in the array list.
Map> fileSizeMap = new HashMap>();
List potDuplicateFilesSize = new ArrayList();
for (Iterator iterator = files.iterator(); iterator.hasNext();) {
File file = (File) iterator.next();
Long fileLength = new Long(file.length());
List filesOfSameLength = fileSizeMap.get(fileLength);
if (filesOfSameLength == null) {
filesOfSameLength = new ArrayList();
fileSizeMap.put(fileLength, filesOfSameLength);
} else {
potDuplicateFilesSize.add(fileLength);
}
filesOfSameLength.add(file);
}
// If we don't have any potential duplicates then skip further processing.
if (potDuplicateFilesSize.size() == 0) {
return null;
}
System.out.println(potDuplicateFilesSize.size() + " files will go thru " + hashingAlgo + " hash check to verify if they are duplicate.");
// Now we will scan the potential duplicate files, and eliminate false positives using md5 hash check.
List> finalListOfDuplicates = new ArrayList>();
for (Iterator potDuplicatesFileSizeIterator = potDuplicateFilesSize
.iterator(); potDuplicatesFileSizeIterator.hasNext();) {
Long fileSize = (Long) potDuplicatesFileSizeIterator.next();
List potDupFiles = fileSizeMap.get(fileSize);
Map> trueDuplicateFiles = new HashMap>();
for (Iterator potDuplicateFilesIterator = potDupFiles.iterator(); potDuplicateFilesIterator
.hasNext();) {
File file = (File) potDuplicateFilesIterator.next();
try {
String md5Hex = hashProvider.getHashHex(file);
List listOfDuplicatesOfAFile = trueDuplicateFiles.get(md5Hex);
if (listOfDuplicatesOfAFile == null) {
listOfDuplicatesOfAFile = new ArrayList();
trueDuplicateFiles.put(md5Hex, listOfDuplicatesOfAFile);
}
listOfDuplicatesOfAFile.add(file);
} catch (IOException e) {
e.printStackTrace();
}
}
Collection> dupsOfSameSizeList = trueDuplicateFiles.values();
for (Iterator> dupsOfSameSizeListIterator = dupsOfSameSizeList.iterator(); dupsOfSameSizeListIterator
.hasNext();) {
List list = (List) dupsOfSameSizeListIterator.next();
// It will be duplicate only if we have more then one copy of it.
if (list.size() > 1) {
finalListOfDuplicates.add(list);
System.out.println("Duplicate sets found: " + finalListOfDuplicates.size());
}
}
}
return finalListOfDuplicates;
}
abstract class HashProvider {
abstract String getHashHex(File file) throws IOException ;
}
class Md5HashProvider extends HashProvider {
String getHashHex(File file) throws IOException {
return DigestUtils.md5Hex(new FileInputStream(file));
}
}
class Sha1HashProvider extends HashProvider {
String getHashHex(File file) throws IOException {
return DigestUtils.sha1Hex(new FileInputStream(file));
}
}
}