Finding duplicate files and removing them

后端 未结 8 1734
谎友^
谎友^ 2020-11-27 09:26

I am writing a Python program to find and remove duplicate files from a folder.

I have multiple copies of mp3 files, and some other files. I am using the sh1 algorit

8条回答
  •  忘掉有多难
    2020-11-27 09:59

        import hashlib
        import os
        import sys
        from sets import Set
    
        def read_chunk(fobj, chunk_size = 2048):
            """ Files can be huge so read them in chunks of bytes. """
            while True:
                chunk = fobj.read(chunk_size)
                if not chunk:
                    return
                yield chunk
    
        def remove_duplicates(dir, hashfun = hashlib.sha512):
            unique = Set()
            for filename in os.listdir(dir):
                filepath = os.path.join(dir, filename)
                if os.path.isfile(filepath):
                    hashobj = hashfun()
                    for chunk in read_chunk(open(filepath,'rb')):
                        hashobj.update(chunk)
                        # the size of the hashobj is constant
                        # print "hashfun: ", hashfun.__sizeof__()
                    hashfile = hashobj.hexdigest()
                    if hashfile not in unique:
                        unique.add(hashfile)
                    else: 
                        os.remove(filepath)
    
        try:
            hashfun = hashlib.sha256
            remove_duplicates(sys.argv[1], hashfun)
        except IndexError:
            print """Please pass a path to a directory with 
            duplicate files as a parameter to the script."""
    

提交回复
热议问题