From what I observe filecmp.dircmp is recursive, but inadequate for my needs, at least in py2. I want to compare two directories and all their contained files. Do
Here a simple solution with a recursive function :
import filecmp
def same_folders(dcmp):
if dcmp.diff_files:
return False
for sub_dcmp in dcmp.subdirs.values():
if not same_folders(sub_dcmp):
return False
return True
same_folders(filecmp.dircmp('/tmp/archive1', '/tmp/archive2'))
Here is my solution: gist
def dirs_same_enough(dir1,dir2,report=False):
''' use os.walk and filecmp.cmpfiles to
determine if two dirs are 'same enough'.
Args:
dir1, dir2: two directory paths
report: if True, print the filecmp.dircmp(dir1,dir2).report_full_closure()
before returning
Returns:
bool
'''
# os walk: root, list(dirs), list(files)
# those lists won't have consistent ordering,
# os.walk also has no guaranteed ordering, so have to sort.
walk1 = sorted(list(os.walk(dir1)))
walk2 = sorted(list(os.walk(dir2)))
def report_and_exit(report,bool_):
if report:
filecmp.dircmp(dir1,dir2).report_full_closure()
return bool_
else:
return bool_
if len(walk1) != len(walk2):
return false_or_report(report)
for (p1,d1,fl1),(p2,d2,fl2) in zip(walk1,walk2):
d1,fl1, d2, fl2 = set(d1),set(fl1),set(d2),set(fl2)
if d1 != d2 or fl1 != fl2:
return report_and_exit(report,False)
for f in fl1:
same,diff,weird = filecmp.cmpfiles(p1,p2,fl1,shallow=False)
if diff or weird:
return report_and_exit(report,False)
return report_and_exit(report,True)
Here's an alternative implementation of the comparison function with filecmp
module. It uses a recursion instead of os.walk
, so it is a little simpler. However, it does not recurse simply by using common_dirs
and subdirs
attributes since in that case we would be implicitly using the default "shallow" implementation of files comparison, which is probably not what you want. In the implementation below, when comparing files with the same name, we're always comparing only their contents.
import filecmp
import os.path
def are_dir_trees_equal(dir1, dir2):
"""
Compare two directories recursively. Files in each directory are
assumed to be equal if their names and contents are equal.
@param dir1: First directory path
@param dir2: Second directory path
@return: True if the directory trees are the same and
there were no errors while accessing the directories or files,
False otherwise.
"""
dirs_cmp = filecmp.dircmp(dir1, dir2)
if len(dirs_cmp.left_only)>0 or len(dirs_cmp.right_only)>0 or \
len(dirs_cmp.funny_files)>0:
return False
(_, mismatch, errors) = filecmp.cmpfiles(
dir1, dir2, dirs_cmp.common_files, shallow=False)
if len(mismatch)>0 or len(errors)>0:
return False
for common_dir in dirs_cmp.common_dirs:
new_dir1 = os.path.join(dir1, common_dir)
new_dir2 = os.path.join(dir2, common_dir)
if not are_dir_trees_equal(new_dir1, new_dir2):
return False
return True
This will check if files are in the same locations and if their content is the same. It will not correctly validate for empty subfolders.
import filecmp
import glob
import os
path_1 = '.'
path_2 = '.'
def folders_equal(f1, f2):
file_pairs = list(zip(
[x for x in glob.iglob(os.path.join(f1, '**'), recursive=True) if os.path.isfile(x)],
[x for x in glob.iglob(os.path.join(f2, '**'), recursive=True) if os.path.isfile(x)]
))
locations_equal = any([os.path.relpath(x, f1) == os.path.relpath(y, f2) for x, y in file_pairs])
files_equal = all([filecmp.cmp(*x) for x in file_pairs])
return locations_equal and files_equal
folders_equal(path_1, path_2)
Another solution to Compare the lay out of dir1 and dir2, ignore the content of files
See gist here: https://gist.github.com/4164344
Edit: here's the code, in case the gist gets lost for some reason:
import os
def compare_dir_layout(dir1, dir2):
def _compare_dir_layout(dir1, dir2):
for (dirpath, dirnames, filenames) in os.walk(dir1):
for filename in filenames:
relative_path = dirpath.replace(dir1, "")
if os.path.exists( dir2 + relative_path + '\\' + filename) == False:
print relative_path, filename
return
print 'files in "' + dir1 + '" but not in "' + dir2 +'"'
_compare_dir_layout(dir1, dir2)
print 'files in "' + dir2 + '" but not in "' + dir1 +'"'
_compare_dir_layout(dir2, dir1)
compare_dir_layout('xxx', 'yyy')