How to speed up comparison when searching for duplicate files?

Question

import sys import os def get_all_files(dir): path_f = [] for dirs, subdirs, files in os.walk(dir): for f in files: if not f.startswith("."): path = os.path.join(dirs, f) path_f.append(path) return path_f def are_files_duplicates(file1, file2): return os.path.basename(file1) == os.path.basename(file2) and os.path.getsize(file1) == os.path.getsize(file2) def find_duplicates(dir): if not os.path.exists(dir): print("Такой директории не существует") return None path_f = get_all_files(dir) for counter_1 in range(0, len(path_f)): for counter_2 in range(counter_1+1, len(path_f)): if are_files_duplicates(path_f[counter_1], path_f[counter_2]): print("Файл {} дублируется с файлом {}".format(path_f[counter_2], path_f[counter_1])) if __name__ == '__main__': try: find_duplicates(sys.argv[1]) except IndexError: print("Укажите название файла")

Now find_duplicates works in quadratic time - compares each file with each. How can this be accelerated?

Sort and compare by list. O (n log n) instead of O (n ^ 2). - Harry

Answer 1 · 2017-07-25T09:17:22

Files with the одинаковым size НЕ являются дубликатами . For example, two files containing "1" and "2" will be the same size, but will not be duplicates. For the correct definition of a duplicate, it is necessary to calculate the хэш .

 import os import hashlib def find_duplicates(folder): duplicates = {} for dirs, subdirs, files in os.walk(folder): for name in files: file = os.path.join(dirs, name) file_hash = hashlib.sha1(open(file, 'rb').read()).digest() dup = duplicates.get(file_hash) if dup: try: duplicates[file_hash][name].append(file) except KeyError: duplicates[file_hash][name] = [file] else: duplicates[file_hash] = {name: [file]} return duplicates if __name__ == '__main__': duplicates = find_duplicates('12') for h in duplicates: for file in duplicates[h]: if len(duplicates[h][file]) > 1: print('Файлы дубликаты: {}'.format(', '.join(duplicates[h][file])))

out:

 Файлы дубликаты: 12\1.txt, 12\dr\1.txt, 12\sr\1.txt Файлы дубликаты: 12\3.txt, 12\sr\3.txt Файлы дубликаты: 12\2.txt, 12\sr\2.txt

file structure:

 c:\SCR\lr2\12\: 1.txt = "1" 2.txt = "2" 3.txt = "1" c:\SCR\lr2\12\dr\: 1.txt = "1" 3.txt = "5" 4.txt = "3" c:\SCR\lr2\12\sr\: 1.txt = "1" 2.txt = "2" 3.txt = "1"

jfs jfs 44.5k eight 53 199 · Answer 2 · 2017-07-26T13:30:43

You can use the one-pass algorithm (O (n) in time, O (n) in memory), if you create a mapping: uniqueness criterion -> file.

From the are_files_duplicates() function in question, the uniqueness criterion follows:

 key = lambda path: (os.path.basename(path), os.path.getsize(path))

In many tasks, as a criterion for the uniqueness of a file, you can consider its contents (content hash), for example: key = md5sum .

To find all duplicates in a given directory tree ( rootdir ) according to a given criterion ( key ):

 import os import contextlib def find_duplicates(rootdir, *, key=None, suppress=contextlib.suppress, seen=None): if key is None: def key(e): return e.name, e.stat(follow_symlinks=False).st_size if seen is None: seen = {} # key -> entry for entry in os.scandir(rootdir): with suppress(OSError): # ignore I/O errors if entry.name.startswith('.'): # skip dotted paths (both directories such as .git and files) continue elif entry.is_dir(follow_symlinks=False): # directory # recursive call yield from find_duplicates(entry.path, key=key, suppress=suppress, seen=seen) elif (entry.is_file(follow_symlinks=False) # ordinary file and seen.setdefault(key(entry), entry) is not entry): # found duplicate yield seen[key(entry)].path, entry.path

Example of use:

 import sys for a, b in find_duplicates(sys.argv[1]): print(a, "<->", b)

How to speed up comparison when searching for duplicate files?

2 answers 2

More articles: