#!/usr/bin/env python3 # Copyright 2017 vgm+dev@devys.org # SPDX-License-Identifier: MIT ''' Hard link files to a common directory containing md5hash as the filename. ''' import sys import os sourcedir = sys.argv[1] destdir = sys.argv[2] os.makedirs(destdir, exist_ok=True) def are_same_files(file1, file2): stat1, stat2 = os.stat(file1), os.stat(file2) if stat1.st_size != stat2.st_size: return False if (stat1.st_nlink > 1 and stat1.st_nlink == stat2.st_nlink and stat1.st_dev == stat2.st_dev and stat1.st_rdev == stat2.st_rdev and stat1.st_ino == stat2.st_ino): return True with open(file1, 'rb') as fin1, open(file2, 'rb') as fin2: while True: data1 = fin1.read(16*1024) data2 = fin2.read(16*1024) if not data1 and not data2: break if data1 != data2: return False return True for line in sys.stdin: md5hash, filename = line.rstrip('\n').split(' *', maxsplit=1) if md5hash.startswith('\\'): md5hash = md5hash[1:] filename = filename.replace('\\n', '\n') source = os.path.join(sourcedir, filename) dest = os.path.join(destdir, md5hash) if not os.path.exists(dest): os.link(source, dest) elif are_same_files(source, dest): print('duplicate detected:', source, dest) os.unlink(source) os.link(dest, source) else: print('clash detectecd:', source, dest)