1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
#!/usr/bin/env python3
# Copyright 2017 vg@devys.org
# SPDX-License-Identifier: MIT
'''
Hard link files to a common directory containing md5hash as the filename.
'''
import sys
import os
sourcedir = sys.argv[1]
destdir = sys.argv[2]
os.makedirs(destdir, exist_ok=True)
def are_same_files(file1, file2):
stat1, stat2 = os.stat(file1), os.stat(file2)
if stat1.st_size != stat2.st_size:
return False
if (stat1.st_nlink > 1
and stat1.st_nlink == stat2.st_nlink
and stat1.st_dev == stat2.st_dev
and stat1.st_rdev == stat2.st_rdev
and stat1.st_ino == stat2.st_ino):
return True
with open(file1, 'rb') as fin1, open(file2, 'rb') as fin2:
while True:
data1 = fin1.read(16*1024)
data2 = fin2.read(16*1024)
if not data1 and not data2:
break
if data1 != data2:
return False
return True
for line in sys.stdin:
md5hash, filename = line.rstrip('\n').split(' *', maxsplit=1)
if md5hash.startswith('\\'):
md5hash = md5hash[1:]
filename = filename.replace('\\n', '\n')
source = os.path.join(sourcedir, filename)
dest = os.path.join(destdir, md5hash)
if not os.path.exists(dest):
os.link(source, dest)
elif are_same_files(source, dest):
print('duplicate detected:', source, dest)
os.unlink(source)
os.link(dest, source)
else:
print('clash detectecd:', source, dest)
|