aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/link-to-hashes.py
blob: 224a410554ded842d87bc59eec121ad1ab41c8ae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
# Copyright 2017 vg@devys.org
# SPDX-License-Identifier: MIT

'''
Hard link files to a common directory containing md5hash as the filename.
'''


import sys
import os


sourcedir = sys.argv[1]
destdir = sys.argv[2]
os.makedirs(destdir, exist_ok=True)


def are_same_files(file1, file2):
    stat1, stat2 = os.stat(file1), os.stat(file2)
    if stat1.st_size != stat2.st_size:
        return False
    if (stat1.st_nlink > 1
        and stat1.st_nlink  == stat2.st_nlink
        and stat1.st_dev    == stat2.st_dev
        and stat1.st_rdev   == stat2.st_rdev
        and stat1.st_ino    == stat2.st_ino):
        return True
    with open(file1, 'rb') as fin1, open(file2, 'rb') as fin2:
        while True:
            data1 = fin1.read(16*1024)
            data2 = fin2.read(16*1024)
            if not data1 and not data2:
                break
            if data1 != data2:
                return False
    return True


for line in sys.stdin:
    md5hash, filename = line.rstrip('\n').split(' *', maxsplit=1)
    if md5hash.startswith('\\'):
        md5hash = md5hash[1:]
        filename = filename.replace('\\n', '\n')
    source = os.path.join(sourcedir, filename)
    dest = os.path.join(destdir, md5hash)
    if not os.path.exists(dest):
        os.link(source, dest)
    elif are_same_files(source, dest):
        print('duplicate detected:', source, dest)
        os.unlink(source)
        os.link(dest, source)
    else:
        print('clash detectecd:', source, dest)