aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xscripts/link-to-hashes.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/scripts/link-to-hashes.py b/scripts/link-to-hashes.py
new file mode 100755
index 0000000..224a410
--- /dev/null
+++ b/scripts/link-to-hashes.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright 2017 vg@devys.org
+# SPDX-License-Identifier: MIT
+
+'''
+Hard link files to a common directory containing md5hash as the filename.
+'''
+
+
+import sys
+import os
+
+
+sourcedir = sys.argv[1]
+destdir = sys.argv[2]
+os.makedirs(destdir, exist_ok=True)
+
+
+def are_same_files(file1, file2):
+ stat1, stat2 = os.stat(file1), os.stat(file2)
+ if stat1.st_size != stat2.st_size:
+ return False
+ if (stat1.st_nlink > 1
+ and stat1.st_nlink == stat2.st_nlink
+ and stat1.st_dev == stat2.st_dev
+ and stat1.st_rdev == stat2.st_rdev
+ and stat1.st_ino == stat2.st_ino):
+ return True
+ with open(file1, 'rb') as fin1, open(file2, 'rb') as fin2:
+ while True:
+ data1 = fin1.read(16*1024)
+ data2 = fin2.read(16*1024)
+ if not data1 and not data2:
+ break
+ if data1 != data2:
+ return False
+ return True
+
+
+for line in sys.stdin:
+ md5hash, filename = line.rstrip('\n').split(' *', maxsplit=1)
+ if md5hash.startswith('\\'):
+ md5hash = md5hash[1:]
+ filename = filename.replace('\\n', '\n')
+ source = os.path.join(sourcedir, filename)
+ dest = os.path.join(destdir, md5hash)
+ if not os.path.exists(dest):
+ os.link(source, dest)
+ elif are_same_files(source, dest):
+ print('duplicate detected:', source, dest)
+ os.unlink(source)
+ os.link(dest, source)
+ else:
+ print('clash detectecd:', source, dest)