aboutsummaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
authorVG <vg@devys.org>2017-06-03 18:20:42 +0200
committerVG <vg@devys.org>2017-06-03 18:20:42 +0200
commit40bc663ec6cec90f31565036623d2cbb586c1454 (patch)
tree1d62e2f02ac9f3fae3ff11a07ae6ff0264becfad /scripts
parentca5ca7b540d89d638e5edfd835f3fbf0f1d531a2 (diff)
downloadscripts-40bc663ec6cec90f31565036623d2cbb586c1454.tar.gz
scripts-40bc663ec6cec90f31565036623d2cbb586c1454.tar.bz2
scripts-40bc663ec6cec90f31565036623d2cbb586c1454.zip
add a script which links filename to their md5 hash
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/link-to-hashes.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/scripts/link-to-hashes.py b/scripts/link-to-hashes.py
new file mode 100755
index 0000000..224a410
--- /dev/null
+++ b/scripts/link-to-hashes.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright 2017 vg@devys.org
+# SPDX-License-Identifier: MIT
+
+'''
+Hard link files to a common directory containing md5hash as the filename.
+'''
+
+
+import sys
+import os
+
+
+sourcedir = sys.argv[1]
+destdir = sys.argv[2]
+os.makedirs(destdir, exist_ok=True)
+
+
+def are_same_files(file1, file2):
+ stat1, stat2 = os.stat(file1), os.stat(file2)
+ if stat1.st_size != stat2.st_size:
+ return False
+ if (stat1.st_nlink > 1
+ and stat1.st_nlink == stat2.st_nlink
+ and stat1.st_dev == stat2.st_dev
+ and stat1.st_rdev == stat2.st_rdev
+ and stat1.st_ino == stat2.st_ino):
+ return True
+ with open(file1, 'rb') as fin1, open(file2, 'rb') as fin2:
+ while True:
+ data1 = fin1.read(16*1024)
+ data2 = fin2.read(16*1024)
+ if not data1 and not data2:
+ break
+ if data1 != data2:
+ return False
+ return True
+
+
+for line in sys.stdin:
+ md5hash, filename = line.rstrip('\n').split(' *', maxsplit=1)
+ if md5hash.startswith('\\'):
+ md5hash = md5hash[1:]
+ filename = filename.replace('\\n', '\n')
+ source = os.path.join(sourcedir, filename)
+ dest = os.path.join(destdir, md5hash)
+ if not os.path.exists(dest):
+ os.link(source, dest)
+ elif are_same_files(source, dest):
+ print('duplicate detected:', source, dest)
+ os.unlink(source)
+ os.link(dest, source)
+ else:
+ print('clash detectecd:', source, dest)