From 40bc663ec6cec90f31565036623d2cbb586c1454 Mon Sep 17 00:00:00 2001 From: VG Date: Sat, 3 Jun 2017 18:20:42 +0200 Subject: add a script which links filename to their md5 hash --- scripts/link-to-hashes.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100755 scripts/link-to-hashes.py diff --git a/scripts/link-to-hashes.py b/scripts/link-to-hashes.py new file mode 100755 index 0000000..224a410 --- /dev/null +++ b/scripts/link-to-hashes.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# Copyright 2017 vg@devys.org +# SPDX-License-Identifier: MIT + +''' +Hard link files to a common directory containing md5hash as the filename. +''' + + +import sys +import os + + +sourcedir = sys.argv[1] +destdir = sys.argv[2] +os.makedirs(destdir, exist_ok=True) + + +def are_same_files(file1, file2): + stat1, stat2 = os.stat(file1), os.stat(file2) + if stat1.st_size != stat2.st_size: + return False + if (stat1.st_nlink > 1 + and stat1.st_nlink == stat2.st_nlink + and stat1.st_dev == stat2.st_dev + and stat1.st_rdev == stat2.st_rdev + and stat1.st_ino == stat2.st_ino): + return True + with open(file1, 'rb') as fin1, open(file2, 'rb') as fin2: + while True: + data1 = fin1.read(16*1024) + data2 = fin2.read(16*1024) + if not data1 and not data2: + break + if data1 != data2: + return False + return True + + +for line in sys.stdin: + md5hash, filename = line.rstrip('\n').split(' *', maxsplit=1) + if md5hash.startswith('\\'): + md5hash = md5hash[1:] + filename = filename.replace('\\n', '\n') + source = os.path.join(sourcedir, filename) + dest = os.path.join(destdir, md5hash) + if not os.path.exists(dest): + os.link(source, dest) + elif are_same_files(source, dest): + print('duplicate detected:', source, dest) + os.unlink(source) + os.link(dest, source) + else: + print('clash detectecd:', source, dest) -- cgit v1.2.3