diff --git a/README.md b/README.md index d6ba483..f124fdb 100644 --- a/README.md +++ b/README.md @@ -39,22 +39,23 @@ Run `chkbit -u PATH` to create/update the chkbit index. chkbit will - create a `.chkbit` index in every subdirectory of the path it was given. -- update the index with md5 hashes for every file. +- update the index with md5/sha512 hashes for every file. - report damage for files that failed the integrity check since the last run (check the exit status). Run `chkbit PATH` to verify only. ``` -usage: chkbit.py [-h] [-u] [-f] [-i] [-w N] [-q] [-v] [PATH [PATH ...]] +usage: chkbit.py [-h] [-u] [--algo ALGO] [-f] [-i] [-w N] [-q] [-v] [PATH ...] Checks the data integrity of your files. See https://github.com/laktak/chkbit-py positional arguments: PATH directories to check -optional arguments: +options: -h, --help show this help message and exit -u, --update update indices (without this chkbit will only verify files) + --algo ALGO hash algorithm: md5, sha512 -f, --force force update of damaged items -i, --verify-index verify files in the index only (will not report new files) -w N, --workers N number of workers to use, default=5 @@ -112,12 +113,28 @@ The disadvantage is obviously that you get hidden `.chkbit` files in your conten chkbit operates on files. -When run for the first time it records a md5 hash of the file contents as well as the file modification time. +When run for the first time it records a hash of the file contents as well as the file modification time. When you run it again it first checks the modification time, -- if the time changed (because you made an edit) it records a new md5 hash. -- otherwise it will compare the current md5 to the recorded value and report an error if they do not match. +- if the time changed (because you made an edit) it records a new hash. +- otherwise it will compare the current hash to the recorded value and report an error if they do not match. + +### I wish to use a stronger hash algorithm + +chkbit now supports sha512. You can specify it with `--algo sha512`. + +Note that existing index files will use the hash that they were created with. If you wish to update all hashes you need to delete your existing indexes first. + +### How can I delete the index files? + +List them with + +``` +find . -name .chkbit +``` + +and add `-delete` to delete. ### Can I test if chkbit is working correctly? diff --git a/chkbit/context.py b/chkbit/context.py new file mode 100644 index 0000000..e5bb3ce --- /dev/null +++ b/chkbit/context.py @@ -0,0 +1,13 @@ +import hashlib + + +class Context: + def __init__(self, verify_index, update, force, hash_algo): + + self.verify_index = verify_index + self.update = update + self.force = force + self.hash_algo = hash_algo + + if hash_algo not in ["md5", "sha512"]: + raise Exception(f"{hash_algo} is unknown.") diff --git a/chkbit/hashfile.py b/chkbit/hashfile.py index 0e1519c..e8f5b00 100644 --- a/chkbit/hashfile.py +++ b/chkbit/hashfile.py @@ -1,18 +1,25 @@ import hashlib -BLOCKSIZE = 2 ** 10 * 128 # kb +BLOCKSIZE = 2**10 * 128 # kb -def hashfile(path): - md5 = hashlib.md5() +def hashfile(path, hash_algo=None): + + if not hash_algo or hash_algo == "md5": + h = hashlib.md5() + elif hash_algo == "sha512": + h = hashlib.sha512() + else: + raise Exception(f"{hash_algo} is unknown.") + with open(path, "rb") as f: while True: buf = f.read(BLOCKSIZE) if len(buf) <= 0: break - md5.update(buf) - return md5.hexdigest() + h.update(buf) + return h.hexdigest() def hashtext(text): diff --git a/chkbit/index.py b/chkbit/index.py index 491947e..8d22102 100644 --- a/chkbit/index.py +++ b/chkbit/index.py @@ -58,12 +58,22 @@ class Index: self.log(stat, os.path.join(self.path, name)) # calc new hashes for this index - def update(self): + def update(self, context): for name in self.files: if self.should_ignore(name): self._log(Stat.SKIP, name) continue - self.new[name] = self._calc_file(name) + + a = context.hash_algo + # check previously used hash + if name in self.old: + old = self.old[name] + if "md5" in old: + a = "md5" # legacy structure + self.old[name] = {"mod": old["mod"], "a": a, "h": old["md5"]} + elif "a" in old: + a = old["a"] + self.new[name] = self._calc_file(name, a) # check/update the index (old vs new) def check_fix(self, force): @@ -77,7 +87,7 @@ class Index: b = self.new[name] amod = a["mod"] bmod = b["mod"] - if a["md5"] == b["md5"]: + if a["h"] == b["h"]: # ok, if the content stays the same the mod time does not matter self._log(Stat.OK, name) if amod != bmod: @@ -101,11 +111,11 @@ class Index: self._log(Stat.WARN_OLD, name) self._setmod() - def _calc_file(self, name): + def _calc_file(self, name, a): path = os.path.join(self.path, name) info = os.stat(path) mtime = int(info.st_mtime * 1000) - return {"mod": mtime, "md5": hashfile(path)} + return {"mod": mtime, "a": a, "h": hashfile(path, a)} def save(self): if self.modified: @@ -114,7 +124,7 @@ class Index: data["idx_hash"] = hashtext(text) with open(self.idx_file, "w", encoding="utf-8") as f: - json.dump(data, f) + json.dump(data, f, separators=(",", ":")) self.modified = False return True else: @@ -129,7 +139,11 @@ class Index: if "data" in data: # extract old format from js version for item in json.loads(data["data"]): - self.old[item["name"]] = {"mod": item["mod"], "md5": item["md5"]} + self.old[item["name"]] = { + "mod": item["mod"], + "a": "md5", + "h": item["md5"], + } elif "idx" in data: self.old = data["idx"] text = json.dumps(self.old, separators=(",", ":")) diff --git a/chkbit/indexthread.py b/chkbit/indexthread.py index a714e75..bd132ef 100644 --- a/chkbit/indexthread.py +++ b/chkbit/indexthread.py @@ -6,11 +6,11 @@ from chkbit import Index, Stat class IndexThread: - def __init__(self, idx, args, res_queue, todo_queue): + def __init__(self, idx, context, res_queue, todo_queue): self.idx = idx - self.verify_index_only = args.verify_index - self.update = args.update and not self.verify_index_only - self.force = args.force + self.verify_index_only = context.verify_index + self.update = context.update and not self.verify_index_only + self.context = context self.todo_queue = todo_queue self.res_queue = res_queue self.t = threading.Thread(target=self.run) @@ -40,10 +40,10 @@ class IndexThread: if e.load() or not self.verify_index_only: # calc the new hashes - e.update() + e.update(self.context) # compare - e.check_fix(self.force) + e.check_fix(self.context.force) # save if update is set if self.update: diff --git a/chkbit/main.py b/chkbit/main.py index 4cdc401..96afc9b 100644 --- a/chkbit/main.py +++ b/chkbit/main.py @@ -4,7 +4,7 @@ import time import argparse import queue import threading -from chkbit import IndexThread, Stat +from chkbit import Context, IndexThread, Stat STATUS_CODES = """ Status codes: @@ -63,6 +63,13 @@ class Main: help="update indices (without this chkbit will only verify files)", ) + parser.add_argument( + "--algo", + type=str, + default="md5", + help="hash algorithm: md5, sha512", + ) + parser.add_argument( "-f", "--force", action="store_true", help="force update of damaged items" ) @@ -90,6 +97,7 @@ class Main: action="store_true", help="quiet, don't show progress/information", ) + parser.add_argument( "-v", "--verbose", action="store_true", help="verbose output" ) @@ -120,9 +128,16 @@ class Main: for path in self.args.paths: todo_queue.put(path) + context = Context( + self.args.verify_index, + self.args.update, + self.args.force, + self.args.algo, + ) + # start indexing workers = [ - IndexThread(idx, self.args, self.res_queue, todo_queue) + IndexThread(idx, context, self.res_queue, todo_queue) for idx in range(self.args.workers) ]