add sha512, see #3

This commit is contained in:
Christian Zangl 2022-02-20 19:11:29 +01:00
parent 457d38b19b
commit d0c78ec3ee
No known key found for this signature in database
GPG Key ID: 6D468AC36E2A4B3D
6 changed files with 92 additions and 26 deletions

View File

@ -39,22 +39,23 @@ Run `chkbit -u PATH` to create/update the chkbit index.
chkbit will
- create a `.chkbit` index in every subdirectory of the path it was given.
- update the index with md5 hashes for every file.
- update the index with md5/sha512 hashes for every file.
- report damage for files that failed the integrity check since the last run (check the exit status).
Run `chkbit PATH` to verify only.
```
usage: chkbit.py [-h] [-u] [-f] [-i] [-w N] [-q] [-v] [PATH [PATH ...]]
usage: chkbit.py [-h] [-u] [--algo ALGO] [-f] [-i] [-w N] [-q] [-v] [PATH ...]
Checks the data integrity of your files. See https://github.com/laktak/chkbit-py
positional arguments:
PATH directories to check
optional arguments:
options:
-h, --help show this help message and exit
-u, --update update indices (without this chkbit will only verify files)
--algo ALGO hash algorithm: md5, sha512
-f, --force force update of damaged items
-i, --verify-index verify files in the index only (will not report new files)
-w N, --workers N number of workers to use, default=5
@ -112,12 +113,28 @@ The disadvantage is obviously that you get hidden `.chkbit` files in your conten
chkbit operates on files.
When run for the first time it records a md5 hash of the file contents as well as the file modification time.
When run for the first time it records a hash of the file contents as well as the file modification time.
When you run it again it first checks the modification time,
- if the time changed (because you made an edit) it records a new md5 hash.
- otherwise it will compare the current md5 to the recorded value and report an error if they do not match.
- if the time changed (because you made an edit) it records a new hash.
- otherwise it will compare the current hash to the recorded value and report an error if they do not match.
### I wish to use a stronger hash algorithm
chkbit now supports sha512. You can specify it with `--algo sha512`.
Note that existing index files will use the hash that they were created with. If you wish to update all hashes you need to delete your existing indexes first.
### How can I delete the index files?
List them with
```
find . -name .chkbit
```
and add `-delete` to delete.
### Can I test if chkbit is working correctly?

13
chkbit/context.py Normal file
View File

@ -0,0 +1,13 @@
import hashlib
class Context:
def __init__(self, verify_index, update, force, hash_algo):
self.verify_index = verify_index
self.update = update
self.force = force
self.hash_algo = hash_algo
if hash_algo not in ["md5", "sha512"]:
raise Exception(f"{hash_algo} is unknown.")

View File

@ -1,18 +1,25 @@
import hashlib
BLOCKSIZE = 2 ** 10 * 128 # kb
BLOCKSIZE = 2**10 * 128 # kb
def hashfile(path):
md5 = hashlib.md5()
def hashfile(path, hash_algo=None):
if not hash_algo or hash_algo == "md5":
h = hashlib.md5()
elif hash_algo == "sha512":
h = hashlib.sha512()
else:
raise Exception(f"{hash_algo} is unknown.")
with open(path, "rb") as f:
while True:
buf = f.read(BLOCKSIZE)
if len(buf) <= 0:
break
md5.update(buf)
return md5.hexdigest()
h.update(buf)
return h.hexdigest()
def hashtext(text):

View File

@ -58,12 +58,22 @@ class Index:
self.log(stat, os.path.join(self.path, name))
# calc new hashes for this index
def update(self):
def update(self, context):
for name in self.files:
if self.should_ignore(name):
self._log(Stat.SKIP, name)
continue
self.new[name] = self._calc_file(name)
a = context.hash_algo
# check previously used hash
if name in self.old:
old = self.old[name]
if "md5" in old:
a = "md5" # legacy structure
self.old[name] = {"mod": old["mod"], "a": a, "h": old["md5"]}
elif "a" in old:
a = old["a"]
self.new[name] = self._calc_file(name, a)
# check/update the index (old vs new)
def check_fix(self, force):
@ -77,7 +87,7 @@ class Index:
b = self.new[name]
amod = a["mod"]
bmod = b["mod"]
if a["md5"] == b["md5"]:
if a["h"] == b["h"]:
# ok, if the content stays the same the mod time does not matter
self._log(Stat.OK, name)
if amod != bmod:
@ -101,11 +111,11 @@ class Index:
self._log(Stat.WARN_OLD, name)
self._setmod()
def _calc_file(self, name):
def _calc_file(self, name, a):
path = os.path.join(self.path, name)
info = os.stat(path)
mtime = int(info.st_mtime * 1000)
return {"mod": mtime, "md5": hashfile(path)}
return {"mod": mtime, "a": a, "h": hashfile(path, a)}
def save(self):
if self.modified:
@ -114,7 +124,7 @@ class Index:
data["idx_hash"] = hashtext(text)
with open(self.idx_file, "w", encoding="utf-8") as f:
json.dump(data, f)
json.dump(data, f, separators=(",", ":"))
self.modified = False
return True
else:
@ -129,7 +139,11 @@ class Index:
if "data" in data:
# extract old format from js version
for item in json.loads(data["data"]):
self.old[item["name"]] = {"mod": item["mod"], "md5": item["md5"]}
self.old[item["name"]] = {
"mod": item["mod"],
"a": "md5",
"h": item["md5"],
}
elif "idx" in data:
self.old = data["idx"]
text = json.dumps(self.old, separators=(",", ":"))

View File

@ -6,11 +6,11 @@ from chkbit import Index, Stat
class IndexThread:
def __init__(self, idx, args, res_queue, todo_queue):
def __init__(self, idx, context, res_queue, todo_queue):
self.idx = idx
self.verify_index_only = args.verify_index
self.update = args.update and not self.verify_index_only
self.force = args.force
self.verify_index_only = context.verify_index
self.update = context.update and not self.verify_index_only
self.context = context
self.todo_queue = todo_queue
self.res_queue = res_queue
self.t = threading.Thread(target=self.run)
@ -40,10 +40,10 @@ class IndexThread:
if e.load() or not self.verify_index_only:
# calc the new hashes
e.update()
e.update(self.context)
# compare
e.check_fix(self.force)
e.check_fix(self.context.force)
# save if update is set
if self.update:

View File

@ -4,7 +4,7 @@ import time
import argparse
import queue
import threading
from chkbit import IndexThread, Stat
from chkbit import Context, IndexThread, Stat
STATUS_CODES = """
Status codes:
@ -63,6 +63,13 @@ class Main:
help="update indices (without this chkbit will only verify files)",
)
parser.add_argument(
"--algo",
type=str,
default="md5",
help="hash algorithm: md5, sha512",
)
parser.add_argument(
"-f", "--force", action="store_true", help="force update of damaged items"
)
@ -90,6 +97,7 @@ class Main:
action="store_true",
help="quiet, don't show progress/information",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="verbose output"
)
@ -120,9 +128,16 @@ class Main:
for path in self.args.paths:
todo_queue.put(path)
context = Context(
self.args.verify_index,
self.args.update,
self.args.force,
self.args.algo,
)
# start indexing
workers = [
IndexThread(idx, self.args, self.res_queue, todo_queue)
IndexThread(idx, context, self.res_queue, todo_queue)
for idx in range(self.args.workers)
]