diff --git a/Pipfile b/Pipfile index 2a7e7d5..9b913f5 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,3 @@ name = "pypi" blake3 = ">=0.3.4" [dev-packages] - -[requires] -python_version = "3.11" diff --git a/Pipfile.lock b/Pipfile.lock index 0cc8f6b..2e4c3a8 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,12 +1,10 @@ { "_meta": { "hash": { - "sha256": "7d9212dcb9d58ac73c0d12a14a5102e0d3df649c4e95913e5bc8846bdb8d710a" + "sha256": "e8cd87a62cdc293b2cab0606525f4eb3bdfeb4f0373a64a5be10685b235d1078" }, "pipfile-spec": 6, - "requires": { - "python_version": "3.11" - }, + "requires": {}, "sources": [ { "name": "pypi", diff --git a/README.md b/README.md index b197d80..22341f6 100644 --- a/README.md +++ b/README.md @@ -22,16 +22,20 @@ Some cloud providers re-encode your videos or compress your images to save space ## Installation -``` -pip install --user chkbit -``` - -Or in its own environment: +The easiest way to install python CLI tools is with [pipx](https://pipx.pypa.io/latest/installation/). ``` pipx install chkbit ``` +You can also use pip: + +``` +pip install --user chkbit +``` + +**NOTE** version 3 now uses the blake3 hash algorithm by default as it is not only better but also faster than md5. + ## Usage Run `chkbit -u PATH` to create/update the chkbit index. @@ -39,13 +43,13 @@ Run `chkbit -u PATH` to create/update the chkbit index. chkbit will - create a `.chkbit` index in every subdirectory of the path it was given. -- update the index with md5/sha512/blake3 hashes for every file. +- update the index with blake3 (see --algo) hashes for every file. - report damage for files that failed the integrity check since the last run (check the exit status). Run `chkbit PATH` to verify only. ``` -usage: chkbit [-h] [-u] [--algo ALGO] [-f] [-i] [-s] [-w N] [-q] [-v] [PATH ...] +usage: chkbit [-h] [-u] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...] Checks the data integrity of your files. See https://github.com/laktak/chkbit-py @@ -54,12 +58,14 @@ positional arguments: options: -h, --help show this help message and exit - -u, --update update indices (without this chkbit will only verify files) - --algo ALGO hash algorithm: md5, sha512, blake3 + -u, --update update indices (without this chkbit will verify files in readonly mode) + --algo ALGO hash algorithm: md5, sha512, blake3 (default: blake3) -f, --force force update of damaged items - -i, --verify-index verify files in the index only (will not report new files) -s, --skip-symlinks do not follow symlinks - -w N, --workers N number of workers to use, default=5 + --index-name NAME filename where chkbit stores its hashes (default: .chkbit) + --ignore-name NAME filename that chkbit reads its ignore list from (default: .chkbitignore) + -w N, --workers N number of workers to use (default: 5) + --plain show plain status instead of being fancy -q, --quiet quiet, don't show progress/information -v, --verbose verbose output @@ -74,7 +80,7 @@ Status codes: EXC: internal exception ``` -chkbit is set to use only 5 workers by default so it will not slow your system to a crawl. You can specify a higher number to make it a lot faster (requires about 128kB of memory per worker). +chkbit is set to use only 5 workers by default so it will not slow your system to a crawl. You can specify a higher number to make it a lot faster if the IO throughput can also keep up. ## Repair @@ -123,7 +129,7 @@ When you run it again it first checks the modification time, ### I wish to use a stronger hash algorithm -chkbit now supports sha512 and blake3. You can specify it with `--algo sha512` or `--algo blake3`. +chkbit now uses blake3 by default. You can also specify it with `--algo sha512` or `--algo md5`. Note that existing index files will use the hash that they were created with. If you wish to update all hashes you need to delete your existing indexes first. @@ -145,19 +151,30 @@ Create test and set the modified time: ``` $ echo foo1 > test; touch -t 201501010000 test $ chkbit -u . -add ./test -Processed 1 file(s). -Indices were updated. +new ./test + +Processed 1 file. +- 192.31 files/second +- 0.00 MB/second +- 1 directory was updated +- 1 file hash was added +- 0 file hashes were updated ``` -`add` indicates the file was added. + +`new` indicates a new file was added. Now update test with a new modified: ``` $ echo foo2 > test; touch -t 201501010001 test # update test & modified $ chkbit -u . upd ./test -Processed 1 file(s). -Indices were updated. + +Processed 1 file. +- 191.61 files/second +- 0.00 MB/second +- 1 directory was updated +- 0 file hashes were added +- 1 file hash was updated ``` `upd` indicates the file was updated. @@ -167,10 +184,13 @@ Now update test with the same modified to simulate damage: $ echo foo3 > test; touch -t 201501010001 test $ chkbit -u . DMG ./test -Processed 0 file(s). + +Processed 1 file. +- 173.93 files/second +- 0.00 MB/second chkbit detected damage in these files: ./test -error: detected 1 file(s) with damage! +error: detected 1 file with damage! ``` `DMG` indicates damage. diff --git a/chkbit.py b/chkbit.py deleted file mode 100755 index 6d188ff..0000000 --- a/chkbit.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -import sys - -if sys.version_info < (3, 6): - sys.exit("Python < 3.6 is not supported") - -from chkbit.main import main - -main() diff --git a/chkbit/__init__.py b/chkbit/__init__.py index 3b8be30..915004f 100644 --- a/chkbit/__init__.py +++ b/chkbit/__init__.py @@ -1,4 +1,5 @@ +from chkbit.status import Status from chkbit.context import Context from chkbit.hashfile import hashfile, hashtext -from chkbit.index import Index, Stat -from chkbit.indexthread import IndexThread +from chkbit.index import Index +from chkbit.index_thread import IndexThread diff --git a/chkbit/context.py b/chkbit/context.py index 780cd8a..7abbf34 100644 --- a/chkbit/context.py +++ b/chkbit/context.py @@ -1,10 +1,35 @@ +import queue +from chkbit import Status + + class Context: - def __init__(self, verify_index, update, force, hash_algo, skip_symlinks): - self.verify_index = verify_index - self.update = update + def __init__( + self, + *, + num_workers=5, + force=False, + update=False, + hash_algo="blake3", + skip_symlinks=False, + index_filename=".chkbit", + ignore_filename=".chkbitignore", + ): + self.num_workers = num_workers self.force = force + self.update = update self.hash_algo = hash_algo self.skip_symlinks = skip_symlinks + self.index_filename = index_filename + self.ignore_filename = ignore_filename + + self.result_queue = queue.Queue() + self.hit_queue = queue.Queue() if hash_algo not in ["md5", "sha512", "blake3"]: raise Exception(f"{hash_algo} is unknown.") + + def log(self, stat: Status, path: str): + self.result_queue.put((0, stat, path)) + + def hit(self, *, cfiles: int = 0, cbytes: int = 0): + self.result_queue.put((1, cfiles, cbytes)) diff --git a/chkbit/hashfile.py b/chkbit/hashfile.py index 72bf96e..0d66d17 100644 --- a/chkbit/hashfile.py +++ b/chkbit/hashfile.py @@ -1,11 +1,12 @@ import hashlib +from typing import Callable BLOCKSIZE = 2**10 * 128 # kb -def hashfile(path, hash_algo=None): - if not hash_algo or hash_algo == "md5": +def hashfile(path: str, hash_algo: str, *, hit: Callable[[str], None]): + if hash_algo == "md5": h = hashlib.md5() elif hash_algo == "sha512": h = hashlib.sha512() @@ -14,14 +15,17 @@ def hashfile(path, hash_algo=None): h = blake3() else: - raise Exception(f"{hash_algo} is unknown.") + raise Exception(f"algo '{hash_algo}' is unknown.") with open(path, "rb") as f: while True: buf = f.read(BLOCKSIZE) - if len(buf) <= 0: + l = len(buf) + if l <= 0: break h.update(buf) + if hit: + hit(l) return h.hexdigest() diff --git a/chkbit/index.py b/chkbit/index.py index 8d22102..a09fc14 100644 --- a/chkbit/index.py +++ b/chkbit/index.py @@ -3,29 +3,14 @@ import os import subprocess import sys import json -from enum import Enum -from chkbit import hashfile, hashtext +from chkbit import hashfile, hashtext, Status VERSION = 2 # index version -INDEX = ".chkbit" -IGNORE = ".chkbitignore" - - -class Stat(Enum): - ERR_DMG = "DMG" - ERR_BITROT = "DMG" # legacy - ERR_IDX = "EIX" - WARN_OLD = "old" - NEW = "new" - UPDATE = "upd" - OK = "ok " - SKIP = "skp" - INTERNALEXCEPTION = "EXC" - FLAG_MOD = "fmod" class Index: - def __init__(self, path, files, *, log=None): + def __init__(self, context, path, files): + self.context = context self.path = path self.files = files self.old = {} @@ -34,15 +19,14 @@ class Index: self.load_ignore() self.updates = [] self.modified = True - self.log = log @property - def ignore_file(self): - return os.path.join(self.path, IGNORE) + def ignore_filepath(self): + return os.path.join(self.path, self.context.ignore_filename) @property - def idx_file(self): - return os.path.join(self.path, INDEX) + def index_filepath(self): + return os.path.join(self.path, self.context.index_filename) def should_ignore(self, name): for ignore in self.ignore: @@ -53,23 +37,23 @@ class Index: def _setmod(self): self.modified = True - def _log(self, stat, name): - if self.log: - self.log(stat, os.path.join(self.path, name)) + def _log(self, stat: Status, name: str): + self.context.log(stat, os.path.join(self.path, name)) # calc new hashes for this index - def update(self, context): + def update(self): for name in self.files: if self.should_ignore(name): - self._log(Stat.SKIP, name) + self._log(Status.SKIP, name) continue - a = context.hash_algo + a = self.context.hash_algo # check previously used hash if name in self.old: old = self.old[name] if "md5" in old: - a = "md5" # legacy structure + # legacy structure + a = "md5" self.old[name] = {"mod": old["mod"], "a": a, "h": old["md5"]} elif "a" in old: a = old["a"] @@ -79,7 +63,7 @@ class Index: def check_fix(self, force): for name in self.new.keys(): if not name in self.old: - self._log(Stat.NEW, name) + self._log(Status.NEW, name) self._setmod() continue @@ -89,14 +73,14 @@ class Index: bmod = b["mod"] if a["h"] == b["h"]: # ok, if the content stays the same the mod time does not matter - self._log(Stat.OK, name) + self._log(Status.OK, name) if amod != bmod: self._setmod() continue if amod == bmod: # damage detected - self._log(Stat.ERR_DMG, name) + self._log(Status.ERR_DMG, name) # replace with old so we don't loose the information on the next run # unless force is set if not force: @@ -105,17 +89,23 @@ class Index: self._setmod() elif amod < bmod: # ok, the file was updated - self._log(Stat.UPDATE, name) + self._log(Status.UPDATE, name) self._setmod() elif amod > bmod: - self._log(Stat.WARN_OLD, name) + self._log(Status.WARN_OLD, name) self._setmod() def _calc_file(self, name, a): path = os.path.join(self.path, name) info = os.stat(path) mtime = int(info.st_mtime * 1000) - return {"mod": mtime, "a": a, "h": hashfile(path, a)} + res = { + "mod": mtime, + "a": a, + "h": hashfile(path, a, hit=lambda l: self.context.hit(cbytes=l)), + } + self.context.hit(cfiles=1) + return res def save(self): if self.modified: @@ -123,7 +113,7 @@ class Index: text = json.dumps(self.new, separators=(",", ":")) data["idx_hash"] = hashtext(text) - with open(self.idx_file, "w", encoding="utf-8") as f: + with open(self.index_filepath, "w", encoding="utf-8") as f: json.dump(data, f, separators=(",", ":")) self.modified = False return True @@ -131,10 +121,10 @@ class Index: return False def load(self): - if not os.path.exists(self.idx_file): + if not os.path.exists(self.index_filepath): return False self.modified = False - with open(self.idx_file, "r", encoding="utf-8") as f: + with open(self.index_filepath, "r", encoding="utf-8") as f: data = json.load(f) if "data" in data: # extract old format from js version @@ -149,13 +139,13 @@ class Index: text = json.dumps(self.old, separators=(",", ":")) if data.get("idx_hash") != hashtext(text): self.modified = True - self._log(Stat.ERR_IDX, self.idx_file) + self._log(Status.ERR_IDX, self.index_filepath) return True def load_ignore(self): - if not os.path.exists(self.ignore_file): + if not os.path.exists(self.ignore_filepath): return - with open(self.ignore_file, "r", encoding="utf-8") as f: + with open(self.ignore_filepath, "r", encoding="utf-8") as f: text = f.read() self.ignore = list( diff --git a/chkbit/index_thread.py b/chkbit/index_thread.py new file mode 100644 index 0000000..6179edc --- /dev/null +++ b/chkbit/index_thread.py @@ -0,0 +1,69 @@ +import os +import sys +import time +import threading +from chkbit import Index, Status + + +class IndexThread: + def __init__(self, thread_no, context, input_queue): + self.thread_no = thread_no + self.update = context.update + self.context = context + self.input_queue = input_queue + self.t = threading.Thread(target=self._run) + self.t.daemon = True + self.t.start() + + def _process_root(self, parent): + files = [] + dirs = [] + + # load files and subdirs + for name in os.listdir(path=parent): + path = os.path.join(parent, name) + if name[0] == ".": + continue + if os.path.isdir(path): + if self.context.skip_symlinks and os.path.islink(path): + pass + else: + dirs.append(name) + elif os.path.isfile(path): + files.append(name) + + # load index + index = Index(self.context, parent, files) + index.load() + + # calc the new hashes + index.update() + + # compare + index.check_fix(self.context.force) + + # save if update is set + if self.update: + if index.save(): + self.context.log(Status.UPDATE_INDEX, "") + + # process subdirs + for name in dirs: + if not index.should_ignore(name): + self.input_queue.put(os.path.join(parent, name)) + else: + self.context.log(Status.SKIP, name + "/") + + def _run(self): + while True: + parent = self.input_queue.get() + if parent is None: + break + try: + self._process_root(parent) + except Exception as e: + self.context.log(Status.INTERNALEXCEPTION, f"{parent}: {e}") + self.input_queue.task_done() + + def join(self): + self.t.join() diff --git a/chkbit/indexthread.py b/chkbit/indexthread.py deleted file mode 100644 index 3b910d9..0000000 --- a/chkbit/indexthread.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import sys -import time -import threading -from chkbit import Index, Stat - - -class IndexThread: - def __init__(self, idx, context, res_queue, todo_queue): - self.idx = idx - self.verify_index_only = context.verify_index - self.update = context.update and not self.verify_index_only - self.context = context - self.todo_queue = todo_queue - self.res_queue = res_queue - self.t = threading.Thread(target=self.run) - self.t.daemon = True - self.t.start() - - def _log(self, stat, path): - if not self.verify_index_only or stat != Stat.NEW: - self.res_queue.put((self.idx, stat, path)) - - def _process_root(self, parent): - files = [] - dirs = [] - - # load files and subdirs - for name in os.listdir(path=parent): - path = os.path.join(parent, name) - if name[0] == ".": - continue - if os.path.isdir(path): - if self.context.skip_symlinks and os.path.islink(path): - pass - else: - dirs.append(name) - elif os.path.isfile(path): - files.append(name) - - # load index - e = Index(parent, files, log=self._log) - if e.load() or not self.verify_index_only: - # calc the new hashes - e.update(self.context) - - # compare - e.check_fix(self.context.force) - - # save if update is set - if self.update: - if e.save(): - self._log(Stat.FLAG_MOD, "") - - # process subdirs - for name in dirs: - if not e.should_ignore(name): - self.todo_queue.put(os.path.join(parent, name)) - else: - self._log(Stat.SKIP, name + "/") - - def run(self): - while True: - parent = self.todo_queue.get() - if parent is None: - break - try: - self._process_root(parent) - except Exception as e: - self._log(Stat.INTERNALEXCEPTION, f"{parent}: {e}") - self.todo_queue.task_done() diff --git a/chkbit/main.py b/chkbit/main.py deleted file mode 100644 index ec67fa7..0000000 --- a/chkbit/main.py +++ /dev/null @@ -1,189 +0,0 @@ -import os -import sys -import time -import argparse -import queue -import threading -from chkbit import Context, IndexThread, Stat - -STATUS_CODES = """ -Status codes: - DMG: error, data damage detected - EIX: error, index damaged - old: warning, file replaced by an older version - new: new file - upd: file updated - ok : check ok - skp: skipped (see .chkbitignore) - EXC: internal exception -""" - - -class Main: - def __init__(self): - self.stdscr = None - self.dmg_list = [] - self.err_list = [] - self.modified = False - self.verbose = False - self.total = 0 - self._parse_args() - - def _log(self, idx, stat, path): - if stat == Stat.FLAG_MOD: - self.modified = True - else: - if stat == Stat.ERR_DMG: - self.dmg_list.append(path) - elif stat == Stat.INTERNALEXCEPTION: - self.err_list.append(path) - elif stat in [Stat.OK, Stat.UPDATE, Stat.NEW]: - self.total += 1 - if self.verbose or not stat in [Stat.OK, Stat.SKIP]: - print(stat.value, path) - if not self.quiet and sys.stdout.isatty(): - print(self.total, end="\r") - - def _parse_args(self): - parser = argparse.ArgumentParser( - prog="chkbit", - description="Checks the data integrity of your files. See https://github.com/laktak/chkbit-py", - epilog=STATUS_CODES, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - parser.add_argument( - "paths", metavar="PATH", type=str, nargs="*", help="directories to check" - ) - - parser.add_argument( - "-u", - "--update", - action="store_true", - help="update indices (without this chkbit will only verify files)", - ) - - parser.add_argument( - "--algo", - type=str, - default="md5", - help="hash algorithm: md5, sha512, blake3", - ) - - parser.add_argument( - "-f", "--force", action="store_true", help="force update of damaged items" - ) - - parser.add_argument( - "-i", - "--verify-index", - action="store_true", - help="verify files in the index only (will not report new files)", - ) - - parser.add_argument( - "-s", "--skip-symlinks", action="store_true", help="do not follow symlinks" - ) - - parser.add_argument( - "-w", - "--workers", - metavar="N", - action="store", - type=int, - default=5, - help="number of workers to use, default=5", - ) - - parser.add_argument( - "-q", - "--quiet", - action="store_true", - help="quiet, don't show progress/information", - ) - - parser.add_argument( - "-v", "--verbose", action="store_true", help="verbose output" - ) - - self.args = parser.parse_args() - self.verbose = self.args.verbose - self.quiet = self.args.quiet - if not self.args.paths: - parser.print_help() - - def _res_worker(self): - while True: - item = self.res_queue.get() - if not item: - break - self._log(*item) - self.res_queue.task_done() - - def process(self): - self.res_queue = queue.Queue() - - # the todo queue is used to distribute the work - # to the index threads - todo_queue = queue.Queue() - - # put the initial paths into the queue - for path in self.args.paths: - todo_queue.put(path) - - context = Context( - self.args.verify_index, - self.args.update, - self.args.force, - self.args.algo, - self.args.skip_symlinks, - ) - - # start indexing - workers = [ - IndexThread(idx, context, self.res_queue, todo_queue) - for idx in range(self.args.workers) - ] - - # log the results from the workers - res_worker = threading.Thread(target=self._res_worker) - res_worker.daemon = True - res_worker.start() - - todo_queue.join() - self.res_queue.join() - - def print_result(self): - if not self.quiet: - print( - f"Processed {self.total} file(s){' in readonly mode' if not self.args.update else ''}." - ) - if self.modified: - print("Indices were updated.") - - if self.dmg_list: - print("chkbit detected damage in these files:", file=sys.stderr) - for err in self.dmg_list: - print(err, file=sys.stderr) - print( - f"error: detected {len(self.dmg_list)} file(s) with damage!", - file=sys.stderr, - ) - if self.err_list: - print("chkbit ran into errors:", file=sys.stderr) - for err in self.err_list: - print(err, file=sys.stderr) - - if self.dmg_list or self.err_list: - sys.exit(1) - - -def main(): - try: - m = Main() - if m.args.paths: - m.process() - m.print_result() - except KeyboardInterrupt: - print("abort") - sys.exit(1) diff --git a/chkbit/status.py b/chkbit/status.py new file mode 100644 index 0000000..30dfeb6 --- /dev/null +++ b/chkbit/status.py @@ -0,0 +1,13 @@ +from enum import Enum + + +class Status(Enum): + ERR_DMG = "DMG" + ERR_IDX = "EIX" + WARN_OLD = "old" + NEW = "new" + UPDATE = "upd" + OK = "ok " + SKIP = "skp" + INTERNALEXCEPTION = "EXC" + UPDATE_INDEX = "iup" diff --git a/cli/cli.py b/cli/cli.py new file mode 100644 index 0000000..99e998e --- /dev/null +++ b/cli/cli.py @@ -0,0 +1,61 @@ +import os +import sys + + +class CLI: + NO_COLOR = os.environ.get("NO_COLOR", "") + + class style: + reset = "\033[0m" + bold = "\033[01m" + disable = "\033[02m" + underline = "\033[04m" + reverse = "\033[07m" + strikethrough = "\033[09m" + invisible = "\033[08m" + + class esc: + up = "\033[A" + down = "\033[B" + right = "\033[C" + left = "\033[D" + + @staticmethod + def clear_line(opt=0): + # 0=to end, 1=from start, 2=all + return "\033[" + str(opt) + "K" + + @staticmethod + def write(*text): + for t in text: + sys.stdout.write(str(t)) + sys.stdout.flush() + + @staticmethod + def printline(*text): + CLI.write(*text, CLI.esc.clear_line(), "\n") + + # 4bit system colors + @staticmethod + def fg4(col): + # black=0,red=1,green=2,orange=3,blue=4,purple=5,cyan=6,lightgrey=7 + # darkgrey=8,lightred=9,lightgreen=10,yellow=11,lightblue=12,pink=13,lightcyan=14 + if CLI.NO_COLOR: + return "" + else: + return f"\033[{(30+col) if col<8 else (90-8+col)}m" + + # 8bit xterm colors + @staticmethod + def fg8(col): + if CLI.NO_COLOR: + return "" + else: + return f"\033[38;5;{col}m" + + @staticmethod + def bg8(col): + if CLI.NO_COLOR: + return "" + else: + return f"\033[48;5;{col}m" diff --git a/cli/main.py b/cli/main.py new file mode 100644 index 0000000..da10300 --- /dev/null +++ b/cli/main.py @@ -0,0 +1,326 @@ +import argparse +import os +import queue +import shutil +import sys +import threading +import time +from datetime import datetime, timedelta +from chkbit import Context, Status, IndexThread +from cli import CLI, Progress, RateCalc, sparkify + + +STATUS_CODES = """ +Status codes: + DMG: error, data damage detected + EIX: error, index damaged + old: warning, file replaced by an older version + new: new file + upd: file updated + ok : check ok + skp: skipped (see .chkbitignore) + EXC: internal exception +""" + +UPDATE_INTERVAL = timedelta(milliseconds=700) +MB = 1024 * 1024 + +CLI_BG = CLI.bg8(240) +CLI_SEP = "|" +CLI_SEP_FG = CLI.fg8(235) +CLI_FG1 = CLI.fg8(255) +CLI_FG2 = CLI.fg8(228) +CLI_FG3 = CLI.fg8(202) +CLI_OK_FG = CLI.fg4(2) +CLI_ALERT_FG = CLI.fg4(1) + + +class Main: + def __init__(self): + self.stdscr = None + self.dmg_list = [] + self.err_list = [] + self.num_idx_upd = 0 + self.num_new = 0 + self.num_upd = 0 + self.verbose = False + self.progress = Progress.Fancy + self.total = 0 + self.term_width = shutil.get_terminal_size()[0] + max_stat = int((self.term_width - 70) / 2) + self.fps = RateCalc(timedelta(seconds=1), max_stat=max_stat) + self.bps = RateCalc(timedelta(seconds=1), max_stat=max_stat) + + def _log(self, stat: Status, path: str): + if stat == Status.UPDATE_INDEX: + self.num_idx_upd += 1 + else: + if stat == Status.ERR_DMG: + self.total += 1 + self.dmg_list.append(path) + elif stat == Status.INTERNALEXCEPTION: + self.err_list.append(path) + elif stat in [Status.OK, Status.UPDATE, Status.NEW]: + self.total += 1 + if stat == Status.UPDATE: + self.num_upd += 1 + elif stat == Status.NEW: + self.num_new += 1 + + if self.verbose or not stat in [Status.OK, Status.SKIP]: + CLI.printline(stat.value, " ", path) + + def _res_worker(self, context: Context): + last = datetime.now() + while True: + try: + item = self.result_queue.get(timeout=0.2) + now = datetime.now() + if not item: + if self.progress == Progress.Fancy: + CLI.printline("") + break + t, *p = item + if t == 0: + self._log(*p) + last = datetime.min + else: + self.fps.push(now, p[0]) + self.bps.push(now, p[1]) + self.result_queue.task_done() + except queue.Empty: + now = datetime.now() + pass + if last + UPDATE_INTERVAL < now: + last = now + + if self.progress == Progress.Fancy: + stat_f = f"{self.fps.last} files/s" + stat_b = f"{int(self.bps.last/MB)} MB/s" + stat = f"[{'RW' if context.update else 'RO'}:{context.num_workers}] {self.total:>5} files $ {sparkify(self.fps.stats)} {stat_f:13} $ {sparkify(self.bps.stats)} {stat_b}" + stat = stat[: self.term_width - 1] + stat = stat.replace("$", CLI_SEP_FG + CLI_SEP + CLI_FG2, 1) + stat = stat.replace("$", CLI_SEP_FG + CLI_SEP + CLI_FG3, 1) + CLI.write( + CLI_BG, + CLI_FG1, + stat, + CLI.esc.clear_line(), + CLI.style.reset, + "\r", + ) + elif self.progress == Progress.Plain: + print(self.total, end="\r") + + def process(self, args): + # the input queue is used to distribute the work + # to the index threads + input_queue = queue.Queue() + + # put the initial paths into the queue + for path in args.paths: + input_queue.put(path) + + context = Context( + num_workers=args.workers, + force=args.force, + update=args.update, + hash_algo=args.algo, + skip_symlinks=args.skip_symlinks, + index_filename=args.index_name, + ignore_filename=args.ignore_name, + ) + self.result_queue = context.result_queue + + # start indexing + workers = [ + IndexThread(i, context, input_queue) for i in range(context.num_workers) + ] + + # log the results from the workers + res_worker = threading.Thread(target=self._res_worker, args=(context,)) + res_worker.daemon = True + res_worker.start() + + # wait for work to finish + input_queue.join() + + # signal workers to exit + for worker in workers: + input_queue.put(None) + + # signal res_worker to exit + self.result_queue.put(None) + + for worker in workers: + worker.join() + res_worker.join() + + return context + + def print_result(self, context): + def cprint(col, text): + if self.progress == Progress.Fancy: + CLI.printline(col, text, CLI.style.reset) + else: + print(text) + + def eprint(col, text): + if self.progress == Progress.Fancy: + CLI.write(col) + print(text, file=sys.stderr) + CLI.write(CLI.style.reset) + else: + print(text, file=sys.stderr) + + iunit = lambda x, u: f"{x} {u}{'s' if x!=1 else ''}" + iunit2 = lambda x, u1, u2: f"{x} {u2 if x!=1 else u1}" + + if self.progress != Progress.Quiet: + cprint( + CLI_OK_FG, + f"Processed {iunit(self.total, 'file')}{' in readonly mode' if not context.update else ''}.", + ) + + if self.progress == Progress.Fancy and self.total > 0: + elapsed = (datetime.now() - self.fps.start).total_seconds() + print(f"- {(self.fps.total+self.fps.current)/elapsed:.2f} files/second") + print(f"- {(self.bps.total+self.bps.current)/MB/elapsed:.2f} MB/second") + + if context.update: + if self.num_idx_upd: + cprint( + CLI_OK_FG, + f"- {iunit2(self.num_idx_upd, 'directory was', 'directories were')} updated\n" + + f"- {iunit2(self.num_new, 'file hash was', 'file hashes were')} added\n" + + f"- {iunit2(self.num_upd, 'file hash was', 'file hashes were')} updated", + ) + elif self.num_new + self.num_upd > 0: + cprint( + CLI_ALERT_FG, + f"No changes were made (specify -u to update):\n" + + f"- {iunit(self.num_new, 'file')} would have been added and\n" + + f"- {iunit(self.num_upd, 'file')} would have been updated.", + ) + + if self.dmg_list: + eprint(CLI_ALERT_FG, "chkbit detected damage in these files:") + for err in self.dmg_list: + print(err, file=sys.stderr) + n = len(self.dmg_list) + eprint( + CLI_ALERT_FG, + f"error: detected {iunit(n, 'file')} with damage!", + ) + + if self.err_list: + eprint(CLI_ALERT_FG, "chkbit ran into errors:") + for err in self.err_list: + print(err, file=sys.stderr) + + if self.dmg_list or self.err_list: + sys.exit(1) + + def run(self): + parser = argparse.ArgumentParser( + prog="chkbit", + description="Checks the data integrity of your files. See https://github.com/laktak/chkbit-py", + epilog=STATUS_CODES, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + parser.add_argument( + "paths", metavar="PATH", type=str, nargs="*", help="directories to check" + ) + + parser.add_argument( + "-u", + "--update", + action="store_true", + help="update indices (without this chkbit will verify files in readonly mode)", + ) + + parser.add_argument( + "--algo", + type=str, + default="blake3", + help="hash algorithm: md5, sha512, blake3 (default: blake3)", + ) + + parser.add_argument( + "-f", "--force", action="store_true", help="force update of damaged items" + ) + + parser.add_argument( + "-s", "--skip-symlinks", action="store_true", help="do not follow symlinks" + ) + + parser.add_argument( + "--index-name", + metavar="NAME", + type=str, + default=".chkbit", + help="filename where chkbit stores its hashes (default: .chkbit)", + ) + parser.add_argument( + "--ignore-name", + metavar="NAME", + type=str, + default=".chkbitignore", + help="filename that chkbit reads its ignore list from (default: .chkbitignore)", + ) + + parser.add_argument( + "-w", + "--workers", + metavar="N", + action="store", + type=int, + default=5, + help="number of workers to use (default: 5)", + ) + + parser.add_argument( + "--plain", + action="store_true", + help="show plain status instead of being fancy", + ) + + parser.add_argument( + "-q", + "--quiet", + action="store_true", + help="quiet, don't show progress/information", + ) + + parser.add_argument( + "-v", "--verbose", action="store_true", help="verbose output" + ) + + args = parser.parse_args() + + self.verbose = args.verbose + if args.quiet: + self.progress = Progress.Quiet + elif not sys.stdout.isatty(): + self.progress = Progress.Summary + elif args.plain: + self.progress = Progress.Plain + + if args.paths: + context = self.process(args) + self.print_result(context) + else: + parser.print_help() + + +def main(): + try: + Main().run() + except KeyboardInterrupt: + print("abort") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/cli/progress.py b/cli/progress.py new file mode 100644 index 0000000..9c12b02 --- /dev/null +++ b/cli/progress.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class Progress(Enum): + Quiet = (0,) + Summary = (1,) + Plain = (2,) + Fancy = (3,) diff --git a/cli/rate_calc.py b/cli/rate_calc.py new file mode 100644 index 0000000..abc384c --- /dev/null +++ b/cli/rate_calc.py @@ -0,0 +1,28 @@ +from datetime import datetime, timedelta + + +class RateCalc: + def __init__(self, interval: timedelta, max_stat: int): + self.interval = interval + self.max_stat = max(max_stat, 10) + self.reset() + + def reset(self): + self.start = datetime.now() + self.updated = self.start + self.total = 0 + self.current = 0 + self.stats = [0] * self.max_stat + + @property + def last(self): + return self.stats[-1] + + def push(self, ts: datetime, value: int): + while self.updated + self.interval < ts: + self.stats.append(self.current) + self.stats = self.stats[-self.max_stat :] + self.total += self.current + self.current = 0 + self.updated += self.interval + self.current += value diff --git a/cli/sparklines.py b/cli/sparklines.py new file mode 100644 index 0000000..488b30b --- /dev/null +++ b/cli/sparklines.py @@ -0,0 +1,71 @@ +import math, os, re, string, sys + +""" +Copyright (c) 2021, Brandon Whaley , et al. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + + +spark_chars = "▁▂▃▄▅▆▇█" +"""Eight unicode characters of (nearly) steadily increasing height.""" + + +def sparkify(series, minimum=None, maximum=None, rows=1): + """Converts to a sparkline string. + + Example: + >>> sparkify([ 0.5, 1.2, 3.5, 7.3, 8.0, 12.5, float("nan"), 15.0, 14.2, 11.8, 6.1, + ... 1.9 ]) + u'▁▁▂▄▅▇ ██▆▄▂' + + >>> sparkify([1, 1, -2, 3, -5, 8, -13]) + u'▆▆▅▆▄█▁' + + Raises ValueError if input data cannot be converted to float. + Raises TypeError if series is not an iterable. + """ + series = [float(n) for n in series] + if all(not math.isfinite(n) for n in series): + return " " * len(series) + + minimum = min(filter(math.isfinite, series)) if minimum is None else minimum + maximum = max(filter(math.isfinite, series)) if maximum is None else maximum + data_range = maximum - minimum + if data_range == 0.0: + # Graph a baseline if every input value is equal. + return "".join([spark_chars[0] if math.isfinite(i) else " " for i in series]) + row_res = len(spark_chars) + resolution = row_res * rows + coefficient = (resolution - 1.0) / data_range + + def clamp(n): + return min(max(n, minimum), maximum) + + def spark_index(n): + """An integer from 0 to (resolution-1) proportional to the data range""" + return int(round((clamp(n) - minimum) * coefficient)) + + output = [] + for r in range(rows - 1, -1, -1): + row_out = [] + row_min = row_res * r + row_max = row_min + row_res - 1 + for n in series: + if not math.isfinite(n): + row_out.append(" ") + continue + i = spark_index(n) + if i < row_min: + row_out.append(" ") + elif i > row_max: + row_out.append(spark_chars[-1]) + else: + row_out.append(spark_chars[i % row_res]) + output.append("".join(row_out)) + return os.linesep.join(output) diff --git a/pyproject.toml b/pyproject.toml index 59b86bd..f580665 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "chkbit" -version = "2.4.0" +version = "3.0.0" description = "chkbit checks the data integrity of your files" authors = [ {name = "Christian Zangl", email = "laktak@cdak.net"}, @@ -16,4 +16,7 @@ license = {file = "LICENSE"} Homepage = "https://github.com/laktak/chkbit-py" [project.scripts] -chkbit = "chkbit.main:main" +chkbit = "cli.main:main" + +[tool.setuptools.packages.find] +include = ["chkbit","cli"]