From 48b957f1d51667801984b9d174e27632e7e5d60f Mon Sep 17 00:00:00 2001 From: ed Date: Wed, 13 Jul 2022 00:48:39 +0200 Subject: [PATCH] add -e2v (file integrity checker) --- README.md | 5 +- bin/mtag/image-noexif.py | 5 +- copyparty/__main__.py | 4 ++ copyparty/authsrv.py | 10 ++- copyparty/up2k.py | 142 +++++++++++++++++++++++++++++++++++++-- copyparty/util.py | 3 + 6 files changed, 159 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bd0430c8..7d3bbfcf 100644 --- a/README.md +++ b/README.md @@ -663,8 +663,11 @@ through arguments: * `-e2t` enables metadata indexing on upload * `-e2ts` also scans for tags in all files that don't have tags yet * `-e2tsr` also deletes all existing tags, doing a full reindex +* `-e2v` verfies file integrity at startup, comparing hashes from the db +* `-e2vu` patches the database with the new hashes from the filesystem +* `-e2vp` panics and kills copyparty instead -the same arguments can be set as volume flags, in addition to `d2d`, `d2ds`, `d2t`, `d2ts` for disabling: +the same arguments can be set as volume flags, in addition to `d2d`, `d2ds`, `d2t`, `d2ts`, `d2v` for disabling: * `-v ~/music::r:c,e2dsa,e2tsr` does a full reindex of everything on startup * `-v ~/music::r:c,d2d` disables **all** indexing, even if any `-e2*` are on * `-v ~/music::r:c,d2t` disables all `-e2t*` (tags), does not affect `-e2d*` diff --git a/bin/mtag/image-noexif.py b/bin/mtag/image-noexif.py index bc009c17..0b0d5918 100644 --- a/bin/mtag/image-noexif.py +++ b/bin/mtag/image-noexif.py @@ -89,4 +89,7 @@ def main(): if __name__ == "__main__": - main() + try: + main() + except: + pass diff --git a/copyparty/__main__.py b/copyparty/__main__.py index 38fb9d10..72abfebe 100644 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -394,6 +394,7 @@ def run_argparse(argv: list[str], formatter: Any, retry: bool) -> argparse.Names \033[36md2ts\033[35m disables metadata collection for existing files \033[36md2ds\033[35m disables onboot indexing, overrides -e2ds* \033[36md2t\033[35m disables metadata collection, overrides -e2t* + \033[36md2v\033[35m disables file verification, overrides -e2v* \033[36md2d\033[35m disables all database stuff, overrides -e2* \033[36mnohash=\\.iso$\033[35m skips hashing file contents if path matches *.iso \033[36mnoidx=\\.iso$\033[35m fully ignores the contents at paths matching *.iso @@ -586,6 +587,9 @@ def run_argparse(argv: list[str], formatter: Any, retry: bool) -> argparse.Names ap2.add_argument("-e2d", action="store_true", help="enable up2k database, making files searchable + enables upload deduplocation") ap2.add_argument("-e2ds", action="store_true", help="scan writable folders for new files on startup; sets -e2d") ap2.add_argument("-e2dsa", action="store_true", help="scans all folders on startup; sets -e2ds") + ap2.add_argument("-e2v", action="store_true", help="verify file integrity; rehash all files and compare with db") + ap2.add_argument("-e2vu", action="store_true", help="on hash mismatch: update the database with the new hash") + ap2.add_argument("-e2vp", action="store_true", help="on hash mismatch: panic and quit copyparty") ap2.add_argument("--hist", metavar="PATH", type=u, help="where to store volume data (db, thumbs)") ap2.add_argument("--no-hash", metavar="PTN", type=u, help="regex: disable hashing of matching paths during e2ds folder scans") ap2.add_argument("--no-idx", metavar="PTN", type=u, help="regex: disable indexing of matching paths during e2ds folder scans") diff --git a/copyparty/authsrv.py b/copyparty/authsrv.py index 92612529..a21700de 100644 --- a/copyparty/authsrv.py +++ b/copyparty/authsrv.py @@ -1008,7 +1008,7 @@ class AuthSrv(object): if ptn: vol.flags[vf] = re.compile(ptn) - for k in ["e2t", "e2ts", "e2tsr"]: + for k in ["e2t", "e2ts", "e2tsr", "e2v", "e2vu", "e2vp"]: if getattr(self.args, k): vol.flags[k] = True @@ -1030,7 +1030,7 @@ class AuthSrv(object): self._read_volflag(vol.flags, "mtp", self.args.mtp, True) # d2d drops all database features for a volume - for grp, rm in [["d2d", "e2d"], ["d2t", "e2t"]]: + for grp, rm in [["d2d", "e2d"], ["d2t", "e2t"], ["d2d", "e2v"]]: if not vol.flags.get(grp, False): continue @@ -1052,6 +1052,12 @@ class AuthSrv(object): vol.flags = {k: v for k, v in vol.flags.items() if not k.startswith(rm)} + for grp, rm in [["d2v", "e2v"]]: + if not vol.flags.get(grp, False): + continue + + vol.flags = {k: v for k, v in vol.flags.items() if not k.startswith(rm)} + # verify tags mentioned by -mt[mp] are used by -mte local_mtp = {} local_only_mtp = {} diff --git a/copyparty/up2k.py b/copyparty/up2k.py index eacb2bfd..82b05a93 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -9,6 +9,7 @@ import math import os import re import shutil +import signal import stat import subprocess as sp import threading @@ -434,10 +435,36 @@ class Up2k(object): if vac: need_vac[vol] = True - if "e2ts" not in vol.flags: - t = "online, idle" - else: + if "e2v" in vol.flags: + t = "online (integrity-check pending)" + elif "e2ts" in vol.flags: t = "online (tags pending)" + else: + t = "online, idle" + + self.volstate[vol.vpath] = t + + # file contents verification + for vol in vols: + if self.stop: + break + + if "e2v" not in vol.flags: + continue + + t = "online (verifying integrity)" + self.volstate[vol.vpath] = t + self.log("{} [{}]".format(t, vol.realpath)) + + nmod = self._verify_integrity(vol) + if nmod: + self.log("modified {} entries in the db".format(nmod), 3) + need_vac[vol] = True + + if "e2ts" in vol.flags: + t = "online (tags pending)" + else: + t = "online, idle" self.volstate[vol.vpath] = t @@ -736,7 +763,9 @@ class Up2k(object): self.log("file: {}".format(abspath)) try: - hashes = self._hashlist_from_file(abspath) + hashes = self._hashlist_from_file( + abspath, "a{}, ".format(self.pp.n) + ) except Exception as ex: self.log("hash: {} @ [{}]".format(repr(ex), abspath)) continue @@ -816,6 +845,106 @@ class Up2k(object): return n_rm + def _verify_integrity(self, vol: VFS) -> int: + """expensive; blocks database access until finished""" + ptop = vol.realpath + assert self.pp and self.mtag + + cur = self.cur[ptop] + rei = vol.flags.get("noidx") + reh = vol.flags.get("nohash") + e2vu = "e2vu" in vol.flags + e2vp = "e2vp" in vol.flags + + excl = [ + d[len(vol.vpath) :].lstrip("/") + for d in self.asrv.vfs.all_vols + if d != vol.vpath and (d.startswith(vol.vpath + "/") or not vol.vpath) + ] + qexa: list[str] = [] + pexa: list[str] = [] + for vpath in excl: + qexa.append("up.rd != ? and not up.rd like ?||'%'") + pexa.extend([vpath, vpath]) + + pex = tuple(pexa) + qex = " and ".join(qexa) + if qex: + qex = " where " + qex + + rewark: list[tuple[str, str, str, int, int]] = [] + + with self.mutex: + b_left = 0 + n_left = 0 + q = "select sz from up" + qex + for (sz,) in cur.execute(q, pex): + b_left += sz # sum() can overflow according to docs + n_left += 1 + + q = "select w, mt, sz, rd, fn from up" + qex + for w, mt, sz, drd, dfn in cur.execute(q, pex): + if self.stop: + return -1 + + n_left -= 1 + b_left -= sz + if drd.startswith("//") or dfn.startswith("//"): + rd, fn = s3dec(drd, dfn) + else: + rd = drd + fn = dfn + + abspath = os.path.join(ptop, rd, fn) + if rei and rei.search(abspath): + continue + + nohash = reh.search(abspath) if reh else False + + pf = "v{}, {:.0f}+".format(n_left, b_left / 1024 / 1024) + self.pp.msg = pf + abspath + + st = bos.stat(abspath) + sz2 = st.st_size + mt2 = int(st.st_mtime) + + if nohash: + w2 = up2k_wark_from_metadata(self.salt, sz2, mt2, rd, fn) + else: + if sz2 > 1024 * 1024 * 32: + self.log("file: {}".format(abspath)) + + try: + hashes = self._hashlist_from_file(abspath, pf) + except Exception as ex: + self.log("hash: {} @ [{}]".format(repr(ex), abspath)) + continue + + w2 = up2k_wark_from_hashlist(self.salt, sz2, hashes) + + if w == w2: + continue + + rewark.append((drd, dfn, w2, sz2, mt2)) + + t = "hash mismatch: {}\n db: {} ({} byte, {})\n fs: {} ({} byte, {})" + t = t.format(abspath, w, sz, mt, w2, sz2, mt2) + self.log(t, 1) + + if e2vp and rewark: + self.hub.retcode = 1 + os.kill(os.getpid(), signal.SIGTERM) + raise Exception("{} files have incorrect hashes".format(len(rewark))) + + if not e2vu: + return 0 + + for rd, fn, w, sz, mt in rewark: + q = "update up set w = ?, sz = ?, mt = ? where rd = ? and fn = ? limit 1" + cur.execute(q, (w, sz, int(mt), rd, fn)) + + return len(rewark) + def _build_tags_index(self, vol: VFS) -> tuple[int, int, bool]: ptop = vol.realpath with self.mutex: @@ -2225,14 +2354,15 @@ class Up2k(object): return wark - def _hashlist_from_file(self, path: str) -> list[str]: + def _hashlist_from_file(self, path: str, prefix: str = "") -> list[str]: fsz = bos.path.getsize(path) csz = up2k_chunksize(fsz) ret = [] with open(fsenc(path), "rb", 512 * 1024) as f: while fsz > 0: if self.pp: - self.pp.msg = "{} MB, {}".format(int(fsz / 1024 / 1024), path) + mb = int(fsz / 1024 / 1024) + self.pp.msg = "{}{} MB, {}".format(prefix, mb, path) hashobj = hashlib.sha512() rem = min(csz, fsz) diff --git a/copyparty/util.py b/copyparty/util.py index 0d5f964f..3e81cf30 100644 --- a/copyparty/util.py +++ b/copyparty/util.py @@ -137,6 +137,9 @@ IMPLICATIONS = [ ["e2tsr", "e2ts"], ["e2ts", "e2t"], ["e2t", "e2d"], + ["e2vu", "e2v"], + ["e2vp", "e2v"], + ["e2v", "e2d"], ]