add -e2v (file integrity checker)

This commit is contained in:
ed 2022-07-13 00:48:39 +02:00
parent 3683984c8d
commit 48b957f1d5
6 changed files with 159 additions and 10 deletions

View file

@ -663,8 +663,11 @@ through arguments:
* `-e2t` enables metadata indexing on upload * `-e2t` enables metadata indexing on upload
* `-e2ts` also scans for tags in all files that don't have tags yet * `-e2ts` also scans for tags in all files that don't have tags yet
* `-e2tsr` also deletes all existing tags, doing a full reindex * `-e2tsr` also deletes all existing tags, doing a full reindex
* `-e2v` verfies file integrity at startup, comparing hashes from the db
* `-e2vu` patches the database with the new hashes from the filesystem
* `-e2vp` panics and kills copyparty instead
the same arguments can be set as volume flags, in addition to `d2d`, `d2ds`, `d2t`, `d2ts` for disabling: the same arguments can be set as volume flags, in addition to `d2d`, `d2ds`, `d2t`, `d2ts`, `d2v` for disabling:
* `-v ~/music::r:c,e2dsa,e2tsr` does a full reindex of everything on startup * `-v ~/music::r:c,e2dsa,e2tsr` does a full reindex of everything on startup
* `-v ~/music::r:c,d2d` disables **all** indexing, even if any `-e2*` are on * `-v ~/music::r:c,d2d` disables **all** indexing, even if any `-e2*` are on
* `-v ~/music::r:c,d2t` disables all `-e2t*` (tags), does not affect `-e2d*` * `-v ~/music::r:c,d2t` disables all `-e2t*` (tags), does not affect `-e2d*`

View file

@ -89,4 +89,7 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
try:
main() main()
except:
pass

View file

@ -394,6 +394,7 @@ def run_argparse(argv: list[str], formatter: Any, retry: bool) -> argparse.Names
\033[36md2ts\033[35m disables metadata collection for existing files \033[36md2ts\033[35m disables metadata collection for existing files
\033[36md2ds\033[35m disables onboot indexing, overrides -e2ds* \033[36md2ds\033[35m disables onboot indexing, overrides -e2ds*
\033[36md2t\033[35m disables metadata collection, overrides -e2t* \033[36md2t\033[35m disables metadata collection, overrides -e2t*
\033[36md2v\033[35m disables file verification, overrides -e2v*
\033[36md2d\033[35m disables all database stuff, overrides -e2* \033[36md2d\033[35m disables all database stuff, overrides -e2*
\033[36mnohash=\\.iso$\033[35m skips hashing file contents if path matches *.iso \033[36mnohash=\\.iso$\033[35m skips hashing file contents if path matches *.iso
\033[36mnoidx=\\.iso$\033[35m fully ignores the contents at paths matching *.iso \033[36mnoidx=\\.iso$\033[35m fully ignores the contents at paths matching *.iso
@ -586,6 +587,9 @@ def run_argparse(argv: list[str], formatter: Any, retry: bool) -> argparse.Names
ap2.add_argument("-e2d", action="store_true", help="enable up2k database, making files searchable + enables upload deduplocation") ap2.add_argument("-e2d", action="store_true", help="enable up2k database, making files searchable + enables upload deduplocation")
ap2.add_argument("-e2ds", action="store_true", help="scan writable folders for new files on startup; sets -e2d") ap2.add_argument("-e2ds", action="store_true", help="scan writable folders for new files on startup; sets -e2d")
ap2.add_argument("-e2dsa", action="store_true", help="scans all folders on startup; sets -e2ds") ap2.add_argument("-e2dsa", action="store_true", help="scans all folders on startup; sets -e2ds")
ap2.add_argument("-e2v", action="store_true", help="verify file integrity; rehash all files and compare with db")
ap2.add_argument("-e2vu", action="store_true", help="on hash mismatch: update the database with the new hash")
ap2.add_argument("-e2vp", action="store_true", help="on hash mismatch: panic and quit copyparty")
ap2.add_argument("--hist", metavar="PATH", type=u, help="where to store volume data (db, thumbs)") ap2.add_argument("--hist", metavar="PATH", type=u, help="where to store volume data (db, thumbs)")
ap2.add_argument("--no-hash", metavar="PTN", type=u, help="regex: disable hashing of matching paths during e2ds folder scans") ap2.add_argument("--no-hash", metavar="PTN", type=u, help="regex: disable hashing of matching paths during e2ds folder scans")
ap2.add_argument("--no-idx", metavar="PTN", type=u, help="regex: disable indexing of matching paths during e2ds folder scans") ap2.add_argument("--no-idx", metavar="PTN", type=u, help="regex: disable indexing of matching paths during e2ds folder scans")

View file

@ -1008,7 +1008,7 @@ class AuthSrv(object):
if ptn: if ptn:
vol.flags[vf] = re.compile(ptn) vol.flags[vf] = re.compile(ptn)
for k in ["e2t", "e2ts", "e2tsr"]: for k in ["e2t", "e2ts", "e2tsr", "e2v", "e2vu", "e2vp"]:
if getattr(self.args, k): if getattr(self.args, k):
vol.flags[k] = True vol.flags[k] = True
@ -1030,7 +1030,7 @@ class AuthSrv(object):
self._read_volflag(vol.flags, "mtp", self.args.mtp, True) self._read_volflag(vol.flags, "mtp", self.args.mtp, True)
# d2d drops all database features for a volume # d2d drops all database features for a volume
for grp, rm in [["d2d", "e2d"], ["d2t", "e2t"]]: for grp, rm in [["d2d", "e2d"], ["d2t", "e2t"], ["d2d", "e2v"]]:
if not vol.flags.get(grp, False): if not vol.flags.get(grp, False):
continue continue
@ -1052,6 +1052,12 @@ class AuthSrv(object):
vol.flags = {k: v for k, v in vol.flags.items() if not k.startswith(rm)} vol.flags = {k: v for k, v in vol.flags.items() if not k.startswith(rm)}
for grp, rm in [["d2v", "e2v"]]:
if not vol.flags.get(grp, False):
continue
vol.flags = {k: v for k, v in vol.flags.items() if not k.startswith(rm)}
# verify tags mentioned by -mt[mp] are used by -mte # verify tags mentioned by -mt[mp] are used by -mte
local_mtp = {} local_mtp = {}
local_only_mtp = {} local_only_mtp = {}

View file

@ -9,6 +9,7 @@ import math
import os import os
import re import re
import shutil import shutil
import signal
import stat import stat
import subprocess as sp import subprocess as sp
import threading import threading
@ -434,10 +435,36 @@ class Up2k(object):
if vac: if vac:
need_vac[vol] = True need_vac[vol] = True
if "e2ts" not in vol.flags: if "e2v" in vol.flags:
t = "online, idle" t = "online (integrity-check pending)"
else: elif "e2ts" in vol.flags:
t = "online (tags pending)" t = "online (tags pending)"
else:
t = "online, idle"
self.volstate[vol.vpath] = t
# file contents verification
for vol in vols:
if self.stop:
break
if "e2v" not in vol.flags:
continue
t = "online (verifying integrity)"
self.volstate[vol.vpath] = t
self.log("{} [{}]".format(t, vol.realpath))
nmod = self._verify_integrity(vol)
if nmod:
self.log("modified {} entries in the db".format(nmod), 3)
need_vac[vol] = True
if "e2ts" in vol.flags:
t = "online (tags pending)"
else:
t = "online, idle"
self.volstate[vol.vpath] = t self.volstate[vol.vpath] = t
@ -736,7 +763,9 @@ class Up2k(object):
self.log("file: {}".format(abspath)) self.log("file: {}".format(abspath))
try: try:
hashes = self._hashlist_from_file(abspath) hashes = self._hashlist_from_file(
abspath, "a{}, ".format(self.pp.n)
)
except Exception as ex: except Exception as ex:
self.log("hash: {} @ [{}]".format(repr(ex), abspath)) self.log("hash: {} @ [{}]".format(repr(ex), abspath))
continue continue
@ -816,6 +845,106 @@ class Up2k(object):
return n_rm return n_rm
def _verify_integrity(self, vol: VFS) -> int:
"""expensive; blocks database access until finished"""
ptop = vol.realpath
assert self.pp and self.mtag
cur = self.cur[ptop]
rei = vol.flags.get("noidx")
reh = vol.flags.get("nohash")
e2vu = "e2vu" in vol.flags
e2vp = "e2vp" in vol.flags
excl = [
d[len(vol.vpath) :].lstrip("/")
for d in self.asrv.vfs.all_vols
if d != vol.vpath and (d.startswith(vol.vpath + "/") or not vol.vpath)
]
qexa: list[str] = []
pexa: list[str] = []
for vpath in excl:
qexa.append("up.rd != ? and not up.rd like ?||'%'")
pexa.extend([vpath, vpath])
pex = tuple(pexa)
qex = " and ".join(qexa)
if qex:
qex = " where " + qex
rewark: list[tuple[str, str, str, int, int]] = []
with self.mutex:
b_left = 0
n_left = 0
q = "select sz from up" + qex
for (sz,) in cur.execute(q, pex):
b_left += sz # sum() can overflow according to docs
n_left += 1
q = "select w, mt, sz, rd, fn from up" + qex
for w, mt, sz, drd, dfn in cur.execute(q, pex):
if self.stop:
return -1
n_left -= 1
b_left -= sz
if drd.startswith("//") or dfn.startswith("//"):
rd, fn = s3dec(drd, dfn)
else:
rd = drd
fn = dfn
abspath = os.path.join(ptop, rd, fn)
if rei and rei.search(abspath):
continue
nohash = reh.search(abspath) if reh else False
pf = "v{}, {:.0f}+".format(n_left, b_left / 1024 / 1024)
self.pp.msg = pf + abspath
st = bos.stat(abspath)
sz2 = st.st_size
mt2 = int(st.st_mtime)
if nohash:
w2 = up2k_wark_from_metadata(self.salt, sz2, mt2, rd, fn)
else:
if sz2 > 1024 * 1024 * 32:
self.log("file: {}".format(abspath))
try:
hashes = self._hashlist_from_file(abspath, pf)
except Exception as ex:
self.log("hash: {} @ [{}]".format(repr(ex), abspath))
continue
w2 = up2k_wark_from_hashlist(self.salt, sz2, hashes)
if w == w2:
continue
rewark.append((drd, dfn, w2, sz2, mt2))
t = "hash mismatch: {}\n db: {} ({} byte, {})\n fs: {} ({} byte, {})"
t = t.format(abspath, w, sz, mt, w2, sz2, mt2)
self.log(t, 1)
if e2vp and rewark:
self.hub.retcode = 1
os.kill(os.getpid(), signal.SIGTERM)
raise Exception("{} files have incorrect hashes".format(len(rewark)))
if not e2vu:
return 0
for rd, fn, w, sz, mt in rewark:
q = "update up set w = ?, sz = ?, mt = ? where rd = ? and fn = ? limit 1"
cur.execute(q, (w, sz, int(mt), rd, fn))
return len(rewark)
def _build_tags_index(self, vol: VFS) -> tuple[int, int, bool]: def _build_tags_index(self, vol: VFS) -> tuple[int, int, bool]:
ptop = vol.realpath ptop = vol.realpath
with self.mutex: with self.mutex:
@ -2225,14 +2354,15 @@ class Up2k(object):
return wark return wark
def _hashlist_from_file(self, path: str) -> list[str]: def _hashlist_from_file(self, path: str, prefix: str = "") -> list[str]:
fsz = bos.path.getsize(path) fsz = bos.path.getsize(path)
csz = up2k_chunksize(fsz) csz = up2k_chunksize(fsz)
ret = [] ret = []
with open(fsenc(path), "rb", 512 * 1024) as f: with open(fsenc(path), "rb", 512 * 1024) as f:
while fsz > 0: while fsz > 0:
if self.pp: if self.pp:
self.pp.msg = "{} MB, {}".format(int(fsz / 1024 / 1024), path) mb = int(fsz / 1024 / 1024)
self.pp.msg = "{}{} MB, {}".format(prefix, mb, path)
hashobj = hashlib.sha512() hashobj = hashlib.sha512()
rem = min(csz, fsz) rem = min(csz, fsz)

View file

@ -137,6 +137,9 @@ IMPLICATIONS = [
["e2tsr", "e2ts"], ["e2tsr", "e2ts"],
["e2ts", "e2t"], ["e2ts", "e2t"],
["e2t", "e2d"], ["e2t", "e2d"],
["e2vu", "e2v"],
["e2vp", "e2v"],
["e2v", "e2d"],
] ]