verify on-disk contents before dedup;

previously, the assumption was made that the database and filesystem
would not desync, and that an upload could safely be substituted with
a symlink to an existing copy on-disk, assuming said copy still
existed on-disk at all

this is fine if copyparty is the only software that makes changes to
the filesystem, but that is a shitty assumption to make in hindsight

add `--safe-dedup` which takes a "safety level", and by default (50)
it will no longer blindly expect that the filesystem has not been
altered through other means; the file contents will now be hashed
and compared to the database

deduplication can be much slower as a result, but definitely worth it
as this avoids some potentially very unpleasant surprises

the previous behavior can be restored with `--safe-dedup 1`
This commit is contained in:
ed 2024-09-06 19:08:14 +00:00
parent 08848be784
commit 6e671c5245
5 changed files with 70 additions and 23 deletions

View file

@ -1978,7 +1978,7 @@ safety profiles:
* `--hardlink` creates hardlinks instead of symlinks when deduplicating uploads, which is less maintenance
* however note if you edit one file it will also affect the other copies
* `--vague-403` returns a "404 not found" instead of "401 unauthorized" which is a common enterprise meme
* `--nih` removes the server hostname from directory listings
* `-nih` removes the server hostname from directory listings
* option `-sss` is a shortcut for the above plus:
* `--no-dav` disables webdav support

View file

@ -992,6 +992,7 @@ def add_upload(ap):
ap2.add_argument("--reg-cap", metavar="N", type=int, default=38400, help="max number of uploads to keep in memory when running without \033[33m-e2d\033[0m; roughly 1 MiB RAM per 600")
ap2.add_argument("--no-fpool", action="store_true", help="disable file-handle pooling -- instead, repeatedly close and reopen files during upload (bad idea to enable this on windows and/or cow filesystems)")
ap2.add_argument("--use-fpool", action="store_true", help="force file-handle pooling, even when it might be dangerous (multiprocessing, filesystems lacking sparse-files support, ...)")
ap2.add_argument("--safe-dedup", metavar="N", type=int, default=50, help="how careful to be when deduplicating files; [\033[32m1\033[0m] = just verify the filesize, [\033[32m50\033[0m] = verify file contents have not been altered (volflag=safededup)")
ap2.add_argument("--hardlink", action="store_true", help="prefer hardlinks instead of symlinks when possible (within same filesystem) (volflag=hardlink)")
ap2.add_argument("--never-symlink", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made (volflag=neversymlink)")
ap2.add_argument("--no-dedup", action="store_true", help="disable symlink/hardlink creation; copy file contents instead (volflag=copydupes)")

View file

@ -21,6 +21,7 @@ def vf_bmap() -> dict[str, str]:
"no_thumb": "dthumb",
"no_vthumb": "dvthumb",
"no_athumb": "dathumb",
"safe_dedup": "safededup",
}
for k in (
"dotsrch",
@ -132,6 +133,7 @@ flagcats = {
"nodupe": "rejects existing files (instead of symlinking them)",
"hardlink": "does dedup with hardlinks instead of symlinks",
"neversymlink": "disables symlink fallback; full copy instead",
"safededup": "verify on-disk data before using it for dedup",
"copydupes": "disables dedup, always saves full copies of dupes",
"sparse": "force use of sparse files, mainly for s3-backed storage",
"daw": "enable full WebDAV write support (dangerous);\nPUT-operations will now \033[1;31mOVERWRITE\033[0;35m existing files",

View file

@ -1462,7 +1462,7 @@ class Up2k(object):
self.log("file: {}".format(abspath))
try:
hashes = self._hashlist_from_file(
hashes, _ = self._hashlist_from_file(
abspath, "a{}, ".format(self.pp.n)
)
except Exception as ex:
@ -1711,7 +1711,7 @@ class Up2k(object):
self.log("file: {}".format(abspath))
try:
hashes = self._hashlist_from_file(abspath, pf)
hashes, _ = self._hashlist_from_file(abspath, pf)
except Exception as ex:
self.log("hash: {} @ [{}]".format(repr(ex), abspath))
continue
@ -2670,6 +2670,9 @@ class Up2k(object):
rand = vfs.flags.get("rand") or cj.get("rand")
lost: list[tuple["sqlite3.Cursor", str, str]] = []
safe_dedup = vfs.flags.get("safededup") or 50
data_ok = safe_dedup < 10 or n4g
vols = [(ptop, jcur)] if jcur else []
if vfs.flags.get("xlink"):
vols += [(k, v) for k, v in self.cur.items() if k != ptop]
@ -2677,7 +2680,7 @@ class Up2k(object):
# force upload time rather than last-modified
cj["lmod"] = int(time.time())
alts: list[tuple[int, int, dict[str, Any]]] = []
alts: list[tuple[int, int, dict[str, Any], "sqlite3.Cursor", str, str]] = []
for ptop, cur in vols:
allv = self.asrv.vfs.all_vols
cvfs = next((v for v in allv.values() if v.realpath == ptop), vfs)
@ -2707,12 +2710,11 @@ class Up2k(object):
wark, st.st_size, dsize, st.st_mtime, dtime, dp_abs
)
self.log(t)
raise Exception("desync")
raise Exception()
except Exception as ex:
if n4g:
st = os.stat_result((0, -1, -1, 0, 0, 0, 0, 0, 0, 0))
else:
if str(ex) != "desync":
lost.append((cur, dp_dir, dp_fn))
continue
@ -2736,18 +2738,42 @@ class Up2k(object):
if k in cj:
j[k] = cj[k]
# offset of 1st diff in vpaths
zig = (
n + 1
for n, (c1, c2) in enumerate(
zip(dp_dir + "\r", cj["prel"] + "\n")
)
if c1 != c2
)
score = (
(3 if st.st_dev == dev else 0)
+ (2 if dp_dir == cj["prel"] else 0)
(6969 if st.st_dev == dev else 0)
+ (3210 if dp_dir == cj["prel"] else next(zig))
+ (1 if dp_fn == cj["name"] else 0)
)
alts.append((score, -len(alts), j))
alts.append((score, -len(alts), j, cur, dp_dir, dp_fn))
if alts:
best = sorted(alts, reverse=True)[0]
job = best[2]
else:
job = None
inc_ap = djoin(cj["ptop"], cj["prel"], cj["name"])
for dupe in sorted(alts, reverse=True):
rj = dupe[2]
orig_ap = djoin(rj["ptop"], rj["prel"], rj["name"])
if data_ok or inc_ap == orig_ap:
data_ok = True
job = rj
break
else:
self.log("asserting contents of %s" % (orig_ap,))
dhashes, st = self._hashlist_from_file(orig_ap)
dwark = up2k_wark_from_hashlist(self.salt, st.st_size, dhashes)
if wark != dwark:
t = "will not dedup (fs index desync): fs=%s, db=%s, file: %s"
self.log(t % (dwark, wark, orig_ap))
lost.append(dupe[3:])
continue
data_ok = True
job = rj
break
if job and wark in reg:
# self.log("pop " + wark + " " + job["name"] + " handle_json db", 4)
@ -2756,7 +2782,7 @@ class Up2k(object):
if lost:
c2 = None
for cur, dp_dir, dp_fn in lost:
t = "forgetting deleted file: /{}"
t = "forgetting desynced db entry: /{}"
self.log(t.format(vjoin(vjoin(vfs.vpath, dp_dir), dp_fn)))
self.db_rm(cur, dp_dir, dp_fn, cj["size"])
if c2 and c2 != cur:
@ -2791,7 +2817,13 @@ class Up2k(object):
del reg[wark]
break
if st and not self.args.nw and not n4g and st.st_size != rj["size"]:
inc_ap = djoin(cj["ptop"], cj["prel"], cj["name"])
orig_ap = djoin(rj["ptop"], rj["prel"], rj["name"])
if self.args.nw or n4g or not st:
pass
elif st.st_size != rj["size"]:
t = "will not dedup (fs index desync): {}, size fs={} db={}, mtime fs={} db={}, file: {}"
t = t.format(
wark, st.st_size, rj["size"], st.st_mtime, rj["lmod"], path
@ -2799,6 +2831,15 @@ class Up2k(object):
self.log(t)
del reg[wark]
elif inc_ap != orig_ap and not data_ok:
self.log("asserting contents of %s" % (orig_ap,))
dhashes, _ = self._hashlist_from_file(orig_ap)
dwark = up2k_wark_from_hashlist(self.salt, st.st_size, dhashes)
if wark != dwark:
t = "will not dedup (fs index desync): fs=%s, idx=%s, file: %s"
self.log(t % (dwark, wark, orig_ap))
del reg[wark]
if job or wark in reg:
job = job or reg[wark]
if (
@ -4246,8 +4287,11 @@ class Up2k(object):
return wark
def _hashlist_from_file(self, path: str, prefix: str = "") -> list[str]:
fsz = bos.path.getsize(path)
def _hashlist_from_file(
self, path: str, prefix: str = ""
) -> tuple[list[str], os.stat_result]:
st = bos.stat(path)
fsz = st.st_size
csz = up2k_chunksize(fsz)
ret = []
suffix = " MB, {}".format(path)
@ -4260,7 +4304,7 @@ class Up2k(object):
while fsz > 0:
# same as `hash_at` except for `imutex` / bufsz
if self.stop:
return []
return [], st
if self.pp:
mb = fsz // (1024 * 1024)
@ -4281,7 +4325,7 @@ class Up2k(object):
digest = base64.urlsafe_b64encode(digest)
ret.append(digest.decode("utf-8"))
return ret
return ret, st
def _new_upload(self, job: dict[str, Any], vfs: VFS, depth: int) -> dict[str, str]:
pdir = djoin(job["ptop"], job["prel"])
@ -4582,7 +4626,7 @@ class Up2k(object):
self.salt, inf.st_size, int(inf.st_mtime), rd, fn
)
else:
hashes = self._hashlist_from_file(abspath)
hashes, _ = self._hashlist_from_file(abspath)
if not hashes:
return False

View file

@ -126,7 +126,7 @@ class Cfg(Namespace):
ex = "ah_cli ah_gen css_browser hist js_browser js_other mime mimes no_forget no_hash no_idx nonsus_urls og_tpl og_ua"
ka.update(**{k: None for k in ex.split()})
ex = "hash_mt srch_time u2abort u2j u2sz"
ex = "hash_mt safe_dedup srch_time u2abort u2j u2sz"
ka.update(**{k: 1 for k in ex.split()})
ex = "au_vol mtab_age reg_cap s_thead s_tbody th_convt"