From 6e671c524521cc95a09cbe746b473ed924808812 Mon Sep 17 00:00:00 2001 From: ed Date: Fri, 6 Sep 2024 19:08:14 +0000 Subject: [PATCH] verify on-disk contents before dedup; previously, the assumption was made that the database and filesystem would not desync, and that an upload could safely be substituted with a symlink to an existing copy on-disk, assuming said copy still existed on-disk at all this is fine if copyparty is the only software that makes changes to the filesystem, but that is a shitty assumption to make in hindsight add `--safe-dedup` which takes a "safety level", and by default (50) it will no longer blindly expect that the filesystem has not been altered through other means; the file contents will now be hashed and compared to the database deduplication can be much slower as a result, but definitely worth it as this avoids some potentially very unpleasant surprises the previous behavior can be restored with `--safe-dedup 1` --- README.md | 2 +- copyparty/__main__.py | 1 + copyparty/cfg.py | 2 + copyparty/up2k.py | 86 ++++++++++++++++++++++++++++++++----------- tests/util.py | 2 +- 5 files changed, 70 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index e9ed663b..3adbe9d6 100644 --- a/README.md +++ b/README.md @@ -1978,7 +1978,7 @@ safety profiles: * `--hardlink` creates hardlinks instead of symlinks when deduplicating uploads, which is less maintenance * however note if you edit one file it will also affect the other copies * `--vague-403` returns a "404 not found" instead of "401 unauthorized" which is a common enterprise meme - * `--nih` removes the server hostname from directory listings + * `-nih` removes the server hostname from directory listings * option `-sss` is a shortcut for the above plus: * `--no-dav` disables webdav support diff --git a/copyparty/__main__.py b/copyparty/__main__.py index c20ca4b9..9fa3201f 100644 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -992,6 +992,7 @@ def add_upload(ap): ap2.add_argument("--reg-cap", metavar="N", type=int, default=38400, help="max number of uploads to keep in memory when running without \033[33m-e2d\033[0m; roughly 1 MiB RAM per 600") ap2.add_argument("--no-fpool", action="store_true", help="disable file-handle pooling -- instead, repeatedly close and reopen files during upload (bad idea to enable this on windows and/or cow filesystems)") ap2.add_argument("--use-fpool", action="store_true", help="force file-handle pooling, even when it might be dangerous (multiprocessing, filesystems lacking sparse-files support, ...)") + ap2.add_argument("--safe-dedup", metavar="N", type=int, default=50, help="how careful to be when deduplicating files; [\033[32m1\033[0m] = just verify the filesize, [\033[32m50\033[0m] = verify file contents have not been altered (volflag=safededup)") ap2.add_argument("--hardlink", action="store_true", help="prefer hardlinks instead of symlinks when possible (within same filesystem) (volflag=hardlink)") ap2.add_argument("--never-symlink", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made (volflag=neversymlink)") ap2.add_argument("--no-dedup", action="store_true", help="disable symlink/hardlink creation; copy file contents instead (volflag=copydupes)") diff --git a/copyparty/cfg.py b/copyparty/cfg.py index cd87dc2e..7c8f1e5d 100644 --- a/copyparty/cfg.py +++ b/copyparty/cfg.py @@ -21,6 +21,7 @@ def vf_bmap() -> dict[str, str]: "no_thumb": "dthumb", "no_vthumb": "dvthumb", "no_athumb": "dathumb", + "safe_dedup": "safededup", } for k in ( "dotsrch", @@ -132,6 +133,7 @@ flagcats = { "nodupe": "rejects existing files (instead of symlinking them)", "hardlink": "does dedup with hardlinks instead of symlinks", "neversymlink": "disables symlink fallback; full copy instead", + "safededup": "verify on-disk data before using it for dedup", "copydupes": "disables dedup, always saves full copies of dupes", "sparse": "force use of sparse files, mainly for s3-backed storage", "daw": "enable full WebDAV write support (dangerous);\nPUT-operations will now \033[1;31mOVERWRITE\033[0;35m existing files", diff --git a/copyparty/up2k.py b/copyparty/up2k.py index 263b9952..eebe7507 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -1462,7 +1462,7 @@ class Up2k(object): self.log("file: {}".format(abspath)) try: - hashes = self._hashlist_from_file( + hashes, _ = self._hashlist_from_file( abspath, "a{}, ".format(self.pp.n) ) except Exception as ex: @@ -1711,7 +1711,7 @@ class Up2k(object): self.log("file: {}".format(abspath)) try: - hashes = self._hashlist_from_file(abspath, pf) + hashes, _ = self._hashlist_from_file(abspath, pf) except Exception as ex: self.log("hash: {} @ [{}]".format(repr(ex), abspath)) continue @@ -2670,6 +2670,9 @@ class Up2k(object): rand = vfs.flags.get("rand") or cj.get("rand") lost: list[tuple["sqlite3.Cursor", str, str]] = [] + safe_dedup = vfs.flags.get("safededup") or 50 + data_ok = safe_dedup < 10 or n4g + vols = [(ptop, jcur)] if jcur else [] if vfs.flags.get("xlink"): vols += [(k, v) for k, v in self.cur.items() if k != ptop] @@ -2677,7 +2680,7 @@ class Up2k(object): # force upload time rather than last-modified cj["lmod"] = int(time.time()) - alts: list[tuple[int, int, dict[str, Any]]] = [] + alts: list[tuple[int, int, dict[str, Any], "sqlite3.Cursor", str, str]] = [] for ptop, cur in vols: allv = self.asrv.vfs.all_vols cvfs = next((v for v in allv.values() if v.realpath == ptop), vfs) @@ -2707,13 +2710,12 @@ class Up2k(object): wark, st.st_size, dsize, st.st_mtime, dtime, dp_abs ) self.log(t) - raise Exception("desync") + raise Exception() except Exception as ex: if n4g: st = os.stat_result((0, -1, -1, 0, 0, 0, 0, 0, 0, 0)) else: - if str(ex) != "desync": - lost.append((cur, dp_dir, dp_fn)) + lost.append((cur, dp_dir, dp_fn)) continue j = { @@ -2736,18 +2738,42 @@ class Up2k(object): if k in cj: j[k] = cj[k] + # offset of 1st diff in vpaths + zig = ( + n + 1 + for n, (c1, c2) in enumerate( + zip(dp_dir + "\r", cj["prel"] + "\n") + ) + if c1 != c2 + ) score = ( - (3 if st.st_dev == dev else 0) - + (2 if dp_dir == cj["prel"] else 0) + (6969 if st.st_dev == dev else 0) + + (3210 if dp_dir == cj["prel"] else next(zig)) + (1 if dp_fn == cj["name"] else 0) ) - alts.append((score, -len(alts), j)) + alts.append((score, -len(alts), j, cur, dp_dir, dp_fn)) - if alts: - best = sorted(alts, reverse=True)[0] - job = best[2] - else: - job = None + job = None + inc_ap = djoin(cj["ptop"], cj["prel"], cj["name"]) + for dupe in sorted(alts, reverse=True): + rj = dupe[2] + orig_ap = djoin(rj["ptop"], rj["prel"], rj["name"]) + if data_ok or inc_ap == orig_ap: + data_ok = True + job = rj + break + else: + self.log("asserting contents of %s" % (orig_ap,)) + dhashes, st = self._hashlist_from_file(orig_ap) + dwark = up2k_wark_from_hashlist(self.salt, st.st_size, dhashes) + if wark != dwark: + t = "will not dedup (fs index desync): fs=%s, db=%s, file: %s" + self.log(t % (dwark, wark, orig_ap)) + lost.append(dupe[3:]) + continue + data_ok = True + job = rj + break if job and wark in reg: # self.log("pop " + wark + " " + job["name"] + " handle_json db", 4) @@ -2756,7 +2782,7 @@ class Up2k(object): if lost: c2 = None for cur, dp_dir, dp_fn in lost: - t = "forgetting deleted file: /{}" + t = "forgetting desynced db entry: /{}" self.log(t.format(vjoin(vjoin(vfs.vpath, dp_dir), dp_fn))) self.db_rm(cur, dp_dir, dp_fn, cj["size"]) if c2 and c2 != cur: @@ -2791,7 +2817,13 @@ class Up2k(object): del reg[wark] break - if st and not self.args.nw and not n4g and st.st_size != rj["size"]: + inc_ap = djoin(cj["ptop"], cj["prel"], cj["name"]) + orig_ap = djoin(rj["ptop"], rj["prel"], rj["name"]) + + if self.args.nw or n4g or not st: + pass + + elif st.st_size != rj["size"]: t = "will not dedup (fs index desync): {}, size fs={} db={}, mtime fs={} db={}, file: {}" t = t.format( wark, st.st_size, rj["size"], st.st_mtime, rj["lmod"], path @@ -2799,6 +2831,15 @@ class Up2k(object): self.log(t) del reg[wark] + elif inc_ap != orig_ap and not data_ok: + self.log("asserting contents of %s" % (orig_ap,)) + dhashes, _ = self._hashlist_from_file(orig_ap) + dwark = up2k_wark_from_hashlist(self.salt, st.st_size, dhashes) + if wark != dwark: + t = "will not dedup (fs index desync): fs=%s, idx=%s, file: %s" + self.log(t % (dwark, wark, orig_ap)) + del reg[wark] + if job or wark in reg: job = job or reg[wark] if ( @@ -4246,8 +4287,11 @@ class Up2k(object): return wark - def _hashlist_from_file(self, path: str, prefix: str = "") -> list[str]: - fsz = bos.path.getsize(path) + def _hashlist_from_file( + self, path: str, prefix: str = "" + ) -> tuple[list[str], os.stat_result]: + st = bos.stat(path) + fsz = st.st_size csz = up2k_chunksize(fsz) ret = [] suffix = " MB, {}".format(path) @@ -4260,7 +4304,7 @@ class Up2k(object): while fsz > 0: # same as `hash_at` except for `imutex` / bufsz if self.stop: - return [] + return [], st if self.pp: mb = fsz // (1024 * 1024) @@ -4281,7 +4325,7 @@ class Up2k(object): digest = base64.urlsafe_b64encode(digest) ret.append(digest.decode("utf-8")) - return ret + return ret, st def _new_upload(self, job: dict[str, Any], vfs: VFS, depth: int) -> dict[str, str]: pdir = djoin(job["ptop"], job["prel"]) @@ -4582,7 +4626,7 @@ class Up2k(object): self.salt, inf.st_size, int(inf.st_mtime), rd, fn ) else: - hashes = self._hashlist_from_file(abspath) + hashes, _ = self._hashlist_from_file(abspath) if not hashes: return False diff --git a/tests/util.py b/tests/util.py index 2c04cf67..b6062e25 100644 --- a/tests/util.py +++ b/tests/util.py @@ -126,7 +126,7 @@ class Cfg(Namespace): ex = "ah_cli ah_gen css_browser hist js_browser js_other mime mimes no_forget no_hash no_idx nonsus_urls og_tpl og_ua" ka.update(**{k: None for k in ex.split()}) - ex = "hash_mt srch_time u2abort u2j u2sz" + ex = "hash_mt safe_dedup srch_time u2abort u2j u2sz" ka.update(**{k: 1 for k in ex.split()}) ex = "au_vol mtab_age reg_cap s_thead s_tbody th_convt"