From 2f021a0c2b1289ebb97838db51cf4590cf9bff00 Mon Sep 17 00:00:00 2001 From: ed Date: Tue, 12 Oct 2021 01:40:19 +0200 Subject: [PATCH] skip indexing files by regex --- README.md | 8 +++++--- copyparty/__main__.py | 6 ++++-- copyparty/authsrv.py | 11 ++++++++--- copyparty/up2k.py | 13 +++++++++---- tests/test_httpcli.py | 3 ++- tests/test_vfs.py | 3 ++- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 14db3f1d..52f32b72 100644 --- a/README.md +++ b/README.md @@ -596,12 +596,14 @@ note: * `e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and `e2ts` would then reindex those, unless there is a new copyparty version with new parsers and the release note says otherwise * the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher -to save some time, you can choose to only index filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash` or the volume-flag `:c,dhash`, this has the following consequences: +to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash \.iso$` or the volume-flag `:c,nohash=\.iso$`, this has the following consequences: * initial indexing is way faster, especially when the volume is on a network disk * makes it impossible to [file-search](#file-search) * if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected -if you set `--no-hash`, you can enable hashing for specific volumes using flag `:c,ehash` +similarly, you can fully ignore files/folders using `--no-idx [...]` and `:c,noidx=\.iso$` + +if you set `--no-hash [...]` globally, you can enable hashing for specific volumes using flag `:c,nohash=` ## upload rules @@ -851,7 +853,7 @@ below are some tweaks roughly ordered by usefulness: * `-q` disables logging and can help a bunch, even when combined with `-lo` to redirect logs to file * `--http-only` or `--https-only` (unless you want to support both protocols) will reduce the delay before a new connection is established * `--hist` pointing to a fast location (ssd) will make directory listings and searches faster when `-e2d` or `-e2t` is set -* `--no-hash` when indexing a network-disk if you don't care about the actual filehashes and only want the names/tags searchable +* `--no-hash .` when indexing a network-disk if you don't care about the actual filehashes and only want the names/tags searchable * `-j` enables multiprocessing (actual multithreading) and can make copyparty perform better in cpu-intensive workloads, for example: * huge amount of short-lived connections * really heavy traffic (downloads/uploads) diff --git a/copyparty/__main__.py b/copyparty/__main__.py index e2d0f50d..2bd4c97a 100644 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -276,7 +276,8 @@ def run_argparse(argv, formatter): \033[36me2d\033[35m sets -e2d (all -e2* args can be set using ce2* volflags) \033[36md2t\033[35m disables metadata collection, overrides -e2t* \033[36md2d\033[35m disables all database stuff, overrides -e2* - \033[36mdhash\033[35m disables file hashing on initial scans, also ehash + \033[36mnohash=\\.iso$\033[35m skips hashing file contents if path matches *.iso + \033[36mnoidx=\\.iso$\033[35m fully ignores the contents at paths matching *.iso \033[36mhist=/tmp/cdb\033[35m puts thumbnails and indexes at that location \033[36mscan=60\033[35m scan for new files every 60sec, same as --re-maxage @@ -412,7 +413,8 @@ def run_argparse(argv, formatter): ap2.add_argument("-e2ds", action="store_true", help="enable up2k db-scanner, sets -e2d") ap2.add_argument("-e2dsa", action="store_true", help="scan all folders (for search), sets -e2ds") ap2.add_argument("--hist", metavar="PATH", type=u, help="where to store volume data (db, thumbs)") - ap2.add_argument("--no-hash", action="store_true", help="disable hashing during e2ds folder scans") + ap2.add_argument("--no-hash", metavar="PTN", type=u, help="regex: disable hashing of matching paths during e2ds folder scans") + ap2.add_argument("--no-idx", metavar="PTN", type=u, help="regex: disable indexing of matching paths during e2ds folder scans") ap2.add_argument("--re-int", metavar="SEC", type=int, default=30, help="disk rescan check interval") ap2.add_argument("--re-maxage", metavar="SEC", type=int, default=0, help="disk rescan volume interval, 0=off, can be set per-volume with the 'scan' volflag") ap2.add_argument("--srch-time", metavar="SEC", type=int, default=30, help="search deadline") diff --git a/copyparty/authsrv.py b/copyparty/authsrv.py index 312e1cf1..ced44e3e 100644 --- a/copyparty/authsrv.py +++ b/copyparty/authsrv.py @@ -865,9 +865,14 @@ class AuthSrv(object): if self.args.e2d or "e2ds" in vol.flags: vol.flags["e2d"] = True - if self.args.no_hash: - if "ehash" not in vol.flags: - vol.flags["dhash"] = True + for ga, vf in [["no_hash", "nohash"], ["no_idx", "noidx"]]: + if vf in vol.flags: + ptn = vol.flags.pop(vf) + else: + ptn = getattr(self.args, ga) + + if ptn: + vol.flags[vf] = re.compile(ptn) for k in ["e2t", "e2ts", "e2tsr"]: if getattr(self.args, k): diff --git a/copyparty/up2k.py b/copyparty/up2k.py index 92d70e7c..9d682e15 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -466,7 +466,8 @@ class Up2k(object): def _build_file_index(self, vol, all_vols): do_vac = False top = vol.realpath - nohash = "dhash" in vol.flags + rei = vol.flags.get("noidx") + reh = vol.flags.get("nohash") with self.mutex: cur, _ = self.register_vpath(top, vol.flags) @@ -483,7 +484,7 @@ class Up2k(object): n_add = n_rm = 0 try: - n_add = self._build_dir(dbw, top, set(excl), top, nohash, []) + n_add = self._build_dir(dbw, top, set(excl), top, rei, reh, []) n_rm = self._drop_lost(dbw[0], top) except: m = "failed to index volume [{}]:\n{}" @@ -496,7 +497,7 @@ class Up2k(object): return True, n_add or n_rm or do_vac - def _build_dir(self, dbw, top, excl, cdir, nohash, seen): + def _build_dir(self, dbw, top, excl, cdir, rei, reh, seen): rcdir = absreal(cdir) # a bit expensive but worth if rcdir in seen: m = "bailing from symlink loop,\n prev: {}\n curr: {}\n from: {}" @@ -511,6 +512,10 @@ class Up2k(object): g = statdir(self.log_func, not self.args.no_scandir, False, cdir) for iname, inf in sorted(g): abspath = os.path.join(cdir, iname) + if rei and rei.search(abspath): + continue + + nohash = reh.search(abspath) if reh else False lmod = int(inf.st_mtime) sz = inf.st_size if stat.S_ISDIR(inf.st_mode): @@ -518,7 +523,7 @@ class Up2k(object): continue # self.log(" dir: {}".format(abspath)) try: - ret += self._build_dir(dbw, top, excl, abspath, nohash, seen) + ret += self._build_dir(dbw, top, excl, abspath, rei, reh, seen) except: m = "failed to index subdir [{}]:\n{}" self.log(m.format(abspath, min_ex()), c=1) diff --git a/tests/test_httpcli.py b/tests/test_httpcli.py index 408d3a33..d10bac66 100644 --- a/tests/test_httpcli.py +++ b/tests/test_httpcli.py @@ -48,7 +48,8 @@ class Cfg(Namespace): mte="a", mth="", hist=None, - no_hash=False, + no_idx=None, + no_hash=None, css_browser=None, **{k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()} ) diff --git a/tests/test_vfs.py b/tests/test_vfs.py index a6246ac9..73ebe3bb 100644 --- a/tests/test_vfs.py +++ b/tests/test_vfs.py @@ -23,7 +23,8 @@ class Cfg(Namespace): "mte": "a", "mth": "", "hist": None, - "no_hash": False, + "no_idx": None, + "no_hash": None, "css_browser": None, "no_voldump": True, "no_logues": False,