skip indexing files by regex

This commit is contained in:
ed 2021-10-12 01:40:19 +02:00
parent eb05cb6c6e
commit 2f021a0c2b
6 changed files with 30 additions and 14 deletions

View file

@ -596,12 +596,14 @@ note:
* `e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and `e2ts` would then reindex those, unless there is a new copyparty version with new parsers and the release note says otherwise * `e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and `e2ts` would then reindex those, unless there is a new copyparty version with new parsers and the release note says otherwise
* the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher * the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher
to save some time, you can choose to only index filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash` or the volume-flag `:c,dhash`, this has the following consequences: to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash \.iso$` or the volume-flag `:c,nohash=\.iso$`, this has the following consequences:
* initial indexing is way faster, especially when the volume is on a network disk * initial indexing is way faster, especially when the volume is on a network disk
* makes it impossible to [file-search](#file-search) * makes it impossible to [file-search](#file-search)
* if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected * if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected
if you set `--no-hash`, you can enable hashing for specific volumes using flag `:c,ehash` similarly, you can fully ignore files/folders using `--no-idx [...]` and `:c,noidx=\.iso$`
if you set `--no-hash [...]` globally, you can enable hashing for specific volumes using flag `:c,nohash=`
## upload rules ## upload rules
@ -851,7 +853,7 @@ below are some tweaks roughly ordered by usefulness:
* `-q` disables logging and can help a bunch, even when combined with `-lo` to redirect logs to file * `-q` disables logging and can help a bunch, even when combined with `-lo` to redirect logs to file
* `--http-only` or `--https-only` (unless you want to support both protocols) will reduce the delay before a new connection is established * `--http-only` or `--https-only` (unless you want to support both protocols) will reduce the delay before a new connection is established
* `--hist` pointing to a fast location (ssd) will make directory listings and searches faster when `-e2d` or `-e2t` is set * `--hist` pointing to a fast location (ssd) will make directory listings and searches faster when `-e2d` or `-e2t` is set
* `--no-hash` when indexing a network-disk if you don't care about the actual filehashes and only want the names/tags searchable * `--no-hash .` when indexing a network-disk if you don't care about the actual filehashes and only want the names/tags searchable
* `-j` enables multiprocessing (actual multithreading) and can make copyparty perform better in cpu-intensive workloads, for example: * `-j` enables multiprocessing (actual multithreading) and can make copyparty perform better in cpu-intensive workloads, for example:
* huge amount of short-lived connections * huge amount of short-lived connections
* really heavy traffic (downloads/uploads) * really heavy traffic (downloads/uploads)

View file

@ -276,7 +276,8 @@ def run_argparse(argv, formatter):
\033[36me2d\033[35m sets -e2d (all -e2* args can be set using ce2* volflags) \033[36me2d\033[35m sets -e2d (all -e2* args can be set using ce2* volflags)
\033[36md2t\033[35m disables metadata collection, overrides -e2t* \033[36md2t\033[35m disables metadata collection, overrides -e2t*
\033[36md2d\033[35m disables all database stuff, overrides -e2* \033[36md2d\033[35m disables all database stuff, overrides -e2*
\033[36mdhash\033[35m disables file hashing on initial scans, also ehash \033[36mnohash=\\.iso$\033[35m skips hashing file contents if path matches *.iso
\033[36mnoidx=\\.iso$\033[35m fully ignores the contents at paths matching *.iso
\033[36mhist=/tmp/cdb\033[35m puts thumbnails and indexes at that location \033[36mhist=/tmp/cdb\033[35m puts thumbnails and indexes at that location
\033[36mscan=60\033[35m scan for new files every 60sec, same as --re-maxage \033[36mscan=60\033[35m scan for new files every 60sec, same as --re-maxage
@ -412,7 +413,8 @@ def run_argparse(argv, formatter):
ap2.add_argument("-e2ds", action="store_true", help="enable up2k db-scanner, sets -e2d") ap2.add_argument("-e2ds", action="store_true", help="enable up2k db-scanner, sets -e2d")
ap2.add_argument("-e2dsa", action="store_true", help="scan all folders (for search), sets -e2ds") ap2.add_argument("-e2dsa", action="store_true", help="scan all folders (for search), sets -e2ds")
ap2.add_argument("--hist", metavar="PATH", type=u, help="where to store volume data (db, thumbs)") ap2.add_argument("--hist", metavar="PATH", type=u, help="where to store volume data (db, thumbs)")
ap2.add_argument("--no-hash", action="store_true", help="disable hashing during e2ds folder scans") ap2.add_argument("--no-hash", metavar="PTN", type=u, help="regex: disable hashing of matching paths during e2ds folder scans")
ap2.add_argument("--no-idx", metavar="PTN", type=u, help="regex: disable indexing of matching paths during e2ds folder scans")
ap2.add_argument("--re-int", metavar="SEC", type=int, default=30, help="disk rescan check interval") ap2.add_argument("--re-int", metavar="SEC", type=int, default=30, help="disk rescan check interval")
ap2.add_argument("--re-maxage", metavar="SEC", type=int, default=0, help="disk rescan volume interval, 0=off, can be set per-volume with the 'scan' volflag") ap2.add_argument("--re-maxage", metavar="SEC", type=int, default=0, help="disk rescan volume interval, 0=off, can be set per-volume with the 'scan' volflag")
ap2.add_argument("--srch-time", metavar="SEC", type=int, default=30, help="search deadline") ap2.add_argument("--srch-time", metavar="SEC", type=int, default=30, help="search deadline")

View file

@ -865,9 +865,14 @@ class AuthSrv(object):
if self.args.e2d or "e2ds" in vol.flags: if self.args.e2d or "e2ds" in vol.flags:
vol.flags["e2d"] = True vol.flags["e2d"] = True
if self.args.no_hash: for ga, vf in [["no_hash", "nohash"], ["no_idx", "noidx"]]:
if "ehash" not in vol.flags: if vf in vol.flags:
vol.flags["dhash"] = True ptn = vol.flags.pop(vf)
else:
ptn = getattr(self.args, ga)
if ptn:
vol.flags[vf] = re.compile(ptn)
for k in ["e2t", "e2ts", "e2tsr"]: for k in ["e2t", "e2ts", "e2tsr"]:
if getattr(self.args, k): if getattr(self.args, k):

View file

@ -466,7 +466,8 @@ class Up2k(object):
def _build_file_index(self, vol, all_vols): def _build_file_index(self, vol, all_vols):
do_vac = False do_vac = False
top = vol.realpath top = vol.realpath
nohash = "dhash" in vol.flags rei = vol.flags.get("noidx")
reh = vol.flags.get("nohash")
with self.mutex: with self.mutex:
cur, _ = self.register_vpath(top, vol.flags) cur, _ = self.register_vpath(top, vol.flags)
@ -483,7 +484,7 @@ class Up2k(object):
n_add = n_rm = 0 n_add = n_rm = 0
try: try:
n_add = self._build_dir(dbw, top, set(excl), top, nohash, []) n_add = self._build_dir(dbw, top, set(excl), top, rei, reh, [])
n_rm = self._drop_lost(dbw[0], top) n_rm = self._drop_lost(dbw[0], top)
except: except:
m = "failed to index volume [{}]:\n{}" m = "failed to index volume [{}]:\n{}"
@ -496,7 +497,7 @@ class Up2k(object):
return True, n_add or n_rm or do_vac return True, n_add or n_rm or do_vac
def _build_dir(self, dbw, top, excl, cdir, nohash, seen): def _build_dir(self, dbw, top, excl, cdir, rei, reh, seen):
rcdir = absreal(cdir) # a bit expensive but worth rcdir = absreal(cdir) # a bit expensive but worth
if rcdir in seen: if rcdir in seen:
m = "bailing from symlink loop,\n prev: {}\n curr: {}\n from: {}" m = "bailing from symlink loop,\n prev: {}\n curr: {}\n from: {}"
@ -511,6 +512,10 @@ class Up2k(object):
g = statdir(self.log_func, not self.args.no_scandir, False, cdir) g = statdir(self.log_func, not self.args.no_scandir, False, cdir)
for iname, inf in sorted(g): for iname, inf in sorted(g):
abspath = os.path.join(cdir, iname) abspath = os.path.join(cdir, iname)
if rei and rei.search(abspath):
continue
nohash = reh.search(abspath) if reh else False
lmod = int(inf.st_mtime) lmod = int(inf.st_mtime)
sz = inf.st_size sz = inf.st_size
if stat.S_ISDIR(inf.st_mode): if stat.S_ISDIR(inf.st_mode):
@ -518,7 +523,7 @@ class Up2k(object):
continue continue
# self.log(" dir: {}".format(abspath)) # self.log(" dir: {}".format(abspath))
try: try:
ret += self._build_dir(dbw, top, excl, abspath, nohash, seen) ret += self._build_dir(dbw, top, excl, abspath, rei, reh, seen)
except: except:
m = "failed to index subdir [{}]:\n{}" m = "failed to index subdir [{}]:\n{}"
self.log(m.format(abspath, min_ex()), c=1) self.log(m.format(abspath, min_ex()), c=1)

View file

@ -48,7 +48,8 @@ class Cfg(Namespace):
mte="a", mte="a",
mth="", mth="",
hist=None, hist=None,
no_hash=False, no_idx=None,
no_hash=None,
css_browser=None, css_browser=None,
**{k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()} **{k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()}
) )

View file

@ -23,7 +23,8 @@ class Cfg(Namespace):
"mte": "a", "mte": "a",
"mth": "", "mth": "",
"hist": None, "hist": None,
"no_hash": False, "no_idx": None,
"no_hash": None,
"css_browser": None, "css_browser": None,
"no_voldump": True, "no_voldump": True,
"no_logues": False, "no_logues": False,