exclude search results by regex (#120)

a better alternative to using `--no-idx` for this purpose since
this also excludes recent uploads, not just during fs-indexing,
and it doesn't prevent deduplication

also speeds up searches by a tiny amount due to building the
sanchecks into the exclude-filter while parsing the config,
instead of during each search query
This commit is contained in:
ed 2024-11-26 23:57:01 +00:00
parent 2f83c6c7d1
commit 697a4fa8a4
8 changed files with 55 additions and 29 deletions

View file

@ -1097,11 +1097,12 @@ using the GUI (winXP or later):
* on winXP only, click the `Sign up for online storage` hyperlink instead and put the URL there
* providing your password as the username is recommended; the password field can be anything or empty
known client bugs:
the webdav client that's built into windows has the following list of bugs; you can avoid all of these by connecting with rclone instead:
* win7+ doesn't actually send the password to the server when reauthenticating after a reboot unless you first try to login with an incorrect password and then switch to the correct password
* or just type your password into the username field instead to get around it entirely
* connecting to a folder which allows anonymous read will make writing impossible, as windows has decided it doesn't need to login
* workaround: connect twice; first to a folder which requires auth, then to the folder you actually want, and leave both of those mounted
* or set the server-option `--dav-auth` to force password-auth for all webdav clients
* win7+ may open a new tcp connection for every file and sometimes forgets to close them, eventually needing a reboot
* maybe NIC-related (??), happens with win10-ltsc on e1000e but not virtio
* windows cannot access folders which contain filenames with invalid unicode or forbidden characters (`<>:"/\|?*`), or names ending with `.`
@ -1268,7 +1269,7 @@ note:
### exclude-patterns
to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash \.iso$` or the volflag `:c,nohash=\.iso$`, this has the following consequences:
to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash '\.iso$'` or the volflag `:c,nohash=\.iso$`, this has the following consequences:
* initial indexing is way faster, especially when the volume is on a network disk
* makes it impossible to [file-search](#file-search)
* if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected
@ -1279,6 +1280,8 @@ similarly, you can fully ignore files/folders using `--no-idx [...]` and `:c,noi
if you set `--no-hash [...]` globally, you can enable hashing for specific volumes using flag `:c,nohash=`
to exclude certain filepaths from search-results, use `--srch-excl` or volflag `srch_excl` instead of `--no-idx`, for example `--srch-excl 'password|logs/[0-9]'`
### filesystem guards
avoid traversing into other filesystems using `--xdev` / volflag `:c,xdev`, skipping any symlinks or bind-mounts to another HDD for example

View file

@ -1401,6 +1401,7 @@ def add_db_general(ap, hcores):
ap2.add_argument("--db-act", metavar="SEC", type=float, default=10.0, help="defer any scheduled volume reindexing until \033[33mSEC\033[0m seconds after last db write (uploads, renames, ...)")
ap2.add_argument("--srch-time", metavar="SEC", type=int, default=45, help="search deadline -- terminate searches running for more than \033[33mSEC\033[0m seconds")
ap2.add_argument("--srch-hits", metavar="N", type=int, default=7999, help="max search results to allow clients to fetch; 125 results will be shown initially")
ap2.add_argument("--srch-excl", metavar="PTN", type=u, default="", help="regex: exclude files from search results if the file-URL matches \033[33mPTN\033[0m (case-sensitive). Example: [\033[32mpassword|logs/[0-9]\033[0m] any URL containing 'password' or 'logs/DIGIT' (volflag=srch_excl)")
ap2.add_argument("--dotsrch", action="store_true", help="show dotfiles in search results (volflags: dotsrch | nodotsrch)")

View file

@ -1880,6 +1880,7 @@ class AuthSrv(object):
["no_hash", "nohash"],
["no_idx", "noidx"],
["og_ua", "og_ua"],
["srch_excl", "srch_excl"],
]:
if vf in vol.flags:
ptn = re.compile(vol.flags.pop(vf))
@ -2086,6 +2087,22 @@ class AuthSrv(object):
self.log(t.format(mtp), 1)
errors = True
for vol in vfs.all_vols.values():
re1: Optional[re.Pattern] = vol.flags.get("srch_excl")
excl = [re1.pattern] if re1 else []
vpaths = []
vtop = vol.vpath
for vp2 in vfs.all_vols.keys():
if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2:
vpaths.append(re.escape(vp2[len(vtop) :].lstrip("/")))
if vpaths:
excl.append("^(%s)/" % ("|".join(vpaths),))
vol.flags["srch_re_dots"] = re.compile("|".join(excl or ["^$"]))
excl.extend([r"^\.", r"/\."])
vol.flags["srch_re_nodot"] = re.compile("|".join(excl))
have_daw = False
for vol in vfs.all_nodes.values():
daw = vol.flags.get("daw") or self.args.daw

View file

@ -191,6 +191,7 @@ flagcats = {
"xvol": "do not follow symlinks leaving the volume root",
"dotsrch": "show dotfiles in search results",
"nodotsrch": "hide dotfiles in search results (default)",
"srch_excl": "exclude search results with URL matching this regex",
},
'database, audio tags\n"mte", "mth", "mtp", "mtm" all work the same as -mte, -mth, ...': {
"mtp=.bpm=f,audio-bpm.py": 'uses the "audio-bpm.py" program to\ngenerate ".bpm" tags from uploads (f = overwrite tags)',

View file

@ -793,7 +793,7 @@ class SvcHub(object):
al.exp_md = odfusion(exp, al.exp_md.replace(" ", ","))
al.exp_lg = odfusion(exp, al.exp_lg.replace(" ", ","))
for k in ["no_hash", "no_idx", "og_ua"]:
for k in ["no_hash", "no_idx", "og_ua", "srch_excl"]:
ptn = getattr(self.args, k)
if ptn:
setattr(self.args, k, re.compile(ptn))

View file

@ -324,7 +324,8 @@ class U2idx(object):
sort: bool,
lim: int,
) -> tuple[list[dict[str, Any]], list[str], bool]:
if self.args.srch_dbg:
dbg = self.args.srch_dbg
if dbg:
t = "searching across all %s volumes in which the user has 'r' (full read access):\n %s"
zs = "\n ".join(["/%s = %s" % (x.vpath, x.realpath) for x in vols])
self.log(t % (len(vols), zs), 5)
@ -367,14 +368,14 @@ class U2idx(object):
if not cur:
continue
excl = []
for vp2 in self.asrv.vfs.all_vols.keys():
if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2:
excl.append(vp2[len(vtop) :].lstrip("/"))
dots = flags.get("dotsrch") and uname in vol.axs.udot
zs = "srch_re_dots" if dots else "srch_re_nodot"
rex: re.Pattern = flags.get(zs) # type: ignore
if self.args.srch_dbg:
t = "searching in volume /%s (%s), excludelist %s"
self.log(t % (vtop, ptop, excl), 5)
if dbg:
t = "searching in volume /%s (%s), excluding %s"
self.log(t % (vtop, ptop, rex.pattern), 5)
rex_cfg: Optional[re.Pattern] = flags.get("srch_excl")
self.active_cur = cur
@ -387,7 +388,6 @@ class U2idx(object):
sret = []
fk = flags.get("fk")
dots = flags.get("dotsrch") and uname in vol.axs.udot
fk_alg = 2 if "fka" in flags else 1
c = cur.execute(uq, tuple(vuv))
for hit in c:
@ -396,20 +396,23 @@ class U2idx(object):
if rd.startswith("//") or fn.startswith("//"):
rd, fn = s3dec(rd, fn)
if rd in excl or any([x for x in excl if rd.startswith(x + "/")]):
if self.args.srch_dbg:
zs = vjoin(vjoin(vtop, rd), fn)
vp = vjoin(vjoin(vtop, rd), fn)
if vp in seen_rps:
continue
if rex.search(vp):
if dbg:
if rex_cfg and rex_cfg.search(vp): # type: ignore
self.log("filtered by srch_excl: %s" % (vp,), 6)
elif not dots and "/." in ("/" + vp):
pass
else:
t = "database inconsistency in volume '/%s'; ignoring: %s"
self.log(t % (vtop, zs), 1)
continue
rp = quotep("/".join([x for x in [vtop, rd, fn] if x]))
if not dots and "/." in ("/" + rp):
continue
if rp in seen_rps:
self.log(t % (vtop, vp), 1)
continue
rp = quotep(vp)
if not fk:
suf = ""
else:
@ -431,7 +434,7 @@ class U2idx(object):
if lim < 0:
break
if self.args.srch_dbg:
if dbg:
t = "in volume '/%s': hit: %s"
self.log(t % (vtop, rp), 5)
@ -461,7 +464,7 @@ class U2idx(object):
ret.extend(sret)
# print("[{}] {}".format(ptop, sret))
if self.args.srch_dbg:
if dbg:
t = "in volume '/%s': got %d hits, %d total so far"
self.log(t % (vtop, len(sret), len(ret)), 5)

View file

@ -1078,7 +1078,8 @@ class Up2k(object):
ft = "\033[0;32m{}{:.0}"
ff = "\033[0;35m{}{:.0}"
fv = "\033[0;36m{}:\033[90m{}"
fx = set(("html_head", "rm_re_t", "rm_re_r", "mv_re_t", "mv_re_r"))
zs = "html_head mv_re_r mv_re_t rm_re_r rm_re_t srch_re_dots srch_re_nodot"
fx = set(zs.split())
fd = vf_bmap()
fd.update(vf_cmap())
fd.update(vf_vmap())
@ -1241,9 +1242,9 @@ class Up2k(object):
# also consider volflags which affect indexing
for vp in vps:
vf = self.vfs.all_vols[vp].flags.items()
vf = {k: v for k, v in vf if k in VF_AFFECTS_INDEXING}
seed.append(str(vf))
vf = self.vfs.all_vols[vp].flags
vf = {k: v for k, v in vf.items() if k in VF_AFFECTS_INDEXING}
seed.append(str(sorted(vf.items())))
zb = hashlib.sha1("\n".join(seed).encode("utf-8", "replace")).digest()
vcfg = ub64enc(zb[:18]).decode("ascii")

View file

@ -122,7 +122,7 @@ class Cfg(Namespace):
def __init__(self, a=None, v=None, c=None, **ka0):
ka = {}
ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs"
ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg srch_excl stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs"
ka.update(**{k: False for k in ex.split()})
ex = "dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash plain_ip"