mirror of
https://github.com/9001/copyparty.git
synced 2025-08-17 09:02:15 -06:00
exclude search results by regex (#120)
a better alternative to using `--no-idx` for this purpose since this also excludes recent uploads, not just during fs-indexing, and it doesn't prevent deduplication also speeds up searches by a tiny amount due to building the sanchecks into the exclude-filter while parsing the config, instead of during each search query
This commit is contained in:
parent
2f83c6c7d1
commit
697a4fa8a4
|
@ -1097,11 +1097,12 @@ using the GUI (winXP or later):
|
|||
* on winXP only, click the `Sign up for online storage` hyperlink instead and put the URL there
|
||||
* providing your password as the username is recommended; the password field can be anything or empty
|
||||
|
||||
known client bugs:
|
||||
the webdav client that's built into windows has the following list of bugs; you can avoid all of these by connecting with rclone instead:
|
||||
* win7+ doesn't actually send the password to the server when reauthenticating after a reboot unless you first try to login with an incorrect password and then switch to the correct password
|
||||
* or just type your password into the username field instead to get around it entirely
|
||||
* connecting to a folder which allows anonymous read will make writing impossible, as windows has decided it doesn't need to login
|
||||
* workaround: connect twice; first to a folder which requires auth, then to the folder you actually want, and leave both of those mounted
|
||||
* or set the server-option `--dav-auth` to force password-auth for all webdav clients
|
||||
* win7+ may open a new tcp connection for every file and sometimes forgets to close them, eventually needing a reboot
|
||||
* maybe NIC-related (??), happens with win10-ltsc on e1000e but not virtio
|
||||
* windows cannot access folders which contain filenames with invalid unicode or forbidden characters (`<>:"/\|?*`), or names ending with `.`
|
||||
|
@ -1268,7 +1269,7 @@ note:
|
|||
|
||||
### exclude-patterns
|
||||
|
||||
to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash \.iso$` or the volflag `:c,nohash=\.iso$`, this has the following consequences:
|
||||
to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash '\.iso$'` or the volflag `:c,nohash=\.iso$`, this has the following consequences:
|
||||
* initial indexing is way faster, especially when the volume is on a network disk
|
||||
* makes it impossible to [file-search](#file-search)
|
||||
* if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected
|
||||
|
@ -1279,6 +1280,8 @@ similarly, you can fully ignore files/folders using `--no-idx [...]` and `:c,noi
|
|||
|
||||
if you set `--no-hash [...]` globally, you can enable hashing for specific volumes using flag `:c,nohash=`
|
||||
|
||||
to exclude certain filepaths from search-results, use `--srch-excl` or volflag `srch_excl` instead of `--no-idx`, for example `--srch-excl 'password|logs/[0-9]'`
|
||||
|
||||
### filesystem guards
|
||||
|
||||
avoid traversing into other filesystems using `--xdev` / volflag `:c,xdev`, skipping any symlinks or bind-mounts to another HDD for example
|
||||
|
|
|
@ -1401,6 +1401,7 @@ def add_db_general(ap, hcores):
|
|||
ap2.add_argument("--db-act", metavar="SEC", type=float, default=10.0, help="defer any scheduled volume reindexing until \033[33mSEC\033[0m seconds after last db write (uploads, renames, ...)")
|
||||
ap2.add_argument("--srch-time", metavar="SEC", type=int, default=45, help="search deadline -- terminate searches running for more than \033[33mSEC\033[0m seconds")
|
||||
ap2.add_argument("--srch-hits", metavar="N", type=int, default=7999, help="max search results to allow clients to fetch; 125 results will be shown initially")
|
||||
ap2.add_argument("--srch-excl", metavar="PTN", type=u, default="", help="regex: exclude files from search results if the file-URL matches \033[33mPTN\033[0m (case-sensitive). Example: [\033[32mpassword|logs/[0-9]\033[0m] any URL containing 'password' or 'logs/DIGIT' (volflag=srch_excl)")
|
||||
ap2.add_argument("--dotsrch", action="store_true", help="show dotfiles in search results (volflags: dotsrch | nodotsrch)")
|
||||
|
||||
|
||||
|
|
|
@ -1880,6 +1880,7 @@ class AuthSrv(object):
|
|||
["no_hash", "nohash"],
|
||||
["no_idx", "noidx"],
|
||||
["og_ua", "og_ua"],
|
||||
["srch_excl", "srch_excl"],
|
||||
]:
|
||||
if vf in vol.flags:
|
||||
ptn = re.compile(vol.flags.pop(vf))
|
||||
|
@ -2086,6 +2087,22 @@ class AuthSrv(object):
|
|||
self.log(t.format(mtp), 1)
|
||||
errors = True
|
||||
|
||||
for vol in vfs.all_vols.values():
|
||||
re1: Optional[re.Pattern] = vol.flags.get("srch_excl")
|
||||
excl = [re1.pattern] if re1 else []
|
||||
|
||||
vpaths = []
|
||||
vtop = vol.vpath
|
||||
for vp2 in vfs.all_vols.keys():
|
||||
if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2:
|
||||
vpaths.append(re.escape(vp2[len(vtop) :].lstrip("/")))
|
||||
if vpaths:
|
||||
excl.append("^(%s)/" % ("|".join(vpaths),))
|
||||
|
||||
vol.flags["srch_re_dots"] = re.compile("|".join(excl or ["^$"]))
|
||||
excl.extend([r"^\.", r"/\."])
|
||||
vol.flags["srch_re_nodot"] = re.compile("|".join(excl))
|
||||
|
||||
have_daw = False
|
||||
for vol in vfs.all_nodes.values():
|
||||
daw = vol.flags.get("daw") or self.args.daw
|
||||
|
|
|
@ -191,6 +191,7 @@ flagcats = {
|
|||
"xvol": "do not follow symlinks leaving the volume root",
|
||||
"dotsrch": "show dotfiles in search results",
|
||||
"nodotsrch": "hide dotfiles in search results (default)",
|
||||
"srch_excl": "exclude search results with URL matching this regex",
|
||||
},
|
||||
'database, audio tags\n"mte", "mth", "mtp", "mtm" all work the same as -mte, -mth, ...': {
|
||||
"mtp=.bpm=f,audio-bpm.py": 'uses the "audio-bpm.py" program to\ngenerate ".bpm" tags from uploads (f = overwrite tags)',
|
||||
|
|
|
@ -793,7 +793,7 @@ class SvcHub(object):
|
|||
al.exp_md = odfusion(exp, al.exp_md.replace(" ", ","))
|
||||
al.exp_lg = odfusion(exp, al.exp_lg.replace(" ", ","))
|
||||
|
||||
for k in ["no_hash", "no_idx", "og_ua"]:
|
||||
for k in ["no_hash", "no_idx", "og_ua", "srch_excl"]:
|
||||
ptn = getattr(self.args, k)
|
||||
if ptn:
|
||||
setattr(self.args, k, re.compile(ptn))
|
||||
|
|
|
@ -324,7 +324,8 @@ class U2idx(object):
|
|||
sort: bool,
|
||||
lim: int,
|
||||
) -> tuple[list[dict[str, Any]], list[str], bool]:
|
||||
if self.args.srch_dbg:
|
||||
dbg = self.args.srch_dbg
|
||||
if dbg:
|
||||
t = "searching across all %s volumes in which the user has 'r' (full read access):\n %s"
|
||||
zs = "\n ".join(["/%s = %s" % (x.vpath, x.realpath) for x in vols])
|
||||
self.log(t % (len(vols), zs), 5)
|
||||
|
@ -367,14 +368,14 @@ class U2idx(object):
|
|||
if not cur:
|
||||
continue
|
||||
|
||||
excl = []
|
||||
for vp2 in self.asrv.vfs.all_vols.keys():
|
||||
if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2:
|
||||
excl.append(vp2[len(vtop) :].lstrip("/"))
|
||||
dots = flags.get("dotsrch") and uname in vol.axs.udot
|
||||
zs = "srch_re_dots" if dots else "srch_re_nodot"
|
||||
rex: re.Pattern = flags.get(zs) # type: ignore
|
||||
|
||||
if self.args.srch_dbg:
|
||||
t = "searching in volume /%s (%s), excludelist %s"
|
||||
self.log(t % (vtop, ptop, excl), 5)
|
||||
if dbg:
|
||||
t = "searching in volume /%s (%s), excluding %s"
|
||||
self.log(t % (vtop, ptop, rex.pattern), 5)
|
||||
rex_cfg: Optional[re.Pattern] = flags.get("srch_excl")
|
||||
|
||||
self.active_cur = cur
|
||||
|
||||
|
@ -387,7 +388,6 @@ class U2idx(object):
|
|||
|
||||
sret = []
|
||||
fk = flags.get("fk")
|
||||
dots = flags.get("dotsrch") and uname in vol.axs.udot
|
||||
fk_alg = 2 if "fka" in flags else 1
|
||||
c = cur.execute(uq, tuple(vuv))
|
||||
for hit in c:
|
||||
|
@ -396,20 +396,23 @@ class U2idx(object):
|
|||
if rd.startswith("//") or fn.startswith("//"):
|
||||
rd, fn = s3dec(rd, fn)
|
||||
|
||||
if rd in excl or any([x for x in excl if rd.startswith(x + "/")]):
|
||||
if self.args.srch_dbg:
|
||||
zs = vjoin(vjoin(vtop, rd), fn)
|
||||
t = "database inconsistency in volume '/%s'; ignoring: %s"
|
||||
self.log(t % (vtop, zs), 1)
|
||||
vp = vjoin(vjoin(vtop, rd), fn)
|
||||
|
||||
if vp in seen_rps:
|
||||
continue
|
||||
|
||||
rp = quotep("/".join([x for x in [vtop, rd, fn] if x]))
|
||||
if not dots and "/." in ("/" + rp):
|
||||
continue
|
||||
|
||||
if rp in seen_rps:
|
||||
if rex.search(vp):
|
||||
if dbg:
|
||||
if rex_cfg and rex_cfg.search(vp): # type: ignore
|
||||
self.log("filtered by srch_excl: %s" % (vp,), 6)
|
||||
elif not dots and "/." in ("/" + vp):
|
||||
pass
|
||||
else:
|
||||
t = "database inconsistency in volume '/%s'; ignoring: %s"
|
||||
self.log(t % (vtop, vp), 1)
|
||||
continue
|
||||
|
||||
rp = quotep(vp)
|
||||
if not fk:
|
||||
suf = ""
|
||||
else:
|
||||
|
@ -431,7 +434,7 @@ class U2idx(object):
|
|||
if lim < 0:
|
||||
break
|
||||
|
||||
if self.args.srch_dbg:
|
||||
if dbg:
|
||||
t = "in volume '/%s': hit: %s"
|
||||
self.log(t % (vtop, rp), 5)
|
||||
|
||||
|
@ -461,7 +464,7 @@ class U2idx(object):
|
|||
ret.extend(sret)
|
||||
# print("[{}] {}".format(ptop, sret))
|
||||
|
||||
if self.args.srch_dbg:
|
||||
if dbg:
|
||||
t = "in volume '/%s': got %d hits, %d total so far"
|
||||
self.log(t % (vtop, len(sret), len(ret)), 5)
|
||||
|
||||
|
|
|
@ -1078,7 +1078,8 @@ class Up2k(object):
|
|||
ft = "\033[0;32m{}{:.0}"
|
||||
ff = "\033[0;35m{}{:.0}"
|
||||
fv = "\033[0;36m{}:\033[90m{}"
|
||||
fx = set(("html_head", "rm_re_t", "rm_re_r", "mv_re_t", "mv_re_r"))
|
||||
zs = "html_head mv_re_r mv_re_t rm_re_r rm_re_t srch_re_dots srch_re_nodot"
|
||||
fx = set(zs.split())
|
||||
fd = vf_bmap()
|
||||
fd.update(vf_cmap())
|
||||
fd.update(vf_vmap())
|
||||
|
@ -1241,9 +1242,9 @@ class Up2k(object):
|
|||
|
||||
# also consider volflags which affect indexing
|
||||
for vp in vps:
|
||||
vf = self.vfs.all_vols[vp].flags.items()
|
||||
vf = {k: v for k, v in vf if k in VF_AFFECTS_INDEXING}
|
||||
seed.append(str(vf))
|
||||
vf = self.vfs.all_vols[vp].flags
|
||||
vf = {k: v for k, v in vf.items() if k in VF_AFFECTS_INDEXING}
|
||||
seed.append(str(sorted(vf.items())))
|
||||
|
||||
zb = hashlib.sha1("\n".join(seed).encode("utf-8", "replace")).digest()
|
||||
vcfg = ub64enc(zb[:18]).decode("ascii")
|
||||
|
|
|
@ -122,7 +122,7 @@ class Cfg(Namespace):
|
|||
def __init__(self, a=None, v=None, c=None, **ka0):
|
||||
ka = {}
|
||||
|
||||
ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs"
|
||||
ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg srch_excl stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs"
|
||||
ka.update(**{k: False for k in ex.split()})
|
||||
|
||||
ex = "dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash plain_ip"
|
||||
|
|
Loading…
Reference in a new issue