diff --git a/README.md b/README.md index 1afedcf6..c47dad89 100644 --- a/README.md +++ b/README.md @@ -1097,11 +1097,12 @@ using the GUI (winXP or later): * on winXP only, click the `Sign up for online storage` hyperlink instead and put the URL there * providing your password as the username is recommended; the password field can be anything or empty -known client bugs: +the webdav client that's built into windows has the following list of bugs; you can avoid all of these by connecting with rclone instead: * win7+ doesn't actually send the password to the server when reauthenticating after a reboot unless you first try to login with an incorrect password and then switch to the correct password * or just type your password into the username field instead to get around it entirely * connecting to a folder which allows anonymous read will make writing impossible, as windows has decided it doesn't need to login * workaround: connect twice; first to a folder which requires auth, then to the folder you actually want, and leave both of those mounted + * or set the server-option `--dav-auth` to force password-auth for all webdav clients * win7+ may open a new tcp connection for every file and sometimes forgets to close them, eventually needing a reboot * maybe NIC-related (??), happens with win10-ltsc on e1000e but not virtio * windows cannot access folders which contain filenames with invalid unicode or forbidden characters (`<>:"/\|?*`), or names ending with `.` @@ -1268,7 +1269,7 @@ note: ### exclude-patterns -to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash \.iso$` or the volflag `:c,nohash=\.iso$`, this has the following consequences: +to save some time, you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash '\.iso$'` or the volflag `:c,nohash=\.iso$`, this has the following consequences: * initial indexing is way faster, especially when the volume is on a network disk * makes it impossible to [file-search](#file-search) * if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected @@ -1279,6 +1280,8 @@ similarly, you can fully ignore files/folders using `--no-idx [...]` and `:c,noi if you set `--no-hash [...]` globally, you can enable hashing for specific volumes using flag `:c,nohash=` +to exclude certain filepaths from search-results, use `--srch-excl` or volflag `srch_excl` instead of `--no-idx`, for example `--srch-excl 'password|logs/[0-9]'` + ### filesystem guards avoid traversing into other filesystems using `--xdev` / volflag `:c,xdev`, skipping any symlinks or bind-mounts to another HDD for example diff --git a/copyparty/__main__.py b/copyparty/__main__.py index 864391d9..789491c2 100644 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -1401,6 +1401,7 @@ def add_db_general(ap, hcores): ap2.add_argument("--db-act", metavar="SEC", type=float, default=10.0, help="defer any scheduled volume reindexing until \033[33mSEC\033[0m seconds after last db write (uploads, renames, ...)") ap2.add_argument("--srch-time", metavar="SEC", type=int, default=45, help="search deadline -- terminate searches running for more than \033[33mSEC\033[0m seconds") ap2.add_argument("--srch-hits", metavar="N", type=int, default=7999, help="max search results to allow clients to fetch; 125 results will be shown initially") + ap2.add_argument("--srch-excl", metavar="PTN", type=u, default="", help="regex: exclude files from search results if the file-URL matches \033[33mPTN\033[0m (case-sensitive). Example: [\033[32mpassword|logs/[0-9]\033[0m] any URL containing 'password' or 'logs/DIGIT' (volflag=srch_excl)") ap2.add_argument("--dotsrch", action="store_true", help="show dotfiles in search results (volflags: dotsrch | nodotsrch)") diff --git a/copyparty/authsrv.py b/copyparty/authsrv.py index d6ad2cdf..74b3991d 100644 --- a/copyparty/authsrv.py +++ b/copyparty/authsrv.py @@ -1880,6 +1880,7 @@ class AuthSrv(object): ["no_hash", "nohash"], ["no_idx", "noidx"], ["og_ua", "og_ua"], + ["srch_excl", "srch_excl"], ]: if vf in vol.flags: ptn = re.compile(vol.flags.pop(vf)) @@ -2086,6 +2087,22 @@ class AuthSrv(object): self.log(t.format(mtp), 1) errors = True + for vol in vfs.all_vols.values(): + re1: Optional[re.Pattern] = vol.flags.get("srch_excl") + excl = [re1.pattern] if re1 else [] + + vpaths = [] + vtop = vol.vpath + for vp2 in vfs.all_vols.keys(): + if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2: + vpaths.append(re.escape(vp2[len(vtop) :].lstrip("/"))) + if vpaths: + excl.append("^(%s)/" % ("|".join(vpaths),)) + + vol.flags["srch_re_dots"] = re.compile("|".join(excl or ["^$"])) + excl.extend([r"^\.", r"/\."]) + vol.flags["srch_re_nodot"] = re.compile("|".join(excl)) + have_daw = False for vol in vfs.all_nodes.values(): daw = vol.flags.get("daw") or self.args.daw diff --git a/copyparty/cfg.py b/copyparty/cfg.py index 496879eb..a7641adb 100644 --- a/copyparty/cfg.py +++ b/copyparty/cfg.py @@ -191,6 +191,7 @@ flagcats = { "xvol": "do not follow symlinks leaving the volume root", "dotsrch": "show dotfiles in search results", "nodotsrch": "hide dotfiles in search results (default)", + "srch_excl": "exclude search results with URL matching this regex", }, 'database, audio tags\n"mte", "mth", "mtp", "mtm" all work the same as -mte, -mth, ...': { "mtp=.bpm=f,audio-bpm.py": 'uses the "audio-bpm.py" program to\ngenerate ".bpm" tags from uploads (f = overwrite tags)', diff --git a/copyparty/svchub.py b/copyparty/svchub.py index b0068386..a4e858fd 100644 --- a/copyparty/svchub.py +++ b/copyparty/svchub.py @@ -793,7 +793,7 @@ class SvcHub(object): al.exp_md = odfusion(exp, al.exp_md.replace(" ", ",")) al.exp_lg = odfusion(exp, al.exp_lg.replace(" ", ",")) - for k in ["no_hash", "no_idx", "og_ua"]: + for k in ["no_hash", "no_idx", "og_ua", "srch_excl"]: ptn = getattr(self.args, k) if ptn: setattr(self.args, k, re.compile(ptn)) diff --git a/copyparty/u2idx.py b/copyparty/u2idx.py index 8201c246..7995ec7e 100644 --- a/copyparty/u2idx.py +++ b/copyparty/u2idx.py @@ -324,7 +324,8 @@ class U2idx(object): sort: bool, lim: int, ) -> tuple[list[dict[str, Any]], list[str], bool]: - if self.args.srch_dbg: + dbg = self.args.srch_dbg + if dbg: t = "searching across all %s volumes in which the user has 'r' (full read access):\n %s" zs = "\n ".join(["/%s = %s" % (x.vpath, x.realpath) for x in vols]) self.log(t % (len(vols), zs), 5) @@ -367,14 +368,14 @@ class U2idx(object): if not cur: continue - excl = [] - for vp2 in self.asrv.vfs.all_vols.keys(): - if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2: - excl.append(vp2[len(vtop) :].lstrip("/")) + dots = flags.get("dotsrch") and uname in vol.axs.udot + zs = "srch_re_dots" if dots else "srch_re_nodot" + rex: re.Pattern = flags.get(zs) # type: ignore - if self.args.srch_dbg: - t = "searching in volume /%s (%s), excludelist %s" - self.log(t % (vtop, ptop, excl), 5) + if dbg: + t = "searching in volume /%s (%s), excluding %s" + self.log(t % (vtop, ptop, rex.pattern), 5) + rex_cfg: Optional[re.Pattern] = flags.get("srch_excl") self.active_cur = cur @@ -387,7 +388,6 @@ class U2idx(object): sret = [] fk = flags.get("fk") - dots = flags.get("dotsrch") and uname in vol.axs.udot fk_alg = 2 if "fka" in flags else 1 c = cur.execute(uq, tuple(vuv)) for hit in c: @@ -396,20 +396,23 @@ class U2idx(object): if rd.startswith("//") or fn.startswith("//"): rd, fn = s3dec(rd, fn) - if rd in excl or any([x for x in excl if rd.startswith(x + "/")]): - if self.args.srch_dbg: - zs = vjoin(vjoin(vtop, rd), fn) - t = "database inconsistency in volume '/%s'; ignoring: %s" - self.log(t % (vtop, zs), 1) + vp = vjoin(vjoin(vtop, rd), fn) + + if vp in seen_rps: continue - rp = quotep("/".join([x for x in [vtop, rd, fn] if x])) - if not dots and "/." in ("/" + rp): - continue - - if rp in seen_rps: + if rex.search(vp): + if dbg: + if rex_cfg and rex_cfg.search(vp): # type: ignore + self.log("filtered by srch_excl: %s" % (vp,), 6) + elif not dots and "/." in ("/" + vp): + pass + else: + t = "database inconsistency in volume '/%s'; ignoring: %s" + self.log(t % (vtop, vp), 1) continue + rp = quotep(vp) if not fk: suf = "" else: @@ -431,7 +434,7 @@ class U2idx(object): if lim < 0: break - if self.args.srch_dbg: + if dbg: t = "in volume '/%s': hit: %s" self.log(t % (vtop, rp), 5) @@ -461,7 +464,7 @@ class U2idx(object): ret.extend(sret) # print("[{}] {}".format(ptop, sret)) - if self.args.srch_dbg: + if dbg: t = "in volume '/%s': got %d hits, %d total so far" self.log(t % (vtop, len(sret), len(ret)), 5) diff --git a/copyparty/up2k.py b/copyparty/up2k.py index b4dc22f2..3f2fa8a0 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -1078,7 +1078,8 @@ class Up2k(object): ft = "\033[0;32m{}{:.0}" ff = "\033[0;35m{}{:.0}" fv = "\033[0;36m{}:\033[90m{}" - fx = set(("html_head", "rm_re_t", "rm_re_r", "mv_re_t", "mv_re_r")) + zs = "html_head mv_re_r mv_re_t rm_re_r rm_re_t srch_re_dots srch_re_nodot" + fx = set(zs.split()) fd = vf_bmap() fd.update(vf_cmap()) fd.update(vf_vmap()) @@ -1241,9 +1242,9 @@ class Up2k(object): # also consider volflags which affect indexing for vp in vps: - vf = self.vfs.all_vols[vp].flags.items() - vf = {k: v for k, v in vf if k in VF_AFFECTS_INDEXING} - seed.append(str(vf)) + vf = self.vfs.all_vols[vp].flags + vf = {k: v for k, v in vf.items() if k in VF_AFFECTS_INDEXING} + seed.append(str(sorted(vf.items()))) zb = hashlib.sha1("\n".join(seed).encode("utf-8", "replace")).digest() vcfg = ub64enc(zb[:18]).decode("ascii") diff --git a/tests/util.py b/tests/util.py index 00cb2159..36e253c6 100644 --- a/tests/util.py +++ b/tests/util.py @@ -122,7 +122,7 @@ class Cfg(Namespace): def __init__(self, a=None, v=None, c=None, **ka0): ka = {} - ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs" + ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg srch_excl stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs" ka.update(**{k: False for k in ex.split()}) ex = "dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash plain_ip"