From 99f63adf58d1554f22a5487800b75794ede2c24c Mon Sep 17 00:00:00 2001 From: ed Date: Sun, 23 Mar 2025 21:21:41 +0000 Subject: [PATCH] google isn't taking the hint specifically google, but also some others, have started ignoring rel="nofollow" while also understanding just enough javascript to try viewing binary files as text --- copyparty/__main__.py | 3 +++ copyparty/httpcli.py | 7 +++++++ copyparty/svchub.py | 3 ++- copyparty/util.py | 3 +++ tests/util.py | 2 +- 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/copyparty/__main__.py b/copyparty/__main__.py index d664fabd..c2b42e3e 100644 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -40,6 +40,7 @@ from .cfg import flagcats, onedash from .svchub import SvcHub from .util import ( APPLESAN_TXT, + BAD_BOTS, DEF_EXP, DEF_MTE, DEF_MTH, @@ -1244,6 +1245,7 @@ def add_optouts(ap): ap2.add_argument("--zipmaxt", metavar="TXT", type=u, default="", help="custom errormessage when download size exceeds max (volflag=zipmaxt)") ap2.add_argument("--zipmaxu", action="store_true", help="authenticated users bypass the zip size limit (volflag=zipmaxu)") ap2.add_argument("--zip-who", metavar="LVL", type=int, default=3, help="who can download as zip/tar? [\033[32m0\033[0m]=nobody, [\033[32m1\033[0m]=admins, [\033[32m2\033[0m]=authenticated-with-read-access, [\033[32m3\033[0m]=everyone-with-read-access (volflag=zip_who)\n\033[1;31mWARNING:\033[0m if a nested volume has a more restrictive value than a parent volume, then this will be \033[33mignored\033[0m if the download is initiated from the parent, more lenient volume") + ap2.add_argument("--ua-nozip", metavar="PTN", type=u, default=BAD_BOTS, help="regex of user-agents to reject from download-as-zip/tar; disable with [\033[32mno\033[0m] or blank") ap2.add_argument("--no-zip", action="store_true", help="disable download as zip/tar; same as \033[33m--zip-who=0\033[0m") ap2.add_argument("--no-tarcmp", action="store_true", help="disable download as compressed tar (?tar=gz, ?tar=bz2, ?tar=xz, ?tar=gz:9, ...)") ap2.add_argument("--no-lifetime", action="store_true", help="do not allow clients (or server config) to schedule an upload to be deleted after a given time") @@ -1434,6 +1436,7 @@ def add_txt(ap): ap2.add_argument("--exp", action="store_true", help="enable textfile expansion -- replace {{self.ip}} and such; see \033[33m--help-exp\033[0m (volflag=exp)") ap2.add_argument("--exp-md", metavar="V,V,V", type=u, default=DEF_EXP, help="comma/space-separated list of placeholders to expand in markdown files; add/remove stuff on the default list with +hdr_foo or /vf.scan (volflag=exp_md)") ap2.add_argument("--exp-lg", metavar="V,V,V", type=u, default=DEF_EXP, help="comma/space-separated list of placeholders to expand in prologue/epilogue files (volflag=exp_lg)") + ap2.add_argument("--ua-nodoc", metavar="PTN", type=u, default=BAD_BOTS, help="regex of user-agents to reject from viewing documents through ?doc=[...]; disable with [\033[32mno\033[0m] or blank") def add_og(ap): diff --git a/copyparty/httpcli.py b/copyparty/httpcli.py index 3c12595f..d2f9a30d 100644 --- a/copyparty/httpcli.py +++ b/copyparty/httpcli.py @@ -3807,6 +3807,9 @@ class HttpCli(object): return "download-as-zip/tar is admin-only on this server" elif lvl <= 2 and self.uname in ("", "*"): return "you must be authenticated to download-as-zip/tar on this server" + elif self.args.ua_nozip and self.args.ua_nozip.search(self.ua): + t = "this URL contains no valuable information for bots/crawlers" + raise Pebkac(403, t) return "" def tx_res(self, req_path: str) -> bool: @@ -6291,6 +6294,10 @@ class HttpCli(object): doc = self.uparam.get("doc") if self.can_read else None if doc: + zp = self.args.ua_nodoc + if zp and zp.search(self.ua): + t = "this URL contains no valuable information for bots/crawlers" + raise Pebkac(403, t) j2a["docname"] = doc doctxt = None dfn = lnames.get(doc.lower()) diff --git a/copyparty/svchub.py b/copyparty/svchub.py index cd55e735..9f6b6085 100644 --- a/copyparty/svchub.py +++ b/copyparty/svchub.py @@ -769,7 +769,8 @@ class SvcHub(object): vs = os.path.expandvars(os.path.expanduser(vs)) setattr(al, k, vs) - for k in "dav_ua1 sus_urls nonsus_urls".split(" "): + zs = "dav_ua1 sus_urls nonsus_urls ua_nodoc ua_nozip" + for k in zs.split(" "): vs = getattr(al, k) if not vs or vs == "no": setattr(al, k, None) diff --git a/copyparty/util.py b/copyparty/util.py index cf7be3f3..e663ff28 100644 --- a/copyparty/util.py +++ b/copyparty/util.py @@ -245,6 +245,9 @@ SYMTIME = PY36 and os.utime in os.supports_follow_symlinks META_NOBOTS = '\n' +# smart enough to understand javascript while also ignoring rel="nofollow" +BAD_BOTS = r"Barkrowler|bingbot|BLEXBot|Googlebot|GPTBot|PetalBot|SeekportBot|SemrushBot|YandexBot" + FFMPEG_URL = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-git-full.7z" URL_PRJ = "https://github.com/9001/copyparty" diff --git a/tests/util.py b/tests/util.py index 74fc64c5..075c5d2e 100644 --- a/tests/util.py +++ b/tests/util.py @@ -135,7 +135,7 @@ class Cfg(Namespace): ex = "dav_inf dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash plain_ip" ka.update(**{k: True for k in ex.split()}) - ex = "ah_cli ah_gen css_browser hist ipu js_browser js_other mime mimes no_forget no_hash no_idx nonsus_urls og_tpl og_ua" + ex = "ah_cli ah_gen css_browser hist ipu js_browser js_other mime mimes no_forget no_hash no_idx nonsus_urls og_tpl og_ua ua_nodoc ua_nozip" ka.update(**{k: None for k in ex.split()}) ex = "hash_mt hsortn safe_dedup srch_time u2abort u2j u2sz"