diff --git a/README.md b/README.md index 0c5dd2fd..f1f445c1 100644 --- a/README.md +++ b/README.md @@ -296,9 +296,16 @@ the same arguments can be set as volume flags, in addition to `d2d` and `d2t` fo * `-v ~/music::r:cd2d` disables **all** indexing, even if any `-e2*` are on * `-v ~/music::r:cd2t` disables all `-e2t*` (tags), does not affect `-e2d*` -`e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and cause `e2ts` to reindex those +note: +* `e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and cause `e2ts` to reindex those +* the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher -the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher +you can choose to only index filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash` or the volume-flag `cnhash`, this has the following consequences: +* initial indexing is way faster, especially when the volume is on a networked disk +* makes it impossible to [file-search](#file-search) +* if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected + +if you set `--no-hash`, you can enable hashing for specific volumes using flag `cehash` ## database location @@ -308,9 +315,9 @@ copyparty creates a subfolder named `.hist` inside each volume where it stores t this can instead be kept in a single place using the `--hist` argument, or the `hist=` volume flag, or a mix of both: * `--hist ~/.cache/copyparty -v ~/music::r:chist=-` sets `~/.cache/copyparty` as the default place to put volume info, but `~/music` gets the regular `.hist` subfolder (`-` restores default behavior) -btw, +note: * markdown edits are always stored in a local `.hist` subdirectory -* on windows the volflag path is cyglike, so `/c/temp` means `C:\temp` +* on windows the volflag path is cyglike, so `/c/temp` means `C:\temp` but use regular paths for `--hist` ## metadata from audio files diff --git a/copyparty/__main__.py b/copyparty/__main__.py index 6c2a369d..816b7c55 100644 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -286,6 +286,7 @@ def run_argparse(argv, formatter): ap2.add_argument("-e2ts", action="store_true", help="enable metadata scanner, sets -e2t") ap2.add_argument("-e2tsr", action="store_true", help="rescan all metadata, sets -e2ts") ap2.add_argument("--hist", metavar="PATH", type=str, help="where to store volume state") + ap2.add_argument("--no-hash", action="store_true", help="disable hashing during e2ds folder scans") ap2.add_argument("--no-mutagen", action="store_true", help="use ffprobe for tags instead") ap2.add_argument("--no-mtag-mt", action="store_true", help="disable tag-read parallelism") ap2.add_argument("-mtm", metavar="M=t,t,t", action="append", type=str, help="add/replace metadata mapping") diff --git a/copyparty/authsrv.py b/copyparty/authsrv.py index 48c43da8..1281d720 100644 --- a/copyparty/authsrv.py +++ b/copyparty/authsrv.py @@ -31,7 +31,7 @@ class VFS(object): self.all_vols = {vpath: self} # flattened recursive else: self.histpath = None - self.all_vols = {} + self.all_vols = None def __repr__(self): return "VFS({})".format( @@ -41,9 +41,10 @@ class VFS(object): ) ) - def _trk(self, vol): - self.all_vols[vol.vpath] = vol - return vol + def get_all_vols(self, outdict): + for v in self.nodes.values(): + v.get_all_vols(outdict) + outdict[v.vpath] = v def add(self, src, dst): """get existing, or add new path to the vfs""" @@ -55,19 +56,18 @@ class VFS(object): name, dst = dst.split("/", 1) if name in self.nodes: # exists; do not manipulate permissions - return self._trk(self.nodes[name].add(src, dst)) + return self.nodes[name].add(src, dst) vn = VFS( - "{}/{}".format(self.realpath, name), + os.path.join(self.realpath, name), "{}/{}".format(self.vpath, name).lstrip("/"), self.uread, self.uwrite, self.uadm, self.flags, ) - self._trk(vn) self.nodes[name] = vn - return self._trk(vn.add(src, dst)) + return vn.add(src, dst) if dst in self.nodes: # leaf exists; return as-is @@ -77,7 +77,7 @@ class VFS(object): vp = "{}/{}".format(self.vpath, dst).lstrip("/") vn = VFS(src, vp) self.nodes[dst] = vn - return self._trk(vn) + return vn def _find(self, vpath): """return [vfs,remainder]""" @@ -462,6 +462,9 @@ class AuthSrv(object): v.uadm = madm[dst] v.flags = mflags[dst] + vfs.all_vols = {} + vfs.get_all_vols(vfs.all_vols) + missing_users = {} for d in [mread, mwrite]: for _, ul in d.items(): @@ -526,6 +529,10 @@ class AuthSrv(object): if self.args.e2d or "e2ds" in vol.flags: vol.flags["e2d"] = True + if self.args.no_hash: + if "ehash" not in vol.flags: + vol.flags["dhash"] = True + for k in ["e2t", "e2ts", "e2tsr"]: if getattr(self.args, k): vol.flags[k] = True diff --git a/copyparty/u2idx.py b/copyparty/u2idx.py index 00230011..c5091b5e 100644 --- a/copyparty/u2idx.py +++ b/copyparty/u2idx.py @@ -7,7 +7,7 @@ import time import threading from datetime import datetime -from .util import s3dec, Pebkac +from .util import s3dec, Pebkac, min_ex from .up2k import up2k_wark_from_hashlist @@ -54,8 +54,8 @@ class U2idx(object): try: return self.run_query(vols, uq, uv)[0] - except Exception as ex: - raise Pebkac(500, repr(ex)) + except: + raise Pebkac(500, min_ex()) def get_cur(self, ptop): cur = self.cur.get(ptop) @@ -245,6 +245,7 @@ class U2idx(object): hit["tags"] = tags ret.extend(sret) + # print("[{}] {}".format(ptop, sret)) done_flag.append(True) self.active_id = None diff --git a/copyparty/up2k.py b/copyparty/up2k.py index cf321a11..dc5d9dab 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -359,6 +359,7 @@ class Up2k(object): def _build_file_index(self, vol, all_vols): do_vac = False top = vol.realpath + nohash = "dhash" in vol.flags with self.mutex: cur, _ = self.register_vpath(top, vol.flags) @@ -373,7 +374,7 @@ class Up2k(object): if WINDOWS: excl = [x.replace("/", "\\") for x in excl] - n_add = self._build_dir(dbw, top, set(excl), top) + n_add = self._build_dir(dbw, top, set(excl), top, nohash) n_rm = self._drop_lost(dbw[0], top) if dbw[1]: self.log("commit {} new files".format(dbw[1])) @@ -381,7 +382,7 @@ class Up2k(object): return True, n_add or n_rm or do_vac - def _build_dir(self, dbw, top, excl, cdir): + def _build_dir(self, dbw, top, excl, cdir, nohash): self.pp.msg = "a{} {}".format(self.pp.n, cdir) histdir = self.vfs.histtab[top] ret = 0 @@ -389,16 +390,17 @@ class Up2k(object): for iname, inf in sorted(g): abspath = os.path.join(cdir, iname) lmod = int(inf.st_mtime) + sz = inf.st_size if stat.S_ISDIR(inf.st_mode): if abspath in excl or abspath == histdir: continue # self.log(" dir: {}".format(abspath)) - ret += self._build_dir(dbw, top, excl, abspath) + ret += self._build_dir(dbw, top, excl, abspath, nohash) else: # self.log("file: {}".format(abspath)) rp = abspath[len(top) :].replace("\\", "/").strip("/") rd, fn = rp.rsplit("/", 1) if "/" in rp else ["", rp] - sql = "select * from up where rd = ? and fn = ?" + sql = "select w, mt, sz from up where rd = ? and fn = ?" try: c = dbw[0].execute(sql, (rd, fn)) except: @@ -407,18 +409,18 @@ class Up2k(object): in_db = list(c.fetchall()) if in_db: self.pp.n -= 1 - _, dts, dsz, _, _ = in_db[0] + dw, dts, dsz = in_db[0] if len(in_db) > 1: m = "WARN: multiple entries: [{}] => [{}] |{}|\n{}" rep_db = "\n".join([repr(x) for x in in_db]) self.log(m.format(top, rp, len(in_db), rep_db)) dts = -1 - if dts == lmod and dsz == inf.st_size: + if dts == lmod and dsz == sz and (nohash or dw[0] != "#"): continue m = "reindex [{}] => [{}] ({}/{}) ({}/{})".format( - top, rp, dts, lmod, dsz, inf.st_size + top, rp, dts, lmod, dsz, sz ) self.log(m) self.db_rm(dbw[0], rd, fn) @@ -427,17 +429,22 @@ class Up2k(object): in_db = None self.pp.msg = "a{} {}".format(self.pp.n, abspath) - if inf.st_size > 1024 * 1024: - self.log("file: {}".format(abspath)) - try: - hashes = self._hashlist_from_file(abspath) - except Exception as ex: - self.log("hash: {} @ [{}]".format(repr(ex), abspath)) - continue + if nohash: + wark = up2k_wark_from_metadata(self.salt, sz, lmod, rd, fn) + else: + if sz > 1024 * 1024: + self.log("file: {}".format(abspath)) - wark = up2k_wark_from_hashlist(self.salt, inf.st_size, hashes) - self.db_add(dbw[0], wark, rd, fn, lmod, inf.st_size) + try: + hashes = self._hashlist_from_file(abspath) + except Exception as ex: + self.log("hash: {} @ [{}]".format(repr(ex), abspath)) + continue + + wark = up2k_wark_from_hashlist(self.salt, sz, hashes) + + self.db_add(dbw[0], wark, rd, fn, lmod, sz) dbw[1] += 1 ret += 1 td = time.time() - dbw[2] @@ -1466,9 +1473,12 @@ def up2k_wark_from_hashlist(salt, filesize, hashes): ident.extend(hashes) ident = "\n".join(ident) - hasher = hashlib.sha512() - hasher.update(ident.encode("utf-8")) - digest = hasher.digest()[:32] + wark = hashlib.sha512(ident.encode("utf-8")).digest() + wark = base64.urlsafe_b64encode(wark) + return wark.decode("ascii")[:43] - wark = base64.urlsafe_b64encode(digest) - return wark.decode("utf-8").rstrip("=") + +def up2k_wark_from_metadata(salt, sz, lastmod, rd, fn): + ret = fsenc("{}\n{}\n{}\n{}\n{}".format(salt, lastmod, sz, rd, fn)) + ret = base64.urlsafe_b64encode(hashlib.sha512(ret).digest()) + return "#{}".format(ret[:42].decode("ascii")) diff --git a/copyparty/util.py b/copyparty/util.py index 2ab3020c..9dc8eb3a 100644 --- a/copyparty/util.py +++ b/copyparty/util.py @@ -254,6 +254,17 @@ def trace(*args, **kwargs): nuprint(msg) +def min_ex(): + et, ev, tb = sys.exc_info() + tb = traceback.extract_tb(tb, 2) + ex = [ + "{} @ {} <{}>: {}".format(fp.split(os.sep)[-1], ln, fun, txt) + for fp, ln, fun, txt in tb + ] + ex.append("{}: {}".format(et.__name__, ev)) + return "\n".join(ex) + + @contextlib.contextmanager def ren_open(fname, *args, **kwargs): fdir = kwargs.pop("fdir", None) diff --git a/tests/test_httpcli.py b/tests/test_httpcli.py index e99fed81..f3bfdcda 100644 --- a/tests/test_httpcli.py +++ b/tests/test_httpcli.py @@ -38,6 +38,7 @@ class Cfg(Namespace): mtp=[], mte="a", hist=None, + no_hash=False, **{k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()} ) diff --git a/tests/test_vfs.py b/tests/test_vfs.py index 0936a317..625c6c6f 100644 --- a/tests/test_vfs.py +++ b/tests/test_vfs.py @@ -18,7 +18,7 @@ from copyparty import util class Cfg(Namespace): def __init__(self, a=[], v=[], c=None): ex = {k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()} - ex2 = {"mtp": [], "mte": "a", "hist": None} + ex2 = {"mtp": [], "mte": "a", "hist": None, "no_hash": False} ex.update(ex2) super(Cfg, self).__init__(a=a, v=v, c=c, **ex) diff --git a/tests/util.py b/tests/util.py index eb412008..1c6dd706 100644 --- a/tests/util.py +++ b/tests/util.py @@ -60,7 +60,7 @@ def get_ramdisk(): if os.path.exists("/Volumes"): # hdiutil eject /Volumes/cptd/ - devname, _ = chkcmd("hdiutil", "attach", "-nomount", "ram://65536") + devname, _ = chkcmd("hdiutil", "attach", "-nomount", "ram://131072") devname = devname.strip() print("devname: [{}]".format(devname)) for _ in range(10):