adding --no-hash

This commit is contained in:
ed 2021-06-10 18:08:30 +02:00
parent d6bf300d80
commit 1078d933b4
9 changed files with 77 additions and 39 deletions

View file

@ -296,9 +296,16 @@ the same arguments can be set as volume flags, in addition to `d2d` and `d2t` fo
* `-v ~/music::r:cd2d` disables **all** indexing, even if any `-e2*` are on * `-v ~/music::r:cd2d` disables **all** indexing, even if any `-e2*` are on
* `-v ~/music::r:cd2t` disables all `-e2t*` (tags), does not affect `-e2d*` * `-v ~/music::r:cd2t` disables all `-e2t*` (tags), does not affect `-e2d*`
`e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and cause `e2ts` to reindex those note:
* `e2tsr` is probably always overkill, since `e2ds`/`e2dsa` would pick up any file modifications and cause `e2ts` to reindex those
* the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher
the rescan button in the admin panel has no effect unless the volume has `-e2ds` or higher you can choose to only index filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash` or the volume-flag `cnhash`, this has the following consequences:
* initial indexing is way faster, especially when the volume is on a networked disk
* makes it impossible to [file-search](#file-search)
* if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected
if you set `--no-hash`, you can enable hashing for specific volumes using flag `cehash`
## database location ## database location
@ -308,9 +315,9 @@ copyparty creates a subfolder named `.hist` inside each volume where it stores t
this can instead be kept in a single place using the `--hist` argument, or the `hist=` volume flag, or a mix of both: this can instead be kept in a single place using the `--hist` argument, or the `hist=` volume flag, or a mix of both:
* `--hist ~/.cache/copyparty -v ~/music::r:chist=-` sets `~/.cache/copyparty` as the default place to put volume info, but `~/music` gets the regular `.hist` subfolder (`-` restores default behavior) * `--hist ~/.cache/copyparty -v ~/music::r:chist=-` sets `~/.cache/copyparty` as the default place to put volume info, but `~/music` gets the regular `.hist` subfolder (`-` restores default behavior)
btw, note:
* markdown edits are always stored in a local `.hist` subdirectory * markdown edits are always stored in a local `.hist` subdirectory
* on windows the volflag path is cyglike, so `/c/temp` means `C:\temp` * on windows the volflag path is cyglike, so `/c/temp` means `C:\temp` but use regular paths for `--hist`
## metadata from audio files ## metadata from audio files

View file

@ -286,6 +286,7 @@ def run_argparse(argv, formatter):
ap2.add_argument("-e2ts", action="store_true", help="enable metadata scanner, sets -e2t") ap2.add_argument("-e2ts", action="store_true", help="enable metadata scanner, sets -e2t")
ap2.add_argument("-e2tsr", action="store_true", help="rescan all metadata, sets -e2ts") ap2.add_argument("-e2tsr", action="store_true", help="rescan all metadata, sets -e2ts")
ap2.add_argument("--hist", metavar="PATH", type=str, help="where to store volume state") ap2.add_argument("--hist", metavar="PATH", type=str, help="where to store volume state")
ap2.add_argument("--no-hash", action="store_true", help="disable hashing during e2ds folder scans")
ap2.add_argument("--no-mutagen", action="store_true", help="use ffprobe for tags instead") ap2.add_argument("--no-mutagen", action="store_true", help="use ffprobe for tags instead")
ap2.add_argument("--no-mtag-mt", action="store_true", help="disable tag-read parallelism") ap2.add_argument("--no-mtag-mt", action="store_true", help="disable tag-read parallelism")
ap2.add_argument("-mtm", metavar="M=t,t,t", action="append", type=str, help="add/replace metadata mapping") ap2.add_argument("-mtm", metavar="M=t,t,t", action="append", type=str, help="add/replace metadata mapping")

View file

@ -31,7 +31,7 @@ class VFS(object):
self.all_vols = {vpath: self} # flattened recursive self.all_vols = {vpath: self} # flattened recursive
else: else:
self.histpath = None self.histpath = None
self.all_vols = {} self.all_vols = None
def __repr__(self): def __repr__(self):
return "VFS({})".format( return "VFS({})".format(
@ -41,9 +41,10 @@ class VFS(object):
) )
) )
def _trk(self, vol): def get_all_vols(self, outdict):
self.all_vols[vol.vpath] = vol for v in self.nodes.values():
return vol v.get_all_vols(outdict)
outdict[v.vpath] = v
def add(self, src, dst): def add(self, src, dst):
"""get existing, or add new path to the vfs""" """get existing, or add new path to the vfs"""
@ -55,19 +56,18 @@ class VFS(object):
name, dst = dst.split("/", 1) name, dst = dst.split("/", 1)
if name in self.nodes: if name in self.nodes:
# exists; do not manipulate permissions # exists; do not manipulate permissions
return self._trk(self.nodes[name].add(src, dst)) return self.nodes[name].add(src, dst)
vn = VFS( vn = VFS(
"{}/{}".format(self.realpath, name), os.path.join(self.realpath, name),
"{}/{}".format(self.vpath, name).lstrip("/"), "{}/{}".format(self.vpath, name).lstrip("/"),
self.uread, self.uread,
self.uwrite, self.uwrite,
self.uadm, self.uadm,
self.flags, self.flags,
) )
self._trk(vn)
self.nodes[name] = vn self.nodes[name] = vn
return self._trk(vn.add(src, dst)) return vn.add(src, dst)
if dst in self.nodes: if dst in self.nodes:
# leaf exists; return as-is # leaf exists; return as-is
@ -77,7 +77,7 @@ class VFS(object):
vp = "{}/{}".format(self.vpath, dst).lstrip("/") vp = "{}/{}".format(self.vpath, dst).lstrip("/")
vn = VFS(src, vp) vn = VFS(src, vp)
self.nodes[dst] = vn self.nodes[dst] = vn
return self._trk(vn) return vn
def _find(self, vpath): def _find(self, vpath):
"""return [vfs,remainder]""" """return [vfs,remainder]"""
@ -462,6 +462,9 @@ class AuthSrv(object):
v.uadm = madm[dst] v.uadm = madm[dst]
v.flags = mflags[dst] v.flags = mflags[dst]
vfs.all_vols = {}
vfs.get_all_vols(vfs.all_vols)
missing_users = {} missing_users = {}
for d in [mread, mwrite]: for d in [mread, mwrite]:
for _, ul in d.items(): for _, ul in d.items():
@ -526,6 +529,10 @@ class AuthSrv(object):
if self.args.e2d or "e2ds" in vol.flags: if self.args.e2d or "e2ds" in vol.flags:
vol.flags["e2d"] = True vol.flags["e2d"] = True
if self.args.no_hash:
if "ehash" not in vol.flags:
vol.flags["dhash"] = True
for k in ["e2t", "e2ts", "e2tsr"]: for k in ["e2t", "e2ts", "e2tsr"]:
if getattr(self.args, k): if getattr(self.args, k):
vol.flags[k] = True vol.flags[k] = True

View file

@ -7,7 +7,7 @@ import time
import threading import threading
from datetime import datetime from datetime import datetime
from .util import s3dec, Pebkac from .util import s3dec, Pebkac, min_ex
from .up2k import up2k_wark_from_hashlist from .up2k import up2k_wark_from_hashlist
@ -54,8 +54,8 @@ class U2idx(object):
try: try:
return self.run_query(vols, uq, uv)[0] return self.run_query(vols, uq, uv)[0]
except Exception as ex: except:
raise Pebkac(500, repr(ex)) raise Pebkac(500, min_ex())
def get_cur(self, ptop): def get_cur(self, ptop):
cur = self.cur.get(ptop) cur = self.cur.get(ptop)
@ -245,6 +245,7 @@ class U2idx(object):
hit["tags"] = tags hit["tags"] = tags
ret.extend(sret) ret.extend(sret)
# print("[{}] {}".format(ptop, sret))
done_flag.append(True) done_flag.append(True)
self.active_id = None self.active_id = None

View file

@ -359,6 +359,7 @@ class Up2k(object):
def _build_file_index(self, vol, all_vols): def _build_file_index(self, vol, all_vols):
do_vac = False do_vac = False
top = vol.realpath top = vol.realpath
nohash = "dhash" in vol.flags
with self.mutex: with self.mutex:
cur, _ = self.register_vpath(top, vol.flags) cur, _ = self.register_vpath(top, vol.flags)
@ -373,7 +374,7 @@ class Up2k(object):
if WINDOWS: if WINDOWS:
excl = [x.replace("/", "\\") for x in excl] excl = [x.replace("/", "\\") for x in excl]
n_add = self._build_dir(dbw, top, set(excl), top) n_add = self._build_dir(dbw, top, set(excl), top, nohash)
n_rm = self._drop_lost(dbw[0], top) n_rm = self._drop_lost(dbw[0], top)
if dbw[1]: if dbw[1]:
self.log("commit {} new files".format(dbw[1])) self.log("commit {} new files".format(dbw[1]))
@ -381,7 +382,7 @@ class Up2k(object):
return True, n_add or n_rm or do_vac return True, n_add or n_rm or do_vac
def _build_dir(self, dbw, top, excl, cdir): def _build_dir(self, dbw, top, excl, cdir, nohash):
self.pp.msg = "a{} {}".format(self.pp.n, cdir) self.pp.msg = "a{} {}".format(self.pp.n, cdir)
histdir = self.vfs.histtab[top] histdir = self.vfs.histtab[top]
ret = 0 ret = 0
@ -389,16 +390,17 @@ class Up2k(object):
for iname, inf in sorted(g): for iname, inf in sorted(g):
abspath = os.path.join(cdir, iname) abspath = os.path.join(cdir, iname)
lmod = int(inf.st_mtime) lmod = int(inf.st_mtime)
sz = inf.st_size
if stat.S_ISDIR(inf.st_mode): if stat.S_ISDIR(inf.st_mode):
if abspath in excl or abspath == histdir: if abspath in excl or abspath == histdir:
continue continue
# self.log(" dir: {}".format(abspath)) # self.log(" dir: {}".format(abspath))
ret += self._build_dir(dbw, top, excl, abspath) ret += self._build_dir(dbw, top, excl, abspath, nohash)
else: else:
# self.log("file: {}".format(abspath)) # self.log("file: {}".format(abspath))
rp = abspath[len(top) :].replace("\\", "/").strip("/") rp = abspath[len(top) :].replace("\\", "/").strip("/")
rd, fn = rp.rsplit("/", 1) if "/" in rp else ["", rp] rd, fn = rp.rsplit("/", 1) if "/" in rp else ["", rp]
sql = "select * from up where rd = ? and fn = ?" sql = "select w, mt, sz from up where rd = ? and fn = ?"
try: try:
c = dbw[0].execute(sql, (rd, fn)) c = dbw[0].execute(sql, (rd, fn))
except: except:
@ -407,18 +409,18 @@ class Up2k(object):
in_db = list(c.fetchall()) in_db = list(c.fetchall())
if in_db: if in_db:
self.pp.n -= 1 self.pp.n -= 1
_, dts, dsz, _, _ = in_db[0] dw, dts, dsz = in_db[0]
if len(in_db) > 1: if len(in_db) > 1:
m = "WARN: multiple entries: [{}] => [{}] |{}|\n{}" m = "WARN: multiple entries: [{}] => [{}] |{}|\n{}"
rep_db = "\n".join([repr(x) for x in in_db]) rep_db = "\n".join([repr(x) for x in in_db])
self.log(m.format(top, rp, len(in_db), rep_db)) self.log(m.format(top, rp, len(in_db), rep_db))
dts = -1 dts = -1
if dts == lmod and dsz == inf.st_size: if dts == lmod and dsz == sz and (nohash or dw[0] != "#"):
continue continue
m = "reindex [{}] => [{}] ({}/{}) ({}/{})".format( m = "reindex [{}] => [{}] ({}/{}) ({}/{})".format(
top, rp, dts, lmod, dsz, inf.st_size top, rp, dts, lmod, dsz, sz
) )
self.log(m) self.log(m)
self.db_rm(dbw[0], rd, fn) self.db_rm(dbw[0], rd, fn)
@ -427,7 +429,11 @@ class Up2k(object):
in_db = None in_db = None
self.pp.msg = "a{} {}".format(self.pp.n, abspath) self.pp.msg = "a{} {}".format(self.pp.n, abspath)
if inf.st_size > 1024 * 1024:
if nohash:
wark = up2k_wark_from_metadata(self.salt, sz, lmod, rd, fn)
else:
if sz > 1024 * 1024:
self.log("file: {}".format(abspath)) self.log("file: {}".format(abspath))
try: try:
@ -436,8 +442,9 @@ class Up2k(object):
self.log("hash: {} @ [{}]".format(repr(ex), abspath)) self.log("hash: {} @ [{}]".format(repr(ex), abspath))
continue continue
wark = up2k_wark_from_hashlist(self.salt, inf.st_size, hashes) wark = up2k_wark_from_hashlist(self.salt, sz, hashes)
self.db_add(dbw[0], wark, rd, fn, lmod, inf.st_size)
self.db_add(dbw[0], wark, rd, fn, lmod, sz)
dbw[1] += 1 dbw[1] += 1
ret += 1 ret += 1
td = time.time() - dbw[2] td = time.time() - dbw[2]
@ -1466,9 +1473,12 @@ def up2k_wark_from_hashlist(salt, filesize, hashes):
ident.extend(hashes) ident.extend(hashes)
ident = "\n".join(ident) ident = "\n".join(ident)
hasher = hashlib.sha512() wark = hashlib.sha512(ident.encode("utf-8")).digest()
hasher.update(ident.encode("utf-8")) wark = base64.urlsafe_b64encode(wark)
digest = hasher.digest()[:32] return wark.decode("ascii")[:43]
wark = base64.urlsafe_b64encode(digest)
return wark.decode("utf-8").rstrip("=") def up2k_wark_from_metadata(salt, sz, lastmod, rd, fn):
ret = fsenc("{}\n{}\n{}\n{}\n{}".format(salt, lastmod, sz, rd, fn))
ret = base64.urlsafe_b64encode(hashlib.sha512(ret).digest())
return "#{}".format(ret[:42].decode("ascii"))

View file

@ -254,6 +254,17 @@ def trace(*args, **kwargs):
nuprint(msg) nuprint(msg)
def min_ex():
et, ev, tb = sys.exc_info()
tb = traceback.extract_tb(tb, 2)
ex = [
"{} @ {} <{}>: {}".format(fp.split(os.sep)[-1], ln, fun, txt)
for fp, ln, fun, txt in tb
]
ex.append("{}: {}".format(et.__name__, ev))
return "\n".join(ex)
@contextlib.contextmanager @contextlib.contextmanager
def ren_open(fname, *args, **kwargs): def ren_open(fname, *args, **kwargs):
fdir = kwargs.pop("fdir", None) fdir = kwargs.pop("fdir", None)

View file

@ -38,6 +38,7 @@ class Cfg(Namespace):
mtp=[], mtp=[],
mte="a", mte="a",
hist=None, hist=None,
no_hash=False,
**{k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()} **{k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()}
) )

View file

@ -18,7 +18,7 @@ from copyparty import util
class Cfg(Namespace): class Cfg(Namespace):
def __init__(self, a=[], v=[], c=None): def __init__(self, a=[], v=[], c=None):
ex = {k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()} ex = {k: False for k in "e2d e2ds e2dsa e2t e2ts e2tsr".split()}
ex2 = {"mtp": [], "mte": "a", "hist": None} ex2 = {"mtp": [], "mte": "a", "hist": None, "no_hash": False}
ex.update(ex2) ex.update(ex2)
super(Cfg, self).__init__(a=a, v=v, c=c, **ex) super(Cfg, self).__init__(a=a, v=v, c=c, **ex)

View file

@ -60,7 +60,7 @@ def get_ramdisk():
if os.path.exists("/Volumes"): if os.path.exists("/Volumes"):
# hdiutil eject /Volumes/cptd/ # hdiutil eject /Volumes/cptd/
devname, _ = chkcmd("hdiutil", "attach", "-nomount", "ram://65536") devname, _ = chkcmd("hdiutil", "attach", "-nomount", "ram://131072")
devname = devname.strip() devname = devname.strip()
print("devname: [{}]".format(devname)) print("devname: [{}]".format(devname))
for _ in range(10): for _ in range(10):