optimize indexer for huge filesystems

2025-11-24 07:23:22 -07:00 · 2021-10-09 01:24:19 +02:00 · 2021-10-09 01:24:19 +02:00 · b0fd8bf7d4
parent b9cf8f3973
commit b0fd8bf7d4
1 changed files with 68 additions and 22 deletions
--- a/copyparty/up2k.py
+++ b/copyparty/up2k.py
@ -29,6 +29,8 @@ from .util import (
    atomic_move,
    quotep,
    vsplit,
+    w8b64enc,
+    w8b64dec,
    s3enc,
    s3dec,
    rmdirs,
@ -479,11 +481,18 @@ class Up2k(object):
            if WINDOWS:
                excl = [x.replace("/", "\\") for x in excl]

-            n_add = self._build_dir(dbw, top, set(excl), top, nohash, [])
-            n_rm = self._drop_lost(dbw[0], top)
+            n_add = n_rm = 0
+            try:
+                n_add = self._build_dir(dbw, top, set(excl), top, nohash, [])
+                n_rm = self._drop_lost(dbw[0], top)
+            except:
+                m = "failed to index volume [{}]:\n{}"
+                self.log(m.format(top, min_ex()), c=1)
+
            if dbw[1]:
                self.log("commit {} new files".format(dbw[1]))
-                dbw[0].connection.commit()
+
+            dbw[0].connection.commit()

            return True, n_add or n_rm or do_vac

@ -498,6 +507,7 @@ class Up2k(object):
        self.pp.msg = "a{} {}".format(self.pp.n, cdir)
        histpath = self.asrv.vfs.histtab[top]
        ret = 0
+        seen_files = {}
        g = statdir(self.log_func, not self.args.no_scandir, False, cdir)
        for iname, inf in sorted(g):
            abspath = os.path.join(cdir, iname)
@ -507,9 +517,14 @@ class Up2k(object):
                if abspath in excl or abspath == histpath:
                    continue
                # self.log(" dir: {}".format(abspath))
-                ret += self._build_dir(dbw, top, excl, abspath, nohash, seen)
+                try:
+                    ret += self._build_dir(dbw, top, excl, abspath, nohash, seen)
+                except:
+                    m = "failed to index subdir [{}]:\n{}"
+                    self.log(m.format(abspath, min_ex()), c=1)
            else:
                # self.log("file: {}".format(abspath))
+                seen_files[iname] = 1
                rp = abspath[len(top) + 1 :]
                if WINDOWS:
                    rp = rp.replace("\\", "/").strip("/")
@ -568,34 +583,65 @@ class Up2k(object):
                    dbw[0].connection.commit()
                    dbw[1] = 0
                    dbw[2] = time.time()
+
+        # drop missing files
+        rd = cdir[len(top) + 1 :].strip("/")
+        if WINDOWS:
+            rd = rd.replace("\\", "/").strip("/")
+
+        q = "select fn from up where rd = ?"
+        try:
+            c = dbw[0].execute(q, (rd,))
+        except:
+            c = dbw[0].execute(q, ("//" + w8b64enc(rd),))
+
+        hits = [w8b64dec(x[2:]) if x.startswith("//") else x for (x,) in c]
+        rm_files = [x for x in hits if x not in seen_files]
+        n_rm = len(rm_files)
+        for fn in rm_files:
+            self.db_rm(dbw[0], rd, fn)
+
+        if n_rm:
+            self.log("forgot {} deleted files".format(n_rm))
+
        return ret

    def _drop_lost(self, cur, top):
        rm = []
+        n_rm = 0
        nchecked = 0
-        nfiles = next(cur.execute("select count(w) from up"))[0]
-        c = cur.execute("select rd, fn from up")
-        for drd, dfn in c:
+        # `_build_dir` did all the files, now do dirs
+        ndirs = next(cur.execute("select count(distinct rd) from up"))[0]
+        c = cur.execute("select distinct rd from up order by rd desc")
+        for (drd,) in c:
            nchecked += 1
-            if drd.startswith("//") or dfn.startswith("//"):
-                drd, dfn = s3dec(drd, dfn)
+            if drd.startswith("//"):
+                rd = w8b64dec(drd[2:])
+            else:
+                rd = drd

-            abspath = os.path.join(top, drd, dfn)
-            # almost zero overhead dw
-            self.pp.msg = "b{} {}".format(nfiles - nchecked, abspath)
+            abspath = os.path.join(top, rd)
+            self.pp.msg = "b{} {}".format(ndirs - nchecked, abspath)
            try:
-                if not bos.path.exists(abspath):
-                    rm.append([drd, dfn])
-            except Exception as ex:
-                self.log("stat-rm: {} @ [{}]".format(repr(ex), abspath))
+                if os.path.isdir(abspath):
+                    continue
+            except:
+                pass

-        if rm:
-            self.log("forgetting {} deleted files".format(len(rm)))
-            for rd, fn in rm:
-                # self.log("{} / {}".format(rd, fn))
-                self.db_rm(cur, rd, fn)
+            rm.append(drd)

-        return len(rm)
+        if not rm:
+            return 0
+
+        q = "select count(w) from up where rd = ?"
+        for rd in rm:
+            n_rm += next(cur.execute(q, (rd,)))[0]
+
+        self.log("forgetting {} deleted dirs, {} files".format(len(rm), n_rm))
+        for rd in rm:
+            cur.execute("delete from up where rd = ?", (rd,))
+
+        return n_rm

    def _build_tags_index(self, vol):
        ptop = vol.realpath