sqlite and misc optimizations:

* exponentially slow upload handshakes caused by lack of rd+fn
   sqlite index; became apparent after a volume hit 200k files
* listing big folders 5% faster due to `_quotep3b`
* optimize `unquote`, 20% faster but only used rarely
* reindex on startup 150x faster in some rare cases
   (same filename in MANY folders)

the database is now around 10% larger (likely worst-case)
This commit is contained in:
ed 2024-09-15 13:18:43 +00:00
parent 2927bbb2d6
commit d67e9cc507
5 changed files with 146 additions and 43 deletions

View file

@ -5143,7 +5143,6 @@ class HttpCli(object):
dirs.append(item) dirs.append(item)
else: else:
files.append(item) files.append(item)
item["rd"] = rem
if is_dk and not vf.get("dks"): if is_dk and not vf.get("dks"):
dirs = [] dirs = []
@ -5166,16 +5165,10 @@ class HttpCli(object):
add_up_at = ".up_at" in mte add_up_at = ".up_at" in mte
is_admin = self.can_admin is_admin = self.can_admin
tagset: set[str] = set() tagset: set[str] = set()
for fe in files: rd = vrem
for fe in files if icur else []:
assert icur # !rm
fn = fe["name"] fn = fe["name"]
rd = fe["rd"]
del fe["rd"]
if not icur:
continue
if vn != dbv:
_, rd = vn.get_dbv(rd)
erd_efn = (rd, fn) erd_efn = (rd, fn)
q = "select mt.k, mt.v from up inner join mt on mt.w = substr(up.w,1,16) where up.rd = ? and up.fn = ? and +mt.k != 'x'" q = "select mt.k, mt.v from up inner join mt on mt.w = substr(up.w,1,16) where up.rd = ? and up.fn = ? and +mt.k != 'x'"
try: try:

View file

@ -1313,6 +1313,9 @@ class Up2k(object):
if WINDOWS: if WINDOWS:
rd = rd.replace("\\", "/").strip("/") rd = rd.replace("\\", "/").strip("/")
rds = rd + "/" if rd else ""
cdirs = cdir + os.sep
g = statdir(self.log_func, not self.args.no_scandir, True, cdir) g = statdir(self.log_func, not self.args.no_scandir, True, cdir)
gl = sorted(g) gl = sorted(g)
partials = set([x[0] for x in gl if "PARTIAL" in x[0]]) partials = set([x[0] for x in gl if "PARTIAL" in x[0]])
@ -1320,8 +1323,8 @@ class Up2k(object):
if self.stop: if self.stop:
return -1 return -1
rp = vjoin(rd, iname) rp = rds + iname
abspath = os.path.join(cdir, iname) abspath = cdirs + iname
if rei and rei.search(abspath): if rei and rei.search(abspath):
unreg.append(rp) unreg.append(rp)
@ -1451,8 +1454,8 @@ class Up2k(object):
if self.stop: if self.stop:
return -1 return -1
rp = vjoin(rd, fn) rp = rds + fn
abspath = os.path.join(cdir, fn) abspath = cdirs + fn
nohash = reh.search(abspath) if reh else False nohash = reh.search(abspath) if reh else False
sql = "select w, mt, sz, ip, at from up where rd = ? and fn = ?" sql = "select w, mt, sz, ip, at from up where rd = ? and fn = ?"
@ -1536,7 +1539,7 @@ class Up2k(object):
# drop shadowed folders # drop shadowed folders
for sh_rd in unreg: for sh_rd in unreg:
n = 0 n = 0
q = "select count(w) from up where (rd = ? or rd like ?||'%') and at == 0" q = "select count(w) from up where (rd=? or rd like ?||'%') and +at == 0"
for sh_erd in [sh_rd, "//" + w8b64enc(sh_rd)]: for sh_erd in [sh_rd, "//" + w8b64enc(sh_rd)]:
try: try:
n = db.c.execute(q, (sh_erd, sh_erd + "/")).fetchone()[0] n = db.c.execute(q, (sh_erd, sh_erd + "/")).fetchone()[0]
@ -1552,7 +1555,7 @@ class Up2k(object):
q = "delete from dh where (d = ? or d like ?||'%')" q = "delete from dh where (d = ? or d like ?||'%')"
db.c.execute(q, (sh_erd, sh_erd + "/")) db.c.execute(q, (sh_erd, sh_erd + "/"))
q = "delete from up where (rd = ? or rd like ?||'%') and at == 0" q = "delete from up where (rd=? or rd like ?||'%') and +at == 0"
db.c.execute(q, (sh_erd, sh_erd + "/")) db.c.execute(q, (sh_erd, sh_erd + "/"))
ret += n ret += n
@ -1650,7 +1653,7 @@ class Up2k(object):
# then covers # then covers
n_rm3 = 0 n_rm3 = 0
qu = "select 1 from up where rd=? and +fn=? limit 1" qu = "select 1 from up where rd=? and fn=? limit 1"
q = "delete from cv where rd=? and dn=? and +fn=?" q = "delete from cv where rd=? and dn=? and +fn=?"
for crd, cdn, fn in cur.execute("select * from cv"): for crd, cdn, fn in cur.execute("select * from cv"):
urd = vjoin(crd, cdn) urd = vjoin(crd, cdn)
@ -2471,12 +2474,10 @@ class Up2k(object):
self.log("WARN: failed to upgrade from v4", 3) self.log("WARN: failed to upgrade from v4", 3)
if ver == DB_VER: if ver == DB_VER:
try:
self._add_cv_tab(cur)
self._add_xiu_tab(cur)
self._add_dhash_tab(cur) self._add_dhash_tab(cur)
except: self._add_xiu_tab(cur)
pass self._add_cv_tab(cur)
self._add_idx_up_vp(cur, db_path)
try: try:
nfiles = next(cur.execute("select count(w) from up"))[0] nfiles = next(cur.execute("select count(w) from up"))[0]
@ -2573,9 +2574,10 @@ class Up2k(object):
for cmd in [ for cmd in [
r"create table up (w text, mt int, sz int, rd text, fn text, ip text, at int)", r"create table up (w text, mt int, sz int, rd text, fn text, ip text, at int)",
r"create index up_rd on up(rd)", r"create index up_vp on up(rd, fn)",
r"create index up_fn on up(fn)", r"create index up_fn on up(fn)",
r"create index up_ip on up(ip)", r"create index up_ip on up(ip)",
r"create index up_at on up(at)",
idx, idx,
r"create table mt (w text, k text, v int)", r"create table mt (w text, k text, v int)",
r"create index mt_w on mt(w)", r"create index mt_w on mt(w)",
@ -2605,6 +2607,12 @@ class Up2k(object):
def _add_dhash_tab(self, cur: "sqlite3.Cursor") -> None: def _add_dhash_tab(self, cur: "sqlite3.Cursor") -> None:
# v5 -> v5a # v5 -> v5a
try:
cur.execute("select d, h from dh limit 1").fetchone()
return
except:
pass
for cmd in [ for cmd in [
r"create table dh (d text, h text)", r"create table dh (d text, h text)",
r"create index dh_d on dh(d)", r"create index dh_d on dh(d)",
@ -2658,6 +2666,24 @@ class Up2k(object):
cur.connection.commit() cur.connection.commit()
def _add_idx_up_vp(self, cur: "sqlite3.Cursor", db_path: str) -> None:
# v5c -> v5d
try:
cur.execute("drop index up_rd")
except:
return
for cmd in [
r"create index up_vp on up(rd, fn)",
r"create index up_at on up(at)",
]:
self.log("upgrading db [%s]: %s" % (db_path, cmd[:18]))
cur.execute(cmd)
self.log("upgrading db [%s]: writing to disk..." % (db_path,))
cur.connection.commit()
cur.execute("vacuum")
def wake_rescanner(self): def wake_rescanner(self):
with self.rescan_cond: with self.rescan_cond:
self.rescan_cond.notify_all() self.rescan_cond.notify_all()

View file

@ -164,12 +164,8 @@ except ImportError:
if not PY2: if not PY2:
from io import BytesIO from io import BytesIO
from urllib.parse import quote_from_bytes as quote
from urllib.parse import unquote_to_bytes as unquote
else: else:
from StringIO import StringIO as BytesIO # type: ignore from StringIO import StringIO as BytesIO # type: ignore
from urllib import quote # type: ignore # pylint: disable=no-name-in-module
from urllib import unquote # type: ignore # pylint: disable=no-name-in-module
try: try:
@ -488,19 +484,6 @@ VERSIONS = (
) )
_: Any = (mp, BytesIO, quote, unquote, SQLITE_VER, JINJA_VER, PYFTPD_VER, PARTFTPY_VER)
__all__ = [
"mp",
"BytesIO",
"quote",
"unquote",
"SQLITE_VER",
"JINJA_VER",
"PYFTPD_VER",
"PARTFTPY_VER",
]
class Daemon(threading.Thread): class Daemon(threading.Thread):
def __init__( def __init__(
self, self,
@ -2074,6 +2057,8 @@ def html_bescape(s: bytes, quot: bool = False, crlf: bool = False) -> bytes:
def _quotep2(txt: str) -> str: def _quotep2(txt: str) -> str:
"""url quoter which deals with bytes correctly""" """url quoter which deals with bytes correctly"""
if not txt:
return ""
btxt = w8enc(txt) btxt = w8enc(txt)
quot = quote(btxt, safe=b"/") quot = quote(btxt, safe=b"/")
return w8dec(quot.replace(b" ", b"+")) # type: ignore return w8dec(quot.replace(b" ", b"+")) # type: ignore
@ -2081,18 +2066,61 @@ def _quotep2(txt: str) -> str:
def _quotep3(txt: str) -> str: def _quotep3(txt: str) -> str:
"""url quoter which deals with bytes correctly""" """url quoter which deals with bytes correctly"""
if not txt:
return ""
btxt = w8enc(txt) btxt = w8enc(txt)
quot = quote(btxt, safe=b"/").encode("utf-8") quot = quote(btxt, safe=b"/").encode("utf-8")
return w8dec(quot.replace(b" ", b"+")) return w8dec(quot.replace(b" ", b"+"))
quotep = _quotep3 if not PY2 else _quotep2 if not PY2:
_uqsb = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.-~/"
_uqtl = {
n: ("%%%02X" % (n,) if n not in _uqsb else chr(n)).encode("utf-8")
for n in range(256)
}
_uqtl[b" "] = b"+"
def _quotep3b(txt: str) -> str:
"""url quoter which deals with bytes correctly"""
if not txt:
return ""
btxt = w8enc(txt)
if btxt.rstrip(_uqsb):
lut = _uqtl
btxt = b"".join([lut[ch] for ch in btxt])
return w8dec(btxt)
quotep = _quotep3b
_hexd = "0123456789ABCDEFabcdef"
_hex2b = {(a + b).encode(): bytes.fromhex(a + b) for a in _hexd for b in _hexd}
def unquote(btxt: bytes) -> bytes:
h2b = _hex2b
parts = iter(btxt.split(b"%"))
ret = [next(parts)]
for item in parts:
c = h2b.get(item[:2])
if c is None:
ret.append(b"%")
ret.append(item)
else:
ret.append(c)
ret.append(item[2:])
return b"".join(ret)
from urllib.parse import quote_from_bytes as quote
else:
from urllib import quote # type: ignore # pylint: disable=no-name-in-module
from urllib import unquote # type: ignore # pylint: disable=no-name-in-module
quotep = _quotep2
def unquotep(txt: str) -> str: def unquotep(txt: str) -> str:
"""url unquoter which deals with bytes correctly""" """url unquoter which deals with bytes correctly"""
btxt = w8enc(txt) btxt = w8enc(txt)
# btxt = btxt.replace(b"+", b" ")
unq2 = unquote(btxt) unq2 = unquote(btxt)
return w8dec(unq2) return w8dec(unq2)
@ -3521,3 +3549,16 @@ class WrongPostKey(Pebkac):
self.got = got self.got = got
self.fname = fname self.fname = fname
self.datagen = datagen self.datagen = datagen
_: Any = (mp, BytesIO, quote, unquote, SQLITE_VER, JINJA_VER, PYFTPD_VER, PARTFTPY_VER)
__all__ = [
"mp",
"BytesIO",
"quote",
"unquote",
"SQLITE_VER",
"JINJA_VER",
"PYFTPD_VER",
"PARTFTPY_VER",
]

38
tests/test_utils.py Normal file
View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
# coding: utf-8
from __future__ import print_function, unicode_literals
import unittest
from copyparty.__main__ import PY2
from copyparty.util import w8enc
from tests import util as tu
class TestUtils(unittest.TestCase):
def cmp(self, orig, t1, t2):
if t1 != t2:
raise Exception("\n%r\n%r\n%r\n" % (w8enc(orig), t1, t2))
def test_quotep(self):
if PY2:
raise unittest.SkipTest()
from copyparty.util import _quotep3, _quotep3b, w8dec
txt = w8dec(tu.randbytes(8192))
self.cmp(txt, _quotep3(txt), _quotep3b(txt))
def test_unquote(self):
if PY2:
raise unittest.SkipTest()
from urllib.parse import unquote_to_bytes as u2b
from copyparty.util import unquote
for btxt in (
tu.randbytes(8192),
br"%ed%91qw,er;ty%20as df?gh+jkl%zxc&vbn <qwe>\"rty'uio&asd&nbsp;fgh",
):
self.cmp(btxt, unquote(btxt), u2b(btxt))

View file

@ -3,6 +3,7 @@
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
import os import os
import random
import re import re
import shutil import shutil
import socket import socket
@ -49,6 +50,10 @@ from copyparty.util import FHC, CachedDict, Garda, Unrecv
init_E(E) init_E(E)
def randbytes(n):
return random.getrandbits(n * 8).to_bytes(n, "little")
def runcmd(argv): def runcmd(argv):
p = sp.Popen(argv, stdout=sp.PIPE, stderr=sp.PIPE) p = sp.Popen(argv, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = p.communicate() stdout, stderr = p.communicate()