sqlite and misc optimizations:

* exponentially slow upload handshakes caused by lack of rd+fn
   sqlite index; became apparent after a volume hit 200k files
* listing big folders 5% faster due to `_quotep3b`
* optimize `unquote`, 20% faster but only used rarely
* reindex on startup 150x faster in some rare cases
   (same filename in MANY folders)

the database is now around 10% larger (likely worst-case)
This commit is contained in:
ed 2024-09-15 13:18:43 +00:00
parent 2927bbb2d6
commit d67e9cc507
5 changed files with 146 additions and 43 deletions

View file

@ -5143,7 +5143,6 @@ class HttpCli(object):
dirs.append(item)
else:
files.append(item)
item["rd"] = rem
if is_dk and not vf.get("dks"):
dirs = []
@ -5166,16 +5165,10 @@ class HttpCli(object):
add_up_at = ".up_at" in mte
is_admin = self.can_admin
tagset: set[str] = set()
for fe in files:
rd = vrem
for fe in files if icur else []:
assert icur # !rm
fn = fe["name"]
rd = fe["rd"]
del fe["rd"]
if not icur:
continue
if vn != dbv:
_, rd = vn.get_dbv(rd)
erd_efn = (rd, fn)
q = "select mt.k, mt.v from up inner join mt on mt.w = substr(up.w,1,16) where up.rd = ? and up.fn = ? and +mt.k != 'x'"
try:

View file

@ -1313,6 +1313,9 @@ class Up2k(object):
if WINDOWS:
rd = rd.replace("\\", "/").strip("/")
rds = rd + "/" if rd else ""
cdirs = cdir + os.sep
g = statdir(self.log_func, not self.args.no_scandir, True, cdir)
gl = sorted(g)
partials = set([x[0] for x in gl if "PARTIAL" in x[0]])
@ -1320,8 +1323,8 @@ class Up2k(object):
if self.stop:
return -1
rp = vjoin(rd, iname)
abspath = os.path.join(cdir, iname)
rp = rds + iname
abspath = cdirs + iname
if rei and rei.search(abspath):
unreg.append(rp)
@ -1451,8 +1454,8 @@ class Up2k(object):
if self.stop:
return -1
rp = vjoin(rd, fn)
abspath = os.path.join(cdir, fn)
rp = rds + fn
abspath = cdirs + fn
nohash = reh.search(abspath) if reh else False
sql = "select w, mt, sz, ip, at from up where rd = ? and fn = ?"
@ -1536,7 +1539,7 @@ class Up2k(object):
# drop shadowed folders
for sh_rd in unreg:
n = 0
q = "select count(w) from up where (rd = ? or rd like ?||'%') and at == 0"
q = "select count(w) from up where (rd=? or rd like ?||'%') and +at == 0"
for sh_erd in [sh_rd, "//" + w8b64enc(sh_rd)]:
try:
n = db.c.execute(q, (sh_erd, sh_erd + "/")).fetchone()[0]
@ -1552,7 +1555,7 @@ class Up2k(object):
q = "delete from dh where (d = ? or d like ?||'%')"
db.c.execute(q, (sh_erd, sh_erd + "/"))
q = "delete from up where (rd = ? or rd like ?||'%') and at == 0"
q = "delete from up where (rd=? or rd like ?||'%') and +at == 0"
db.c.execute(q, (sh_erd, sh_erd + "/"))
ret += n
@ -1650,7 +1653,7 @@ class Up2k(object):
# then covers
n_rm3 = 0
qu = "select 1 from up where rd=? and +fn=? limit 1"
qu = "select 1 from up where rd=? and fn=? limit 1"
q = "delete from cv where rd=? and dn=? and +fn=?"
for crd, cdn, fn in cur.execute("select * from cv"):
urd = vjoin(crd, cdn)
@ -2471,12 +2474,10 @@ class Up2k(object):
self.log("WARN: failed to upgrade from v4", 3)
if ver == DB_VER:
try:
self._add_cv_tab(cur)
self._add_xiu_tab(cur)
self._add_dhash_tab(cur)
except:
pass
self._add_dhash_tab(cur)
self._add_xiu_tab(cur)
self._add_cv_tab(cur)
self._add_idx_up_vp(cur, db_path)
try:
nfiles = next(cur.execute("select count(w) from up"))[0]
@ -2573,9 +2574,10 @@ class Up2k(object):
for cmd in [
r"create table up (w text, mt int, sz int, rd text, fn text, ip text, at int)",
r"create index up_rd on up(rd)",
r"create index up_vp on up(rd, fn)",
r"create index up_fn on up(fn)",
r"create index up_ip on up(ip)",
r"create index up_at on up(at)",
idx,
r"create table mt (w text, k text, v int)",
r"create index mt_w on mt(w)",
@ -2605,6 +2607,12 @@ class Up2k(object):
def _add_dhash_tab(self, cur: "sqlite3.Cursor") -> None:
# v5 -> v5a
try:
cur.execute("select d, h from dh limit 1").fetchone()
return
except:
pass
for cmd in [
r"create table dh (d text, h text)",
r"create index dh_d on dh(d)",
@ -2658,6 +2666,24 @@ class Up2k(object):
cur.connection.commit()
def _add_idx_up_vp(self, cur: "sqlite3.Cursor", db_path: str) -> None:
# v5c -> v5d
try:
cur.execute("drop index up_rd")
except:
return
for cmd in [
r"create index up_vp on up(rd, fn)",
r"create index up_at on up(at)",
]:
self.log("upgrading db [%s]: %s" % (db_path, cmd[:18]))
cur.execute(cmd)
self.log("upgrading db [%s]: writing to disk..." % (db_path,))
cur.connection.commit()
cur.execute("vacuum")
def wake_rescanner(self):
with self.rescan_cond:
self.rescan_cond.notify_all()

View file

@ -164,12 +164,8 @@ except ImportError:
if not PY2:
from io import BytesIO
from urllib.parse import quote_from_bytes as quote
from urllib.parse import unquote_to_bytes as unquote
else:
from StringIO import StringIO as BytesIO # type: ignore
from urllib import quote # type: ignore # pylint: disable=no-name-in-module
from urllib import unquote # type: ignore # pylint: disable=no-name-in-module
try:
@ -488,19 +484,6 @@ VERSIONS = (
)
_: Any = (mp, BytesIO, quote, unquote, SQLITE_VER, JINJA_VER, PYFTPD_VER, PARTFTPY_VER)
__all__ = [
"mp",
"BytesIO",
"quote",
"unquote",
"SQLITE_VER",
"JINJA_VER",
"PYFTPD_VER",
"PARTFTPY_VER",
]
class Daemon(threading.Thread):
def __init__(
self,
@ -2074,6 +2057,8 @@ def html_bescape(s: bytes, quot: bool = False, crlf: bool = False) -> bytes:
def _quotep2(txt: str) -> str:
"""url quoter which deals with bytes correctly"""
if not txt:
return ""
btxt = w8enc(txt)
quot = quote(btxt, safe=b"/")
return w8dec(quot.replace(b" ", b"+")) # type: ignore
@ -2081,18 +2066,61 @@ def _quotep2(txt: str) -> str:
def _quotep3(txt: str) -> str:
"""url quoter which deals with bytes correctly"""
if not txt:
return ""
btxt = w8enc(txt)
quot = quote(btxt, safe=b"/").encode("utf-8")
return w8dec(quot.replace(b" ", b"+"))
quotep = _quotep3 if not PY2 else _quotep2
if not PY2:
_uqsb = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.-~/"
_uqtl = {
n: ("%%%02X" % (n,) if n not in _uqsb else chr(n)).encode("utf-8")
for n in range(256)
}
_uqtl[b" "] = b"+"
def _quotep3b(txt: str) -> str:
"""url quoter which deals with bytes correctly"""
if not txt:
return ""
btxt = w8enc(txt)
if btxt.rstrip(_uqsb):
lut = _uqtl
btxt = b"".join([lut[ch] for ch in btxt])
return w8dec(btxt)
quotep = _quotep3b
_hexd = "0123456789ABCDEFabcdef"
_hex2b = {(a + b).encode(): bytes.fromhex(a + b) for a in _hexd for b in _hexd}
def unquote(btxt: bytes) -> bytes:
h2b = _hex2b
parts = iter(btxt.split(b"%"))
ret = [next(parts)]
for item in parts:
c = h2b.get(item[:2])
if c is None:
ret.append(b"%")
ret.append(item)
else:
ret.append(c)
ret.append(item[2:])
return b"".join(ret)
from urllib.parse import quote_from_bytes as quote
else:
from urllib import quote # type: ignore # pylint: disable=no-name-in-module
from urllib import unquote # type: ignore # pylint: disable=no-name-in-module
quotep = _quotep2
def unquotep(txt: str) -> str:
"""url unquoter which deals with bytes correctly"""
btxt = w8enc(txt)
# btxt = btxt.replace(b"+", b" ")
unq2 = unquote(btxt)
return w8dec(unq2)
@ -3521,3 +3549,16 @@ class WrongPostKey(Pebkac):
self.got = got
self.fname = fname
self.datagen = datagen
_: Any = (mp, BytesIO, quote, unquote, SQLITE_VER, JINJA_VER, PYFTPD_VER, PARTFTPY_VER)
__all__ = [
"mp",
"BytesIO",
"quote",
"unquote",
"SQLITE_VER",
"JINJA_VER",
"PYFTPD_VER",
"PARTFTPY_VER",
]

38
tests/test_utils.py Normal file
View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
# coding: utf-8
from __future__ import print_function, unicode_literals
import unittest
from copyparty.__main__ import PY2
from copyparty.util import w8enc
from tests import util as tu
class TestUtils(unittest.TestCase):
def cmp(self, orig, t1, t2):
if t1 != t2:
raise Exception("\n%r\n%r\n%r\n" % (w8enc(orig), t1, t2))
def test_quotep(self):
if PY2:
raise unittest.SkipTest()
from copyparty.util import _quotep3, _quotep3b, w8dec
txt = w8dec(tu.randbytes(8192))
self.cmp(txt, _quotep3(txt), _quotep3b(txt))
def test_unquote(self):
if PY2:
raise unittest.SkipTest()
from urllib.parse import unquote_to_bytes as u2b
from copyparty.util import unquote
for btxt in (
tu.randbytes(8192),
br"%ed%91qw,er;ty%20as df?gh+jkl%zxc&vbn <qwe>\"rty'uio&asd&nbsp;fgh",
):
self.cmp(btxt, unquote(btxt), u2b(btxt))

View file

@ -3,6 +3,7 @@
from __future__ import print_function, unicode_literals
import os
import random
import re
import shutil
import socket
@ -49,6 +50,10 @@ from copyparty.util import FHC, CachedDict, Garda, Unrecv
init_E(E)
def randbytes(n):
return random.getrandbits(n * 8).to_bytes(n, "little")
def runcmd(argv):
p = sp.Popen(argv, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = p.communicate()