copyparty/bin/up2k.py
2021-10-29 01:49:40 +02:00

800 lines
23 KiB
Python
Executable file

#!/usr/bin/env python3
from __future__ import print_function, unicode_literals
"""
up2k.py: upload to copyparty
2021-10-29, v0.10, ed <irc.rizon.net>, MIT-Licensed
https://github.com/9001/copyparty/blob/hovudstraum/bin/up2k.py
- dependencies: requests
- supports python 2.6, 2.7, and 3.3 through 3.10
- almost zero error-handling
- but if something breaks just try again and it'll autoresume
"""
import os
import sys
import stat
import math
import time
import atexit
import signal
import base64
import hashlib
import argparse
import platform
import threading
import requests
import datetime
# from copyparty/__init__.py
PY2 = sys.version_info[0] == 2
if PY2:
from Queue import Queue
from urllib import unquote
from urllib import quote
sys.dont_write_bytecode = True
bytes = str
else:
from queue import Queue
from urllib.parse import unquote_to_bytes as unquote
from urllib.parse import quote_from_bytes as quote
unicode = str
VT100 = platform.system() != "Windows"
req_ses = requests.Session()
class File(object):
"""an up2k upload task; represents a single file"""
def __init__(self, top, rel, size, lmod):
self.top = top # type: bytes
self.rel = rel.replace(b"\\", b"/") # type: bytes
self.size = size # type: int
self.lmod = lmod # type: float
self.abs = os.path.join(top, rel) # type: bytes
self.name = self.rel.split(b"/")[-1].decode("utf-8", "replace") # type: str
# set by get_hashlist
self.cids = [] # type: list[tuple[str, int, int]] # [ hash, ofs, sz ]
self.kchunks = {} # type: dict[str, tuple[int, int]] # hash: [ ofs, sz ]
# set by handshake
self.ucids = [] # type: list[str] # chunks which need to be uploaded
self.wark = None # type: str
self.url = None # type: str
# set by upload
self.up_b = 0 # type: int
self.up_c = 0 # type: int
# m = "size({}) lmod({}) top({}) rel({}) abs({}) name({})\n"
# eprint(m.format(self.size, self.lmod, self.top, self.rel, self.abs, self.name))
class FileSlice(object):
"""file-like object providing a fixed window into a file"""
def __init__(self, file, cid):
# type: (File, str) -> FileSlice
self.car, self.len = file.kchunks[cid]
self.cdr = self.car + self.len
self.ofs = 0 # type: int
self.f = open(file.abs, "rb", 512 * 1024)
self.f.seek(self.car)
# https://stackoverflow.com/questions/4359495/what-is-exactly-a-file-like-object-in-python
# IOBase, RawIOBase, BufferedIOBase
funs = "close closed __enter__ __exit__ __iter__ isatty __next__ readable seekable writable"
try:
for fun in funs.split():
setattr(self, fun, getattr(self.f, fun))
except:
pass # py27 probably
def tell(self):
return self.ofs
def seek(self, ofs, wh=0):
if wh == 1:
ofs = self.ofs + ofs
elif wh == 2:
ofs = self.len + ofs # provided ofs is negative
if ofs < 0:
ofs = 0
elif ofs >= self.len:
ofs = self.len - 1
self.ofs = ofs
self.f.seek(self.car + ofs)
def read(self, sz):
sz = min(sz, self.len - self.ofs)
ret = self.f.read(sz)
self.ofs += len(ret)
return ret
_print = print
def eprint(*a, **ka):
ka["file"] = sys.stderr
ka["end"] = ""
if not PY2:
ka["flush"] = True
_print(*a, **ka)
if PY2 or not VT100:
sys.stderr.flush()
def flushing_print(*a, **ka):
_print(*a, **ka)
if "flush" not in ka:
sys.stdout.flush()
if not VT100:
print = flushing_print
def termsize():
import os
env = os.environ
def ioctl_GWINSZ(fd):
try:
import fcntl, termios, struct, os
cr = struct.unpack("hh", fcntl.ioctl(fd, termios.TIOCGWINSZ, "1234"))
except:
return
return cr
cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
if not cr:
try:
fd = os.open(os.ctermid(), os.O_RDONLY)
cr = ioctl_GWINSZ(fd)
os.close(fd)
except:
pass
if not cr:
try:
cr = (env["LINES"], env["COLUMNS"])
except:
cr = (25, 80)
return int(cr[1]), int(cr[0])
class CTermsize(object):
def __init__(self):
self.ev = False
self.margin = None
self.g = None
self.w, self.h = termsize()
try:
signal.signal(signal.SIGWINCH, self.ev_sig)
except:
return
thr = threading.Thread(target=self.worker)
thr.daemon = True
thr.start()
def worker(self):
while True:
time.sleep(0.5)
if not self.ev:
continue
self.ev = False
self.w, self.h = termsize()
if self.margin is not None:
self.scroll_region(self.margin)
def ev_sig(self, *a, **ka):
self.ev = True
def scroll_region(self, margin):
self.margin = margin
if margin is None:
self.g = None
eprint("\033[s\033[r\033[u")
else:
self.g = 1 + self.h - margin
m = "{0}\033[{1}A".format("\n" * margin, margin)
eprint("{0}\033[s\033[1;{1}r\033[u".format(m, self.g - 1))
ss = CTermsize()
def statdir(top):
"""non-recursive listing of directory contents, along with stat() info"""
if hasattr(os, "scandir"):
with os.scandir(top) as dh:
for fh in dh:
yield [os.path.join(top, fh.name), fh.stat()]
else:
for name in os.listdir(top):
abspath = os.path.join(top, name)
yield [abspath, os.stat(abspath)]
def walkdir(top):
"""recursive statdir"""
for ap, inf in sorted(statdir(top)):
if stat.S_ISDIR(inf.st_mode):
for x in walkdir(ap):
yield x
else:
yield ap, inf
def walkdirs(tops):
"""recursive statdir for a list of tops, yields [top, relpath, stat]"""
sep = "{0}".format(os.sep).encode("ascii")
for top in tops:
if top[-1:] == sep:
stop = top.rstrip(sep)
else:
stop = os.path.dirname(top)
if os.path.isdir(top):
for ap, inf in walkdir(top):
yield stop, ap[len(stop) :].lstrip(sep), inf
else:
d, n = top.rsplit(sep, 1)
yield d, n, os.stat(top)
# mostly from copyparty/util.py
def quotep(btxt):
quot1 = quote(btxt, safe=b"/")
if not PY2:
quot1 = quot1.encode("ascii")
return quot1.replace(b" ", b"+")
# from copyparty/util.py
def humansize(sz, terse=False):
"""picks a sensible unit for the given extent"""
for unit in ["B", "KiB", "MiB", "GiB", "TiB"]:
if sz < 1024:
break
sz /= 1024.0
ret = " ".join([str(sz)[:4].rstrip("."), unit])
if not terse:
return ret
return ret.replace("iB", "").replace(" ", "")
# from copyparty/up2k.py
def up2k_chunksize(filesize):
"""gives The correct chunksize for up2k hashing"""
chunksize = 1024 * 1024
stepsize = 512 * 1024
while True:
for mul in [1, 2]:
nchunks = math.ceil(filesize * 1.0 / chunksize)
if nchunks <= 256 or chunksize >= 32 * 1024 * 1024:
return chunksize
chunksize += stepsize
stepsize *= mul
# mostly from copyparty/up2k.py
def get_hashlist(file, pcb):
# type: (File, any) -> None
"""generates the up2k hashlist from file contents, inserts it into `file`"""
chunk_sz = up2k_chunksize(file.size)
file_rem = file.size
file_ofs = 0
ret = []
with open(file.abs, "rb", 512 * 1024) as f:
while file_rem > 0:
hashobj = hashlib.sha512()
chunk_sz = chunk_rem = min(chunk_sz, file_rem)
while chunk_rem > 0:
buf = f.read(min(chunk_rem, 64 * 1024))
if not buf:
raise Exception("EOF at " + str(f.tell()))
hashobj.update(buf)
chunk_rem -= len(buf)
digest = hashobj.digest()[:33]
digest = base64.urlsafe_b64encode(digest).decode("utf-8")
ret.append([digest, file_ofs, chunk_sz])
file_ofs += chunk_sz
file_rem -= chunk_sz
if pcb:
pcb(file, file_ofs)
file.cids = ret
file.kchunks = {}
for k, v1, v2 in ret:
file.kchunks[k] = [v1, v2]
def handshake(req_ses, url, file, pw, search):
# type: (requests.Session, str, File, any, bool) -> List[str]
"""
performs a handshake with the server; reply is:
if search, a list of search results
otherwise, a list of chunks to upload
"""
req = {
"hash": [x[0] for x in file.cids],
"name": file.name,
"lmod": file.lmod,
"size": file.size,
}
if search:
req["srch"] = 1
headers = {"Content-Type": "text/plain"} # wtf ed
if pw:
headers["Cookie"] = "=".join(["cppwd", pw])
if file.url:
url = file.url
elif b"/" in file.rel:
url += quotep(file.rel.rsplit(b"/", 1)[0]).decode("utf-8", "replace")
while True:
try:
r = req_ses.post(url, headers=headers, json=req)
break
except:
eprint("handshake failed, retry...\n")
time.sleep(1)
try:
r = r.json()
except:
raise Exception(r.text)
if search:
return r["hits"]
try:
pre, url = url.split("://")
pre += "://"
except:
pre = ""
file.url = pre + url.split("/")[0] + r["purl"]
file.name = r["name"]
file.wark = r["wark"]
return r["hash"]
def upload(req_ses, file, cid, pw):
# type: (requests.Session, File, str, any) -> None
"""upload one specific chunk, `cid` (a chunk-hash)"""
headers = {
"X-Up2k-Hash": cid,
"X-Up2k-Wark": file.wark,
"Content-Type": "application/octet-stream",
}
if pw:
headers["Cookie"] = "=".join(["cppwd", pw])
f = FileSlice(file, cid)
try:
r = req_ses.post(file.url, headers=headers, data=f)
if not r:
raise Exception(repr(r))
_ = r.content
finally:
f.f.close()
class Daemon(threading.Thread):
def __init__(self, *a, **ka):
threading.Thread.__init__(self, *a, **ka)
self.daemon = True
class Ctl(object):
"""
this will be the coordinator which runs everything in parallel
(hashing, handshakes, uploads) but right now it's p dumb
"""
def __init__(self, ar):
self.ar = ar
ar.files = [
os.path.abspath(os.path.realpath(x.encode("utf-8")))
+ (x[-1:] if x[-1:] == os.sep else "").encode("utf-8")
for x in ar.files
]
ar.url = ar.url.rstrip("/") + "/"
if "://" not in ar.url:
ar.url = "http://" + ar.url
eprint("\nscanning {0} locations\n".format(len(ar.files)))
nfiles = 0
nbytes = 0
for _, _, inf in walkdirs(ar.files):
nfiles += 1
nbytes += inf.st_size
eprint("found {0} files, {1}\n\n".format(nfiles, humansize(nbytes)))
self.nfiles = nfiles
self.nbytes = nbytes
if ar.td:
req_ses.verify = False
if ar.te:
req_ses.verify = ar.te
self.filegen = walkdirs(ar.files)
if ar.safe:
self.safe()
else:
self.fancy()
def safe(self):
"""minimal basic slow boring fallback codepath"""
search = self.ar.s
for nf, (top, rel, inf) in enumerate(self.filegen):
file = File(top, rel, inf.st_size, inf.st_mtime)
upath = file.abs.decode("utf-8", "replace")
print("{0} {1}\n hash...".format(self.nfiles - nf, upath))
get_hashlist(file, None)
burl = self.ar.url[:8] + self.ar.url[8:].split("/")[0] + "/"
while True:
print(" hs...")
hs = handshake(req_ses, self.ar.url, file, self.ar.a, search)
if search:
if hs:
for hit in hs:
print(" found: {0}{1}".format(burl, hit["rp"]))
else:
print(" NOT found")
break
file.ucids = hs
if not hs:
break
print("{0} {1}".format(self.nfiles - nf, upath))
ncs = len(hs)
for nc, cid in enumerate(hs):
print(" {0} up {1}".format(ncs - nc, cid))
upload(req_ses, file, cid, self.ar.a)
print(" ok!")
def fancy(self):
self.hash_f = 0
self.hash_c = 0
self.hash_b = 0
self.up_f = 0
self.up_c = 0
self.up_b = 0
self.up_br = 0
self.hasher_busy = 1
self.handshaker_busy = 0
self.uploader_busy = 0
self.t0 = time.time()
self.t0_up = None
self.spd = None
self.mutex = threading.Lock()
self.q_handshake = Queue() # type: Queue[File]
self.q_recheck = Queue() # type: Queue[File] # partial upload exists [...]
self.q_upload = Queue() # type: Queue[tuple[File, str]]
self.st_hash = [None, "(idle, starting...)"] # type: tuple[File, int]
self.st_up = [None, "(idle, starting...)"] # type: tuple[File, int]
if VT100:
atexit.register(self.cleanup_vt100)
ss.scroll_region(3)
Daemon(target=self.hasher).start()
for _ in range(self.ar.j):
Daemon(target=self.handshaker).start()
Daemon(target=self.uploader).start()
idles = 0
while idles < 3:
time.sleep(0.07)
with self.mutex:
if (
self.q_handshake.empty()
and self.q_upload.empty()
and not self.hasher_busy
and not self.handshaker_busy
and not self.uploader_busy
):
idles += 1
else:
idles = 0
if VT100:
maxlen = ss.w - len(str(self.nfiles)) - 14
txt = "\033[s\033[{0}H".format(ss.g)
for y, k, st, f in [
[0, "hash", self.st_hash, self.hash_f],
[1, "send", self.st_up, self.up_f],
]:
txt += "\033[{0}H{1}:".format(ss.g + y, k)
file, arg = st
if not file:
txt += " {0}\033[K".format(arg)
else:
if y:
p = 100 * file.up_b / file.size
else:
p = 100 * arg / file.size
name = file.abs.decode("utf-8", "replace")[-maxlen:]
if "/" in name:
name = "\033[36m{0}\033[0m/{1}".format(*name.rsplit("/", 1))
m = "{0:6.1f}% {1} {2}\033[K"
txt += m.format(p, self.nfiles - f, name)
txt += "\033[{0}H ".format(ss.g + 2)
else:
txt = " "
if not self.up_br:
spd = self.hash_b / (time.time() - self.t0)
eta = (self.nbytes - self.hash_b) / (spd + 1)
else:
spd = self.up_br / (time.time() - self.t0_up)
spd = self.spd = (self.spd or spd) * 0.9 + spd * 0.1
eta = (self.nbytes - self.up_b) / (spd + 1)
spd = humansize(spd)
eta = str(datetime.timedelta(seconds=int(eta)))
left = humansize(self.nbytes - self.up_b)
tail = "\033[K\033[u" if VT100 else "\r"
m = "eta: {0} @ {1}/s, {2} left".format(eta, spd, left)
eprint(txt + "\033]0;{0}\033\\\r{1}{2}".format(m, m, tail))
def cleanup_vt100(self):
ss.scroll_region(None)
eprint("\033[J\033]0;\033\\")
def cb_hasher(self, file, ofs):
self.st_hash = [file, ofs]
def hasher(self):
prd = None
ls = {}
for top, rel, inf in self.filegen:
if self.ar.z:
rd = os.path.dirname(rel)
if prd != rd:
prd = rd
headers = {}
if self.ar.a:
headers["Cookie"] = "=".join(["cppwd", self.ar.a])
ls = {}
try:
print(" ls ~{0}".format(rd.decode("utf-8", "replace")))
r = req_ses.get(
self.ar.url.encode("utf-8") + quotep(rd) + b"?ls",
headers=headers,
)
for f in r.json()["files"]:
rfn = f["href"].split("?")[0].encode("utf-8", "replace")
ls[unquote(rfn)] = f
except:
print(" mkdir ~{0}".format(rd.decode("utf-8", "replace")))
rf = ls.get(os.path.basename(rel), None)
if rf and rf["sz"] == inf.st_size and abs(rf["ts"] - inf.st_mtime) <= 1:
self.nfiles -= 1
self.nbytes -= inf.st_size
continue
file = File(top, rel, inf.st_size, inf.st_mtime)
while True:
with self.mutex:
if (
self.hash_b - self.up_b < 1024 * 1024 * 128
and self.hash_c - self.up_c < 64
and (
not self.ar.nh
or (
self.q_upload.empty()
and self.q_handshake.empty()
and not self.uploader_busy
)
)
):
break
time.sleep(0.05)
get_hashlist(file, self.cb_hasher)
with self.mutex:
self.hash_f += 1
self.hash_c += len(file.cids)
self.hash_b += file.size
self.q_handshake.put(file)
self.hasher_busy = 0
self.st_hash = [None, "(finished)"]
def handshaker(self):
search = self.ar.s
q = self.q_handshake
burl = self.ar.url[:8] + self.ar.url[8:].split("/")[0] + "/"
while True:
file = q.get()
if not file:
if q == self.q_handshake:
q = self.q_recheck
q.put(None)
continue
self.q_upload.put(None)
break
with self.mutex:
self.handshaker_busy += 1
upath = file.abs.decode("utf-8", "replace")
try:
hs = handshake(req_ses, self.ar.url, file, self.ar.a, search)
except Exception as ex:
if q == self.q_handshake and "<pre>partial upload exists" in str(ex):
self.q_recheck.put(file)
hs = []
else:
raise
if search:
if hs:
for hit in hs:
m = "found: {0}\n {1}{2}\n"
print(m.format(upath, burl, hit["rp"]), end="")
else:
print("NOT found: {0}\n".format(upath), end="")
with self.mutex:
self.up_f += 1
self.up_c += len(file.cids)
self.up_b += file.size
self.handshaker_busy -= 1
continue
with self.mutex:
if not hs:
# all chunks done
self.up_f += 1
self.up_c += len(file.cids) - file.up_c
self.up_b += file.size - file.up_b
if hs and file.up_c:
# some chunks failed
self.up_c -= len(hs)
file.up_c -= len(hs)
for cid in hs:
sz = file.kchunks[cid][1]
self.up_b -= sz
file.up_b -= sz
file.ucids = hs
self.handshaker_busy -= 1
if not hs:
kw = "uploaded" if file.up_b else " found"
print("{0} {1}".format(kw, upath))
for cid in hs:
self.q_upload.put([file, cid])
def uploader(self):
while True:
task = self.q_upload.get()
if not task:
self.st_up = [None, "(finished)"]
break
with self.mutex:
self.uploader_busy += 1
self.t0_up = self.t0_up or time.time()
file, cid = task
try:
upload(req_ses, file, cid, self.ar.a)
except:
eprint("upload failed, retry...\n")
pass # handshake will fix it
with self.mutex:
sz = file.kchunks[cid][1]
file.ucids = [x for x in file.ucids if x != cid]
if not file.ucids:
self.q_handshake.put(file)
self.st_up = [file, cid]
file.up_b += sz
self.up_b += sz
self.up_br += sz
file.up_c += 1
self.up_c += 1
self.uploader_busy -= 1
class APF(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
def main():
time.strptime("19970815", "%Y%m%d") # python#7980
if not VT100:
os.system("rem") # enables colors
# fmt: off
ap = app = argparse.ArgumentParser(formatter_class=APF, epilog="""
NOTE:
source file/folder selection uses rsync syntax, meaning that:
"foo" uploads the entire folder to URL/foo/
"foo/" uploads the CONTENTS of the folder into URL/
""")
ap.add_argument("url", type=unicode, help="server url, including destination folder")
ap.add_argument("files", type=unicode, nargs="+", help="files and/or folders to process")
ap.add_argument("-a", metavar="PASSWORD", help="password")
ap.add_argument("-s", action="store_true", help="file-search (disables upload)")
ap = app.add_argument_group("performance tweaks")
ap.add_argument("-j", type=int, metavar="THREADS", default=4, help="parallel connections")
ap.add_argument("-nh", action="store_true", help="disable hashing while uploading")
ap.add_argument("--safe", action="store_true", help="use simple fallback approach")
ap.add_argument("-z", action="store_true", help="ZOOMIN' (skip uploading files if they exist at the destination with the ~same last-modified timestamp, so same as yolo / turbo with date-chk but even faster)")
ap = app.add_argument_group("tls")
ap.add_argument("-te", metavar="PEM_FILE", help="certificate to expect/verify")
ap.add_argument("-td", action="store_true", help="disable certificate check")
# fmt: on
Ctl(app.parse_args())
if __name__ == "__main__":
main()