cromulent rfc2388/rfc6266 parser

2025-11-24 07:23:22 -07:00 · 2019-06-04 22:07:58 +00:00 · 2019-06-04 22:07:58 +00:00 · c53413d57c
parent afa4216591
commit c53413d57c
1 changed files with 289 additions and 0 deletions
--- a/copyparty/util.py
+++ b/copyparty/util.py
@ -2,6 +2,9 @@
 # coding: utf-8
 from __future__ import print_function

+import re
+import hashlib
+

 class Unrecv(object):
    """
@ -25,3 +28,289 @@ class Unrecv(object):

    def unrecv(self, buf):
        self.buf = buf + self.buf
+
+
+class MultipartParser(object):
+    def __init__(self, log_func, sr, http_headers):
+        self.sr = sr
+        self.log = log_func
+        self.headers = http_headers
+
+        self.re_ctype = re.compile(r"^content-type: *([^;]+)", re.IGNORECASE)
+        self.re_cdisp = re.compile(r"^content-disposition: *([^;]+)", re.IGNORECASE)
+        self.re_cdisp_field = re.compile(
+            r'^content-disposition:(?: *|.*; *)name="([^"]+)"', re.IGNORECASE
+        )
+        self.re_cdisp_file = re.compile(
+            r'^content-disposition:(?: *|.*; *)filename="(.*)"', re.IGNORECASE
+        )
+
+    def _read_header(self):
+        """
+        returns [fieldname, filename] after eating a block of multipart headers
+        while doing a decent job at dealing with the absolute mess that is
+        rfc1341/rfc1521/rfc2047/rfc2231/rfc2388/rfc6266/the-real-world
+        (only the fallback non-js uploader relies on these filenames)
+        """
+        for ln in read_header(self.sr):
+            self.log(ln)
+
+            m = self.re_ctype.match(ln)
+            if m:
+                if m.group(1).lower() == "multipart/mixed":
+                    # rfc-7578 overrides rfc-2388 so this is not-impl
+                    # (opera >=9 <11.10 is the only thing i've ever seen use it)
+                    raise Exception(
+                        "you can't use that browser to upload multiple files at once"
+                    )
+
+                continue
+
+            # the only other header we care about is content-disposition
+            m = self.re_cdisp.match(ln)
+            if not m:
+                continue
+
+            if m.group(1).lower() != "form-data":
+                raise Pebkac("not form-data: {}".format(ln))
+
+            try:
+                field = self.re_cdisp_field.match(ln).group(1)
+            except:
+                raise Pebkac("missing field name: {}".format(ln))
+
+            try:
+                fn = self.re_cdisp_file.match(ln).group(1)
+            except:
+                # this is not a file upload, we're done
+                return field, None
+
+            try:
+                is_webkit = self.headers["user-agent"].lower().find("applewebkit") >= 0
+            except:
+                is_webkit = False
+
+            # chromes ignore the spec and makes this real easy
+            if is_webkit:
+                # quotes become %22 but they don't escape the %
+                # so unescaping the quotes could turn messi
+                return field, fn.split('"')[0]
+
+            # also ez if filename doesn't contain "
+            if not fn.split('"')[0].endswith("\\"):
+                return field, fn.split('"')[0]
+
+            # this breaks on firefox uploads that contain \"
+            # since firefox escapes " but forgets to escape \
+            # so it'll truncate after the \
+            ret = u""
+            esc = False
+            for ch in fn:
+                if esc:
+                    if ch in [u'"', u"\\"]:
+                        ret += u'"'
+                    else:
+                        ret += esc + ch
+                    esc = False
+                elif ch == u"\\":
+                    esc = True
+                elif ch == u'"':
+                    break
+                else:
+                    ret += ch
+
+            return [field, ret]
+
+    def _read_data(self):
+        blen = len(self.boundary)
+        bufsz = 32 * 1024
+        while True:
+            buf = self.sr.recv(bufsz)
+            if not buf:
+                # abort: client disconnected
+                raise Exception("client disconnected during post")
+
+            while True:
+                ofs = buf.find(self.boundary)
+                if ofs != -1:
+                    self.sr.unrecv(buf[ofs + blen :])
+                    yield buf[:ofs]
+                    return
+
+                d = len(buf) - blen
+                if d > 0:
+                    # buffer growing large; yield everything except
+                    # the part at the end (maybe start of boundary)
+                    yield buf[:d]
+                    buf = buf[d:]
+
+                # look for boundary near the end of the buffer
+                for n in range(1, len(buf) + 1):
+                    if not buf[-n:] in self.boundary:
+                        n -= 1
+                        break
+
+                if n == 0 or not self.boundary.startswith(buf[-n:]):
+                    # no boundary contents near the buffer edge
+                    break
+
+                if blen == n:
+                    # EOF: found boundary
+                    yield buf[:-n]
+                    return
+
+                buf2 = self.sr.recv(bufsz)
+                if not buf2:
+                    # abort: client disconnected
+                    raise Exception("client disconnected during post")
+
+                buf += buf2
+
+            yield buf
+
+    def _run_gen(self):
+        """
+        yields [fieldname, unsanitized_filename, fieldvalue]
+        where fieldvalue yields chunks of data
+        """
+        while True:
+            fieldname, filename = self._read_header()
+            yield [fieldname, filename, self._read_data()]
+
+            tail = self.sr.recv(2)
+
+            if tail == b"--":
+                # EOF indicated by this immediately after final boundary
+                self.sr.recv(2)
+                return
+
+            if tail != b"\r\n":
+                raise Pebkac("protocol error after field value")
+
+    def _read_value(self, iterator, max_len):
+        ret = b""
+        for buf in iterator:
+            ret += buf
+            if len(ret) > max_len:
+                raise Pebkac("field length is too long")
+
+        return ret
+
+    def parse(self):
+        # spec says there might be junk before the first boundary,
+        # can't have the leading \r\n if that's not the case
+        self.boundary = b"--" + get_boundary(self.headers).encode("utf-8")
+
+        # discard junk before the first boundary
+        for junk in self._read_data():
+            self.log(
+                u"discarding preamble: [{}]".format(junk.decode("utf-8", "ignore"))
+            )
+
+        # nice, now make it fast
+        self.boundary = b"\r\n" + self.boundary
+        self.gen = self._run_gen()
+
+    def require(self, field_name, max_len):
+        """
+        returns the value of the next field in the multipart body,
+        raises if the field name is not as expected
+        """
+        p_field, _, p_data = next(self.gen)
+        if p_field != field_name:
+            raise Pebkac('expected field "{}", got "{}"'.format(field_name, p_field))
+
+        return self._read_value(p_data, max_len).decode("utf-8", "ignore")
+
+    def drop(self):
+        """discards the remaining multipart body"""
+        for _, _, data in self.gen:
+            for _ in data:
+                pass
+
+
+def get_boundary(headers):
+    # boundaries contain a-z A-Z 0-9 ' ( ) + _ , - . / : = ?
+    # (whitespace allowed except as the last char)
+    ptn = r"^multipart/form-data; *(.*; *)?boundary=([^;]+)"
+    ct = headers["content-type"]
+    m = re.match(ptn, ct, re.IGNORECASE)
+    if not m:
+        raise Pebkac("invalid content-type for a multipart post: {}".format(ct))
+
+    return m.group(2)
+
+
+def read_header(sr):
+    ret = b""
+    while True:
+        if ret.endswith(b"\r\n\r\n"):
+            break
+        elif ret.endswith(b"\r\n\r"):
+            n = 1
+        elif ret.endswith(b"\r\n"):
+            n = 2
+        elif ret.endswith(b"\r"):
+            n = 3
+        else:
+            n = 4
+
+        buf = sr.recv(n)
+        if not buf:
+            raise Exception("failed to read headers")
+
+        ret += buf
+
+    return ret[:-4].decode("utf-8", "replace").split("\r\n")
+
+
+def sanitize_fn(fn):
+    return fn.replace("\\", "/").split("/")[-1].strip()
+
+
+def hashcopy(actor, fin, fout):
+    u32_lim = int((2 ** 31) * 0.9)
+    hashobj = hashlib.sha512()
+    tlen = 0
+    for buf in fin:
+        actor.workload += 1
+        if actor.workload > u32_lim:
+            actor.workload = 100  # prevent overflow
+
+        tlen += len(buf)
+        hashobj.update(buf)
+        fout.write(buf)
+
+    return tlen, hashobj.hexdigest()
+
+
+def unescape_cookie(orig):
+    # mw=idk; doot=qwe%2Crty%3Basd+fgh%2Bjkl%25zxc%26vbn  # qwe,rty;asd fgh+jkl%zxc&vbn
+    ret = u""
+    esc = u""
+    for ch in orig:
+        if ch == u"%":
+            if len(esc) > 0:
+                ret += esc
+            esc = ch
+
+        elif len(esc) > 0:
+            esc += ch
+            if len(esc) == 3:
+                try:
+                    ret += chr(int(esc[1:], 16))
+                except:
+                    ret += esc
+                    esc = u""
+
+        else:
+            ret += ch
+
+    if len(esc) > 0:
+        ret += esc
+
+    return ret
+
+
+class Pebkac(Exception):
+    pass