cromulent rfc2388/rfc6266 parser

2025-11-24 07:23:22 -07:00 · 2019-06-04 22:07:58 +00:00 · 2019-06-04 22:07:58 +00:00 · c53413d57c
parent afa4216591
commit c53413d57c
1 changed files with 289 additions and 0 deletions
--- a/copyparty/util.py
+++ b/copyparty/util.py
@ -2,6 +2,9 @@
 # coding: utf-8
 from __future__ import print_function
 import re
 import hashlib
 class Unrecv(object):
    """
@ -25,3 +28,289 @@ class Unrecv(object):
    def unrecv(self, buf):
        self.buf = buf + self.buf
 class MultipartParser(object):
    def __init__(self, log_func, sr, http_headers):
        self.sr = sr
        self.log = log_func
        self.headers = http_headers
        self.re_ctype = re.compile(r"^content-type: *([^;]+)", re.IGNORECASE)
        self.re_cdisp = re.compile(r"^content-disposition: *([^;]+)", re.IGNORECASE)
        self.re_cdisp_field = re.compile(
            r'^content-disposition:(?: *|.*; *)name="([^"]+)"', re.IGNORECASE
        )
        self.re_cdisp_file = re.compile(
            r'^content-disposition:(?: *|.*; *)filename="(.*)"', re.IGNORECASE
        )
    def _read_header(self):
        """
        returns [fieldname, filename] after eating a block of multipart headers
        while doing a decent job at dealing with the absolute mess that is
        rfc1341/rfc1521/rfc2047/rfc2231/rfc2388/rfc6266/the-real-world
        (only the fallback non-js uploader relies on these filenames)
        """
        for ln in read_header(self.sr):
            self.log(ln)
            m = self.re_ctype.match(ln)
            if m:
                if m.group(1).lower() == "multipart/mixed":
                    # rfc-7578 overrides rfc-2388 so this is not-impl
                    # (opera >=9 <11.10 is the only thing i've ever seen use it)
                    raise Exception(
                        "you can't use that browser to upload multiple files at once"
                    )
                continue
            # the only other header we care about is content-disposition
            m = self.re_cdisp.match(ln)
            if not m:
                continue
            if m.group(1).lower() != "form-data":
                raise Pebkac("not form-data: {}".format(ln))
            try:
                field = self.re_cdisp_field.match(ln).group(1)
            except:
                raise Pebkac("missing field name: {}".format(ln))
            try:
                fn = self.re_cdisp_file.match(ln).group(1)
            except:
                # this is not a file upload, we're done
                return field, None
            try:
                is_webkit = self.headers["user-agent"].lower().find("applewebkit") >= 0
            except:
                is_webkit = False
            # chromes ignore the spec and makes this real easy
            if is_webkit:
                # quotes become %22 but they don't escape the %
                # so unescaping the quotes could turn messi
                return field, fn.split('"')[0]
            # also ez if filename doesn't contain "
            if not fn.split('"')[0].endswith("\\"):
                return field, fn.split('"')[0]
            # this breaks on firefox uploads that contain \"
            # since firefox escapes " but forgets to escape \
            # so it'll truncate after the \
            ret = u""
            esc = False
            for ch in fn:
                if esc:
                    if ch in [u'"', u"\\"]:
                        ret += u'"'
                    else:
                        ret += esc + ch
                    esc = False
                elif ch == u"\\":
                    esc = True
                elif ch == u'"':
                    break
                else:
                    ret += ch
            return [field, ret]
    def _read_data(self):
        blen = len(self.boundary)
        bufsz = 32 * 1024
        while True:
            buf = self.sr.recv(bufsz)
            if not buf:
                # abort: client disconnected
                raise Exception("client disconnected during post")
            while True:
                ofs = buf.find(self.boundary)
                if ofs != -1:
                    self.sr.unrecv(buf[ofs + blen :])
                    yield buf[:ofs]
                    return
                d = len(buf) - blen
                if d > 0:
                    # buffer growing large; yield everything except
                    # the part at the end (maybe start of boundary)
                    yield buf[:d]
                    buf = buf[d:]
                # look for boundary near the end of the buffer
                for n in range(1, len(buf) + 1):
                    if not buf[-n:] in self.boundary:
                        n -= 1
                        break
                if n == 0 or not self.boundary.startswith(buf[-n:]):
                    # no boundary contents near the buffer edge
                    break
                if blen == n:
                    # EOF: found boundary
                    yield buf[:-n]
                    return
                buf2 = self.sr.recv(bufsz)
                if not buf2:
                    # abort: client disconnected
                    raise Exception("client disconnected during post")
                buf += buf2
            yield buf
    def _run_gen(self):
        """
        yields [fieldname, unsanitized_filename, fieldvalue]
        where fieldvalue yields chunks of data
        """
        while True:
            fieldname, filename = self._read_header()
            yield [fieldname, filename, self._read_data()]
            tail = self.sr.recv(2)
            if tail == b"--":
                # EOF indicated by this immediately after final boundary
                self.sr.recv(2)
                return
            if tail != b"\r\n":
                raise Pebkac("protocol error after field value")
    def _read_value(self, iterator, max_len):
        ret = b""
        for buf in iterator:
            ret += buf
            if len(ret) > max_len:
                raise Pebkac("field length is too long")
        return ret
    def parse(self):
        # spec says there might be junk before the first boundary,
        # can't have the leading \r\n if that's not the case
        self.boundary = b"--" + get_boundary(self.headers).encode("utf-8")
        # discard junk before the first boundary
        for junk in self._read_data():
            self.log(
                u"discarding preamble: [{}]".format(junk.decode("utf-8", "ignore"))
            )
        # nice, now make it fast
        self.boundary = b"\r\n" + self.boundary
        self.gen = self._run_gen()
    def require(self, field_name, max_len):
        """
        returns the value of the next field in the multipart body,
        raises if the field name is not as expected
        """
        p_field, _, p_data = next(self.gen)
        if p_field != field_name:
            raise Pebkac('expected field "{}", got "{}"'.format(field_name, p_field))
        return self._read_value(p_data, max_len).decode("utf-8", "ignore")
    def drop(self):
        """discards the remaining multipart body"""
        for _, _, data in self.gen:
            for _ in data:
                pass
 def get_boundary(headers):
    # boundaries contain a-z A-Z 0-9 ' ( ) + _ , - . / : = ?
    # (whitespace allowed except as the last char)
    ptn = r"^multipart/form-data; *(.*; *)?boundary=([^;]+)"
    ct = headers["content-type"]
    m = re.match(ptn, ct, re.IGNORECASE)
    if not m:
        raise Pebkac("invalid content-type for a multipart post: {}".format(ct))
    return m.group(2)
 def read_header(sr):
    ret = b""
    while True:
        if ret.endswith(b"\r\n\r\n"):
            break
        elif ret.endswith(b"\r\n\r"):
            n = 1
        elif ret.endswith(b"\r\n"):
            n = 2
        elif ret.endswith(b"\r"):
            n = 3
        else:
            n = 4
        buf = sr.recv(n)
        if not buf:
            raise Exception("failed to read headers")
        ret += buf
    return ret[:-4].decode("utf-8", "replace").split("\r\n")
 def sanitize_fn(fn):
    return fn.replace("\\", "/").split("/")[-1].strip()
 def hashcopy(actor, fin, fout):
    u32_lim = int((2 ** 31) * 0.9)
    hashobj = hashlib.sha512()
    tlen = 0
    for buf in fin:
        actor.workload += 1
        if actor.workload > u32_lim:
            actor.workload = 100  # prevent overflow
        tlen += len(buf)
        hashobj.update(buf)
        fout.write(buf)
    return tlen, hashobj.hexdigest()
 def unescape_cookie(orig):
    # mw=idk; doot=qwe%2Crty%3Basd+fgh%2Bjkl%25zxc%26vbn  # qwe,rty;asd fgh+jkl%zxc&vbn
    ret = u""
    esc = u""
    for ch in orig:
        if ch == u"%":
            if len(esc) > 0:
                ret += esc
            esc = ch
        elif len(esc) > 0:
            esc += ch
            if len(esc) == 3:
                try:
                    ret += chr(int(esc[1:], 16))
                except:
                    ret += esc
                    esc = u""
        else:
            ret += ch
    if len(esc) > 0:
        ret += esc
    return ret
 class Pebkac(Exception):
    pass