improve errmsg when reading non-utf8 files (#143)

previously, the native python-error was printed when reading the contents of a textfile using the wrong character encoding while technically correct, it could be confusing for end-users add a helper to produce a more helpful errormessage when someone (for example) tries to load a latin-1 config file
2025-11-24 07:23:22 -07:00 · 2025-03-09 11:59:33 +01:00 · 2025-03-09 11:59:33 +01:00 · 25974d660d
parent 12fcb42201
commit 25974d660d
4 changed files with 57 additions and 56 deletions
--- a/copyparty/main.py
+++ b/copyparty/main.py
@ -65,6 +65,7 @@ from .util import (
    load_resource,
    min_ex,
    pybin,
    read_utf8,
    termsize,
    wrap,
 )
@ -255,8 +256,7 @@ def get_srvname(verbose) -> str:
    if verbose:
        lprint("using hostname from {}\n".format(fp))
    try:
-        with open(fp, "rb") as f:
+        return read_utf8(None, fp, True).strip()
            ret = f.read().decode("utf-8", "replace").strip()
    except:
        ret = ""
        namelen = 5
@ -265,47 +265,18 @@ def get_srvname(verbose) -> str:
            ret = re.sub("[234567=]", "", ret)[:namelen]
        with open(fp, "wb") as f:
            f.write(ret.encode("utf-8") + b"\n")
-
+        return ret
    return ret
-def get_fk_salt() -> str:
+def get_salt(name: str, nbytes: int) -> str:
-    fp = os.path.join(E.cfg, "fk-salt.txt")
+    fp = os.path.join(E.cfg, "%s-salt.txt" % (name,))
    try:
-        with open(fp, "rb") as f:
+        return read_utf8(None, fp, True).strip()
            ret = f.read().strip()
    except:
-        ret = b64enc(os.urandom(18))
+        ret = b64enc(os.urandom(nbytes))
        with open(fp, "wb") as f:
            f.write(ret + b"\n")
-
+        return ret.decode("utf-8")
    return ret.decode("utf-8")
 def get_dk_salt() -> str:
    fp = os.path.join(E.cfg, "dk-salt.txt")
    try:
        with open(fp, "rb") as f:
            ret = f.read().strip()
    except:
        ret = b64enc(os.urandom(30))
        with open(fp, "wb") as f:
            f.write(ret + b"\n")
    return ret.decode("utf-8")
 def get_ah_salt() -> str:
    fp = os.path.join(E.cfg, "ah-salt.txt")
    try:
        with open(fp, "rb") as f:
            ret = f.read().strip()
    except:
        ret = b64enc(os.urandom(18))
        with open(fp, "wb") as f:
            f.write(ret + b"\n")
    return ret.decode("utf-8")
 def ensure_locale() -> None:
@ -1552,9 +1523,9 @@ def run_argparse(
    cert_path = os.path.join(E.cfg, "cert.pem")
-    fk_salt = get_fk_salt()
+    fk_salt = get_salt("fk", 18)
-    dk_salt = get_dk_salt()
+    dk_salt = get_salt("dk", 30)
-    ah_salt = get_ah_salt()
+    ah_salt = get_salt("ah", 18)
    # alpine peaks at 5 threads for some reason,
    # all others scale past that (but try to avoid SMT),
--- a/copyparty/authsrv.py
+++ b/copyparty/authsrv.py
@ -33,6 +33,7 @@ from .util import (
    get_df,
    humansize,
    odfusion,
    read_utf8,
    relchk,
    statdir,
    ub64enc,
@ -2547,8 +2548,8 @@ class AuthSrv(object):
            if not bos.path.exists(ap):
                pwdb = {}
            else:
-                with open(ap, "r", encoding="utf-8") as f:
+                jtxt = read_utf8(self.log, ap, True)
-                    pwdb = json.load(f)
+                pwdb = json.loads(jtxt)
            pwdb = [x for x in pwdb if x[0] != uname]
            pwdb.append((uname, self.defpw[uname], hpw))
@ -2571,8 +2572,8 @@ class AuthSrv(object):
        if not self.args.chpw or not bos.path.exists(ap):
            return
-        with open(ap, "r", encoding="utf-8") as f:
+        jtxt = read_utf8(self.log, ap, True)
-            pwdb = json.load(f)
+        pwdb = json.loads(jtxt)
        useen = set()
        urst = set()
@ -3068,8 +3069,9 @@ def expand_config_file(
    ipath += " -> " + fp
    ret.append("#\033[36m opening cfg file{}\033[0m".format(ipath))
-    with open(fp, "rb") as f:
+    cfg_lines = read_utf8(log, fp, True).split("\n")
-        for oln in [x.decode("utf-8").rstrip() for x in f]:
+    if True:  # diff-golf
        for oln in [x.rstrip() for x in cfg_lines]:
            ln = oln.split("  #")[0].strip()
            if ln.startswith("% "):
                pad = " " * len(oln.split("%")[0])
--- a/copyparty/httpcli.py
+++ b/copyparty/httpcli.py
@ -87,6 +87,7 @@ from .util import (
    quotep,
    rand_name,
    read_header,
    read_utf8,
    read_socket,
    read_socket_chunked,
    read_socket_unbounded,
@ -870,8 +871,7 @@ class HttpCli(object):
            html = html.replace("%", "", 1)
        if html.startswith("@"):
-            with open(html[1:], "rb") as f:
+            html = read_utf8(self.log, html[1:], True)
                html = f.read().decode("utf-8")
        if html.startswith("%"):
            html = html[1:]
@ -3740,8 +3740,7 @@ class HttpCli(object):
                    continue
                fn = "%s/%s" % (abspath, fn)
                if bos.path.isfile(fn):
-                    with open(fsenc(fn), "rb") as f:
+                    logues[n] = read_utf8(self.log, fsenc(fn), False)
                        logues[n] = f.read().decode("utf-8")
                    if "exp" in vn.flags:
                        logues[n] = self._expand(
                            logues[n], vn.flags.get("exp_lg") or []
@ -3762,9 +3761,8 @@ class HttpCli(object):
            for fn in fns:
                fn = "%s/%s" % (abspath, fn)
                if bos.path.isfile(fn):
-                    with open(fsenc(fn), "rb") as f:
+                    txt = read_utf8(self.log, fsenc(fn), False)
-                        txt = f.read().decode("utf-8")
+                    break
                        break
            if txt and "exp" in vn.flags:
                txt = self._expand(txt, vn.flags.get("exp_md") or [])
@ -6254,9 +6252,7 @@ class HttpCli(object):
                docpath = os.path.join(abspath, doc)
                sz = bos.path.getsize(docpath)
                if sz < 1024 * self.args.txt_max:
-                    with open(fsenc(docpath), "rb") as f:
+                    doctxt = read_utf8(self.log, fsenc(docpath), False)
                        doctxt = f.read().decode("utf-8", "replace")
                    if doc.lower().endswith(".md") and "exp" in vn.flags:
                        doctxt = self._expand(doctxt, vn.flags.get("exp_md") or [])
                else:
--- a/copyparty/util.py
+++ b/copyparty/util.py
@ -594,6 +594,38 @@ except Exception as ex:
        print("using fallback base64 codec due to %r" % (ex,))
 class NotUTF8(Exception):
    pass
 def read_utf8(log: Optional["NamedLogger"], ap: Union[str, bytes], strict: bool) -> str:
    with open(ap, "rb") as f:
        buf = f.read()
    try:
        return buf.decode("utf-8", "strict")
    except UnicodeDecodeError as ex:
        eo = ex.start
        eb = buf[eo : eo + 1]
    if not strict:
        t = "WARNING: The file [%s] is not using the UTF-8 character encoding; some characters in the file will be skipped/ignored. The first unreadable character was byte %r at offset %d. Please convert this file to UTF-8 by opening the file in your text-editor and saving it as UTF-8."
        t = t % (ap, eb, eo)
        if log:
            log(t, 3)
        else:
            print(t)
        return buf.decode("utf-8", "replace")
    t = "ERROR: The file [%s] is not using the UTF-8 character encoding, and cannot be loaded. The first unreadable character was byte %r at offset %d. Please convert this file to UTF-8 by opening the file in your text-editor and saving it as UTF-8."
    t = t % (ap, eb, eo)
    if log:
        log(t, 3)
    else:
        print(t)
    raise NotUTF8(t)
 class Daemon(threading.Thread):
    def __init__(
        self,