improve errmsg when reading non-utf8 files (#143)

previously, the native python-error was printed when reading
the contents of a textfile using the wrong character encoding

while technically correct, it could be confusing for end-users

add a helper to produce a more helpful errormessage when
someone (for example) tries to load a latin-1 config file
This commit is contained in:
ed 2025-03-09 11:59:33 +01:00
parent 12fcb42201
commit 25974d660d
4 changed files with 57 additions and 56 deletions

View file

@ -65,6 +65,7 @@ from .util import (
load_resource, load_resource,
min_ex, min_ex,
pybin, pybin,
read_utf8,
termsize, termsize,
wrap, wrap,
) )
@ -255,8 +256,7 @@ def get_srvname(verbose) -> str:
if verbose: if verbose:
lprint("using hostname from {}\n".format(fp)) lprint("using hostname from {}\n".format(fp))
try: try:
with open(fp, "rb") as f: return read_utf8(None, fp, True).strip()
ret = f.read().decode("utf-8", "replace").strip()
except: except:
ret = "" ret = ""
namelen = 5 namelen = 5
@ -265,47 +265,18 @@ def get_srvname(verbose) -> str:
ret = re.sub("[234567=]", "", ret)[:namelen] ret = re.sub("[234567=]", "", ret)[:namelen]
with open(fp, "wb") as f: with open(fp, "wb") as f:
f.write(ret.encode("utf-8") + b"\n") f.write(ret.encode("utf-8") + b"\n")
return ret
return ret
def get_fk_salt() -> str: def get_salt(name: str, nbytes: int) -> str:
fp = os.path.join(E.cfg, "fk-salt.txt") fp = os.path.join(E.cfg, "%s-salt.txt" % (name,))
try: try:
with open(fp, "rb") as f: return read_utf8(None, fp, True).strip()
ret = f.read().strip()
except: except:
ret = b64enc(os.urandom(18)) ret = b64enc(os.urandom(nbytes))
with open(fp, "wb") as f: with open(fp, "wb") as f:
f.write(ret + b"\n") f.write(ret + b"\n")
return ret.decode("utf-8")
return ret.decode("utf-8")
def get_dk_salt() -> str:
fp = os.path.join(E.cfg, "dk-salt.txt")
try:
with open(fp, "rb") as f:
ret = f.read().strip()
except:
ret = b64enc(os.urandom(30))
with open(fp, "wb") as f:
f.write(ret + b"\n")
return ret.decode("utf-8")
def get_ah_salt() -> str:
fp = os.path.join(E.cfg, "ah-salt.txt")
try:
with open(fp, "rb") as f:
ret = f.read().strip()
except:
ret = b64enc(os.urandom(18))
with open(fp, "wb") as f:
f.write(ret + b"\n")
return ret.decode("utf-8")
def ensure_locale() -> None: def ensure_locale() -> None:
@ -1552,9 +1523,9 @@ def run_argparse(
cert_path = os.path.join(E.cfg, "cert.pem") cert_path = os.path.join(E.cfg, "cert.pem")
fk_salt = get_fk_salt() fk_salt = get_salt("fk", 18)
dk_salt = get_dk_salt() dk_salt = get_salt("dk", 30)
ah_salt = get_ah_salt() ah_salt = get_salt("ah", 18)
# alpine peaks at 5 threads for some reason, # alpine peaks at 5 threads for some reason,
# all others scale past that (but try to avoid SMT), # all others scale past that (but try to avoid SMT),

View file

@ -33,6 +33,7 @@ from .util import (
get_df, get_df,
humansize, humansize,
odfusion, odfusion,
read_utf8,
relchk, relchk,
statdir, statdir,
ub64enc, ub64enc,
@ -2547,8 +2548,8 @@ class AuthSrv(object):
if not bos.path.exists(ap): if not bos.path.exists(ap):
pwdb = {} pwdb = {}
else: else:
with open(ap, "r", encoding="utf-8") as f: jtxt = read_utf8(self.log, ap, True)
pwdb = json.load(f) pwdb = json.loads(jtxt)
pwdb = [x for x in pwdb if x[0] != uname] pwdb = [x for x in pwdb if x[0] != uname]
pwdb.append((uname, self.defpw[uname], hpw)) pwdb.append((uname, self.defpw[uname], hpw))
@ -2571,8 +2572,8 @@ class AuthSrv(object):
if not self.args.chpw or not bos.path.exists(ap): if not self.args.chpw or not bos.path.exists(ap):
return return
with open(ap, "r", encoding="utf-8") as f: jtxt = read_utf8(self.log, ap, True)
pwdb = json.load(f) pwdb = json.loads(jtxt)
useen = set() useen = set()
urst = set() urst = set()
@ -3068,8 +3069,9 @@ def expand_config_file(
ipath += " -> " + fp ipath += " -> " + fp
ret.append("#\033[36m opening cfg file{}\033[0m".format(ipath)) ret.append("#\033[36m opening cfg file{}\033[0m".format(ipath))
with open(fp, "rb") as f: cfg_lines = read_utf8(log, fp, True).split("\n")
for oln in [x.decode("utf-8").rstrip() for x in f]: if True: # diff-golf
for oln in [x.rstrip() for x in cfg_lines]:
ln = oln.split(" #")[0].strip() ln = oln.split(" #")[0].strip()
if ln.startswith("% "): if ln.startswith("% "):
pad = " " * len(oln.split("%")[0]) pad = " " * len(oln.split("%")[0])

View file

@ -87,6 +87,7 @@ from .util import (
quotep, quotep,
rand_name, rand_name,
read_header, read_header,
read_utf8,
read_socket, read_socket,
read_socket_chunked, read_socket_chunked,
read_socket_unbounded, read_socket_unbounded,
@ -870,8 +871,7 @@ class HttpCli(object):
html = html.replace("%", "", 1) html = html.replace("%", "", 1)
if html.startswith("@"): if html.startswith("@"):
with open(html[1:], "rb") as f: html = read_utf8(self.log, html[1:], True)
html = f.read().decode("utf-8")
if html.startswith("%"): if html.startswith("%"):
html = html[1:] html = html[1:]
@ -3740,8 +3740,7 @@ class HttpCli(object):
continue continue
fn = "%s/%s" % (abspath, fn) fn = "%s/%s" % (abspath, fn)
if bos.path.isfile(fn): if bos.path.isfile(fn):
with open(fsenc(fn), "rb") as f: logues[n] = read_utf8(self.log, fsenc(fn), False)
logues[n] = f.read().decode("utf-8")
if "exp" in vn.flags: if "exp" in vn.flags:
logues[n] = self._expand( logues[n] = self._expand(
logues[n], vn.flags.get("exp_lg") or [] logues[n], vn.flags.get("exp_lg") or []
@ -3762,9 +3761,8 @@ class HttpCli(object):
for fn in fns: for fn in fns:
fn = "%s/%s" % (abspath, fn) fn = "%s/%s" % (abspath, fn)
if bos.path.isfile(fn): if bos.path.isfile(fn):
with open(fsenc(fn), "rb") as f: txt = read_utf8(self.log, fsenc(fn), False)
txt = f.read().decode("utf-8") break
break
if txt and "exp" in vn.flags: if txt and "exp" in vn.flags:
txt = self._expand(txt, vn.flags.get("exp_md") or []) txt = self._expand(txt, vn.flags.get("exp_md") or [])
@ -6254,9 +6252,7 @@ class HttpCli(object):
docpath = os.path.join(abspath, doc) docpath = os.path.join(abspath, doc)
sz = bos.path.getsize(docpath) sz = bos.path.getsize(docpath)
if sz < 1024 * self.args.txt_max: if sz < 1024 * self.args.txt_max:
with open(fsenc(docpath), "rb") as f: doctxt = read_utf8(self.log, fsenc(docpath), False)
doctxt = f.read().decode("utf-8", "replace")
if doc.lower().endswith(".md") and "exp" in vn.flags: if doc.lower().endswith(".md") and "exp" in vn.flags:
doctxt = self._expand(doctxt, vn.flags.get("exp_md") or []) doctxt = self._expand(doctxt, vn.flags.get("exp_md") or [])
else: else:

View file

@ -594,6 +594,38 @@ except Exception as ex:
print("using fallback base64 codec due to %r" % (ex,)) print("using fallback base64 codec due to %r" % (ex,))
class NotUTF8(Exception):
pass
def read_utf8(log: Optional["NamedLogger"], ap: Union[str, bytes], strict: bool) -> str:
with open(ap, "rb") as f:
buf = f.read()
try:
return buf.decode("utf-8", "strict")
except UnicodeDecodeError as ex:
eo = ex.start
eb = buf[eo : eo + 1]
if not strict:
t = "WARNING: The file [%s] is not using the UTF-8 character encoding; some characters in the file will be skipped/ignored. The first unreadable character was byte %r at offset %d. Please convert this file to UTF-8 by opening the file in your text-editor and saving it as UTF-8."
t = t % (ap, eb, eo)
if log:
log(t, 3)
else:
print(t)
return buf.decode("utf-8", "replace")
t = "ERROR: The file [%s] is not using the UTF-8 character encoding, and cannot be loaded. The first unreadable character was byte %r at offset %d. Please convert this file to UTF-8 by opening the file in your text-editor and saving it as UTF-8."
t = t % (ap, eb, eo)
if log:
log(t, 3)
else:
print(t)
raise NotUTF8(t)
class Daemon(threading.Thread): class Daemon(threading.Thread):
def __init__( def __init__(
self, self,