add filetype detection for nameless uploads

This commit is contained in:
ed 2022-09-18 17:30:57 +02:00
parent df64a62a03
commit 9401b5ae13
7 changed files with 141 additions and 15 deletions

View file

@ -62,6 +62,7 @@ try the **[read-only demo server](https://a.ocv.me/pub/demo/)** 👀 running fro
* [periodic rescan](#periodic-rescan) - filesystem monitoring * [periodic rescan](#periodic-rescan) - filesystem monitoring
* [upload rules](#upload-rules) - set upload rules using volflags * [upload rules](#upload-rules) - set upload rules using volflags
* [compress uploads](#compress-uploads) - files can be autocompressed on upload * [compress uploads](#compress-uploads) - files can be autocompressed on upload
* [other flags](#other-flags)
* [database location](#database-location) - in-volume (`.hist/up2k.db`, default) or somewhere else * [database location](#database-location) - in-volume (`.hist/up2k.db`, default) or somewhere else
* [metadata from audio files](#metadata-from-audio-files) - set `-e2t` to index tags on upload * [metadata from audio files](#metadata-from-audio-files) - set `-e2t` to index tags on upload
* [file parser plugins](#file-parser-plugins) - provide custom parsers to index additional tags, also see [./bin/mtag/README.md](./bin/mtag/README.md) * [file parser plugins](#file-parser-plugins) - provide custom parsers to index additional tags, also see [./bin/mtag/README.md](./bin/mtag/README.md)
@ -264,6 +265,8 @@ some improvement ideas
* [Chrome issue 1352210](https://bugs.chromium.org/p/chromium/issues/detail?id=1352210) -- plaintext http may be faster at filehashing than https (but also extremely CPU-intensive and likely to run into the above gc bugs) * [Chrome issue 1352210](https://bugs.chromium.org/p/chromium/issues/detail?id=1352210) -- plaintext http may be faster at filehashing than https (but also extremely CPU-intensive and likely to run into the above gc bugs)
* [Firefox issue 1790500](https://bugzilla.mozilla.org/show_bug.cgi?id=1790500) -- sometimes forgets to close filedescriptors during upload so the browser can crash after ~4000 files
* iPhones: the volume control doesn't work because [apple doesn't want it to](https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/Using_HTML5_Audio_Video/Device-SpecificConsiderations/Device-SpecificConsiderations.html#//apple_ref/doc/uid/TP40009523-CH5-SW11) * iPhones: the volume control doesn't work because [apple doesn't want it to](https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/Using_HTML5_Audio_Video/Device-SpecificConsiderations/Device-SpecificConsiderations.html#//apple_ref/doc/uid/TP40009523-CH5-SW11)
* *future workaround:* enable the equalizer, make it all-zero, and set a negative boost to reduce the volume * *future workaround:* enable the equalizer, make it all-zero, and set a negative boost to reduce the volume
* "future" because `AudioContext` is broken in the current iOS version (15.1), maybe one day... * "future" because `AudioContext` is broken in the current iOS version (15.1), maybe one day...
@ -773,6 +776,11 @@ some examples,
allows (but does not force) gz compression if client uploads to `/inc?pk` or `/inc?gz` or `/inc?gz=4` allows (but does not force) gz compression if client uploads to `/inc?pk` or `/inc?gz` or `/inc?gz=4`
## other flags
* `:c,magic` enables filetype detection for nameless uploads, same as `--magic`
## database location ## database location
in-volume (`.hist/up2k.db`, default) or somewhere else in-volume (`.hist/up2k.db`, default) or somewhere else
@ -1191,9 +1199,9 @@ upload modifiers:
| `Rand: 4` | `rand=4` | generate random filename with 4 characters | | `Rand: 4` | `rand=4` | generate random filename with 4 characters |
| `Life: 30` | `life=30` | delete file after 30 seconds | | `Life: 30` | `life=30` | delete file after 30 seconds |
`life` only has an effect if the volume has a lifetime, and the volume lifetime must be greater than the file's * `life` only has an effect if the volume has a lifetime, and the volume lifetime must be greater than the file's
server behavior of `msg` can be reconfigured with `--urlform` * server behavior of `msg` can be reconfigured with `--urlform`
## admin ## admin

View file

@ -476,6 +476,7 @@ def run_argparse(argv: list[str], formatter: Any, retry: bool) -> argparse.Names
\033[0muploads, general: \033[0muploads, general:
\033[36mnodupe\033[35m rejects existing files (instead of symlinking them) \033[36mnodupe\033[35m rejects existing files (instead of symlinking them)
\033[36mnosub\033[35m forces all uploads into the top folder of the vfs \033[36mnosub\033[35m forces all uploads into the top folder of the vfs
\033[36mmagic$\033[35m enables filetype detection for nameless uploads
\033[36mgz\033[35m allows server-side gzip of uploads with ?gz (also c,xz) \033[36mgz\033[35m allows server-side gzip of uploads with ?gz (also c,xz)
\033[36mpk\033[35m forces server-side compression, optional arg: xz,9 \033[36mpk\033[35m forces server-side compression, optional arg: xz,9
@ -591,6 +592,7 @@ def run_argparse(argv: list[str], formatter: Any, retry: bool) -> argparse.Names
ap2.add_argument("--hardlink", action="store_true", help="prefer hardlinks instead of symlinks when possible (within same filesystem)") ap2.add_argument("--hardlink", action="store_true", help="prefer hardlinks instead of symlinks when possible (within same filesystem)")
ap2.add_argument("--never-symlink", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made") ap2.add_argument("--never-symlink", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made")
ap2.add_argument("--no-dedup", action="store_true", help="disable symlink/hardlink creation; copy file contents instead") ap2.add_argument("--no-dedup", action="store_true", help="disable symlink/hardlink creation; copy file contents instead")
ap2.add_argument("--magic", action="store_true", help="enable filetype detection on nameless uploads")
ap2.add_argument("--df", metavar="GiB", type=float, default=0, help="ensure GiB free disk space by rejecting upload requests") ap2.add_argument("--df", metavar="GiB", type=float, default=0, help="ensure GiB free disk space by rejecting upload requests")
ap2.add_argument("--sparse", metavar="MiB", type=int, default=4, help="windows-only: minimum size of incoming uploads through up2k before they are made into sparse files") ap2.add_argument("--sparse", metavar="MiB", type=int, default=4, help="windows-only: minimum size of incoming uploads through up2k before they are made into sparse files")
ap2.add_argument("--turbo", metavar="LVL", type=int, default=0, help="configure turbo-mode in up2k client; 0 = off and warn if enabled, 1 = off, 2 = on, 3 = on and disable datecheck") ap2.add_argument("--turbo", metavar="LVL", type=int, default=0, help="configure turbo-mode in up2k client; 0 = off and warn if enabled, 1 = off, 2 = on, 3 = on and disable datecheck")

View file

@ -1071,7 +1071,7 @@ class AuthSrv(object):
if getattr(self.args, k): if getattr(self.args, k):
vol.flags[k] = True vol.flags[k] = True
for ga, vf in [["no_forget", "noforget"]]: for ga, vf in [["no_forget", "noforget"], ["magic", "magic"]]:
if getattr(self.args, ga): if getattr(self.args, ga):
vol.flags[vf] = True vol.flags[vf] = True

View file

@ -784,7 +784,8 @@ class HttpCli(object):
self.log("fallthrough? thats a bug", 1) self.log("fallthrough? thats a bug", 1)
suffix = "-{:.6f}-{}".format(time.time(), self.dip()) suffix = "-{:.6f}-{}".format(time.time(), self.dip())
if not fn: nameless = not fn
if nameless:
suffix += ".bin" suffix += ".bin"
fn = "put" + suffix fn = "put" + suffix
@ -815,6 +816,28 @@ class HttpCli(object):
if self.args.nw: if self.args.nw:
return post_sz, sha_hex, sha_b64, remains, path, "" return post_sz, sha_hex, sha_b64, remains, path, ""
if nameless and "magic" in vfs.flags:
try:
ext = self.conn.hsrv.magician.ext(path)
except Exception as ex:
self.log("filetype detection failed for [{}]: {}".format(path, ex), 6)
ext = None
if ext:
if rnd:
fn2 = self.rand_name(fdir, "a." + ext, rnd)
else:
fn2 = fn.rsplit(".", 1)[0] + "." + ext
params["suffix"] = suffix[:-4]
with ren_open(fn, *open_a, **params) as zfw:
f, fn = zfw["orz"]
path2 = os.path.join(fdir, fn2)
atomic_move(path, path2)
fn = fn2
path = path2
vfs, rem = vfs.get_dbv(rem) vfs, rem = vfs.get_dbv(rem)
self.conn.hsrv.broker.say( self.conn.hsrv.broker.say(
"up2k.hash_file", "up2k.hash_file",

View file

@ -31,7 +31,15 @@ except ImportError:
from .__init__ import MACOS, TYPE_CHECKING, EnvParams from .__init__ import MACOS, TYPE_CHECKING, EnvParams
from .bos import bos from .bos import bos
from .httpconn import HttpConn from .httpconn import HttpConn
from .util import FHC, min_ex, shut_socket, spack, start_log_thrs, start_stackmon from .util import (
FHC,
Magician,
min_ex,
shut_socket,
spack,
start_log_thrs,
start_stackmon,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from .broker_util import BrokerCli from .broker_util import BrokerCli
@ -60,6 +68,7 @@ class HttpSrv(object):
socket.setdefaulttimeout(120) socket.setdefaulttimeout(120)
nsuf = "-n{}-i{:x}".format(nid, os.getpid()) if nid else "" nsuf = "-n{}-i{:x}".format(nid, os.getpid()) if nid else ""
self.magician = Magician()
self.name = "hsrv" + nsuf self.name = "hsrv" + nsuf
self.mutex = threading.Lock() self.mutex = threading.Lock()

View file

@ -69,6 +69,7 @@ except:
if TYPE_CHECKING: if TYPE_CHECKING:
from .authsrv import VFS from .authsrv import VFS
import magic
FAKE_MP = False FAKE_MP = False
@ -154,22 +155,18 @@ IMPLICATIONS = [
MIMES = { MIMES = {
"md": "text/plain",
"txt": "text/plain",
"js": "text/javascript",
"opus": "audio/ogg; codecs=opus", "opus": "audio/ogg; codecs=opus",
"caf": "audio/x-caf",
"mp3": "audio/mpeg",
"m4a": "audio/mp4",
"jpg": "image/jpeg",
} }
def _add_mimes() -> None: def _add_mimes() -> None:
# `mimetypes` is woefully unpopulated on windows
# but will be used as fallback on linux
for ln in """text css html csv for ln in """text css html csv
application json wasm xml pdf rtf zip application json wasm xml pdf rtf zip jar fits wasm
image webp jpeg png gif bmp image webp jpeg png gif bmp jxl jp2 jxs jxr tiff bpg heic heif avif
audio aac ogg wav audio aac ogg wav flac ape amr
video webm mp4 mpeg video webm mp4 mpeg
font woff woff2 otf ttf font woff woff2 otf ttf
""".splitlines(): """.splitlines():
@ -177,10 +174,35 @@ font woff woff2 otf ttf
for v in vs.strip().split(): for v in vs.strip().split():
MIMES[v] = "{}/{}".format(k, v) MIMES[v] = "{}/{}".format(k, v)
for ln in """text md=plain txt=plain js=javascript
application 7z=x-7z-compressed tar=x-tar bz2=x-bzip2 gz=gzip rar=x-rar-compressed zst=zstd xz=x-xz lz=lzip cpio=x-cpio
application exe=vnd.microsoft.portable-executable msi=x-ms-installer cab=vnd.ms-cab-compressed rpm=x-rpm crx=x-chrome-extension
application epub=epub+zip mobi=x-mobipocket-ebook lit=x-ms-reader rss=rss+xml atom=atom+xml torrent=x-bittorrent
application p7s=pkcs7-signature dcm=dicom shx=vnd.shx shp=vnd.shp dbf=x-dbf gml=gml+xml gpx=gpx+xml amf=x-amf
application swf=x-shockwave-flash m3u=vnd.apple.mpegurl db3=vnd.sqlite3 sqlite=vnd.sqlite3
image jpg=jpeg xpm=x-xpixmap psd=vnd.adobe.photoshop jpf=jpx tif=tiff ico=x-icon djvu=vnd.djvu
image heic=heic-sequence heif=heif-sequence hdr=vnd.radiance svg=svg+xml
audio caf=x-caf mp3=mpeg m4a=mp4 mid=midi mpc=musepack aif=aiff au=basic qcp=qcelp
video mkv=x-matroska mov=quicktime avi=x-msvideo m4v=x-m4v ts=mp2t
video asf=x-ms-asf flv=x-flv 3gp=3gpp 3g2=3gpp2 rmvb=vnd.rn-realmedia-vbr
font ttc=collection
""".splitlines():
k, ems = ln.split(" ", 1)
for em in ems.strip().split():
ext, mime = em.split("=")
MIMES[ext] = "{}/{}".format(k, mime)
_add_mimes() _add_mimes()
EXTS: dict[str, str] = {v: k for k, v in MIMES.items()}
EXTS["vnd.mozilla.apng"] = "png"
MAGIC_MAP = {"jpeg": "jpg"}
REKOBO_KEY = { REKOBO_KEY = {
v: ln.split(" ", 1)[0] v: ln.split(" ", 1)[0]
for ln in """ for ln in """
@ -625,6 +647,50 @@ class HMaccas(object):
return self.b(msg.encode("utf-8", "replace")) return self.b(msg.encode("utf-8", "replace"))
class Magician(object):
def __init__(self) -> None:
self.bad_magic = False
self.mutex = threading.Lock()
self.magic: Optional["magic.Magic"] = None
def ext(self, fpath: str) -> str:
import magic
try:
if self.bad_magic:
raise Exception()
if not self.magic:
try:
with self.mutex:
if not self.magic:
self.magic = magic.Magic(uncompress=False, extension=True)
except:
self.bad_magic = True
raise
with self.mutex:
ret = self.magic.from_file(fpath)
except:
ret = "?"
ret = ret.split("/")[0]
ret = MAGIC_MAP.get(ret, ret)
if "?" not in ret:
return ret
mime = magic.from_file(fpath, mime=True)
mime = re.split("[; ]", mime, 1)[0]
ret = EXTS.get(mime)
if not ret:
mg = mimetypes.guess_extension(mime)
if mg:
return mg[1:]
else:
raise Exception()
if WINDOWS and sys.version_info < (3, 8): if WINDOWS and sys.version_info < (3, 8):
_popen = sp.Popen _popen = sp.Popen

View file

@ -170,6 +170,23 @@ tmpdir="$(
wget -O$f "$url" || curl -L "$url" >$f) wget -O$f "$url" || curl -L "$url" >$f)
done done
echo collecting python-magic
v=0.4.27
f=python-magic-$v.tar.gz
[ -e "$f" ] ||
(url=https://files.pythonhosted.org/packages/da/db/0b3e28ac047452d079d375ec6798bf76a036a08182dbb39ed38116a49130/python-magic-0.4.27.tar.gz;
wget -O$f "$url" || curl -L "$url" >$f)
tar -zxf $f
mkdir magic
mv python-magic-*/magic .
rm -rf python-magic-*
rm magic/compat.py
f=magic/__init__.py
awk '/^def _add_compat/{o=1} !o; /^_add_compat/{o=0}' <$f >t
tmv "$f"
mv magic ftp/ # doesn't provide a version label anyways
# enable this to dynamically remove type hints at startup, # enable this to dynamically remove type hints at startup,
# in case a future python version can use them for performance # in case a future python version can use them for performance
true || ( true || (
@ -326,6 +343,7 @@ rm have
f=j2/jinja2/constants.py f=j2/jinja2/constants.py
awk '/^LOREM_IPSUM_WORDS/{o=1;print "LOREM_IPSUM_WORDS = u\"a\"";next} !o; /"""/{o=0}' <$f >t awk '/^LOREM_IPSUM_WORDS/{o=1;print "LOREM_IPSUM_WORDS = u\"a\"";next} !o; /"""/{o=0}' <$f >t
tmv "$f" tmv "$f"
rm -f j2/jinja2/async*
grep -rLE '^#[^a-z]*coding: utf-8' j2 | grep -rLE '^#[^a-z]*coding: utf-8' j2 |
while IFS= read -r f; do while IFS= read -r f; do