EPUB Thumbnailing support

This commit is contained in:
AppleTheGolden 2025-08-10 17:16:43 +02:00
parent db2a03409c
commit 1c37c13c44
No known key found for this signature in database
GPG key ID: F6AC8A62154C42AA
2 changed files with 89 additions and 4 deletions

View file

@ -1443,13 +1443,13 @@ def add_thumbnail(ap):
# https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
# https://github.com/libvips/libvips
# ffmpeg -hide_banner -demuxers | awk '/^ D /{print$2}' | while IFS= read -r x; do ffmpeg -hide_banner -h demuxer=$x; done | grep -E '^Demuxer |extensions:'
ap2.add_argument("--th-r-pil", metavar="T,T", type=u, default="avif,avifs,blp,bmp,cbz,dcx,dds,dib,emf,eps,fits,flc,fli,fpx,gif,heic,heics,heif,heifs,icns,ico,im,j2p,j2k,jp2,jpeg,jpg,jpx,pbm,pcx,pgm,png,pnm,ppm,psd,qoi,sgi,spi,tga,tif,tiff,webp,wmf,xbm,xpm", help="image formats to decode using pillow")
ap2.add_argument("--th-r-pil", metavar="T,T", type=u, default="avif,avifs,blp,bmp,cbz,dcx,dds,dib,emf,eps,epub,fits,flc,fli,fpx,gif,heic,heics,heif,heifs,icns,ico,im,j2p,j2k,jp2,jpeg,jpg,jpx,pbm,pcx,pgm,png,pnm,ppm,psd,qoi,sgi,spi,tga,tif,tiff,webp,wmf,xbm,xpm", help="image formats to decode using pillow")
ap2.add_argument("--th-r-vips", metavar="T,T", type=u, default="avif,exr,fit,fits,fts,gif,hdr,heic,jp2,jpeg,jpg,jpx,jxl,nii,pfm,pgm,png,ppm,svg,tif,tiff,webp", help="image formats to decode using pyvips")
ap2.add_argument("--th-r-ffi", metavar="T,T", type=u, default="apng,avif,avifs,bmp,cbz,dds,dib,fit,fits,fts,gif,hdr,heic,heics,heif,heifs,icns,ico,jp2,jpeg,jpg,jpx,jxl,pbm,pcx,pfm,pgm,png,pnm,ppm,psd,qoi,sgi,tga,tif,tiff,webp,xbm,xpm", help="image formats to decode using ffmpeg")
ap2.add_argument("--th-r-ffi", metavar="T,T", type=u, default="apng,avif,avifs,bmp,cbz,dds,dib,epub,fit,fits,fts,gif,hdr,heic,heics,heif,heifs,icns,ico,jp2,jpeg,jpg,jpx,jxl,pbm,pcx,pfm,pgm,png,pnm,ppm,psd,qoi,sgi,tga,tif,tiff,webp,xbm,xpm", help="image formats to decode using ffmpeg")
ap2.add_argument("--th-r-ffv", metavar="T,T", type=u, default="3gp,asf,av1,avc,avi,flv,h264,h265,hevc,m4v,mjpeg,mjpg,mkv,mov,mp4,mpeg,mpeg2,mpegts,mpg,mpg2,mts,nut,ogm,ogv,rm,ts,vob,webm,wmv", help="video formats to decode using ffmpeg")
ap2.add_argument("--th-r-ffa", metavar="T,T", type=u, default="aac,ac3,aif,aiff,alac,alaw,amr,apac,ape,au,bonk,dfpwm,dts,flac,gsm,ilbc,it,itgz,itxz,itz,m4a,mdgz,mdxz,mdz,mo3,mod,mp2,mp3,mpc,mptm,mt2,mulaw,oga,ogg,okt,opus,ra,s3m,s3gz,s3xz,s3z,tak,tta,ulaw,wav,wma,wv,xm,xmgz,xmxz,xmz,xpk", help="audio formats to decode using ffmpeg")
ap2.add_argument("--th-spec-cnv", metavar="T", type=u, default="it,itgz,itxz,itz,mdgz,mdxz,mdz,mo3,mod,s3m,s3gz,s3xz,s3z,xm,xmgz,xmxz,xmz,xpk", help="audio formats which provoke https://trac.ffmpeg.org/ticket/10797 (huge ram usage for s3xmodit spectrograms)")
ap2.add_argument("--au-unpk", metavar="E=F.C", type=u, default="mdz=mod.zip, mdgz=mod.gz, mdxz=mod.xz, s3z=s3m.zip, s3gz=s3m.gz, s3xz=s3m.xz, xmz=xm.zip, xmgz=xm.gz, xmxz=xm.xz, itz=it.zip, itgz=it.gz, itxz=it.xz, cbz=jpg.cbz", help="audio/image formats to decompress before passing to ffmpeg")
ap2.add_argument("--au-unpk", metavar="E=F.C", type=u, default="mdz=mod.zip, mdgz=mod.gz, mdxz=mod.xz, s3z=s3m.zip, s3gz=s3m.gz, s3xz=s3m.xz, xmz=xm.zip, xmgz=xm.gz, xmxz=xm.xz, itz=it.zip, itgz=it.gz, itxz=it.xz, cbz=jpg.cbz, epub=jpg.epub", help="audio/image formats to decompress before passing to ffmpeg")
def add_transcoding(ap):

View file

@ -29,7 +29,7 @@ from .util import (
)
if True: # pylint: disable=using-constant-test
from typing import Any, Optional, Union
from typing import Any, Optional, Union, IO
from .util import NamedLogger, RootLogger
@ -60,9 +60,23 @@ def have_ff(scmd: str) -> bool:
else:
return bool(shutil.which(scmd))
def expat_is_secure():
"""
From the python xml docs:
An attacker can abuse XML features to carry out denial of service attacks, access local files, generate network connections to other machines, or circumvent firewalls.
Expat versions lower that 2.6.0 may be vulnerable to billion laughs, quadratic blowup and large tokens. Python may be vulnerable if it uses such older versions of Expat as a system-provided library. Check pyexpat.EXPAT_VERSION.
"""
import pyexpat
# expat_2.7.1
if len(pyexpat.EXPAT_VERSION) < 11:
return False
major, minor, patch = (int(x) for x in pyexpat.EXPAT_VERSION[6:].split("."))
return major > 2 or major == 2 and minor >= 6
HAVE_FFMPEG = not os.environ.get("PRTY_NO_FFMPEG") and have_ff("ffmpeg")
HAVE_FFPROBE = not os.environ.get("PRTY_NO_FFPROBE") and have_ff("ffprobe")
HAVE_SECURE_EXPAT = os.environ.get("PRTY_ALLOW_INSECURE_EXPAT") or expat_is_secure()
CBZ_PICS = set("png jpg jpeg gif bmp tga tif tiff webp avif".split())
CBZ_01 = re.compile(r"(^|[^0-9v])0+[01]\b")
@ -176,6 +190,10 @@ def au_unpk(
raise Exception("no images inside cbz")
fi = zf.open(using)
elif pk == "epub":
if HAVE_SECURE_EXPAT:
fi = get_cover_from_epub(log, abspath)
else:
raise Exception("unknown compression %s" % (pk,))
@ -365,6 +383,70 @@ def parse_ffprobe(txt: str) -> tuple[dict[str, tuple[int, Any]], dict[str, list[
return zd, md
def get_cover_from_epub(log: "NamedLogger", abspath: str) -> IO[bytes] | None:
import zipfile
import xml.etree.ElementTree as ElTree
try:
from urlparse import urljoin # Python2
except ImportError:
from urllib.parse import urljoin # Python3
with zipfile.ZipFile(abspath, "r") as z:
# First open the container file to find the package document (.opf file)
try:
container_root = ElTree.parse(z.open("META-INF/container.xml"))
except KeyError:
log(f"epub: no container file found in {abspath}")
return None
# https://www.w3.org/TR/epub-33/#sec-container.xml-rootfile-elem
container_namesapce = {"": "urn:oasis:names:tc:opendocument:xmlns:container"}
# One file could contain multiple package documents, default to the first one
rootfile_path = container_root\
.find("./rootfiles/rootfile", container_namesapce)\
.get("full-path")
# Then open the first package document to find the path of the cover image
try:
package_root = ElTree.parse(z.open(rootfile_path))
except KeyError:
log(f"epub: no package document found in {abspath}")
return None
# https://www.w3.org/TR/epub-33/#sec-package-doc
package_namespaces = {"": "http://www.idpf.org/2007/opf"}
# https://www.w3.org/TR/epub-33/#sec-cover-image
coverimage_path_node = package_root\
.find("./manifest/item[@properties='cover-image']", package_namespaces)
if coverimage_path_node:
coverimage_path = coverimage_path_node.get("href")
else:
# This might be an EPUB2 file, try the legacy way of specifying covers
coverimage_path = _get_cover_from_epub2(log, package_root, package_namespaces)
# This url is either absolute (in the .epub) or relative to the package document
adjusted_cover_path = urljoin(rootfile_path, coverimage_path)
return z.open(adjusted_cover_path)
def _get_cover_from_epub2(log: "NamedLogger", package_root, package_namespaces) -> str | None:
# <meta name="cover" content="id-to-cover-image"> in <metadata>, then
# <item> in <manifest>
cover_id = package_root \
.find("./metadata/meta[@name='cover']", package_namespaces) \
.get("content")
if not cover_id:
return None
for node in package_root.iterfind("./manifest/item", package_namespaces):
if node.get("id") == cover_id:
cover_path = node.get("href")
return cover_path
return None
class MTag(object):
def __init__(self, log_func: "RootLogger", args: argparse.Namespace) -> None:
self.log_func = log_func
@ -407,6 +489,9 @@ class MTag(object):
self.log(msg.format(or_ffprobe, " " * 37, pyname), c=1)
return
if not HAVE_SECURE_EXPAT:
self.log("expat version is missing critical security fixes; epub thumbnails will not be available", c=3)
# https://picard-docs.musicbrainz.org/downloads/MusicBrainz_Picard_Tag_Map.html
tagmap = {
"album": ["album", "talb", "\u00a9alb", "original-album", "toal"],