up2k-hook-ytid: add mp4/webm/mkv metadata scanner

2026-06-22 05:53:29 -06:00 · 2022-07-26 22:09:18 +02:00 · 2022-07-26 22:09:18 +02:00 · 76adac7c72
parent 5dc75ebb67
commit 76adac7c72
2 changed files with 167 additions and 38 deletions
--- a/contrib/plugins/up2k-hook-ytid.js
+++ b/contrib/plugins/up2k-hook-ytid.js
@ -2,29 +2,140 @@
 // assumes all files dropped into the uploader have a youtube-id somewhere in the filename,
 // locates the youtube-ids and passes them to an API which returns a list of IDs which should be uploaded
 //
+// also tries to find the youtube-id in the embedded metadata
+//
 // assumes copyparty is behind nginx as /ytq is a standalone service which must be rproxied in place

 function up2k_namefilter(good_files, nil_files, bad_files, hooks) {
-    var filenames = [],
-        file_lists = [good_files, nil_files, bad_files];
+    var passthru = up2k.uc.fsearch;
+    if (passthru)
+        return hooks[0](good_files, nil_files, bad_files, hooks.slice(1));

-    for (var lst of file_lists)
-        for (var ent of lst)
-            filenames.push(ent[1]);
+    a_up2k_namefilter(good_files, nil_files, bad_files, hooks).then(() => { });
+}
+
+function bstrpos(buf, ptn) {
+    var ofs = 0,
+        ch0 = ptn[0],
+        sz = buf.byteLength;

-    var yt_ids = new Set();
-    for (var lst of file_lists)
-        for (var ent of lst) {
-            var m, name = ent[1];
    while (true) {
+        ofs = buf.indexOf(ch0, ofs);
+        if (ofs < 0 || ofs >= sz)
+            return -1;
+
+        for (var a = 1; a < ptn.length; a++)
+            if (buf[ofs + a] !== ptn[a])
+                break;
+
+        if (a === ptn.length)
+            return ofs;
+
+        ++ofs;
+    }
+}
+
+async function a_up2k_namefilter(good_files, nil_files, bad_files, hooks) {
+    var t0 = Date.now(),
+        yt_ids = new Set(),
+        textdec = new TextDecoder('latin1'),
+        md_ptn = new TextEncoder().encode('youtube.com/watch?v='),
+        file_ids = [],  // all IDs found for each good_files
+        mofs = 0,
+        mnchk = 0,
+        mfile = '';
+
+    for (var a = 0; a < good_files.length; a++) {
+        var [fobj, name] = good_files[a],
+            sz = fobj.size,
+            ids = [],
+            id_ok = false,
+            m;
+
+        // all IDs found in this file
+        file_ids.push(ids);
+
+        // look for ID in filename; reduce the
+        // metadata-scan intensity if the id looks safe
+        m = /[\[(-]([\w-]{11})[\])]?\.(?:mp4|webm|mkv)$/i.exec(name);
+        id_ok = !!m;
+
+        while (true) {
+            // fuzzy catch-all;
            // some ytdl fork did %(title)-%(id).%(ext) ...
            m = /(?:^|[^\w])([\w-]{11})(?:$|[^\w-])/.exec(name);
            if (!m)
                break;

-                yt_ids.add(m[1]);
            name = name.replace(m[1], '');
+            yt_ids.add(m[1]);
+            ids.push(m[1]);
        }
+
+        // look for IDs in video metadata,
+        if (/\.(mp4|webm|mkv)$/i.exec(name)) {
+            toast.show('inf r', 0, `analyzing file ${a + 1} / ${good_files.length} :\n${name}\n\nhave analysed ${++mnchk} files in ${(Date.now() - t0) / 1000} seconds, ${humantime((good_files.length - (a + 1)) * (((Date.now() - t0) / 1000) / mnchk))} remaining,\n\nbiggest offset so far is ${mofs}, in this file:\n\n${mfile}`);
+
+            // check first and last 128 MiB;
+            // pWxOroN5WCo.mkv @  6edb98 (6.92M)
+            // Nf-nN1wF5Xo.mp4 @ 4a98034 (74.6M)
+            var chunksz = 1024 * 1024 * 2,  // byte
+                aspan = id_ok ? 128 : 512;  // MiB
+
+            aspan = parseInt(Math.min(sz / 2, aspan * 1024 * 1024) / chunksz) * chunksz;
+
+            for (var side = 0; side < 2; side++) {
+                var ofs = side ? Math.max(0, sz - aspan) : 0,
+                    nchunks = aspan / chunksz;
+
+                for (var chunk = 0; chunk < nchunks; chunk++) {
+                    var bchunk = await fobj.slice(ofs, ofs + chunksz + 16).arrayBuffer(),
+                        uchunk = new Uint8Array(bchunk, 0, bchunk.byteLength),
+                        bofs = bstrpos(uchunk, md_ptn),
+                        absofs = Math.min(ofs + bofs, (sz - ofs) + bofs),
+                        txt = bofs < 0 ? '' : textdec.decode(uchunk.subarray(bofs)),
+                        m;
+
+                    //console.log(`side ${ side }, chunk ${ chunk }, ofs ${ ofs }, bchunk ${ bchunk.byteLength }, txt ${ txt.length }`);
+                    while (true) {
+                        // mkv/webm have [a-z] immediately after url
+                        m = /(youtube\.com\/watch\?v=[\w-]{11})/.exec(txt);
+                        if (!m)
+                            break;
+
+                        txt = txt.replace(m[1], '');
+                        m = m[1].slice(-11);
+
+                        console.log(`found ${m} @${bofs}, ${name} `);
+                        yt_ids.add(m);
+                        if (!has(ids, m))
+                            ids.push(m);
+
+                        // bail after next iteration
+                        chunk = nchunks - 1;
+                        side = 9;
+
+                        if (mofs < absofs) {
+                            mofs = absofs;
+                            mfile = name;
+                        }
+                    }
+                    ofs += chunksz;
+                    if (ofs >= sz)
+                        break;
+                }
+            }
+        }
+    }
+
+    if (false) {
+        var msg = `finished analysing ${mnchk} files in ${(Date.now() - t0) / 1000} seconds,\n\nbiggest offset was ${mofs} in this file:\n\n${mfile}`,
+            mfun = function () { toast.ok(0, msg); };
+
+        mfun();
+        setTimeout(mfun, 200);
+
+        return hooks[0]([], [], [], hooks.slice(1));
    }

    toast.inf(5, `running query for ${yt_ids.size} videos...`);
@ -34,48 +145,65 @@ function up2k_namefilter(good_files, nil_files, bad_files, hooks) {
    xhr.setRequestHeader('Content-Type', 'text/plain');
    xhr.onload = xhr.onerror = function () {
        if (this.status != 200)
-            return toast.err(0, `sorry, database query failed ;_;\n\nplease let us know so we can look at it, thx!!\n\nerror ${this.status}: ${(this.response && this.response.err) || this.responseText}`);
+            return toast.err(0, `sorry, database query failed; _; \n\nplease let us know so we can look at it, thx!!\n\nerror ${this.status}: ${(this.response && this.response.err) || this.responseText} `);

-        var new_lists = [],
-            ptn = new RegExp(this.responseText.trim().split('\n').join('|') || '\n'),
-            nothing_to_do = true,
-            n_skip = 0;
+        process_id_list(this.responseText);
+    };
+    xhr.send(Array.from(yt_ids).join('\n'));

-        for (var lst of file_lists) {
-            var keep = [];
-            new_lists.push(keep);
+    setTimeout(function () { process_id_list('Nf-nN1wF5Xo\n'); }, 500);

-            for (var ent of lst)
-                if (ptn.exec(ent[1]))
-                    keep.push(ent);
-                else
-                    n_skip++;
+    function process_id_list(txt) {
+        var wanted_ids = new Set(txt.trim().split('\n')),
+            wanted_names = new Set(),  // basenames with a wanted ID
+            wanted_files = new Set();  // filedrops

-            if (keep.length)
-                nothing_to_do = false;
+        for (var a = 0; a < good_files.length; a++) {
+            var name = good_files[a][1];
+            for (var b = 0; b < file_ids[a].length; b++)
+                if (wanted_ids.has(file_ids[a][b])) {
+                    wanted_files.add(good_files[a]);
+
+                    var m = /(.*)\.(mp4|webm|mkv)$/i.exec(name);
+                    if (m)
+                        wanted_names.add(m[1]);
+
+                    break;
+                }
+        }
+
+        // add all files with the same basename as each explicitly wanted file
+        // (infojson/chatlog/etc when ID was discovered from metadata)
+        for (var a = 0; a < good_files.length; a++) {
+            var name = good_files[a][1];
+            for (var b = 0; b < 3; b++) {
+                name = name.replace(/\.[^\.]+$/, '');
+                if (wanted_names.has(name)) {
+                    wanted_files.add(good_files[a]);
+                    break;
+                }
+            }
        }

        function upload_filtered() {
-            if (nothing_to_do)
+            if (!wanted_files.size)
                return modal.alert('Good news -- turns out we already have all those.\n\nBut thank you for checking in!');

-            [good_files, nil_files, bad_files] = new_lists;
-            hooks[0](good_files, nil_files, bad_files, hooks.slice(1));
+            hooks[0](Array.from(wanted_files), nil_files, bad_files, hooks.slice(1));
        }

        function upload_all() {
            hooks[0](good_files, nil_files, bad_files, hooks.slice(1));
        }

-        var msg = `you added ${good_files.length} files; ${n_skip} of them were skipped --\neither because we already have them,\nor because there is no youtube-ID in your filename.\n\n<code>OK</code> / <code>Enter</code> = continue uploading the ${new_lists[0].length} files we definitely need\n\n<code>Cancel</code> / <code>ESC</code> = override the filter; upload ALL the files you added`;
+        var n_skip = good_files.length - wanted_files.size,
+            msg = `you added ${good_files.length} files; ${n_skip} of them were skipped --\neither because we already have them,\nor because there is no youtube-ID in your filename.\n\n<code>OK</code> / <code>Enter</code> = continue uploading just the ${wanted_files.size} files we definitely need\n\n<code>Cancel</code> / <code>ESC</code> = override the filter; upload ALL the files you added`;

        if (!n_skip)
            upload_filtered();
        else
            modal.confirm(msg, upload_filtered, upload_all);
-
    };
-    xhr.send(Array.from(yt_ids).join('\n'));
 }

 up2k_hooks.push(function () {
--- a/copyparty/up2k.py
+++ b/copyparty/up2k.py
@ -132,7 +132,7 @@ class Up2k(object):
        self.mem_cur = None
        self.sqlite_ver = None
        self.no_expr_idx = False
-        self.timeout = int(max(self.args.srch_time, 5) * 1.2)
+        self.timeout = int(max(self.args.srch_time, 5) * 1.2) + 1
        self.spools: set[tempfile.SpooledTemporaryFile[bytes]] = set()
        if HAVE_SQLITE3:
            # mojibake detector
@ -1178,6 +1178,7 @@ class Up2k(object):
        params: tuple[Any, ...],
        flt: int,
    ) -> tuple[tempfile.SpooledTemporaryFile[bytes], int]:
+        """mutex me"""
        n = 0
        c2 = cur.connection.cursor()
        tf = tempfile.SpooledTemporaryFile(1024 * 1024 * 8, "w+b", prefix="cpp-tq-")