diff --git a/contrib/plugins/up2k-hook-ytid.js b/contrib/plugins/up2k-hook-ytid.js index 23b60d1b..0982f953 100644 --- a/contrib/plugins/up2k-hook-ytid.js +++ b/contrib/plugins/up2k-hook-ytid.js @@ -2,30 +2,141 @@ // assumes all files dropped into the uploader have a youtube-id somewhere in the filename, // locates the youtube-ids and passes them to an API which returns a list of IDs which should be uploaded // +// also tries to find the youtube-id in the embedded metadata +// // assumes copyparty is behind nginx as /ytq is a standalone service which must be rproxied in place function up2k_namefilter(good_files, nil_files, bad_files, hooks) { - var filenames = [], - file_lists = [good_files, nil_files, bad_files]; + var passthru = up2k.uc.fsearch; + if (passthru) + return hooks[0](good_files, nil_files, bad_files, hooks.slice(1)); - for (var lst of file_lists) - for (var ent of lst) - filenames.push(ent[1]); + a_up2k_namefilter(good_files, nil_files, bad_files, hooks).then(() => { }); +} - var yt_ids = new Set(); - for (var lst of file_lists) - for (var ent of lst) { - var m, name = ent[1]; - while (true) { - // some ytdl fork did %(title)-%(id).%(ext) ... - m = /(?:^|[^\w])([\w-]{11})(?:$|[^\w-])/.exec(name); - if (!m) - break; +function bstrpos(buf, ptn) { + var ofs = 0, + ch0 = ptn[0], + sz = buf.byteLength; - yt_ids.add(m[1]); - name = name.replace(m[1], ''); + while (true) { + ofs = buf.indexOf(ch0, ofs); + if (ofs < 0 || ofs >= sz) + return -1; + + for (var a = 1; a < ptn.length; a++) + if (buf[ofs + a] !== ptn[a]) + break; + + if (a === ptn.length) + return ofs; + + ++ofs; + } +} + +async function a_up2k_namefilter(good_files, nil_files, bad_files, hooks) { + var t0 = Date.now(), + yt_ids = new Set(), + textdec = new TextDecoder('latin1'), + md_ptn = new TextEncoder().encode('youtube.com/watch?v='), + file_ids = [], // all IDs found for each good_files + mofs = 0, + mnchk = 0, + mfile = ''; + + for (var a = 0; a < good_files.length; a++) { + var [fobj, name] = good_files[a], + sz = fobj.size, + ids = [], + id_ok = false, + m; + + // all IDs found in this file + file_ids.push(ids); + + // look for ID in filename; reduce the + // metadata-scan intensity if the id looks safe + m = /[\[(-]([\w-]{11})[\])]?\.(?:mp4|webm|mkv)$/i.exec(name); + id_ok = !!m; + + while (true) { + // fuzzy catch-all; + // some ytdl fork did %(title)-%(id).%(ext) ... + m = /(?:^|[^\w])([\w-]{11})(?:$|[^\w-])/.exec(name); + if (!m) + break; + + name = name.replace(m[1], ''); + yt_ids.add(m[1]); + ids.push(m[1]); + } + + // look for IDs in video metadata, + if (/\.(mp4|webm|mkv)$/i.exec(name)) { + toast.show('inf r', 0, `analyzing file ${a + 1} / ${good_files.length} :\n${name}\n\nhave analysed ${++mnchk} files in ${(Date.now() - t0) / 1000} seconds, ${humantime((good_files.length - (a + 1)) * (((Date.now() - t0) / 1000) / mnchk))} remaining,\n\nbiggest offset so far is ${mofs}, in this file:\n\n${mfile}`); + + // check first and last 128 MiB; + // pWxOroN5WCo.mkv @ 6edb98 (6.92M) + // Nf-nN1wF5Xo.mp4 @ 4a98034 (74.6M) + var chunksz = 1024 * 1024 * 2, // byte + aspan = id_ok ? 128 : 512; // MiB + + aspan = parseInt(Math.min(sz / 2, aspan * 1024 * 1024) / chunksz) * chunksz; + + for (var side = 0; side < 2; side++) { + var ofs = side ? Math.max(0, sz - aspan) : 0, + nchunks = aspan / chunksz; + + for (var chunk = 0; chunk < nchunks; chunk++) { + var bchunk = await fobj.slice(ofs, ofs + chunksz + 16).arrayBuffer(), + uchunk = new Uint8Array(bchunk, 0, bchunk.byteLength), + bofs = bstrpos(uchunk, md_ptn), + absofs = Math.min(ofs + bofs, (sz - ofs) + bofs), + txt = bofs < 0 ? '' : textdec.decode(uchunk.subarray(bofs)), + m; + + //console.log(`side ${ side }, chunk ${ chunk }, ofs ${ ofs }, bchunk ${ bchunk.byteLength }, txt ${ txt.length }`); + while (true) { + // mkv/webm have [a-z] immediately after url + m = /(youtube\.com\/watch\?v=[\w-]{11})/.exec(txt); + if (!m) + break; + + txt = txt.replace(m[1], ''); + m = m[1].slice(-11); + + console.log(`found ${m} @${bofs}, ${name} `); + yt_ids.add(m); + if (!has(ids, m)) + ids.push(m); + + // bail after next iteration + chunk = nchunks - 1; + side = 9; + + if (mofs < absofs) { + mofs = absofs; + mfile = name; + } + } + ofs += chunksz; + if (ofs >= sz) + break; + } } } + } + + if (false) { + var msg = `finished analysing ${mnchk} files in ${(Date.now() - t0) / 1000} seconds,\n\nbiggest offset was ${mofs} in this file:\n\n${mfile}`, + mfun = function () { toast.ok(0, msg); }; + + mfun(); + setTimeout(mfun, 200); + + return hooks[0]([], [], [], hooks.slice(1)); + } toast.inf(5, `running query for ${yt_ids.size} videos...`); @@ -34,48 +145,65 @@ function up2k_namefilter(good_files, nil_files, bad_files, hooks) { xhr.setRequestHeader('Content-Type', 'text/plain'); xhr.onload = xhr.onerror = function () { if (this.status != 200) - return toast.err(0, `sorry, database query failed ;_;\n\nplease let us know so we can look at it, thx!!\n\nerror ${this.status}: ${(this.response && this.response.err) || this.responseText}`); + return toast.err(0, `sorry, database query failed; _; \n\nplease let us know so we can look at it, thx!!\n\nerror ${this.status}: ${(this.response && this.response.err) || this.responseText} `); - var new_lists = [], - ptn = new RegExp(this.responseText.trim().split('\n').join('|') || '\n'), - nothing_to_do = true, - n_skip = 0; + process_id_list(this.responseText); + }; + xhr.send(Array.from(yt_ids).join('\n')); - for (var lst of file_lists) { - var keep = []; - new_lists.push(keep); + setTimeout(function () { process_id_list('Nf-nN1wF5Xo\n'); }, 500); - for (var ent of lst) - if (ptn.exec(ent[1])) - keep.push(ent); - else - n_skip++; + function process_id_list(txt) { + var wanted_ids = new Set(txt.trim().split('\n')), + wanted_names = new Set(), // basenames with a wanted ID + wanted_files = new Set(); // filedrops - if (keep.length) - nothing_to_do = false; + for (var a = 0; a < good_files.length; a++) { + var name = good_files[a][1]; + for (var b = 0; b < file_ids[a].length; b++) + if (wanted_ids.has(file_ids[a][b])) { + wanted_files.add(good_files[a]); + + var m = /(.*)\.(mp4|webm|mkv)$/i.exec(name); + if (m) + wanted_names.add(m[1]); + + break; + } + } + + // add all files with the same basename as each explicitly wanted file + // (infojson/chatlog/etc when ID was discovered from metadata) + for (var a = 0; a < good_files.length; a++) { + var name = good_files[a][1]; + for (var b = 0; b < 3; b++) { + name = name.replace(/\.[^\.]+$/, ''); + if (wanted_names.has(name)) { + wanted_files.add(good_files[a]); + break; + } + } } function upload_filtered() { - if (nothing_to_do) + if (!wanted_files.size) return modal.alert('Good news -- turns out we already have all those.\n\nBut thank you for checking in!'); - [good_files, nil_files, bad_files] = new_lists; - hooks[0](good_files, nil_files, bad_files, hooks.slice(1)); + hooks[0](Array.from(wanted_files), nil_files, bad_files, hooks.slice(1)); } function upload_all() { hooks[0](good_files, nil_files, bad_files, hooks.slice(1)); } - var msg = `you added ${good_files.length} files; ${n_skip} of them were skipped --\neither because we already have them,\nor because there is no youtube-ID in your filename.\n\nOK / Enter = continue uploading the ${new_lists[0].length} files we definitely need\n\nCancel / ESC = override the filter; upload ALL the files you added`; + var n_skip = good_files.length - wanted_files.size, + msg = `you added ${good_files.length} files; ${n_skip} of them were skipped --\neither because we already have them,\nor because there is no youtube-ID in your filename.\n\nOK / Enter = continue uploading just the ${wanted_files.size} files we definitely need\n\nCancel / ESC = override the filter; upload ALL the files you added`; if (!n_skip) upload_filtered(); else modal.confirm(msg, upload_filtered, upload_all); - }; - xhr.send(Array.from(yt_ids).join('\n')); } up2k_hooks.push(function () { diff --git a/copyparty/up2k.py b/copyparty/up2k.py index df1a61df..07c2d7f8 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -132,7 +132,7 @@ class Up2k(object): self.mem_cur = None self.sqlite_ver = None self.no_expr_idx = False - self.timeout = int(max(self.args.srch_time, 5) * 1.2) + self.timeout = int(max(self.args.srch_time, 5) * 1.2) + 1 self.spools: set[tempfile.SpooledTemporaryFile[bytes]] = set() if HAVE_SQLITE3: # mojibake detector @@ -1178,6 +1178,7 @@ class Up2k(object): params: tuple[Any, ...], flt: int, ) -> tuple[tempfile.SpooledTemporaryFile[bytes], int]: + """mutex me""" n = 0 c2 = cur.connection.cursor() tf = tempfile.SpooledTemporaryFile(1024 * 1024 * 8, "w+b", prefix="cpp-tq-")