* scripts: add log repacker
* bench/filehash: msys support + add more stats
This commit is contained in:
ed 2024-01-06 01:15:43 +00:00
parent 143f72fe36
commit dee0950f74
2 changed files with 96 additions and 6 deletions

View file

@ -12,6 +12,11 @@ set -euo pipefail
#
# can be adjusted with --hash-mt (but alpine caps out at 5)
fsize=256
nfiles=128
pybin=$(command -v python3 || command -v python)
#pybin=~/.pyenv/versions/nogil-3.9.10-2/bin/python3
[ $# -ge 1 ] || {
echo 'need arg 1: path to copyparty-sfx.py'
echo ' (remaining args will be passed on to copyparty,'
@ -22,6 +27,8 @@ sfx="$1"
shift
sfx="$(realpath "$sfx" || readlink -e "$sfx" || echo "$sfx")"
awk=$(command -v gawk || command -v awk)
uname -s | grep -E MSYS && win=1 || win=
totalsize=$((fsize*nfiles))
# try to use /dev/shm to avoid hitting filesystems at all,
# otherwise fallback to mktemp which probably uses /tmp
@ -30,20 +37,24 @@ mkdir $td || td=$(mktemp -d)
trap "rm -rf $td" INT TERM EXIT
cd $td
echo creating 256 MiB testfile in $td
head -c $((1024*1024*256)) /dev/urandom > 1
echo creating $fsize MiB testfile in $td
sz=$((1024*1024*fsize))
head -c $sz /dev/zero | openssl enc -aes-256-ctr -iter 1 -pass pass:k -nosalt 2>/dev/null >1 || true
wc -c 1 | awk '$1=='$sz'{r=1}END{exit 1-r}' || head -c $sz /dev/urandom >1
echo creating 127 symlinks to it
for n in $(seq 2 128); do ln -s 1 $n; done
echo creating $((nfiles-1)) symlinks to it
for n in $(seq 2 $nfiles); do MSYS=winsymlinks:nativestrict ln -s 1 $n; done
echo warming up cache
cat 1 >/dev/null
echo ok lets go
python3 "$sfx" -p39204 -e2dsa --dbd=yolo --exit=idx -lo=t -q "$@"
$pybin "$sfx" -p39204 -e2dsa --dbd=yolo --exit=idx -lo=t -q "$@" && err= || err=$?
[ $win ] && [ $err = 15 ] && err= # sigterm doesn't hook on windows, ah whatever
[ $err ] && echo ERROR $err && exit $err
echo and the results are...
LC_ALL=C $awk '/1 volumes in / {s=$(NF-1); printf "speed: %.1f MiB/s (time=%.2fs)\n", 256*128/s, s}' <t
LC_ALL=C $awk '/1 volumes in / {s=$(NF-1); printf "speed: %.1f MiB/s (time=%.2fs)\n", '$totalsize'/s, s}' <t
echo deleting $td and exiting
@ -52,6 +63,8 @@ echo deleting $td and exiting
# MiB/s @ cpu or device (copyparty, pythonver, distro/os) // comment
# 3887 @ Ryzen 5 4500U (cpp 1.9.5, nogil 3.9, fedora 39) // --hash-mt=6; laptop
# 3732 @ Ryzen 5 4500U (cpp 1.9.5, py 3.12.1, fedora 39) // --hash-mt=6; laptop
# 3608 @ Ryzen 5 4500U (cpp 1.9.5, py 3.11.5, fedora 38) // --hash-mt=6; laptop
# 2726 @ Ryzen 5 4500U (cpp 1.9.5, py 3.11.5, fedora 38) // --hash-mt=4 (old-default)
# 2202 @ Ryzen 5 4500U (cpp 1.9.5, py 3.11.5, docker-alpine 3.18.3) ??? alpine slow
@ -62,6 +75,10 @@ echo deleting $td and exiting
# 5544 @ Intel i5-12500 (cpp 1.9.5, py 3.11.2, debian 12.0) // --hash-mt=12; desktop
# 5197 @ Ryzen 7 3700X (cpp 1.9.5, py 3.9.18, freebsd 13.2) // --hash-mt=8; 2u server
# 4551 @ mbp 2020 m1 (cpp 1.9.5, py 3.11.7, macos 14.2.1)
# 4190 @ Ryzen 7 5800X (cpp 1.9.5, py 3.11.6, fedora 37) // --hash-mt=8 (vbox-VM on win10-17763.4974)
# 3028 @ Ryzen 7 5800X (cpp 1.9.5, py 3.11.6, fedora 37) // --hash-mt=5 (vbox-VM on win10-17763.4974)
# 2629 @ Ryzen 7 5800X (cpp 1.9.5, py 3.11.7, win10-ltsc-1809-17763.4974) // --hash-mt=5 (default)
# 2576 @ Ryzen 7 5800X (cpp 1.9.5, py 3.11.7, win10-ltsc-1809-17763.4974) // --hash-mt=8 (hello??)
# 2606 @ Ryzen 7 3700X (cpp 1.9.5, py 3.9.18, freebsd 13.2) // --hash-mt=4 (old-default)
# 1436 @ Ryzen 5 5500U (cpp 1.9.5, py 3.11.4, alpine 3.18.3) // nuc
# 1065 @ Pixel 7 (cpp 1.9.5, py 3.11.5, termux 2023-09)

73
scripts/logpack.sh Executable file
View file

@ -0,0 +1,73 @@
#!/bin/bash
set -e
# recompress logs so they decompress faster + save some space;
# * will not recurse into subfolders
# * each file in current folder gets recompressed to zstd; input file is DELETED
# * any xz-compressed logfiles are decompressed before converting to zstd
# * SHOULD ignore and skip files which are currently open; SHOULD be safe to run while copyparty is running
# for files larger than $cutoff, compress with `zstd -T0`
# (otherwise do several files in parallel (scales better))
cutoff=400M
# osx support:
# port install findutils gsed coreutils
command -v gfind >/dev/null &&
command -v gsed >/dev/null &&
command -v gsort >/dev/null && {
find() { gfind "$@"; }
sed() { gsed "$@"; }
sort() { gsort "$@"; }
}
packfun() {
local jobs=$1 fn="$2"
printf '%s\n' "$fn" | grep -qF .zst && return
local of="$(printf '%s\n' "$fn" | sed -r 's/\.(xz|txt)/.zst/')"
[ "$fn" = "$of" ] &&
of="$of.zst"
[ -e "$of" ] &&
echo "SKIP: output file exists: $of" &&
return
lsof -- "$fn" 2>/dev/null | grep -E .. &&
printf "SKIP: file in use: %s\n\n" $fn &&
return
# determine by header; old copyparty versions would produce xz output without .xz names
head -c3 "$fn" | grep -qF 7z &&
cmd="xz -dkc" || cmd="cat"
printf '<%s> T%d: %s\n' "$cmd" $jobs "$of"
$cmd <"$fn" >/dev/null || {
echo "ERROR: uncompress failed: $fn"
return
}
$cmd <"$fn" | zstd --long -19 -T$jobs >"$of"
touch -r "$fn" -- "$of"
cmp <($cmd <"$fn") <(zstd -d <"$of") || {
echo "ERROR: data mismatch: $of"
mv "$of"{,.BAD}
return
}
rm -- "$fn"
}
# do small files in parallel first (in descending size);
# each file can use 4 threads in case the cutoff is poor
export -f packfun
export -f sed 2>/dev/null || true
find -maxdepth 1 -type f -size -$cutoff -printf '%s %p\n' |
sort -nr | sed -r 's`[^ ]+ ``; s`^\./``' | tr '\n' '\0' |
xargs "$@" -0i -P$(nproc) bash -c 'packfun 4 "$@"' _ {}
# then the big ones, letting each file use the whole cpu
for f in *; do packfun 0 "$f"; done