From dee0950f74376ae89d19cdf5220578335386faa8 Mon Sep 17 00:00:00 2001 From: ed Date: Sat, 6 Jan 2024 01:15:43 +0000 Subject: [PATCH] misc; * scripts: add log repacker * bench/filehash: msys support + add more stats --- scripts/bench/filehash.sh | 29 ++++++++++++---- scripts/logpack.sh | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 6 deletions(-) create mode 100755 scripts/logpack.sh diff --git a/scripts/bench/filehash.sh b/scripts/bench/filehash.sh index f729c071..5a5f4c98 100755 --- a/scripts/bench/filehash.sh +++ b/scripts/bench/filehash.sh @@ -12,6 +12,11 @@ set -euo pipefail # # can be adjusted with --hash-mt (but alpine caps out at 5) +fsize=256 +nfiles=128 +pybin=$(command -v python3 || command -v python) +#pybin=~/.pyenv/versions/nogil-3.9.10-2/bin/python3 + [ $# -ge 1 ] || { echo 'need arg 1: path to copyparty-sfx.py' echo ' (remaining args will be passed on to copyparty,' @@ -22,6 +27,8 @@ sfx="$1" shift sfx="$(realpath "$sfx" || readlink -e "$sfx" || echo "$sfx")" awk=$(command -v gawk || command -v awk) +uname -s | grep -E MSYS && win=1 || win= +totalsize=$((fsize*nfiles)) # try to use /dev/shm to avoid hitting filesystems at all, # otherwise fallback to mktemp which probably uses /tmp @@ -30,20 +37,24 @@ mkdir $td || td=$(mktemp -d) trap "rm -rf $td" INT TERM EXIT cd $td -echo creating 256 MiB testfile in $td -head -c $((1024*1024*256)) /dev/urandom > 1 +echo creating $fsize MiB testfile in $td +sz=$((1024*1024*fsize)) +head -c $sz /dev/zero | openssl enc -aes-256-ctr -iter 1 -pass pass:k -nosalt 2>/dev/null >1 || true +wc -c 1 | awk '$1=='$sz'{r=1}END{exit 1-r}' || head -c $sz /dev/urandom >1 -echo creating 127 symlinks to it -for n in $(seq 2 128); do ln -s 1 $n; done +echo creating $((nfiles-1)) symlinks to it +for n in $(seq 2 $nfiles); do MSYS=winsymlinks:nativestrict ln -s 1 $n; done echo warming up cache cat 1 >/dev/null echo ok lets go -python3 "$sfx" -p39204 -e2dsa --dbd=yolo --exit=idx -lo=t -q "$@" +$pybin "$sfx" -p39204 -e2dsa --dbd=yolo --exit=idx -lo=t -q "$@" && err= || err=$? +[ $win ] && [ $err = 15 ] && err= # sigterm doesn't hook on windows, ah whatever +[ $err ] && echo ERROR $err && exit $err echo and the results are... -LC_ALL=C $awk '/1 volumes in / {s=$(NF-1); printf "speed: %.1f MiB/s (time=%.2fs)\n", 256*128/s, s}' /dev/null && +command -v gsed >/dev/null && +command -v gsort >/dev/null && { + find() { gfind "$@"; } + sed() { gsed "$@"; } + sort() { gsort "$@"; } +} + +packfun() { + local jobs=$1 fn="$2" + printf '%s\n' "$fn" | grep -qF .zst && return + + local of="$(printf '%s\n' "$fn" | sed -r 's/\.(xz|txt)/.zst/')" + [ "$fn" = "$of" ] && + of="$of.zst" + + [ -e "$of" ] && + echo "SKIP: output file exists: $of" && + return + + lsof -- "$fn" 2>/dev/null | grep -E .. && + printf "SKIP: file in use: %s\n\n" $fn && + return + + # determine by header; old copyparty versions would produce xz output without .xz names + head -c3 "$fn" | grep -qF 7z && + cmd="xz -dkc" || cmd="cat" + + printf '<%s> T%d: %s\n' "$cmd" $jobs "$of" + + $cmd <"$fn" >/dev/null || { + echo "ERROR: uncompress failed: $fn" + return + } + + $cmd <"$fn" | zstd --long -19 -T$jobs >"$of" + touch -r "$fn" -- "$of" + + cmp <($cmd <"$fn") <(zstd -d <"$of") || { + echo "ERROR: data mismatch: $of" + mv "$of"{,.BAD} + return + } + rm -- "$fn" +} + +# do small files in parallel first (in descending size); +# each file can use 4 threads in case the cutoff is poor +export -f packfun +export -f sed 2>/dev/null || true +find -maxdepth 1 -type f -size -$cutoff -printf '%s %p\n' | +sort -nr | sed -r 's`[^ ]+ ``; s`^\./``' | tr '\n' '\0' | +xargs "$@" -0i -P$(nproc) bash -c 'packfun 4 "$@"' _ {} + +# then the big ones, letting each file use the whole cpu +for f in *; do packfun 0 "$f"; done