From 4b720f41500a80a8c4436508b5b5fecd26a360d4 Mon Sep 17 00:00:00 2001 From: ed Date: Sat, 4 Nov 2023 20:32:34 +0000 Subject: [PATCH] add more prometheus metrics; breaking changes: * cpp_uptime is now a gauge * cpp_bans is now cpp_active_bans (and also a gauge) and other related fixes: * stop emitting invalid cpp_disk_size/free for offline volumes * support overriding the spec-mandatory mimetype with ?mime=foo --- README.md | 22 ++++++++++- copyparty/__main__.py | 1 + copyparty/ftpd.py | 6 +++ copyparty/httpcli.py | 4 ++ copyparty/httpsrv.py | 3 ++ copyparty/metrics.py | 88 ++++++++++++++++++++++++++++++++++++++----- copyparty/up2k.py | 1 + 7 files changed, 113 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3cbf5289..20cc91ea 100644 --- a/README.md +++ b/README.md @@ -1304,8 +1304,23 @@ scrape_configs: ``` currently the following metrics are available, -* `cpp_uptime_seconds` -* `cpp_bans` number of banned IPs +* `cpp_uptime_seconds` time since last copyparty restart +* `cpp_boot_unixtime_seconds` same but as an absolute timestamp +* `cpp_http_conns` number of open http(s) connections +* `cpp_http_reqs` number of http(s) requests handled +* `cpp_sus_reqs` number of 403/422/malicious requests +* `cpp_active_bans` number of currently banned IPs +* `cpp_total_bans` number of IPs banned since last restart + +these are available unless `--nos-vst` is specified: +* `cpp_db_idle_seconds` time since last database activity (upload/rename/delete) +* `cpp_db_act_seconds` same but as an absolute timestamp +* `cpp_idle_vols` number of volumes which are idle / ready +* `cpp_busy_vols` number of volumes which are busy / indexing +* `cpp_offline_vols` number of volumes which are offline / unavailable +* `cpp_hashing_files` number of files queued for hashing / indexing +* `cpp_tagq_files` number of files queued for metadata scanning +* `cpp_mtpq_files` number of files queued for plugin-based analysis and these are available per-volume only: * `cpp_disk_size_bytes` total HDD size @@ -1324,9 +1339,12 @@ some of the metrics have additional requirements to function correctly, the following options are available to disable some of the metrics: * `--nos-hdd` disables `cpp_disk_*` which can prevent spinning up HDDs * `--nos-vol` disables `cpp_vol_*` which reduces server startup time +* `--nos-vst` disables volume state, reducing the worst-case prometheus query time by 0.5 sec * `--nos-dup` disables `cpp_dupe_*` which reduces the server load caused by prometheus queries * `--nos-unf` disables `cpp_unf_*` for no particular purpose +note: the following metrics are counted incorrectly if multiprocessing is enabled with `-j`: `cpp_http_conns`, `cpp_http_reqs`, `cpp_sus_reqs`, `cpp_active_bans`, `cpp_total_bans` + # packages diff --git a/copyparty/__main__.py b/copyparty/__main__.py index ead96543..3e09427d 100755 --- a/copyparty/__main__.py +++ b/copyparty/__main__.py @@ -1014,6 +1014,7 @@ def add_stats(ap): ap2.add_argument("--stats", action="store_true", help="enable openmetrics at /.cpr/metrics for admin accounts") ap2.add_argument("--nos-hdd", action="store_true", help="disable disk-space metrics (used/free space)") ap2.add_argument("--nos-vol", action="store_true", help="disable volume size metrics (num files, total bytes, vmaxb/vmaxn)") + ap2.add_argument("--nos-vst", action="store_true", help="disable volume state metrics (indexing, analyzing, activity)") ap2.add_argument("--nos-dup", action="store_true", help="disable dupe-files metrics (good idea; very slow)") ap2.add_argument("--nos-unf", action="store_true", help="disable unfinished-uploads metrics") diff --git a/copyparty/ftpd.py b/copyparty/ftpd.py index cb0573bb..15f84264 100644 --- a/copyparty/ftpd.py +++ b/copyparty/ftpd.py @@ -92,6 +92,12 @@ class FtpAuth(DummyAuthorizer): if bonk: logging.warning("client banned: invalid passwords") bans[ip] = bonk + try: + # only possible if multiprocessing disabled + self.hub.broker.httpsrv.bans[ip] = bonk + self.hub.broker.httpsrv.nban += 1 + except: + pass raise AuthenticationFailed("Authentication failed.") diff --git a/copyparty/httpcli.py b/copyparty/httpcli.py index 0c2a76e2..ecd24d20 100644 --- a/copyparty/httpcli.py +++ b/copyparty/httpcli.py @@ -277,6 +277,8 @@ class HttpCli(object): return False + self.conn.hsrv.nreq += 1 + self.ua = self.headers.get("user-agent", "") self.is_rclone = self.ua.startswith("rclone/") @@ -567,6 +569,7 @@ class HttpCli(object): return self.conn.iphash.s(self.ip) def cbonk(self, g: Garda, v: str, reason: str, descr: str) -> bool: + self.conn.hsrv.nsus += 1 if not g.lim: return False @@ -590,6 +593,7 @@ class HttpCli(object): ): self.log("client banned: %s" % (descr,), 1) self.conn.hsrv.bans[ip] = bonk + self.conn.hsrv.nban += 1 return True return False diff --git a/copyparty/httpsrv.py b/copyparty/httpsrv.py index 2f253dad..994cdcaa 100644 --- a/copyparty/httpsrv.py +++ b/copyparty/httpsrv.py @@ -128,6 +128,9 @@ class HttpSrv(object): self.u2fh = FHC() self.metrics = Metrics(self) + self.nreq = 0 + self.nsus = 0 + self.nban = 0 self.srvs: list[socket.socket] = [] self.ncli = 0 # exact self.clients: set[HttpConn] = set() # laggy diff --git a/copyparty/metrics.py b/copyparty/metrics.py index 4cf7ff5b..72e86fdb 100644 --- a/copyparty/metrics.py +++ b/copyparty/metrics.py @@ -34,14 +34,23 @@ class Metrics(object): ret: list[str] = [] - def addc(k: str, unit: str, v: str, desc: str) -> None: - if unit: - k += "_" + unit - zs = "# TYPE %s counter\n# UNIT %s %s\n# HELP %s %s\n%s_created %s\n%s_total %s" - ret.append(zs % (k, k, unit, k, desc, k, int(self.hsrv.t0), k, v)) - else: - zs = "# TYPE %s counter\n# HELP %s %s\n%s_created %s\n%s_total %s" - ret.append(zs % (k, k, desc, k, int(self.hsrv.t0), k, v)) + def addc(k: str, v: str, desc: str) -> None: + zs = "# TYPE %s counter\n# HELP %s %s\n%s_created %s\n%s_total %s" + ret.append(zs % (k, k, desc, k, int(self.hsrv.t0), k, v)) + + def adduc(k: str, unit: str, v: str, desc: str) -> None: + k += "_" + unit + zs = "# TYPE %s counter\n# UNIT %s %s\n# HELP %s %s\n%s_created %s\n%s_total %s" + ret.append(zs % (k, k, unit, k, desc, k, int(self.hsrv.t0), k, v)) + + def addg(k: str, v: str, desc: str) -> None: + zs = "# TYPE %s gauge\n# HELP %s %s\n%s %s" + ret.append(zs % (k, k, desc, k, v)) + + def addug(k: str, unit: str, v: str, desc: str) -> None: + k += "_" + unit + zs = "# TYPE %s gauge\n# UNIT %s %s\n# HELP %s %s\n%s %s" + ret.append(zs % (k, k, unit, k, desc, k, v)) def addh(k: str, typ: str, desc: str) -> None: zs = "# TYPE %s %s\n# HELP %s %s" @@ -54,17 +63,75 @@ class Metrics(object): def addv(k: str, v: str) -> None: ret.append("%s %s" % (k, v)) + t = "time since last copyparty restart" v = "{:.3f}".format(time.time() - self.hsrv.t0) - addc("cpp_uptime", "seconds", v, "time since last server restart") + addug("cpp_uptime", "seconds", v, t) + + # timestamps are gauges because initial value is not zero + t = "unixtime of last copyparty restart" + v = "{:.3f}".format(self.hsrv.t0) + addug("cpp_boot_unixtime", "seconds", v, t) + + t = "number of open http(s) client connections" + addg("cpp_http_conns", str(self.hsrv.ncli), t) + + t = "number of http(s) requests since last restart" + addc("cpp_http_reqs", str(self.hsrv.nreq), t) + + t = "number of 403/422/malicious reqs since restart" + addc("cpp_sus_reqs", str(self.hsrv.nsus), t) v = str(len(conn.bans or [])) - addc("cpp_bans", "", v, "number of banned IPs") + addg("cpp_active_bans", v, "number of currently banned IPs") + + t = "number of IPs banned since last restart" + addg("cpp_total_bans", str(self.hsrv.nban), t) + + if not args.nos_vst: + x = self.hsrv.broker.ask("up2k.get_state") + vs = json.loads(x.get()) + + nvidle = 0 + nvbusy = 0 + nvoffline = 0 + for v in vs["volstate"].values(): + if v == "online, idle": + nvidle += 1 + elif "OFFLINE" in v: + nvoffline += 1 + else: + nvbusy += 1 + + addg("cpp_idle_vols", str(nvidle), "number of idle/ready volumes") + addg("cpp_busy_vols", str(nvbusy), "number of busy/indexing volumes") + addg("cpp_offline_vols", str(nvoffline), "number of offline volumes") + + t = "time since last database activity (upload/rename/delete)" + addug("cpp_db_idle", "seconds", str(vs["dbwt"]), t) + + t = "unixtime of last database activity (upload/rename/delete)" + addug("cpp_db_act", "seconds", str(vs["dbwu"]), t) + + t = "number of files queued for hashing/indexing" + addg("cpp_hashing_files", str(vs["hashq"]), t) + + t = "number of files queued for metadata scanning" + addg("cpp_tagq_files", str(vs["tagq"]), t) + + try: + t = "number of files queued for plugin-based analysis" + addg("cpp_mtpq_files", str(int(vs["mtpq"])), t) + except: + pass if not args.nos_hdd: addbh("cpp_disk_size_bytes", "total HDD size of volume") addbh("cpp_disk_free_bytes", "free HDD space in volume") for vpath, vol in allvols: free, total = get_df(vol.realpath) + if free is None or total is None: + continue + addv('cpp_disk_size_bytes{vol="/%s"}' % (vpath), str(total)) addv('cpp_disk_free_bytes{vol="/%s"}' % (vpath), str(free)) @@ -161,5 +228,6 @@ class Metrics(object): ret.append("# EOF") mime = "application/openmetrics-text; version=1.0.0; charset=utf-8" + mime = cli.uparam.get("mime") or mime cli.reply("\n".join(ret).encode("utf-8"), mime=mime) return True diff --git a/copyparty/up2k.py b/copyparty/up2k.py index 15022a2a..0ec28b70 100644 --- a/copyparty/up2k.py +++ b/copyparty/up2k.py @@ -266,6 +266,7 @@ class Up2k(object): "hashq": self.n_hashq, "tagq": self.n_tagq, "mtpq": mtpq, + "dbwu": "{:.2f}".format(self.db_act), "dbwt": "{:.2f}".format( min(1000 * 24 * 60 * 60 - 1, time.time() - self.db_act) ),