add more prometheus metrics; breaking changes:

* cpp_uptime is now a gauge
* cpp_bans is now cpp_active_bans (and also a gauge)

and other related fixes:
* stop emitting invalid cpp_disk_size/free for offline volumes
* support overriding the spec-mandatory mimetype with ?mime=foo
This commit is contained in:
ed 2023-11-04 20:32:34 +00:00
parent 2e85a25614
commit 4b720f4150
7 changed files with 113 additions and 12 deletions

View file

@ -1304,8 +1304,23 @@ scrape_configs:
``` ```
currently the following metrics are available, currently the following metrics are available,
* `cpp_uptime_seconds` * `cpp_uptime_seconds` time since last copyparty restart
* `cpp_bans` number of banned IPs * `cpp_boot_unixtime_seconds` same but as an absolute timestamp
* `cpp_http_conns` number of open http(s) connections
* `cpp_http_reqs` number of http(s) requests handled
* `cpp_sus_reqs` number of 403/422/malicious requests
* `cpp_active_bans` number of currently banned IPs
* `cpp_total_bans` number of IPs banned since last restart
these are available unless `--nos-vst` is specified:
* `cpp_db_idle_seconds` time since last database activity (upload/rename/delete)
* `cpp_db_act_seconds` same but as an absolute timestamp
* `cpp_idle_vols` number of volumes which are idle / ready
* `cpp_busy_vols` number of volumes which are busy / indexing
* `cpp_offline_vols` number of volumes which are offline / unavailable
* `cpp_hashing_files` number of files queued for hashing / indexing
* `cpp_tagq_files` number of files queued for metadata scanning
* `cpp_mtpq_files` number of files queued for plugin-based analysis
and these are available per-volume only: and these are available per-volume only:
* `cpp_disk_size_bytes` total HDD size * `cpp_disk_size_bytes` total HDD size
@ -1324,9 +1339,12 @@ some of the metrics have additional requirements to function correctly,
the following options are available to disable some of the metrics: the following options are available to disable some of the metrics:
* `--nos-hdd` disables `cpp_disk_*` which can prevent spinning up HDDs * `--nos-hdd` disables `cpp_disk_*` which can prevent spinning up HDDs
* `--nos-vol` disables `cpp_vol_*` which reduces server startup time * `--nos-vol` disables `cpp_vol_*` which reduces server startup time
* `--nos-vst` disables volume state, reducing the worst-case prometheus query time by 0.5 sec
* `--nos-dup` disables `cpp_dupe_*` which reduces the server load caused by prometheus queries * `--nos-dup` disables `cpp_dupe_*` which reduces the server load caused by prometheus queries
* `--nos-unf` disables `cpp_unf_*` for no particular purpose * `--nos-unf` disables `cpp_unf_*` for no particular purpose
note: the following metrics are counted incorrectly if multiprocessing is enabled with `-j`: `cpp_http_conns`, `cpp_http_reqs`, `cpp_sus_reqs`, `cpp_active_bans`, `cpp_total_bans`
# packages # packages

View file

@ -1014,6 +1014,7 @@ def add_stats(ap):
ap2.add_argument("--stats", action="store_true", help="enable openmetrics at /.cpr/metrics for admin accounts") ap2.add_argument("--stats", action="store_true", help="enable openmetrics at /.cpr/metrics for admin accounts")
ap2.add_argument("--nos-hdd", action="store_true", help="disable disk-space metrics (used/free space)") ap2.add_argument("--nos-hdd", action="store_true", help="disable disk-space metrics (used/free space)")
ap2.add_argument("--nos-vol", action="store_true", help="disable volume size metrics (num files, total bytes, vmaxb/vmaxn)") ap2.add_argument("--nos-vol", action="store_true", help="disable volume size metrics (num files, total bytes, vmaxb/vmaxn)")
ap2.add_argument("--nos-vst", action="store_true", help="disable volume state metrics (indexing, analyzing, activity)")
ap2.add_argument("--nos-dup", action="store_true", help="disable dupe-files metrics (good idea; very slow)") ap2.add_argument("--nos-dup", action="store_true", help="disable dupe-files metrics (good idea; very slow)")
ap2.add_argument("--nos-unf", action="store_true", help="disable unfinished-uploads metrics") ap2.add_argument("--nos-unf", action="store_true", help="disable unfinished-uploads metrics")

View file

@ -92,6 +92,12 @@ class FtpAuth(DummyAuthorizer):
if bonk: if bonk:
logging.warning("client banned: invalid passwords") logging.warning("client banned: invalid passwords")
bans[ip] = bonk bans[ip] = bonk
try:
# only possible if multiprocessing disabled
self.hub.broker.httpsrv.bans[ip] = bonk
self.hub.broker.httpsrv.nban += 1
except:
pass
raise AuthenticationFailed("Authentication failed.") raise AuthenticationFailed("Authentication failed.")

View file

@ -277,6 +277,8 @@ class HttpCli(object):
return False return False
self.conn.hsrv.nreq += 1
self.ua = self.headers.get("user-agent", "") self.ua = self.headers.get("user-agent", "")
self.is_rclone = self.ua.startswith("rclone/") self.is_rclone = self.ua.startswith("rclone/")
@ -567,6 +569,7 @@ class HttpCli(object):
return self.conn.iphash.s(self.ip) return self.conn.iphash.s(self.ip)
def cbonk(self, g: Garda, v: str, reason: str, descr: str) -> bool: def cbonk(self, g: Garda, v: str, reason: str, descr: str) -> bool:
self.conn.hsrv.nsus += 1
if not g.lim: if not g.lim:
return False return False
@ -590,6 +593,7 @@ class HttpCli(object):
): ):
self.log("client banned: %s" % (descr,), 1) self.log("client banned: %s" % (descr,), 1)
self.conn.hsrv.bans[ip] = bonk self.conn.hsrv.bans[ip] = bonk
self.conn.hsrv.nban += 1
return True return True
return False return False

View file

@ -128,6 +128,9 @@ class HttpSrv(object):
self.u2fh = FHC() self.u2fh = FHC()
self.metrics = Metrics(self) self.metrics = Metrics(self)
self.nreq = 0
self.nsus = 0
self.nban = 0
self.srvs: list[socket.socket] = [] self.srvs: list[socket.socket] = []
self.ncli = 0 # exact self.ncli = 0 # exact
self.clients: set[HttpConn] = set() # laggy self.clients: set[HttpConn] = set() # laggy

View file

@ -34,14 +34,23 @@ class Metrics(object):
ret: list[str] = [] ret: list[str] = []
def addc(k: str, unit: str, v: str, desc: str) -> None: def addc(k: str, v: str, desc: str) -> None:
if unit: zs = "# TYPE %s counter\n# HELP %s %s\n%s_created %s\n%s_total %s"
k += "_" + unit ret.append(zs % (k, k, desc, k, int(self.hsrv.t0), k, v))
zs = "# TYPE %s counter\n# UNIT %s %s\n# HELP %s %s\n%s_created %s\n%s_total %s"
ret.append(zs % (k, k, unit, k, desc, k, int(self.hsrv.t0), k, v)) def adduc(k: str, unit: str, v: str, desc: str) -> None:
else: k += "_" + unit
zs = "# TYPE %s counter\n# HELP %s %s\n%s_created %s\n%s_total %s" zs = "# TYPE %s counter\n# UNIT %s %s\n# HELP %s %s\n%s_created %s\n%s_total %s"
ret.append(zs % (k, k, desc, k, int(self.hsrv.t0), k, v)) ret.append(zs % (k, k, unit, k, desc, k, int(self.hsrv.t0), k, v))
def addg(k: str, v: str, desc: str) -> None:
zs = "# TYPE %s gauge\n# HELP %s %s\n%s %s"
ret.append(zs % (k, k, desc, k, v))
def addug(k: str, unit: str, v: str, desc: str) -> None:
k += "_" + unit
zs = "# TYPE %s gauge\n# UNIT %s %s\n# HELP %s %s\n%s %s"
ret.append(zs % (k, k, unit, k, desc, k, v))
def addh(k: str, typ: str, desc: str) -> None: def addh(k: str, typ: str, desc: str) -> None:
zs = "# TYPE %s %s\n# HELP %s %s" zs = "# TYPE %s %s\n# HELP %s %s"
@ -54,17 +63,75 @@ class Metrics(object):
def addv(k: str, v: str) -> None: def addv(k: str, v: str) -> None:
ret.append("%s %s" % (k, v)) ret.append("%s %s" % (k, v))
t = "time since last copyparty restart"
v = "{:.3f}".format(time.time() - self.hsrv.t0) v = "{:.3f}".format(time.time() - self.hsrv.t0)
addc("cpp_uptime", "seconds", v, "time since last server restart") addug("cpp_uptime", "seconds", v, t)
# timestamps are gauges because initial value is not zero
t = "unixtime of last copyparty restart"
v = "{:.3f}".format(self.hsrv.t0)
addug("cpp_boot_unixtime", "seconds", v, t)
t = "number of open http(s) client connections"
addg("cpp_http_conns", str(self.hsrv.ncli), t)
t = "number of http(s) requests since last restart"
addc("cpp_http_reqs", str(self.hsrv.nreq), t)
t = "number of 403/422/malicious reqs since restart"
addc("cpp_sus_reqs", str(self.hsrv.nsus), t)
v = str(len(conn.bans or [])) v = str(len(conn.bans or []))
addc("cpp_bans", "", v, "number of banned IPs") addg("cpp_active_bans", v, "number of currently banned IPs")
t = "number of IPs banned since last restart"
addg("cpp_total_bans", str(self.hsrv.nban), t)
if not args.nos_vst:
x = self.hsrv.broker.ask("up2k.get_state")
vs = json.loads(x.get())
nvidle = 0
nvbusy = 0
nvoffline = 0
for v in vs["volstate"].values():
if v == "online, idle":
nvidle += 1
elif "OFFLINE" in v:
nvoffline += 1
else:
nvbusy += 1
addg("cpp_idle_vols", str(nvidle), "number of idle/ready volumes")
addg("cpp_busy_vols", str(nvbusy), "number of busy/indexing volumes")
addg("cpp_offline_vols", str(nvoffline), "number of offline volumes")
t = "time since last database activity (upload/rename/delete)"
addug("cpp_db_idle", "seconds", str(vs["dbwt"]), t)
t = "unixtime of last database activity (upload/rename/delete)"
addug("cpp_db_act", "seconds", str(vs["dbwu"]), t)
t = "number of files queued for hashing/indexing"
addg("cpp_hashing_files", str(vs["hashq"]), t)
t = "number of files queued for metadata scanning"
addg("cpp_tagq_files", str(vs["tagq"]), t)
try:
t = "number of files queued for plugin-based analysis"
addg("cpp_mtpq_files", str(int(vs["mtpq"])), t)
except:
pass
if not args.nos_hdd: if not args.nos_hdd:
addbh("cpp_disk_size_bytes", "total HDD size of volume") addbh("cpp_disk_size_bytes", "total HDD size of volume")
addbh("cpp_disk_free_bytes", "free HDD space in volume") addbh("cpp_disk_free_bytes", "free HDD space in volume")
for vpath, vol in allvols: for vpath, vol in allvols:
free, total = get_df(vol.realpath) free, total = get_df(vol.realpath)
if free is None or total is None:
continue
addv('cpp_disk_size_bytes{vol="/%s"}' % (vpath), str(total)) addv('cpp_disk_size_bytes{vol="/%s"}' % (vpath), str(total))
addv('cpp_disk_free_bytes{vol="/%s"}' % (vpath), str(free)) addv('cpp_disk_free_bytes{vol="/%s"}' % (vpath), str(free))
@ -161,5 +228,6 @@ class Metrics(object):
ret.append("# EOF") ret.append("# EOF")
mime = "application/openmetrics-text; version=1.0.0; charset=utf-8" mime = "application/openmetrics-text; version=1.0.0; charset=utf-8"
mime = cli.uparam.get("mime") or mime
cli.reply("\n".join(ret).encode("utf-8"), mime=mime) cli.reply("\n".join(ret).encode("utf-8"), mime=mime)
return True return True

View file

@ -266,6 +266,7 @@ class Up2k(object):
"hashq": self.n_hashq, "hashq": self.n_hashq,
"tagq": self.n_tagq, "tagq": self.n_tagq,
"mtpq": mtpq, "mtpq": mtpq,
"dbwu": "{:.2f}".format(self.db_act),
"dbwt": "{:.2f}".format( "dbwt": "{:.2f}".format(
min(1000 * 24 * 60 * 60 - 1, time.time() - self.db_act) min(1000 * 24 * 60 * 60 - 1, time.time() - self.db_act)
), ),