Skip to content

Commit 370c57d

Browse files
committed
Run test probes every 15 minutes on example domains to collect statistics
1 parent 68b7f7f commit 370c57d

File tree

8 files changed

+1108
-4
lines changed

8 files changed

+1108
-4
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
2525
current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path))))
2626
ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
2727

28-
pysrcdirs = internetnl tests interface checks integration_tests
28+
pysrcdirs = internetnl tests interface checks integration_tests docker
2929
pysrc = $(shell find ${pysrcdirs} -name \*.py)
3030

3131
bin = .venv/bin

docker/cron.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
FROM alpine:3.18
22

3-
RUN apk add --no-cache curl postgresql15
3+
RUN apk add --no-cache curl postgresql15 python3 py3-prometheus-client py3-requests
44

55
COPY docker/cron/periodic /etc/periodic/
66

docker/cron/periodic/15min/tests.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python3
2+
3+
# run tests on example domains and write metrics to prometheus textfile
4+
5+
# for iterative development
6+
# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \
7+
# ghcr.io/internetstandards/cron:latest /tests.py --debug
8+
9+
import sys
10+
import os
11+
import time
12+
from prometheus_client import REGISTRY, Gauge, generate_latest
13+
import prometheus_client
14+
import logging
15+
import requests
16+
17+
log = logging.getLogger(__name__)
18+
19+
DEBUG = "--debug" in sys.argv
20+
21+
# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
22+
OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests.prom"
23+
24+
TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL"))
25+
REQUEST_TIMEOUT = 30
26+
27+
TESTS = ["site", "mail"]
28+
29+
IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL")
30+
INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME")
31+
# talk directly to the internal app container as the webserver might
32+
# have access restrictions in place
33+
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
34+
HEADERS = {"Host": INTERNETNL_DOMAINNAME}
35+
36+
# domain's to use in website tests
37+
WEBSITE_TEST_DOMAINS = [
38+
"example.nl",
39+
"example.com",
40+
]
41+
42+
43+
METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
44+
METRIC_PROBE_SUCCESS = Gauge("tests_probe_success_total", "Whether the probe succeeded.", ["test", "domain", "probe"])
45+
METRIC_PROBE_RUNTIME = Gauge(
46+
"tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
47+
)
48+
METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
49+
50+
METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
51+
METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
52+
METRIC_TEST_FAILURE = Gauge("tests_test_failure_total", "Test runs that failed.", ["test", "domain"])
53+
METRIC_TEST_SUCCESS = Gauge("tests_test_success_total", "Test runs that succeeded.", ["test", "domain"])
54+
METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout", "Test that ran into timeout.", ["test", "domain"])
55+
METRIC_TEST_RUNTIME = Gauge("tests_test_runtime_seconds", "Amount of time test ran before done.", ["test", "domain"])
56+
57+
58+
def run_tests_on_domain(test, domain):
59+
# initiate the test
60+
r = requests.get(
61+
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
62+
timeout=REQUEST_TIMEOUT,
63+
allow_redirects=False,
64+
headers=HEADERS,
65+
)
66+
r.raise_for_status()
67+
log.debug(r.text)
68+
69+
# abort early if cached result
70+
probes = r.json()
71+
if not [p for p in probes if not p["done"]]:
72+
METRIC_TEST_CACHE.labels(test, domain).set(1)
73+
return
74+
75+
# poll probes until done
76+
test_start = int(time.time())
77+
finished_probes = set()
78+
while int(time.time()) < test_start + TEST_TIMEOUT:
79+
# get probe status
80+
r = requests.get(
81+
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
82+
timeout=REQUEST_TIMEOUT,
83+
allow_redirects=False,
84+
headers=HEADERS,
85+
)
86+
r.raise_for_status()
87+
log.debug(r.text)
88+
89+
# record probe statuses for probes that are finished
90+
probes = r.json()
91+
for probe in probes:
92+
if probe["name"] in finished_probes:
93+
continue
94+
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
95+
if probe["done"]:
96+
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
97+
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
98+
finished_probes.add(probe["name"])
99+
100+
# stop when all probes are finished
101+
if not [p for p in probes if not p["done"]]:
102+
break
103+
104+
time.sleep(1)
105+
else:
106+
METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
107+
108+
METRIC_TEST_RUNTIME.labels(test, domain).set(int(time.time() - test_start))
109+
110+
# get additional metrics like score
111+
for probe_name in finished_probes:
112+
try:
113+
r = requests.get(
114+
f"{URL_BASE}/{test}/{probe_name}/{domain}/?{time.time()}",
115+
timeout=REQUEST_TIMEOUT,
116+
allow_redirects=False,
117+
headers=HEADERS,
118+
)
119+
r.raise_for_status()
120+
if r.status_code == 200:
121+
probe_result = r.json()
122+
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
123+
except Exception:
124+
log.exception("failed to get probe score")
125+
126+
127+
def run_tests():
128+
for test in TESTS:
129+
for domain in WEBSITE_TEST_DOMAINS:
130+
log.info(f"testing: {test} {domain}")
131+
METRIC_TEST_RUN.labels(test, domain).set(1)
132+
METRIC_TEST_CACHE.labels(test, domain).set(0)
133+
METRIC_TEST_FAILURE.labels(test, domain).set(0)
134+
try:
135+
run_tests_on_domain(test, domain)
136+
except Exception:
137+
log.exception("Error during test")
138+
METRIC_TEST_FAILURE.labels(test, domain).set(1)
139+
140+
141+
def main():
142+
logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR)
143+
144+
# disable internal metrics
145+
REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
146+
REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
147+
REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)
148+
149+
# run test probes against domains and collect metrics
150+
run_tests()
151+
152+
# write metrics to stdout or file in prometheus textfile format
153+
if DEBUG:
154+
print(generate_latest(REGISTRY).decode())
155+
else:
156+
with open(OUTPUT_TEXTFILE, "w") as f:
157+
f.write(generate_latest(REGISTRY).decode())
158+
159+
160+
if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS", "True") == "True":
161+
main()

docker/defaults.env

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ CRON_DAILY_POSTGRESQL_BACKUP=True
201201
CRON_DAILY_TRUNCATE_EXPORTER_LOGS=True
202202
CRON_WEEKLY_POSTGRESQL_BACKUP=False
203203

204+
# enable running tests every 15 minutes for metrics collection
205+
CRON_15MIN_RUN_TESTS=True
206+
204207
INTERNETNL_BRANDING=False
205208

206209
# enable caching, set to off to disable

docker/develop.env

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,10 @@ ROUTINATOR_URL=https://rpki-validator.ripe.net/api/v1/validity
6060
# use default logging driver instead of journald
6161
LOGGING_DRIVER=json-file
6262

63-
# disable backup crons
63+
# disable backup and other crons
6464
CRON_DAILY_POSTGRESQL_BACKUP=False
6565
CRON_WEEKLY_POSTGRESQL_BACKUP=False
66+
CRON_15MIN_RUN_TESTS=False
6667

6768
INTERNETNL_BRANDING=False
6869

docker/docker-compose.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,9 @@ services:
627627
- DB_PASSWORD=password
628628
- CRON_DAILY_POSTGRESQL_BACKUP
629629
- CRON_WEEKLY_POSTGRESQL_BACKUP
630+
- IPV4_IP_APP_INTERNAL
631+
- INTERNETNL_DOMAINNAME
632+
- INTERNETNL_CACHE_TTL
630633

631634
restart: unless-stopped
632635
logging:
@@ -647,6 +650,7 @@ services:
647650
- manual-hof:/app/manual-hall-of-fame/
648651
- postgres-backups:/var/lib/postgresql/backups
649652
- nginx-logs-exporter:/var/log/nginx/prometheus-nginxlog-exporter/
653+
- prometheus-textfile-directory:/prometheus-textfile-directory
650654

651655
healthcheck:
652656
test: ["CMD", "pgrep", "crond"]
@@ -793,6 +797,8 @@ services:
793797
- --collector.netdev.device-exclude=veth
794798
# ignore docker container interfaces
795799
- --collector.netclass.ignored-devices=veth
800+
- --collector.textfile
801+
- --collector.textfile.directory=/prometheus-textfile-directory
796802
restart: unless-stopped
797803
logging:
798804
driver: $LOGGING_DRIVER
@@ -804,6 +810,7 @@ services:
804810
volumes:
805811
- /:/host:ro
806812
- /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket
813+
- prometheus-textfile-directory:/prometheus-textfile-directory
807814

808815
docker_stats_exporter:
809816
# https://github.com/jan4843/docker_stats_exporter
@@ -851,6 +858,8 @@ volumes:
851858
unbound-zones: {}
852859
# permanent storage for Prometheus metrics
853860
prometheus-data: {}
861+
# prometheus metric textfile shared volume
862+
prometheus-textfile-directory: {}
854863
# permanent storage for Grafana custom dashboards
855864
grafana-data: {}
856865
# shares hosters HoF file between cron and app

0 commit comments

Comments
 (0)