Run test probes every 15 minutes on example domains to collect statistics

aequitas · aequitas · commit 370c57def61f · 2024-03-12T09:34:07.000+01:00
diff --git a/Makefile b/Makefile
@@ -25,7 +25,7 @@ mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path))))
 ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
-pysrcdirs = internetnl tests interface checks integration_tests
+pysrcdirs = internetnl tests interface checks integration_tests docker
 pysrc = $(shell find ${pysrcdirs} -name \*.py)
 
 bin = .venv/bin
diff --git a/docker/cron.Dockerfile b/docker/cron.Dockerfile
@@ -1,6 +1,6 @@
 FROM alpine:3.18
 
-RUN apk add --no-cache curl postgresql15
+RUN apk add --no-cache curl postgresql15 python3 py3-prometheus-client py3-requests
 
 COPY docker/cron/periodic /etc/periodic/
 
diff --git a/docker/cron/periodic/15min/tests.py b/docker/cron/periodic/15min/tests.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+
+# run tests on example domains and write metrics to prometheus textfile
+
+# for iterative development
+# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \
+# ghcr.io/internetstandards/cron:latest /tests.py --debug
+
+import sys
+import os
+import time
+from prometheus_client import REGISTRY, Gauge, generate_latest
+import prometheus_client
+import logging
+import requests
+
+log = logging.getLogger(__name__)
+
+DEBUG = "--debug" in sys.argv
+
+# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
+OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests.prom"
+
+TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL"))
+REQUEST_TIMEOUT = 30
+
+TESTS = ["site", "mail"]
+
+IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL")
+INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME")
+# talk directly to the internal app container as the webserver might
+# have access restrictions in place
+URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
+HEADERS = {"Host": INTERNETNL_DOMAINNAME}
+
+# domain's to use in website tests
+WEBSITE_TEST_DOMAINS = [
+    "example.nl",
+    "example.com",
+]
+
+
+METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
+METRIC_PROBE_SUCCESS = Gauge("tests_probe_success_total", "Whether the probe succeeded.", ["test", "domain", "probe"])
+METRIC_PROBE_RUNTIME = Gauge(
+    "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
+)
+METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
+
+METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
+METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
+METRIC_TEST_FAILURE = Gauge("tests_test_failure_total", "Test runs that failed.", ["test", "domain"])
+METRIC_TEST_SUCCESS = Gauge("tests_test_success_total", "Test runs that succeeded.", ["test", "domain"])
+METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout", "Test that ran into timeout.", ["test", "domain"])
+METRIC_TEST_RUNTIME = Gauge("tests_test_runtime_seconds", "Amount of time test ran before done.", ["test", "domain"])
+
+
+def run_tests_on_domain(test, domain):
+    # initiate the test
+    r = requests.get(
+        f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
+        timeout=REQUEST_TIMEOUT,
+        allow_redirects=False,
+        headers=HEADERS,
+    )
+    r.raise_for_status()
+    log.debug(r.text)
+
+    # abort early if cached result
+    probes = r.json()
+    if not [p for p in probes if not p["done"]]:
+        METRIC_TEST_CACHE.labels(test, domain).set(1)
+        return
+
+    # poll probes until done
+    test_start = int(time.time())
+    finished_probes = set()
+    while int(time.time()) < test_start + TEST_TIMEOUT:
+        # get probe status
+        r = requests.get(
+            f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
+            timeout=REQUEST_TIMEOUT,
+            allow_redirects=False,
+            headers=HEADERS,
+        )
+        r.raise_for_status()
+        log.debug(r.text)
+
+        # record probe statuses for probes that are finished
+        probes = r.json()
+        for probe in probes:
+            if probe["name"] in finished_probes:
+                continue
+            METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
+            if probe["done"]:
+                METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
+                METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
+                finished_probes.add(probe["name"])
+
+        # stop when all probes are finished
+        if not [p for p in probes if not p["done"]]:
+            break
+
+        time.sleep(1)
+    else:
+        METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
+
+    METRIC_TEST_RUNTIME.labels(test, domain).set(int(time.time() - test_start))
+
+    # get additional metrics like score
+    for probe_name in finished_probes:
+        try:
+            r = requests.get(
+                f"{URL_BASE}/{test}/{probe_name}/{domain}/?{time.time()}",
+                timeout=REQUEST_TIMEOUT,
+                allow_redirects=False,
+                headers=HEADERS,
+            )
+            r.raise_for_status()
+            if r.status_code == 200:
+                probe_result = r.json()
+                METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
+        except Exception:
+            log.exception("failed to get probe score")
+
+
+def run_tests():
+    for test in TESTS:
+        for domain in WEBSITE_TEST_DOMAINS:
+            log.info(f"testing: {test} {domain}")
+            METRIC_TEST_RUN.labels(test, domain).set(1)
+            METRIC_TEST_CACHE.labels(test, domain).set(0)
+            METRIC_TEST_FAILURE.labels(test, domain).set(0)
+            try:
+                run_tests_on_domain(test, domain)
+            except Exception:
+                log.exception("Error during test")
+                METRIC_TEST_FAILURE.labels(test, domain).set(1)
+
+
+def main():
+    logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR)
+
+    # disable internal metrics
+    REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
+    REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
+    REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)
+
+    # run test probes against domains and collect metrics
+    run_tests()
+
+    # write metrics to stdout or file in prometheus textfile format
+    if DEBUG:
+        print(generate_latest(REGISTRY).decode())
+    else:
+        with open(OUTPUT_TEXTFILE, "w") as f:
+            f.write(generate_latest(REGISTRY).decode())
+
+
+if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS", "True") == "True":
+    main()
diff --git a/docker/defaults.env b/docker/defaults.env
@@ -201,6 +201,9 @@ CRON_DAILY_POSTGRESQL_BACKUP=True
 CRON_DAILY_TRUNCATE_EXPORTER_LOGS=True
 CRON_WEEKLY_POSTGRESQL_BACKUP=False
 
+# enable running tests every 15 minutes for metrics collection
+CRON_15MIN_RUN_TESTS=True
+
 INTERNETNL_BRANDING=False
 
 # enable caching, set to off to disable
diff --git a/docker/develop.env b/docker/develop.env
@@ -60,9 +60,10 @@ ROUTINATOR_URL=https://rpki-validator.ripe.net/api/v1/validity
 # use default logging driver instead of journald
 LOGGING_DRIVER=json-file
 
-# disable backup crons
+# disable backup and other crons
 CRON_DAILY_POSTGRESQL_BACKUP=False
 CRON_WEEKLY_POSTGRESQL_BACKUP=False
+CRON_15MIN_RUN_TESTS=False
 
 INTERNETNL_BRANDING=False
 
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -627,6 +627,9 @@ services:
       - DB_PASSWORD=password
       - CRON_DAILY_POSTGRESQL_BACKUP
       - CRON_WEEKLY_POSTGRESQL_BACKUP
+      - IPV4_IP_APP_INTERNAL
+      - INTERNETNL_DOMAINNAME
+      - INTERNETNL_CACHE_TTL
 
     restart: unless-stopped
     logging:
@@ -647,6 +650,7 @@ services:
       - manual-hof:/app/manual-hall-of-fame/
       - postgres-backups:/var/lib/postgresql/backups
       - nginx-logs-exporter:/var/log/nginx/prometheus-nginxlog-exporter/
+      - prometheus-textfile-directory:/prometheus-textfile-directory
 
     healthcheck:
       test: ["CMD", "pgrep", "crond"]
@@ -793,6 +797,8 @@ services:
       - --collector.netdev.device-exclude=veth
       # ignore docker container interfaces
       - --collector.netclass.ignored-devices=veth
+      - --collector.textfile
+      - --collector.textfile.directory=/prometheus-textfile-directory
     restart: unless-stopped
     logging:
       driver: $LOGGING_DRIVER
@@ -804,6 +810,7 @@ services:
     volumes:
       - /:/host:ro
       - /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket
+      - prometheus-textfile-directory:/prometheus-textfile-directory
 
   docker_stats_exporter:
     # https://github.com/jan4843/docker_stats_exporter
@@ -851,6 +858,8 @@ volumes:
   unbound-zones: {}
   # permanent storage for Prometheus metrics
   prometheus-data: {}
+  # prometheus metric textfile shared volume
+  prometheus-textfile-directory: {}
   # permanent storage for Grafana custom dashboards
   grafana-data: {}
   # shares hosters HoF file between cron and app
diff --git a/docker/monitoring/grafana/dashboards/periodic-tests.json b/docker/monitoring/grafana/dashboards/periodic-tests.json
diff --git a/interface/urls.py b/interface/urls.py