From 59e53572990347a6ad9942ca344d8d29017abc6e Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Sat, 1 Feb 2025 20:48:45 -0800 Subject: [PATCH] [tmpnet] Deploy collectors with golang to simplify cross-repo use Previously, prometheus and promtail were installed and launched by with bash scripts. Migrating installation to nix and launch to golang enables directly sharing the functionality with subnet-evm and hypersdk. No more having to copy and maintain copies of the scripts in multiple repos. --- .../run-monitored-tmpnet-cmd/action.yml | 21 +- .github/workflows/ci.yml | 8 +- scripts/run_prometheus.sh | 93 ------- scripts/run_promtail.sh | 91 ------- tests/fixture/e2e/env.go | 4 + tests/fixture/e2e/flags.go | 36 +-- tests/fixture/e2e/metrics_link.go | 2 +- tests/fixture/tmpnet/node_process.go | 7 +- tests/fixture/tmpnet/start_collectors.go | 242 ++++++++++++++++++ 9 files changed, 279 insertions(+), 225 deletions(-) delete mode 100755 scripts/run_prometheus.sh delete mode 100755 scripts/run_promtail.sh create mode 100644 tests/fixture/tmpnet/start_collectors.go diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index 3a3bc911efbc..a64e9970598f 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -36,23 +36,6 @@ inputs: runs: using: composite steps: - - name: Start prometheus - # Only run for the original repo; a forked repo won't have access to the monitoring credentials - if: (inputs.prometheus_username != '') - shell: bash - # Assumes calling project has a nix flake that ensures a compatible prometheus - run: nix develop --impure --command bash -x ./scripts/run_prometheus.sh - env: - PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} - PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }} - - name: Start promtail - if: (inputs.prometheus_username != '') - shell: bash - # Assumes calling project has a nix flake that ensures a compatible promtail - run: nix develop --impure --command bash -x ./scripts/run_promtail.sh - env: - LOKI_USERNAME: ${{ inputs.loki_username }} - LOKI_PASSWORD: ${{ inputs.loki_password }} - name: Notify of metrics availability if: (inputs.prometheus_username != '') shell: bash @@ -65,6 +48,10 @@ runs: shell: bash run: ${{ inputs.run }} env: + LOKI_USERNAME: ${{ inputs.loki_username }} + LOKI_PASSWORD: ${{ inputs.loki_password }} + PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} + PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }} GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }} GH_WORKFLOW: ${{ inputs.workflow }} GH_RUN_ID: ${{ inputs.run_id }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 142615ced9d9..956f2cdfd9b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,7 +62,7 @@ jobs: - name: Run e2e tests uses: ./.github/actions/run-monitored-tmpnet-cmd with: - run: E2E_SERIAL=1 ./scripts/tests.e2e.sh --delay-network-shutdown + run: nix develop --impure --command E2E_SERIAL=1 bash -x ./scripts/tests.e2e.sh --enable-collectors prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} loki_username: ${{ secrets.LOKI_ID || '' }} @@ -87,7 +87,7 @@ jobs: - name: Run e2e tests with existing network uses: ./.github/actions/run-monitored-tmpnet-cmd with: - run: E2E_SERIAL=1 ./scripts/tests.e2e.existing.sh --delay-network-shutdown + run: nix develop --impure --command E2E_SERIAL=1 bash -x ./scripts/tests.e2e.existing.sh --enable-collectors prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} loki_username: ${{ secrets.LOKI_ID || '' }} @@ -112,8 +112,8 @@ jobs: - name: Run e2e tests uses: ./.github/actions/run-monitored-tmpnet-cmd with: - run: ./scripts/tests.upgrade.sh - filter_by_owner: avalanchego-e2e + run: nix develop --impure --command bash -x ./scripts/tests.upgrade.sh + filter_by_owner: avalanchego-upgrade prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} loki_username: ${{ secrets.LOKI_ID || '' }} diff --git a/scripts/run_prometheus.sh b/scripts/run_prometheus.sh deleted file mode 100755 index 92585a10cb76..000000000000 --- a/scripts/run_prometheus.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -# - Starts a prometheus instance in agent-mode to collect metrics from nodes running -# locally and in CI. -# -# - promtail will remain running in the background and will forward metrics to the -# specified prometheus endpoint. -# -# - Each node is configured with a file written to ~/.tmpnet/prometheus/file_sd_configs -# -# - To stop the running instance: -# $ kill -9 `cat ~/.tmpnet/promtheus/run.pid` && rm ~/.tmpnet/promtail/run.pid - -# e.g., -# PROMETHEUS_USERNAME= PROMETHEUS_PASSWORD= ./scripts/run_prometheus.sh -if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then - echo "must be run from repository root" - exit 255 -fi - -CMD=prometheus - -if ! command -v "${CMD}" &> /dev/null; then - echo "prometheus not found, have you run 'nix develop'?" - echo "To install nix: https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix" - exit 1 -fi - -PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus" -PIDFILE="${PROMETHEUS_WORKING_DIR}"/run.pid - -# First check if an agent-mode prometheus is already running. A single instance can collect -# metrics from all local temporary networks. -if pgrep --pidfile="${PIDFILE}" -f 'prometheus.*enable-feature=agent' &> /dev/null; then - echo "prometheus is already running locally with --enable-feature=agent" - exit 0 -fi - -PROMETHEUS_URL="${PROMETHEUS_URL:-https://prometheus-poc.avax-dev.network}" -if [[ -z "${PROMETHEUS_URL}" ]]; then - echo "Please provide a value for PROMETHEUS_URL" - exit 1 -fi - -PROMETHEUS_USERNAME="${PROMETHEUS_USERNAME:-}" -if [[ -z "${PROMETHEUS_USERNAME}" ]]; then - echo "Please provide a value for PROMETHEUS_USERNAME" - exit 1 -fi - -PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}" -if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then - echo "Please provide a value for PROMETHEUS_PASSWORD" - exit 1 -fi - -# Configure prometheus -FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs" -mkdir -p "${FILE_SD_PATH}" - -CONFIG_PATH="${PROMETHEUS_WORKING_DIR}/prometheus.yaml" -cat > "${CONFIG_PATH}" < prometheus.log 2>&1 & -echo $! > "${PIDFILE}" -echo "prometheus started with pid $(cat "${PIDFILE}")" -# shellcheck disable=SC2016 -echo 'To stop prometheus: "kill -SIGTERM `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid"' diff --git a/scripts/run_promtail.sh b/scripts/run_promtail.sh deleted file mode 100755 index 43030bd8aad3..000000000000 --- a/scripts/run_promtail.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -# - Starts a promtail instance to collect logs from nodes running locally and in CI. -# -# - promtail will remain running in the background and will forward logs to the -# specified Loki endpoint. -# -# - Each node is configured with a file written to ~/.tmpnet/promtail/file_sd_configs/ -# -# - To stop the running instance: -# $ kill -9 `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid - -# e.g., -# LOKI_USERNAME= LOKI_PASSWORD= ./scripts/run_promtail.sh -if ! [[ "$0" =~ scripts/run_promtail.sh ]]; then - echo "must be run from repository root" - exit 255 -fi - -CMD=promtail - -if ! command -v "${CMD}" &> /dev/null; then - echo "promtail not found, have you run 'nix develop'?" - echo "To install nix: https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix" - exit 1 -fi - -PROMTAIL_WORKING_DIR="${HOME}/.tmpnet/promtail" -PIDFILE="${PROMTAIL_WORKING_DIR}"/run.pid - -# First check if promtail is already running. A single instance can -# collect logs from all local temporary networks. -if pgrep --pidfile="${PIDFILE}" &> /dev/null; then - echo "promtail is already running" - exit 0 -fi - -LOKI_URL="${LOKI_URL:-https://loki-poc.avax-dev.network}" -if [[ -z "${LOKI_URL}" ]]; then - echo "Please provide a value for LOKI_URL" - exit 1 -fi - -LOKI_USERNAME="${LOKI_USERNAME:-}" -if [[ -z "${LOKI_USERNAME}" ]]; then - echo "Please provide a value for LOKI_USERNAME" - exit 1 -fi - -LOKI_PASSWORD="${LOKI_PASSWORD:-}" -if [[ -z "${LOKI_PASSWORD}" ]]; then - echo "Please provide a value for LOKI_PASSWORD" - exit 1 -fi - -# Configure promtail -FILE_SD_PATH="${PROMTAIL_WORKING_DIR}/file_sd_configs" -mkdir -p "${FILE_SD_PATH}" - -CONFIG_PATH="${PROMTAIL_WORKING_DIR}/promtail.yaml" -cat > "${CONFIG_PATH}" < promtail.log 2>&1 & -echo $! > "${PIDFILE}" -echo "promtail started with pid $(cat "${PIDFILE}")" -# shellcheck disable=SC2016 -echo 'To stop promtail: "kill -SIGTERM `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid"' diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index cfb6e8d76d39..88d1b7baaa72 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -130,6 +130,10 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } } + if flagVars.EnableCollectors() { + require.NoError(tmpnet.EnsureCollectorsRunning(tc.Log())) + } + // Start a new network if network == nil { network = desiredNetwork diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index e5afcb9aa5f2..89d7b4c5b6c6 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -12,20 +12,16 @@ import ( "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" ) -// Ensure that this value takes into account the scrape_interval -// defined in scripts/run_prometheus.sh. -const networkShutdownDelay = 12 * time.Second - type FlagVars struct { - avalancheGoExecPath string - pluginDir string - networkDir string - reuseNetwork bool - delayNetworkShutdown bool - startNetwork bool - stopNetwork bool - restartNetwork bool - nodeCount int + avalancheGoExecPath string + pluginDir string + networkDir string + reuseNetwork bool + enableCollectors bool + startNetwork bool + stopNetwork bool + restartNetwork bool + nodeCount int } func (v *FlagVars) AvalancheGoExecPath() string { @@ -54,10 +50,14 @@ func (v *FlagVars) RestartNetwork() bool { return v.restartNetwork } +func (v *FlagVars) EnableCollectors() bool { + return v.enableCollectors +} + func (v *FlagVars) NetworkShutdownDelay() time.Duration { - if v.delayNetworkShutdown { + if v.enableCollectors { // Only return a non-zero value if the delay is enabled. - return networkShutdownDelay + return tmpnet.NetworkShutdownDelay } return 0 } @@ -121,10 +121,10 @@ func RegisterFlags() *FlagVars { "[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.", ) flag.BoolVar( - &vars.delayNetworkShutdown, - "delay-network-shutdown", + &vars.enableCollectors, + "enable-collectors", false, - "[optional] whether to delay network shutdown to allow a final metrics scrape.", + "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", ) flag.BoolVar( &vars.startNetwork, diff --git a/tests/fixture/e2e/metrics_link.go b/tests/fixture/e2e/metrics_link.go index 716938520066..2042d0e461e1 100644 --- a/tests/fixture/e2e/metrics_link.go +++ b/tests/fixture/e2e/metrics_link.go @@ -49,7 +49,7 @@ var _ = ginkgo.AfterEach(func() { // Extend the end time by the shutdown delay (a proxy for the metrics // scrape interval) to maximize the chances of the specified duration // including all metrics relevant to the current spec. - endTime := time.Now().Add(networkShutdownDelay).UnixMilli() + endTime := time.Now().Add(tmpnet.NetworkShutdownDelay).UnixMilli() metricsLink := tmpnet.MetricsLinkForNetwork( env.GetNetwork().UUID, strconv.FormatInt(startTime, 10), diff --git a/tests/fixture/tmpnet/node_process.go b/tests/fixture/tmpnet/node_process.go index 2d429c98a01c..d2da2c07b362 100644 --- a/tests/fixture/tmpnet/node_process.go +++ b/tests/fixture/tmpnet/node_process.go @@ -225,7 +225,12 @@ func (p *NodeProcess) getProcess() (*os.Process, error) { return nil, nil } - proc, err := os.FindProcess(p.pid) + return getProcess(p.pid) +} + +// getProcess retrieves the process if it is running. +func getProcess(pid int) (*os.Process, error) { + proc, err := os.FindProcess(pid) if err != nil { return nil, fmt.Errorf("failed to find process: %w", err) } diff --git a/tests/fixture/tmpnet/start_collectors.go b/tests/fixture/tmpnet/start_collectors.go new file mode 100644 index 000000000000..21591cea2fc1 --- /dev/null +++ b/tests/fixture/tmpnet/start_collectors.go @@ -0,0 +1,242 @@ +// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package tmpnet + +import ( + "errors" + "fmt" + "io/fs" + "os" + "os/exec" + "path/filepath" + "strconv" + "time" + + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/utils/logging" + "github.com/ava-labs/avalanchego/utils/perms" +) + +type configGeneratorFunc func(workingDir string, username string, password string) string + +// Use a delay slightly longer than the 10s scrape interval configured for prometheus to ensure a final scrape before shutdown +const NetworkShutdownDelay = 12 * time.Second + +func EnsureCollectorsRunning(log logging.Logger) error { + if err := ensurePrometheusRunning(log); err != nil { + return err + } + return ensurePromtailRunning(log) +} + +func ensurePrometheusRunning(log logging.Logger) error { + return ensureCollectorRunning( + log, + "prometheus", + "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", + "PROMETHEUS", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +global: + scrape_interval: 10s # Default is every 1 minute. + evaluation_interval: 10s # The default is every 1 minute. + scrape_timeout: 5s # The default is every 10s + +scrape_configs: + - job_name: "avalanchego" + metrics_path: "/ext/metrics" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' + +remote_write: + - url: "https://prometheus-poc.avax-dev.network/api/v1/write" + basic_auth: + username: "%s" + password: "%s" +`, workingDir, username, password) + }, + ) +} + +func ensurePromtailRunning(log logging.Logger) error { + return ensureCollectorRunning( + log, + "promtail", + "-config.file=promtail.yaml", + "LOKI", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +server: + http_listen_port: 0 + grpc_listen_port: 0 + +positions: + filename: %s/positions.yaml + +client: + url: "https://loki-poc.avax-dev.network/api/prom/push" + basic_auth: + username: "%s" + password: "%s" + +scrape_configs: + - job_name: "avalanchego" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' +`, workingDir, username, password, workingDir) + }, + ) +} + +func ensureCollectorRunning( + log logging.Logger, + cmdName string, + args string, + baseEnvName string, + configGenerator configGeneratorFunc, +) error { + tmpnetDir, err := getTmpnetPath() + if err != nil { + return err + } + workingDir := filepath.Join(tmpnetDir, cmdName) + pidFilename := "run.pid" + pidPath := filepath.Join(workingDir, pidFilename) + + if err := os.MkdirAll(workingDir, perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s dir: %w", cmdName, err) + } + + if err := os.MkdirAll(filepath.Join(workingDir, "file_sd_configs"), perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create promtail file_sd_configs dir: %w", err) + } + + // Read the PID from the file + pidData, err := os.ReadFile(pidPath) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("failed to read %s PID file %s: %w", cmdName, pidPath, err) + } + if len(pidData) > 0 { + pid, err := strconv.Atoi(string(pidData)) + if err != nil { + return fmt.Errorf("failed to parse %s PID: %w", cmdName, err) + } + process, err := getProcess(pid) + if err != nil { + return err + } + if process != nil { + log.Info(cmdName + " is already running") + return nil + } + } + + // Remove the pid file to avoid conflicting with the new one starting + if err := os.Remove(pidPath); err != nil { + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("failed to remove stale pid file: %w", err) + } + } else { + log.Info("deleted stale "+cmdName+" pid file", + zap.String("path", pidPath), + ) + } + + // TODO(marun) Maybe collect errors instead of returning them 1-by-1? + if _, err := exec.LookPath(cmdName); err != nil { + return fmt.Errorf("%s command not found. Maybe run 'nix develop'?", cmdName) + } + + usernameEnvVar := baseEnvName + "_USERNAME" + username := getEnv(usernameEnvVar, "") + if len(username) == 0 { + return fmt.Errorf("%s env var not set", usernameEnvVar) + } + + passwordEnvVar := baseEnvName + "_PASSWORD" + password := getEnv(passwordEnvVar, "") + if len(password) == 0 { + return fmt.Errorf("%s var not set", passwordEnvVar) + } + + confFilename := cmdName + ".yaml" + confPath := filepath.Join(workingDir, confFilename) + log.Info("writing "+cmdName+" config", + zap.String("path", confPath), + ) + config := configGenerator(workingDir, username, password) + if err := os.WriteFile(confPath, []byte(config), perms.ReadWrite); err != nil { + return err + } + + fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidFilename + log.Info("starting "+cmdName, + zap.String("workingDir", workingDir), + zap.String("fullCmd", fullCmd), + ) + + // TODO(marun) Figure out a way to redirect stdout and stderr of a detached child process without a bash shell + cmd := exec.Command("bash", "-c", fullCmd) + configureDetachedProcess(cmd) // Ensure the child process will outlive its parent + cmd.Dir = workingDir + + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start %s: %w", cmdName, err) + } + + var pid string + // TODO(marun) Use a context instead + for { + if fileExistsAndNotEmpty(pidPath) { + var err error + pid, err = readFileContents(pidPath) + if err != nil { + return fmt.Errorf("failed to read pid file: %w", err) + } + break + } + time.Sleep(100 * time.Millisecond) + } + log.Info(cmdName+" started", + zap.String("pid", pid), + ) + + killMsg := fmt.Sprintf("To stop %s: kill -SIGTERM $(cat %s) && rm %s", cmdName, pidPath, pidPath) + log.Info(killMsg) + + return nil +} + +// Function to check if a file exists and is not empty +func fileExistsAndNotEmpty(filename string) bool { + fileInfo, err := os.Stat(filename) + if err != nil { + if os.IsNotExist(err) { + return false + } + fmt.Printf("Error stating file: %v\n", err) + return false + } + return fileInfo.Size() > 0 +} + +// Function to read the contents of a file +func readFileContents(filename string) (string, error) { + content, err := os.ReadFile(filename) + if err != nil { + return "", err + } + return string(content), nil +} + +// TODO(marun) Put this somewhere standard +func getEnv(key, fallback string) string { + if value, ok := os.LookupEnv(key); ok { + return value + } + return fallback +}