From 01300b7aeeb69ab00a4e523a8df0619ccf15406a Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Sat, 22 Feb 2025 17:11:00 +0100 Subject: [PATCH] fixup: Refactor for review --- bin/tmpnetctl | 13 + tests/e2e/README.md | 29 ++ tests/fixture/e2e/env.go | 2 +- tests/fixture/e2e/flags.go | 27 +- tests/fixture/tmpnet/README.md | 88 +++-- tests/fixture/tmpnet/cmd/main.go | 33 +- tests/fixture/tmpnet/collectors.go | 419 +++++++++++++++++++++++ tests/fixture/tmpnet/start_collectors.go | 242 ------------- tests/fixture/tmpnet/utils.go | 9 + tests/upgrade/upgrade_test.go | 6 + 10 files changed, 573 insertions(+), 295 deletions(-) create mode 100755 bin/tmpnetctl create mode 100644 tests/fixture/tmpnet/collectors.go delete mode 100644 tests/fixture/tmpnet/start_collectors.go diff --git a/bin/tmpnetctl b/bin/tmpnetctl new file mode 100755 index 000000000000..d9cd8b8bb595 --- /dev/null +++ b/bin/tmpnetctl @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Ensure the go command is run from the root of the repository +AVALANCHE_PATH=$(cd "$( dirname "${BASH_SOURCE[0]}" )"; cd .. && pwd ) +cd "${AVALANCHE_PATH}" + +# Build if needed +if [[ ! -f ./build/tmpnetctl ]]; then + ./scripts/build_tmpnetctl.sh +fi +./build/tmpnetctl diff --git a/tests/e2e/README.md b/tests/e2e/README.md index 1851ffd3141e..464b0f55462f 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -107,3 +107,32 @@ these bootstrap checks during development, set the ```bash E2E_SKIP_BOOTSTRAP_CHECKS=1 ./bin/ginkgo -v ./tests/e2e ... ``` + +## Monitoring + +It is possible to enable collection of logs and metrics from the +temporary networks used for e2e testing by: + + - Supplying `--enable-collectors` as an argument to the test suite + - Starting collectors in advance of a test run with `tmpnetctl + start-collectors` + +Both methods require: + + - Auth credentials to be supplied as env vars: + - `PROMETHEUS_USERNAME` + - `PROMETHEUS_PASSWORD` + - `LOKI_USERNAME` + - `LOKI_PASSWORD` + - The availability in the path of binaries for promtail and prometheus + - Starting a development shell with `nix develop` is one way to + ensure this and requires the [installation of + nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix). + +Once started, the collectors will continue to run in the background +until stopped by `tmpnetctl stop-collectors`. + +The results of collection will be viewable at +https://grafana-poc.avax-dev.network. + +For more detail, see the [tmpnet docs](../tmpnet/README.md#monitoring). diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index 55d22e36632b..35eea8478a2f 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -131,7 +131,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } if flagVars.EnableCollectors() { - require.NoError(tmpnet.EnsureCollectorsRunning(tc.Log())) + require.NoError(tmpnet.EnsureCollectorsRunning(tc.DefaultContext(), tc.Log())) } // Start a new network diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index 5fba80eb75b8..815c8e011ed2 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -96,14 +96,6 @@ func (v *FlagVars) NodeCount() int { return v.nodeCount } -func GetEnvWithDefault(envVar, defaultVal string) string { - val := os.Getenv(envVar) - if len(val) == 0 { - return defaultVal - } - return val -} - func RegisterFlags() *FlagVars { vars := FlagVars{} flag.StringVar( @@ -118,7 +110,7 @@ func RegisterFlags() *FlagVars { flag.StringVar( &vars.pluginDir, "plugin-dir", - GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), + tmpnet.GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), fmt.Sprintf( "[optional] the dir containing VM plugins. Also possible to configure via the %s env variable.", tmpnet.AvalancheGoPluginDirEnvName, @@ -142,12 +134,7 @@ func RegisterFlags() *FlagVars { false, "[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.", ) - flag.BoolVar( - &vars.enableCollectors, - "enable-collectors", - cast.ToBool(GetEnvWithDefault("TMPNET_ENABLE_COLLECTORS", "false")), - "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", - ) + SetEnableCollectorsFlag(&vars.enableCollectors) flag.BoolVar( &vars.startNetwork, "start-network", @@ -169,3 +156,13 @@ func RegisterFlags() *FlagVars { return &vars } + +// Enable reuse by the upgrade job +func SetEnableCollectorsFlag(p *bool) { + flag.BoolVar( + p, + "enable-collectors", + cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_ENABLE_COLLECTORS", "false")), + "[optional] whether to enable collectors of logs and metrics from nodes of the temporary network.", + ) +} diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index b6a0880d611e..dccde8c668b5 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -24,18 +24,24 @@ repositories. The functionality in this package is grouped by logical purpose into the following non-test files: -| Filename | Types | Purpose | -|:------------------|:------------|:-----------------------------------------------| -| defaults.go | | Defines common default configuration | -| flags.go | FlagsMap | Simplifies configuration of avalanchego flags | -| genesis.go | | Creates test genesis | -| network.go | Network | Orchestrates and configures temporary networks | -| network_config.go | Network | Reads and writes network configuration | -| node.go | Node | Orchestrates and configures nodes | -| node_config.go | Node | Reads and writes node configuration | -| node_process.go | NodeProcess | Orchestrates node processes | -| subnet.go | Subnet | Orchestrates subnets | -| utils.go | | Defines shared utility functions | +| Filename | Types | Purpose | +|:----------------------------|:------------|:----------------------------------------------------| +| collectors.go | | Starts and stops collectors for logs and metrics | +| defaults.go | | Defines common default configuration | +| detached_process_default.go | | Configures detached processes for darwin and linux | +| detached_process_windows.go | | No-op detached process configuration for windows | +| flags.go | FlagsMap | Simplifies configuration of avalanchego flags | +| genesis.go | | Creates test genesis | +| kube.go | | Library for Kubernetes interaction | +| local_network.go | | Defines configuration for the default local network | +| network.go | Network | Orchestrates and configures temporary networks | +| network_config.go | Network | Reads and writes network configuration | +| network_test.go | | Simple test round-tripping Network serialization | +| node.go | Node | Orchestrates and configures nodes | +| node_config.go | Node | Reads and writes node configuration | +| node_process.go | NodeProcess | Orchestrates node processes | +| subnet.go | Subnet | Orchestrates subnets | +| utils.go | | Defines shared utility functions | ## Usage @@ -280,35 +286,54 @@ shared. ### Example usage ```bash -# Start prometheus to collect metrics -PROMETHEUS_USERNAME= PROMETHEUS_PASSWORD= ./scripts/run_prometheus.sh +# Start a nix shell to ensure the availability of promtail and prometheus. +nix develop -# Start promtail to collect logs -LOKI_USERNAME= LOKI_PASSWORD= ./scripts/run_promtail.sh +# Enable collection of logs and metrics +PROMETHEUS_USERNAME= \ +PROMETHEUS_PASSWORD= \ +LOKI_USERNAME= \ +LOKI_PASSWORD= \ +./bin/tmpnetctl start-collectors # Network start emits link to grafana displaying collected logs and metrics ./bin/tmpnetctl start-network -# Configure metrics collection from a local node binding to the default API -# port of 9650 and storing its logs in ~/.avalanchego/logs. The script will -# also emit a link to grafana. -./scripts/configure-local-metrics-collection.sh +# When done with the network, stop the collectors +./bin/tmpnetctl stop-collectors ``` +### Starting collectors + +Collectors for logs and metrics can be started by `tmpnetctl +start-collectors`: + + - Requires that the following env vars be set + - `PROMETHEUS_USERNAME` + - `PROMETHEUS_PASSWORD` + - `LOKI_USERNAME` + - `LOKI_PASSWORD` + - Requires that binaries for promtail and prometheus be available in the path + - Starting a development shell with `nix develop` is one way to + ensure this and requires the [installation of + nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix). + - Starts prometheus in agent mode configured to scrape metrics from + configured nodes and forward them to + https://prometheus-poc.avax-dev.network. + - Starts promtail configured to collect logs from configured nodes + and forward them to https://loki-poc.avax-dev.network. + +### Stopping collectors + +Collectors for logs and metrics can be stopped by `tmpnetctl +stop-collectors`: + ### Metrics collection When a node is started, configuration enabling collection of metrics from the node is written to `~/.tmpnet/prometheus/file_sd_configs/[network uuid]-[node id].json`. -The `scripts/run_prometheus.sh` script starts prometheus in agent mode -configured to scrape metrics from configured nodes and forward the -metrics to a persistent prometheus instance. The script requires that -the `PROMETHEUS_USERNAME` and `PROMETHEUS_PASSWORD` env vars be set. By -default the prometheus instance at -https://prometheus-poc.avax-dev.network will be targeted and -this can be overridden via the `PROMETHEUS_URL` env var. - ### Log collection Nodes log are stored at `~/.tmpnet/networks/[network id]/[node @@ -320,13 +345,6 @@ collection of logs for the node is written to `~/.tmpnet/promtail/file_sd_configs/[network uuid]-[node id].json`. -The `scripts/run_promtail.sh` script starts promtail configured to -collect logs from configured nodes and forward the results to loki. The -script requires that the `LOKI_USERNAME` and `LOKI_PASSWORD` env vars be -set. By default the loki instance at -https://loki-poc.avax-dev.network will be targeted and this -can be overridden via the `LOKI_URL` env var. - ### Labels The logs and metrics collected for temporary networks will have the diff --git a/tests/fixture/tmpnet/cmd/main.go b/tests/fixture/tmpnet/cmd/main.go index 1e5ebbf39735..6257c449ac38 100644 --- a/tests/fixture/tmpnet/cmd/main.go +++ b/tests/fixture/tmpnet/cmd/main.go @@ -16,7 +16,6 @@ import ( "go.uber.org/zap" "github.com/ava-labs/avalanchego/tests" - "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" "github.com/ava-labs/avalanchego/utils/logging" "github.com/ava-labs/avalanchego/version" @@ -124,7 +123,7 @@ func main() { startNetworkCmd.PersistentFlags().StringVar( &pluginDir, "plugin-dir", - e2e.GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), + tmpnet.GetEnvWithDefault(tmpnet.AvalancheGoPluginDirEnvName, os.ExpandEnv("$HOME/.avalanchego/plugins")), "[optional] the dir containing VM plugins", ) startNetworkCmd.PersistentFlags().Uint8Var(&nodeCount, "node-count", tmpnet.DefaultNodeCount, "Number of nodes the network should initially consist of") @@ -167,6 +166,36 @@ func main() { } rootCmd.AddCommand(restartNetworkCmd) + startCollectorsCmd := &cobra.Command{ + Use: "start-collectors", + Short: "Start log and metric collectors for local process-based nodes", + RunE: func(*cobra.Command, []string) error { + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) + defer cancel() + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.EnsureCollectorsRunning(ctx, log) + }, + } + rootCmd.AddCommand(startCollectorsCmd) + + stopCollectorsCmd := &cobra.Command{ + Use: "stop-collectors", + Short: "Stop log and metric collectors for local process-based nodes", + RunE: func(*cobra.Command, []string) error { + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) + defer cancel() + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.EnsureCollectorsStopped(ctx, log) + }, + } + rootCmd.AddCommand(stopCollectorsCmd) + if err := rootCmd.Execute(); err != nil { fmt.Fprintf(os.Stderr, "tmpnetctl failed: %v\n", err) os.Exit(1) diff --git a/tests/fixture/tmpnet/collectors.go b/tests/fixture/tmpnet/collectors.go new file mode 100644 index 000000000000..ac4625e7b8f5 --- /dev/null +++ b/tests/fixture/tmpnet/collectors.go @@ -0,0 +1,419 @@ +// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package tmpnet + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + "time" + + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/utils/logging" + "github.com/ava-labs/avalanchego/utils/perms" +) + +type configGeneratorFunc func(workingDir string, username string, password string) string + +const ( + collectorTickerInterval = 100 * time.Millisecond + + prometheusScrapeInterval = 10 * time.Second + + prometheusCmd = "prometheus" + promtailCmd = "promtail" + + // Use a delay slightly longer than the scrape interval to ensure a final scrape before shutdown + NetworkShutdownDelay = prometheusScrapeInterval + 2*time.Second +) + +// EnsureCollectorsRunning ensures collectors are running to collect logs and metrics from local nodes. +func EnsureCollectorsRunning(ctx context.Context, log logging.Logger) error { + if _, ok := ctx.Deadline(); !ok { + return errors.New("unable to start collectors with a context without a deadline") + } + if err := ensurePrometheusRunning(ctx, log); err != nil { + return err + } + if err := ensurePromtailRunning(ctx, log); err != nil { + return err + } + + log.Info("To stop: tmpnetctl stop-collectors") + + return nil +} + +// EnsureCollectorsStopped ensures collectors are not running. +func EnsureCollectorsStopped(ctx context.Context, log logging.Logger) error { + if _, ok := ctx.Deadline(); !ok { + return errors.New("unable to start collectors with a context without a deadline") + } + for _, cmdName := range []string{prometheusCmd, promtailCmd} { + // Determine if the process is running + workingDir, err := getWorkingDir(cmdName) + if err != nil { + return err + } + pidPath := getPIDPath(workingDir) + proc, err := processFromPIDFile(workingDir, pidPath) + if err != nil { + return err + } + if proc == nil { + log.Info("collector not running", + zap.String("cmd", cmdName), + ) + continue + } + + log.Info("sending SIGTERM to collector process", + zap.String("cmdName", cmdName), + zap.Int("pid", proc.Pid), + ) + if err := proc.Signal(syscall.SIGTERM); err != nil { + return fmt.Errorf("failed to send SIGTERM to pid %d: %w", proc.Pid, err) + } + + log.Info("waiting for collector process to stop", + zap.String("cmdName", cmdName), + zap.Int("pid", proc.Pid), + ) + ticker := time.NewTicker(collectorTickerInterval) + defer ticker.Stop() + for { + p, err := getProcess(proc.Pid) + if err != nil { + return fmt.Errorf("failed to retrieve process: %w", err) + } + if p == nil { + // Process is no longer running + + // Attempt to clear the PID file. Not critical that it is removed, just good housekeeping. + if err := clearStalePIDFile(log, cmdName, pidPath); err != nil { + log.Warn("failed to remove stale PID file", + zap.String("cmd", cmdName), + zap.String("pidFile", pidPath), + zap.Error(err), + ) + } + + break + } + + select { + case <-ctx.Done(): + return fmt.Errorf("failed to see %s stop before timeout: %w", cmdName, ctx.Err()) + case <-ticker.C: + } + } + log.Info("collector stopped", + zap.String("cmdName", cmdName), + ) + } + + return nil +} + +// ensurePrometheusRunning ensures an agent-mode prometheus process is running to collect metrics from local nodes. +func ensurePrometheusRunning(ctx context.Context, log logging.Logger) error { + return ensureCollectorRunning( + ctx, + log, + prometheusCmd, + "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", + "PROMETHEUS", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +global: + scrape_interval: %v # Default is every 1 minute. + evaluation_interval: 10s # The default is every 1 minute. + scrape_timeout: 5s # The default is every 10s + +scrape_configs: + - job_name: "avalanchego" + metrics_path: "/ext/metrics" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' + +remote_write: + - url: "https://prometheus-poc.avax-dev.network/api/v1/write" + basic_auth: + username: "%s" + password: "%s" +`, prometheusScrapeInterval, workingDir, username, password) + }, + ) +} + +// ensurePromtailRunning ensures a promtail process is running to collect logs from local nodes. +func ensurePromtailRunning(ctx context.Context, log logging.Logger) error { + return ensureCollectorRunning( + ctx, + log, + promtailCmd, + "-config.file=promtail.yaml", + "LOKI", + func(workingDir string, username string, password string) string { + return fmt.Sprintf(` +server: + http_listen_port: 0 + grpc_listen_port: 0 + +positions: + filename: %s/positions.yaml + +client: + url: "https://loki-poc.avax-dev.network/api/prom/push" + basic_auth: + username: "%s" + password: "%s" + +scrape_configs: + - job_name: "avalanchego" + file_sd_configs: + - files: + - '%s/file_sd_configs/*.json' +`, workingDir, username, password, workingDir) + }, + ) +} + +func getWorkingDir(cmdName string) (string, error) { + tmpnetDir, err := getTmpnetPath() + if err != nil { + return "", err + } + return filepath.Join(tmpnetDir, cmdName), nil +} + +func getPIDPath(workingDir string) string { + return filepath.Join(workingDir, "run.pid") +} + +// ensureCollectorRunning starts a collector process if it is not already running. +func ensureCollectorRunning( + ctx context.Context, + log logging.Logger, + cmdName string, + args string, + baseEnvName string, + configGenerator configGeneratorFunc, +) error { + // Determine paths + workingDir, err := getWorkingDir(cmdName) + if err != nil { + return err + } + pidPath := getPIDPath(workingDir) + + // Ensure required paths exist + if err := os.MkdirAll(workingDir, perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s dir: %w", cmdName, err) + } + if err := os.MkdirAll(filepath.Join(workingDir, "file_sd_configs"), perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s file_sd_configs dir: %w", cmdName, err) + } + + // Check if the process is already running + if process, err := processFromPIDFile(cmdName, pidPath); err != nil { + return err + } else if process != nil { + log.Info(cmdName + " is already running") + return nil + } + + // Clear any stale pid file + if err := clearStalePIDFile(log, cmdName, pidPath); err != nil { + return err + } + + // Check if the specified command is available in the path + if _, err := exec.LookPath(cmdName); err != nil { + return fmt.Errorf("%s command not found. Maybe run 'nix develop'?", cmdName) + } + + // Write the collector config file + if err := writeConfigFile(log, cmdName, workingDir, baseEnvName, configGenerator); err != nil { + return err + } + + // Start the collector + return startCollector(ctx, log, cmdName, args, workingDir, pidPath) +} + +// processFromPIDFile attempts to retrieve a running process from the specified PID file. +func processFromPIDFile(cmdName string, pidPath string) (*os.Process, error) { + pid, err := getPID(cmdName, pidPath) + if err != nil { + return nil, err + } + if pid == 0 { + return nil, nil + } + return getProcess(pid) +} + +// getPID attempts to read the PID of the collector from a PID file. +func getPID(cmdName string, pidPath string) (int, error) { + pidData, err := os.ReadFile(pidPath) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return 0, fmt.Errorf("failed to read %s PID file %s: %w", cmdName, pidPath, err) + } + if len(pidData) == 0 { + return 0, nil + } + pid, err := strconv.Atoi(string(pidData)) + if err != nil { + return 0, fmt.Errorf("failed to parse %s PID: %w", cmdName, err) + } + return pid, nil +} + +// clearStalePIDFile remove an existing pid file to avoid conflicting with a new process. +func clearStalePIDFile(log logging.Logger, cmdName string, pidPath string) error { + if err := os.Remove(pidPath); err != nil { + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("failed to remove stale pid file: %w", err) + } + } else { + log.Info("deleted stale "+cmdName+" pid file", + zap.String("path", pidPath), + ) + } + return nil +} + +// writeConfigFile writes the configuration file for a collector +func writeConfigFile( + log logging.Logger, + cmdName string, + workingDir string, + baseEnvName string, + configGenerator configGeneratorFunc, +) error { + // Retrieve the credentials for the command + username, password, err := getCredentials(baseEnvName) + if err != nil { + return err + } + + // Generate configuration for the command to its working dir + confFilename := cmdName + ".yaml" + confPath := filepath.Join(workingDir, confFilename) + log.Info("writing "+cmdName+" config", + zap.String("path", confPath), + ) + config := configGenerator(workingDir, username, password) + return os.WriteFile(confPath, []byte(config), perms.ReadWrite) +} + +// getCredentials retrieves the username and password for the given base env name. +func getCredentials(baseEnvName string) (string, string, error) { + usernameEnvVar := baseEnvName + "_USERNAME" + username := GetEnvWithDefault(usernameEnvVar, "") + if len(username) == 0 { + return "", "", fmt.Errorf("%s env var not set", usernameEnvVar) + } + passwordEnvVar := baseEnvName + "_PASSWORD" + password := GetEnvWithDefault(passwordEnvVar, "") + if len(password) == 0 { + return "", "", fmt.Errorf("%s var not set", passwordEnvVar) + } + return username, password, nil +} + +// Start a collector. Use bash to execute the command in the background and enable +// stderr and stdout redirection to a log file. +// +// Ideally this would be possible without bash, but it does not seem possible to +// have this process open a log file, set cmd.Stdout cmd.Stderr to that file, and +// then have the child process be able to write to that file once the parent +// process exits. Attempting to do so resulted in an empty log file. +func startCollector( + ctx context.Context, + log logging.Logger, + cmdName string, + args string, + workingDir string, + pidPath string, +) error { + fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidPath + log.Info("starting "+cmdName, + zap.String("workingDir", workingDir), + zap.String("fullCmd", fullCmd), + ) + + cmd := exec.Command("bash", "-c", fullCmd) + configureDetachedProcess(cmd) // Ensure the child process will outlive its parent + cmd.Dir = workingDir + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start %s: %w", cmdName, err) + } + + // Wait for PID file to be written. It's not enough to check for the PID of cmd + // because the PID we want is a child of the process that cmd represents. + if pid, err := waitForPIDFile(ctx, cmdName, pidPath); err != nil { + return err + } else { + log.Info(cmdName+" started", + zap.String("pid", pid), + ) + } + + return nil +} + +// waitForPIDFile waits for the PID file to be written as an indication of process start. +func waitForPIDFile(ctx context.Context, cmdName string, pidPath string) (string, error) { + var ( + ticker = time.NewTicker(collectorTickerInterval) + pid string + ) + defer ticker.Stop() + for { + if fileExistsAndNotEmpty(pidPath) { + var err error + pid, err = readFileContents(pidPath) + if err != nil { + return "", fmt.Errorf("failed to read pid file: %w", err) + } + break + } + select { + case <-ctx.Done(): + return "", fmt.Errorf("failed to wait for %s to start before timeout: %w", cmdName, ctx.Err()) + case <-ticker.C: + } + } + return pid, nil +} + +func fileExistsAndNotEmpty(filename string) bool { + fileInfo, err := os.Stat(filename) + if err != nil { + if os.IsNotExist(err) { + return false + } + fmt.Printf("Error stating file: %v\n", err) + return false + } + return fileInfo.Size() > 0 +} + +func readFileContents(filename string) (string, error) { + content, err := os.ReadFile(filename) + if err != nil { + return "", err + } + return string(content), nil +} diff --git a/tests/fixture/tmpnet/start_collectors.go b/tests/fixture/tmpnet/start_collectors.go deleted file mode 100644 index 21591cea2fc1..000000000000 --- a/tests/fixture/tmpnet/start_collectors.go +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. -// See the file LICENSE for licensing terms. - -package tmpnet - -import ( - "errors" - "fmt" - "io/fs" - "os" - "os/exec" - "path/filepath" - "strconv" - "time" - - "go.uber.org/zap" - - "github.com/ava-labs/avalanchego/utils/logging" - "github.com/ava-labs/avalanchego/utils/perms" -) - -type configGeneratorFunc func(workingDir string, username string, password string) string - -// Use a delay slightly longer than the 10s scrape interval configured for prometheus to ensure a final scrape before shutdown -const NetworkShutdownDelay = 12 * time.Second - -func EnsureCollectorsRunning(log logging.Logger) error { - if err := ensurePrometheusRunning(log); err != nil { - return err - } - return ensurePromtailRunning(log) -} - -func ensurePrometheusRunning(log logging.Logger) error { - return ensureCollectorRunning( - log, - "prometheus", - "--config.file=prometheus.yaml --storage.agent.path=./data --web.listen-address=localhost:0 --enable-feature=agent", - "PROMETHEUS", - func(workingDir string, username string, password string) string { - return fmt.Sprintf(` -global: - scrape_interval: 10s # Default is every 1 minute. - evaluation_interval: 10s # The default is every 1 minute. - scrape_timeout: 5s # The default is every 10s - -scrape_configs: - - job_name: "avalanchego" - metrics_path: "/ext/metrics" - file_sd_configs: - - files: - - '%s/file_sd_configs/*.json' - -remote_write: - - url: "https://prometheus-poc.avax-dev.network/api/v1/write" - basic_auth: - username: "%s" - password: "%s" -`, workingDir, username, password) - }, - ) -} - -func ensurePromtailRunning(log logging.Logger) error { - return ensureCollectorRunning( - log, - "promtail", - "-config.file=promtail.yaml", - "LOKI", - func(workingDir string, username string, password string) string { - return fmt.Sprintf(` -server: - http_listen_port: 0 - grpc_listen_port: 0 - -positions: - filename: %s/positions.yaml - -client: - url: "https://loki-poc.avax-dev.network/api/prom/push" - basic_auth: - username: "%s" - password: "%s" - -scrape_configs: - - job_name: "avalanchego" - file_sd_configs: - - files: - - '%s/file_sd_configs/*.json' -`, workingDir, username, password, workingDir) - }, - ) -} - -func ensureCollectorRunning( - log logging.Logger, - cmdName string, - args string, - baseEnvName string, - configGenerator configGeneratorFunc, -) error { - tmpnetDir, err := getTmpnetPath() - if err != nil { - return err - } - workingDir := filepath.Join(tmpnetDir, cmdName) - pidFilename := "run.pid" - pidPath := filepath.Join(workingDir, pidFilename) - - if err := os.MkdirAll(workingDir, perms.ReadWriteExecute); err != nil { - return fmt.Errorf("failed to create %s dir: %w", cmdName, err) - } - - if err := os.MkdirAll(filepath.Join(workingDir, "file_sd_configs"), perms.ReadWriteExecute); err != nil { - return fmt.Errorf("failed to create promtail file_sd_configs dir: %w", err) - } - - // Read the PID from the file - pidData, err := os.ReadFile(pidPath) - if err != nil && !errors.Is(err, os.ErrNotExist) { - return fmt.Errorf("failed to read %s PID file %s: %w", cmdName, pidPath, err) - } - if len(pidData) > 0 { - pid, err := strconv.Atoi(string(pidData)) - if err != nil { - return fmt.Errorf("failed to parse %s PID: %w", cmdName, err) - } - process, err := getProcess(pid) - if err != nil { - return err - } - if process != nil { - log.Info(cmdName + " is already running") - return nil - } - } - - // Remove the pid file to avoid conflicting with the new one starting - if err := os.Remove(pidPath); err != nil { - if !errors.Is(err, fs.ErrNotExist) { - return fmt.Errorf("failed to remove stale pid file: %w", err) - } - } else { - log.Info("deleted stale "+cmdName+" pid file", - zap.String("path", pidPath), - ) - } - - // TODO(marun) Maybe collect errors instead of returning them 1-by-1? - if _, err := exec.LookPath(cmdName); err != nil { - return fmt.Errorf("%s command not found. Maybe run 'nix develop'?", cmdName) - } - - usernameEnvVar := baseEnvName + "_USERNAME" - username := getEnv(usernameEnvVar, "") - if len(username) == 0 { - return fmt.Errorf("%s env var not set", usernameEnvVar) - } - - passwordEnvVar := baseEnvName + "_PASSWORD" - password := getEnv(passwordEnvVar, "") - if len(password) == 0 { - return fmt.Errorf("%s var not set", passwordEnvVar) - } - - confFilename := cmdName + ".yaml" - confPath := filepath.Join(workingDir, confFilename) - log.Info("writing "+cmdName+" config", - zap.String("path", confPath), - ) - config := configGenerator(workingDir, username, password) - if err := os.WriteFile(confPath, []byte(config), perms.ReadWrite); err != nil { - return err - } - - fullCmd := "nohup " + cmdName + " " + args + " > " + cmdName + ".log 2>&1 & echo -n \"$!\" > " + pidFilename - log.Info("starting "+cmdName, - zap.String("workingDir", workingDir), - zap.String("fullCmd", fullCmd), - ) - - // TODO(marun) Figure out a way to redirect stdout and stderr of a detached child process without a bash shell - cmd := exec.Command("bash", "-c", fullCmd) - configureDetachedProcess(cmd) // Ensure the child process will outlive its parent - cmd.Dir = workingDir - - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start %s: %w", cmdName, err) - } - - var pid string - // TODO(marun) Use a context instead - for { - if fileExistsAndNotEmpty(pidPath) { - var err error - pid, err = readFileContents(pidPath) - if err != nil { - return fmt.Errorf("failed to read pid file: %w", err) - } - break - } - time.Sleep(100 * time.Millisecond) - } - log.Info(cmdName+" started", - zap.String("pid", pid), - ) - - killMsg := fmt.Sprintf("To stop %s: kill -SIGTERM $(cat %s) && rm %s", cmdName, pidPath, pidPath) - log.Info(killMsg) - - return nil -} - -// Function to check if a file exists and is not empty -func fileExistsAndNotEmpty(filename string) bool { - fileInfo, err := os.Stat(filename) - if err != nil { - if os.IsNotExist(err) { - return false - } - fmt.Printf("Error stating file: %v\n", err) - return false - } - return fileInfo.Size() > 0 -} - -// Function to read the contents of a file -func readFileContents(filename string) (string, error) { - content, err := os.ReadFile(filename) - if err != nil { - return "", err - } - return string(content), nil -} - -// TODO(marun) Put this somewhere standard -func getEnv(key, fallback string) string { - if value, ok := os.LookupEnv(key); ok { - return value - } - return fallback -} diff --git a/tests/fixture/tmpnet/utils.go b/tests/fixture/tmpnet/utils.go index 1ed097c864ac..29b36af1330e 100644 --- a/tests/fixture/tmpnet/utils.go +++ b/tests/fixture/tmpnet/utils.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "net" + "os" "syscall" "time" @@ -125,3 +126,11 @@ func NodesToIDs(nodes ...*Node) []ids.NodeID { } return nodeIDs } + +func GetEnvWithDefault(envVar, defaultVal string) string { + val := os.Getenv(envVar) + if len(val) == 0 { + return defaultVal + } + return val +} diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index 2db31b983cfd..99679ccf08d0 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -22,6 +22,7 @@ func TestUpgrade(t *testing.T) { var ( avalancheGoExecPath string avalancheGoExecPathToUpgradeTo string + enableCollectors bool ) func init() { @@ -37,6 +38,7 @@ func init() { "", "avalanchego executable path to upgrade to", ) + e2e.SetEnableCollectorsFlag(&enableCollectors) } var _ = ginkgo.Describe("[Upgrade]", func() { @@ -51,6 +53,10 @@ var _ = ginkgo.Describe("[Upgrade]", func() { require.NoError(err) network.Genesis = genesis + if enableCollectors { + require.NoError(tmpnet.EnsureCollectorsRunning(tc.DefaultContext(), tc.Log())) + } + e2e.StartNetwork( tc, network,