Skip to content
Draft
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 45 additions & 4 deletions docker/docker_jit_monitor/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::{
collections::HashMap,
process::{Child, Command},
string::FromUtf8Error,
sync::atomic::{AtomicU32, AtomicU64, Ordering},
Expand All @@ -15,6 +16,9 @@ use crate::github_api::{get_idle_runners, spawn_runner};

mod github_api;

const MAX_SPAWN_RETRIES: u32 = 7;
/// How long the loop will sleep in milliseconds.
const BASE_LOOP_SLEEP: u64 = 500;
Comment thread
Narfinger marked this conversation as resolved.
static RUNNER_ID: AtomicU64 = AtomicU64::new(0);
static EXITING: AtomicU32 = AtomicU32::new(0);

Expand Down Expand Up @@ -143,7 +147,7 @@ enum SpawnRunnerError {
#[cfg(target_os = "linux")]
const OS_TAG: &str = "Linux";

#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
enum ContainerType {
Builder,
Runner,
Expand Down Expand Up @@ -187,7 +191,7 @@ impl Iterator for ContainerTypeIterator {
None
}
};
self.current.clone()
self.current
}
}

Expand All @@ -207,6 +211,38 @@ struct DockerContainer {
container_type: ContainerType,
}

#[derive(Debug, Default)]
/// Store the number of retries per container type
struct Retries {
t: HashMap<ContainerType, u32>,
}

impl Retries {
/// Increases the number of retries and quits if it reaches `MAX_SPAWN_RETRIES`.
fn inc_and_check(&mut self, t: ContainerType) {
let value = self.t.entry(t).or_insert(0);
*value += 1;
if *value > MAX_SPAWN_RETRIES {
println!(
"We had {value} many times to spawn a runner/builder ({t:?}). It is not happening."
);
std::process::exit(-1);
}
}

/// Resets the counter when we have succesfully spawned a runner.
fn reset(&mut self, t: ContainerType) {
self.t.entry(t).insert_entry(0);
}

/// The current wait time we have for a loop.
/// Defaults to `BASE_LOOP_SLEEP` and exponentially increases with failures.
fn wait_time(&self) -> Duration {
let m = self.t.values().max().unwrap_or(&0);
Duration::from_millis(BASE_LOOP_SLEEP * 2_u64.pow(*m))
}
}

fn main() -> anyhow::Result<()> {
env_logger::init();
info!("Starting monitor for selfhosted docker-based github runners!");
Expand All @@ -232,6 +268,7 @@ fn main() -> anyhow::Result<()> {
let mut running_containers: Vec<DockerContainer> = vec![];
// Todo: implement something to reserve devices for the duration of the docker run child process.

let mut retries = Retries::default();
loop {
let exiting = EXITING.load(Ordering::Relaxed);
for container_type in ContainerType::iter() {
Expand All @@ -254,13 +291,17 @@ fn main() -> anyhow::Result<()> {
};

match spawn_runner(config) {
Ok(container) => running_containers.push(container),
Ok(container) => {
retries.reset(container_type);
running_containers.push(container)
}
Err(SpawnRunnerError::GhApiError(_, message))
if message.contains("gh: Already exists") =>
{
info!("Runner name already taken - Will retry with new name later")
}
Err(e) => {
retries.inc_and_check(container_type);
error!("Failed to spawn JIT runner: {e:?}");
}
}
Expand Down Expand Up @@ -319,7 +360,7 @@ fn main() -> anyhow::Result<()> {
}

running_containers = still_running;
thread::sleep(Duration::from_millis(500));
thread::sleep(retries.wait_time());
}

info!("Exiting....");
Expand Down