Skip to content

Commit dc4b8ab

Browse files
committed
Add cgroup v2 support
cgroup v1 uses two special files to determine access, where cgroup v2 uses eBPF programs to control access. The code will attach a custom eBPF program which allows run-time reconfiguration and detach docker's default. eBPF programs will be detached when the attaching program dies, which can be dangerous if container-hotplug exits unexpectedly while the program is running, so we instead pin it (so it stays when the program exits) and unpin it after the docker container is down. In this case we might have garbage eBPF programs pinned when container-hotplug exits unexpectedly but it is safe.
1 parent dc435a4 commit dc4b8ab

File tree

6 files changed

+174
-16
lines changed

6 files changed

+174
-16
lines changed

Cargo.lock

+51
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ bollard = "0.16"
3030
futures = "0.3"
3131
rustix = { version = "0.38", features = ["fs", "stdio", "termios"] }
3232
bitflags = "2"
33+
aya = { git = "https://github.com/nbdd0121/aya.git" }
3334

3435
[build-dependencies]
3536
anyhow = { version = "1", features = ["backtrace"] }

README.md

+1-4
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@ Another concern is providing a container with well known paths for the devices.
2525
On bare-metal systems this would usually be achieved with a `SYMLINK` directive in a udev rule.
2626
This program tries to provide a similar functionality for containers, allowing you to specify symlinks for certain devices.
2727

28-
## Limitations
29-
30-
`container-hotplug` needs to be run as root and relies on `cgroup v1`. It does not support `cgroup v2`.
31-
On distributions with `cgroup v2`, you can switch back to `cgroup v1` by setting the [kernel parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters) `systemd.unified_cgroup_hierarchy=0`.
28+
This tool supports both cgroup v1 and v2.
3229

3330
## Example
3431

src/docker/cgroup.rs

+100-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
use anyhow::{ensure, Result};
1+
use anyhow::{ensure, Context, Result};
2+
use aya::maps::{HashMap, MapData};
3+
use aya::programs::{CgroupDevice, Link};
4+
use std::fs::File;
5+
use std::mem::ManuallyDrop;
26
use std::path::PathBuf;
37

48
// The numerical representation below needs to match BPF_DEVCG constants.
@@ -26,6 +30,10 @@ pub trait DeviceAccessController {
2630
minor: u32,
2731
access: Access,
2832
) -> Result<()>;
33+
34+
/// Stop performing access control. This may allow all accesses, so should only be used when
35+
/// the cgroup is shutdown.
36+
fn stop(self: Box<Self>) -> Result<()>;
2937
}
3038

3139
pub struct DeviceAccessControllerV1 {
@@ -96,4 +104,95 @@ impl DeviceAccessController for DeviceAccessControllerV1 {
96104

97105
Ok(())
98106
}
107+
108+
fn stop(self: Box<Self>) -> Result<()> {
109+
Ok(())
110+
}
111+
}
112+
113+
#[allow(unused)] // This is read as POD by the BPF program.
114+
#[derive(Clone, Copy)]
115+
struct Device {
116+
device_type: u32,
117+
major: u32,
118+
minor: u32,
119+
}
120+
121+
// SAFETY: Device is `repr(C)`` and has no padding.
122+
unsafe impl aya::Pod for Device {}
123+
124+
pub struct DeviceAccessControllerV2 {
125+
map: HashMap<MapData, Device, u32>,
126+
pin: PathBuf,
127+
}
128+
129+
impl DeviceAccessControllerV2 {
130+
pub fn new(id: &str) -> Result<Self> {
131+
// We want to take control of the device cgroup filtering from docker. To do this, we attach our own
132+
// filter program and detach the one by docker.
133+
let cgroup_path = format!("/sys/fs/cgroup/system.slice/docker-{id}.scope");
134+
let cgroup = File::open(cgroup_path)?;
135+
136+
let mut bpf = aya::Bpf::load(include_bytes!(concat!(
137+
env!("CARGO_MANIFEST_DIR"),
138+
"/cgroup_device_filter/target/bpfel-unknown-none/release/cgroup_device_filter"
139+
)))?;
140+
141+
let program: &mut CgroupDevice = bpf
142+
.program_mut("check_device")
143+
.context("cannot find check_device program")?
144+
.try_into()?;
145+
146+
program.load()?;
147+
148+
// Iterate existing programs. We'll need to detach them later.
149+
// Wrap this inside `ManuallyDrop` to prevent accidental detaching.
150+
let existing_programs = ManuallyDrop::new(CgroupDevice::query(&cgroup)?);
151+
152+
program.attach(&cgroup)?;
153+
154+
// Pin the program so that if container-hotplug accidentally exits, the filter won't be removed from the docker
155+
// container.
156+
let pin: PathBuf = format!("/sys/fs/bpf/docker-{id}-device-filter").into();
157+
program.pin(&pin)?;
158+
159+
// Now our new filter is attached, detach all docker filters.
160+
for existing_program in ManuallyDrop::into_inner(existing_programs) {
161+
existing_program.detach()?;
162+
}
163+
164+
let map: HashMap<_, Device, u32> = bpf
165+
.take_map("DEVICE_PERM")
166+
.context("cannot find DEVICE_PERM map")?
167+
.try_into()?;
168+
169+
Ok(Self { map, pin })
170+
}
171+
}
172+
173+
impl DeviceAccessController for DeviceAccessControllerV2 {
174+
fn set_permission(
175+
&mut self,
176+
ty: DeviceType,
177+
major: u32,
178+
minor: u32,
179+
access: Access,
180+
) -> Result<()> {
181+
let device = Device {
182+
device_type: ty as u32,
183+
major,
184+
minor,
185+
};
186+
if access.is_empty() {
187+
self.map.remove(&device)?;
188+
} else {
189+
self.map.insert(device, access.bits(), 0)?;
190+
}
191+
Ok(())
192+
}
193+
194+
fn stop(self: Box<Self>) -> Result<()> {
195+
CgroupDevice::from_pin(&self.pin)?.unpin()?;
196+
Ok(())
197+
}
99198
}

src/docker/container.rs

+19-5
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@ use tokio::signal::unix::{signal, SignalKind};
1010
use tokio::task::{spawn, JoinHandle};
1111
use tokio_stream::StreamExt;
1212

13-
use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceType};
13+
use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceAccessControllerV2, DeviceType};
1414
use super::{IoStream, IoStreamSource};
1515

1616
#[derive(Clone)]
1717
pub struct Container {
1818
id: String,
1919
docker: bollard::Docker,
2020
remove_event: Shared<BoxFuture<'static, Option<EventMessage>>>,
21-
cgroup_device_filter: Arc<Mutex<Box<dyn DeviceAccessController + Send>>>,
21+
cgroup_device_filter: Arc<Mutex<Option<Box<dyn DeviceAccessController + Send>>>>,
2222
}
2323

2424
impl Container {
@@ -40,13 +40,19 @@ impl Container {
4040
.shared();
4141

4242
let cgroup_device_filter: Box<dyn DeviceAccessController + Send> =
43-
Box::new(DeviceAccessControllerV1::new(id)?);
43+
match DeviceAccessControllerV2::new(id) {
44+
Ok(v) => Box::new(v),
45+
Err(err) => match DeviceAccessControllerV1::new(id) {
46+
Ok(v) => Box::new(v),
47+
Err(_) => Err(err).context("neither cgroup v1 and cgroup v2 works")?,
48+
},
49+
};
4450

4551
Ok(Self {
4652
id: id.to_owned(),
4753
docker: docker.clone(),
4854
remove_event: remove_evevnt,
49-
cgroup_device_filter: Arc::new(Mutex::new(cgroup_device_filter)),
55+
cgroup_device_filter: Arc::new(Mutex::new(Some(cgroup_device_filter))),
5056
})
5157
}
5258

@@ -83,6 +89,14 @@ impl Container {
8389
.context("no destroy event")?;
8490
}
8591

92+
// Stop the cgroup device filter. Only do so once we're sure that the container is removed.
93+
self.cgroup_device_filter
94+
.lock()
95+
.unwrap()
96+
.take()
97+
.unwrap()
98+
.stop()?;
99+
86100
Ok(())
87101
}
88102

@@ -229,7 +243,7 @@ impl Container {
229243
let controller = self.cgroup_device_filter.clone();
230244
tokio::task::spawn_blocking(move || -> Result<()> {
231245
let mut controller = controller.lock().unwrap();
232-
controller.set_permission(
246+
controller.as_mut().unwrap().set_permission(
233247
DeviceType::Character,
234248
major,
235249
minor,

src/main.rs

+2-6
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ use cli::{Action, Device, Symlink};
77
use docker::{Container, Docker};
88
use hotplug::{Event as HotPlugEvent, HotPlug, PluggedDevice};
99

10+
use std::fmt::Display;
1011
use std::pin::pin;
11-
use std::{fmt::Display, path::Path};
1212
use tokio_stream::StreamExt;
1313

14-
use anyhow::{bail, Context, Result};
14+
use anyhow::{Context, Result};
1515
use clap::Parser;
1616
use clap_verbosity_flag::{InfoLevel, LogLevel, Verbosity};
1717
use log::info;
@@ -98,10 +98,6 @@ fn run_hotplug(
9898
async fn run(param: cli::Run, verbosity: Verbosity<InfoLevel>) -> Result<u8> {
9999
let mut status = 0;
100100

101-
if !Path::new("/sys/fs/cgroup/devices/").is_dir() {
102-
bail!("Could not find cgroup v1");
103-
}
104-
105101
let docker = Docker::connect_with_defaults()?;
106102
let container = docker.run(param.docker_args).await?;
107103
drop(container.pipe_signals());

0 commit comments

Comments
 (0)