Skip to content

Commit b20d745

Browse files
committed
Implement a second HTTP server within compute_ctl
The compute_ctl HTTP server has the following purposes: - Allow management via the control plane - Provide an endpoint for scaping metrics - Provide APIs for compute internal clients - Neon Postgres extension for installing remote extensions - local_proxy for installing extensions and adding grants The first two purposes require the HTTP server to be available outside the compute. The Neon threat model is a bad actor within our internal network. We need to reduce the surface area of attack. By exposing unnecessary unauthenticated HTTP endpoints to the internal network, we increase the surface area of attack. For endpoints described in the third bullet point, we can just run an extra HTTP server, which is only bound to the loopback interface since all consumers of those endpoints are within the compute.
1 parent 733a572 commit b20d745

File tree

13 files changed

+278
-140
lines changed

13 files changed

+278
-140
lines changed

Diff for: compute_tools/src/bin/compute_ctl.rs

+24-8
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ use anyhow::{Context, Result};
4747
use chrono::Utc;
4848
use clap::Parser;
4949
use compute_tools::disk_quota::set_disk_quota;
50+
use compute_tools::http::server::Server;
5051
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
5152
use signal_hook::consts::{SIGQUIT, SIGTERM};
5253
use signal_hook::{consts::SIGINT, iterator::Signals};
@@ -61,7 +62,6 @@ use compute_tools::compute::{
6162
};
6263
use compute_tools::configurator::launch_configurator;
6364
use compute_tools::extension_server::get_pg_version_string;
64-
use compute_tools::http::launch_http_server;
6565
use compute_tools::logger::*;
6666
use compute_tools::monitor::launch_monitor;
6767
use compute_tools::params::*;
@@ -94,8 +94,20 @@ struct Cli {
9494
#[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
9595
pub remote_ext_config: Option<String>,
9696

97-
#[arg(long, default_value_t = 3080)]
98-
pub http_port: u16,
97+
/// The port to bind the external listening HTTP server to. Clients running
98+
/// outside the compute will talk to the compute through this port. Keep
99+
/// the previous name for this argument around for a smoother release
100+
/// with the control plane.
101+
///
102+
/// TODO: Remove the alias after the control plane release which teaches the
103+
/// control plane about the renamed argument.
104+
#[arg(long, alias = "http-port", default_value_t = 3080)]
105+
pub external_http_port: u16,
106+
107+
/// The port to bind the internal listening HTTP server to. Clients like
108+
/// the neon extension (for installing remote extensions) and local_proxy.
109+
#[arg(long, default_value_t = 3081)]
110+
pub internal_http_port: u16,
99111

100112
#[arg(short = 'D', long, value_name = "DATADIR")]
101113
pub pgdata: String,
@@ -325,7 +337,8 @@ fn wait_spec(
325337
pgdata: cli.pgdata.clone(),
326338
pgbin: cli.pgbin.clone(),
327339
pgversion: get_pg_version_string(&cli.pgbin),
328-
http_port: cli.http_port,
340+
external_http_port: cli.external_http_port,
341+
internal_http_port: cli.internal_http_port,
329342
live_config_allowed,
330343
state: Mutex::new(new_state),
331344
state_changed: Condvar::new(),
@@ -343,10 +356,13 @@ fn wait_spec(
343356
compute.prewarm_postgres()?;
344357
}
345358

346-
// Launch http service first, so that we can serve control-plane requests
347-
// while configuration is still in progress.
348-
let _http_handle =
349-
launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");
359+
// Launch the external HTTP server first, so that we can serve control plane
360+
// requests while configuration is still in progress.
361+
Server::External(cli.external_http_port).launch(&compute);
362+
363+
// The internal HTTP server could be launched later, but there isn't much
364+
// sense in waiting.
365+
Server::Internal(cli.internal_http_port).launch(&compute);
350366

351367
if !spec_set {
352368
// No spec provided, hang waiting for it.

Diff for: compute_tools/src/compute.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ pub struct ComputeNode {
8181
/// - we push spec and it does configuration
8282
/// - but then it is restarted without any spec again
8383
pub live_config_allowed: bool,
84-
/// The port that the compute's HTTP server listens on
85-
pub http_port: u16,
84+
/// The port that the compute's external HTTP server listens on
85+
pub external_http_port: u16,
86+
/// The port that the compute's internal HTTP server listens on
87+
pub internal_http_port: u16,
8688
/// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
8789
/// To allow HTTP API server to serving status requests, while configuration
8890
/// is in progress, lock should be held only for short periods of time to do
@@ -634,7 +636,7 @@ impl ComputeNode {
634636
config::write_postgres_conf(
635637
&pgdata_path.join("postgresql.conf"),
636638
&pspec.spec,
637-
self.http_port,
639+
self.internal_http_port,
638640
)?;
639641

640642
// Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1395,7 +1397,7 @@ impl ComputeNode {
13951397
// Write new config
13961398
let pgdata_path = Path::new(&self.pgdata);
13971399
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
1398-
config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
1400+
config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
13991401

14001402
let max_concurrent_connections = spec.reconfigure_concurrency;
14011403

Diff for: compute_tools/src/http/mod.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@ use http::{header::CONTENT_TYPE, StatusCode};
44
use serde::Serialize;
55
use tracing::error;
66

7-
pub use server::launch_http_server;
8-
97
mod extract;
108
mod routes;
11-
mod server;
9+
pub mod server;
1210

1311
/// Convenience response builder for JSON responses
1412
struct JsonResponse;

Diff for: compute_tools/src/http/server.rs

+129-62
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::{
2+
fmt::Display,
23
net::{IpAddr, Ipv6Addr, SocketAddr},
34
sync::Arc,
45
thread,
@@ -26,46 +27,65 @@ use super::routes::{
2627
};
2728
use crate::compute::ComputeNode;
2829

29-
async fn handle_404() -> Response {
30-
StatusCode::NOT_FOUND.into_response()
31-
}
32-
3330
const X_REQUEST_ID: &str = "x-request-id";
3431

35-
/// This middleware function allows compute_ctl to generate its own request ID
36-
/// if one isn't supplied. The control plane will always send one as a UUID. The
37-
/// neon Postgres extension on the other hand does not send one.
38-
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
39-
let headers = request.headers_mut();
32+
/// `compute_ctl` has two servers: internal and external. The internal server
33+
/// binds to the loopback interface and handles communication from clients on
34+
/// the compute. The external server is what receives communication from the
35+
/// control plane, the metrics scraper, etc. We make the distinction because
36+
/// certain routes in `compute_ctl` only need to be exposed to local processes
37+
/// like Postgres via the neon extension and local_proxy.
38+
#[derive(Clone, Copy, Debug)]
39+
pub enum Server {
40+
Internal(u16),
41+
External(u16),
42+
}
4043

41-
if headers.get(X_REQUEST_ID).is_none() {
42-
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
44+
impl Display for Server {
45+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46+
match self {
47+
Server::Internal(_) => f.write_str("internal"),
48+
Server::External(_) => f.write_str("external"),
49+
}
4350
}
44-
45-
next.run(request).await
4651
}
4752

48-
/// Run the HTTP server and wait on it forever.
49-
#[tokio::main]
50-
async fn serve(port: u16, compute: Arc<ComputeNode>) {
51-
let mut app = Router::new()
52-
.route("/check_writability", post(check_writability::is_writable))
53-
.route("/configure", post(configure::configure))
54-
.route("/database_schema", get(database_schema::get_schema_dump))
55-
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
56-
.route(
57-
"/extension_server/{*filename}",
58-
post(extension_server::download_extension),
59-
)
60-
.route("/extensions", post(extensions::install_extension))
61-
.route("/grants", post(grants::add_grant))
62-
.route("/insights", get(insights::get_insights))
63-
.route("/metrics", get(metrics::get_metrics))
64-
.route("/metrics.json", get(metrics_json::get_metrics))
65-
.route("/status", get(status::get_status))
66-
.route("/terminate", post(terminate::terminate))
67-
.fallback(handle_404)
68-
.layer(
53+
impl From<Server> for Router<Arc<ComputeNode>> {
54+
fn from(server: Server) -> Self {
55+
let mut router = Router::<Arc<ComputeNode>>::new();
56+
57+
router = match server {
58+
Server::Internal(_) => {
59+
router = router
60+
.route(
61+
"/extension_server/{*filename}",
62+
post(extension_server::download_extension),
63+
)
64+
.route("/extensions", post(extensions::install_extension))
65+
.route("/grants", post(grants::add_grant));
66+
67+
// Add in any testing support
68+
if cfg!(feature = "testing") {
69+
use super::routes::failpoints;
70+
71+
router = router.route("/failpoints", post(failpoints::configure_failpoints));
72+
}
73+
74+
router
75+
}
76+
Server::External(_) => router
77+
.route("/check_writability", post(check_writability::is_writable))
78+
.route("/configure", post(configure::configure))
79+
.route("/database_schema", get(database_schema::get_schema_dump))
80+
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
81+
.route("/insights", get(insights::get_insights))
82+
.route("/metrics", get(metrics::get_metrics))
83+
.route("/metrics.json", get(metrics_json::get_metrics))
84+
.route("/status", get(status::get_status))
85+
.route("/terminate", post(terminate::terminate)),
86+
};
87+
88+
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
6989
ServiceBuilder::new()
7090
// Add this middleware since we assume the request ID exists
7191
.layer(middleware::from_fn(maybe_add_request_id_header))
@@ -105,45 +125,92 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
105125
)
106126
.layer(PropagateRequestIdLayer::x_request_id()),
107127
)
108-
.with_state(compute);
128+
}
129+
}
130+
131+
impl Server {
132+
async fn handle_404() -> impl IntoResponse {
133+
StatusCode::NOT_FOUND
134+
}
135+
136+
async fn handle_405() -> impl IntoResponse {
137+
StatusCode::METHOD_NOT_ALLOWED
138+
}
139+
140+
async fn listener(&self) -> Result<TcpListener> {
141+
let addr = SocketAddr::new(self.ip(), self.port());
142+
let listener = TcpListener::bind(&addr).await?;
109143

110-
// Add in any testing support
111-
if cfg!(feature = "testing") {
112-
use super::routes::failpoints;
144+
Ok(listener)
145+
}
146+
147+
fn ip(&self) -> IpAddr {
148+
match self {
149+
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
150+
// allow binding to localhost
151+
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
152+
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
153+
}
154+
}
113155

114-
app = app.route("/failpoints", post(failpoints::configure_failpoints))
156+
fn port(self) -> u16 {
157+
match self {
158+
Server::Internal(port) => port,
159+
Server::External(port) => port,
160+
}
115161
}
116162

117-
// This usually binds to both IPv4 and IPv6 on Linux, see
118-
// https://github.com/rust-lang/rust/pull/34440 for more information
119-
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
120-
let listener = match TcpListener::bind(&addr).await {
121-
Ok(listener) => listener,
122-
Err(e) => {
123-
error!(
124-
"failed to bind the compute_ctl HTTP server to port {}: {}",
125-
port, e
163+
#[tokio::main]
164+
async fn serve(self, compute: Arc<ComputeNode>) {
165+
let listener = self.listener().await.unwrap_or_else(|e| {
166+
// If we can't bind, the compute cannot operate correctly
167+
panic!(
168+
"failed to bind the compute_ctl {} HTTP server to {}: {}",
169+
self,
170+
SocketAddr::new(self.ip(), self.port()),
171+
e
172+
);
173+
});
174+
175+
if tracing::enabled!(tracing::Level::INFO) {
176+
let local_addr = match listener.local_addr() {
177+
Ok(local_addr) => local_addr,
178+
Err(_) => SocketAddr::new(self.ip(), self.port()),
179+
};
180+
181+
info!(
182+
"compute_ctl {} HTTP server listening at {}",
183+
self, local_addr
126184
);
127-
return;
128185
}
129-
};
130186

131-
if let Ok(local_addr) = listener.local_addr() {
132-
info!("compute_ctl HTTP server listening on {}", local_addr);
133-
} else {
134-
info!("compute_ctl HTTP server listening on port {}", port);
187+
let router = Router::from(self).with_state(compute);
188+
189+
if let Err(e) = axum::serve(listener, router).await {
190+
error!("compute_ctl {} HTTP server error: {}", self, e);
191+
}
135192
}
136193

137-
if let Err(e) = axum::serve(listener, app).await {
138-
error!("compute_ctl HTTP server error: {}", e);
194+
pub fn launch(self, compute: &Arc<ComputeNode>) {
195+
let state = Arc::clone(compute);
196+
197+
info!("Launching the {} server", self);
198+
thread::Builder::new()
199+
.name(format!("http-server-{self}"))
200+
.spawn(move || self.serve(state))
201+
.unwrap_or_else(|_| panic!("Failed to start the {self} HTTP server"));
139202
}
140203
}
141204

142-
/// Launch a separate HTTP server thread and return its `JoinHandle`.
143-
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
144-
let state = Arc::clone(state);
205+
/// This middleware function allows compute_ctl to generate its own request ID
206+
/// if one isn't supplied. The control plane will always send one as a UUID. The
207+
/// neon Postgres extension on the other hand does not send one.
208+
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
209+
let headers = request.headers_mut();
145210

146-
Ok(thread::Builder::new()
147-
.name("http-server".into())
148-
.spawn(move || serve(port, state))?)
211+
if headers.get(X_REQUEST_ID).is_none() {
212+
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
213+
}
214+
215+
next.run(request).await
149216
}

Diff for: control_plane/src/bin/neon_local.rs

+5-2
Original file line numberDiff line numberDiff line change
@@ -552,8 +552,10 @@ struct EndpointCreateCmdArgs {
552552
lsn: Option<Lsn>,
553553
#[clap(long)]
554554
pg_port: Option<u16>,
555+
#[clap(long, alias = "http-port")]
556+
external_http_port: Option<u16>,
555557
#[clap(long)]
556-
http_port: Option<u16>,
558+
internal_http_port: Option<u16>,
557559
#[clap(long = "pageserver-id")]
558560
endpoint_pageserver_id: Option<NodeId>,
559561

@@ -1353,7 +1355,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
13531355
tenant_id,
13541356
timeline_id,
13551357
args.pg_port,
1356-
args.http_port,
1358+
args.external_http_port,
1359+
args.internal_http_port,
13571360
args.pg_version,
13581361
mode,
13591362
!args.update_catalog,

0 commit comments

Comments
 (0)