diff --git a/Cargo.lock b/Cargo.lock index d9218f28d..4811fe7c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,183 +125,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "async-channel" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" -dependencies = [ - "concurrent-queue", - "event-listener 2.5.3", - "futures-core", -] - -[[package]] -name = "async-channel" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ca33f4bc4ed1babef42cad36cc1f51fa88be00420404e5b1e80ab1b18f7678c" -dependencies = [ - "concurrent-queue", - "event-listener 4.0.3", - "event-listener-strategy", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-dup" -version = "1.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c2886ab563af5038f79ec016dd7b87947ed138b794e8dd64992962c9cca0411" -dependencies = [ - "async-lock 3.3.0", - "futures-io", -] - -[[package]] -name = "async-executor" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17ae5ebefcc48e7452b4987947920dac9450be1110cadf34d1b8c116bdbaf97c" -dependencies = [ - "async-lock 3.3.0", - "async-task", - "concurrent-queue", - "fastrand 2.0.1", - "futures-lite 2.2.0", - "slab", -] - -[[package]] -name = "async-fs" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "279cf904654eeebfa37ac9bb1598880884924aab82e290aa65c9e77a0e142e06" -dependencies = [ - "async-lock 2.8.0", - "autocfg", - "blocking", - "futures-lite 1.13.0", -] - -[[package]] -name = "async-io" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" -dependencies = [ - "async-lock 2.8.0", - "autocfg", - "cfg-if", - "concurrent-queue", - "futures-lite 1.13.0", - "log 0.4.20", - "parking", - "polling 2.8.0", - "rustix 0.37.27", - "slab", - "socket2 0.4.10", - "waker-fn", -] - -[[package]] -name = "async-io" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb41eb19024a91746eba0773aa5e16036045bbf45733766661099e182ea6a744" -dependencies = [ - "async-lock 3.3.0", - "cfg-if", - "concurrent-queue", - "futures-io", - "futures-lite 2.2.0", - "parking", - "polling 3.3.2", - "rustix 0.38.30", - "slab", - "tracing", - "windows-sys 0.52.0", -] - -[[package]] -name = "async-lock" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" -dependencies = [ - "event-listener 2.5.3", -] - -[[package]] -name = "async-lock" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d034b430882f8381900d3fe6f0aaa3ad94f2cb4ac519b429692a1bc2dda4ae7b" -dependencies = [ - "event-listener 4.0.3", - "event-listener-strategy", - "pin-project-lite", -] - -[[package]] -name = "async-net" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0434b1ed18ce1cf5769b8ac540e33f01fa9471058b5e89da9e06f3c882a8c12f" -dependencies = [ - "async-io 1.13.0", - "blocking", - "futures-lite 1.13.0", -] - -[[package]] -name = "async-process" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6438ba0a08d81529c69b36700fa2f95837bfe3e776ab39cde9c14d9149da88" -dependencies = [ - "async-io 1.13.0", - "async-lock 2.8.0", - "async-signal", - "blocking", - "cfg-if", - "event-listener 3.1.0", - "futures-lite 1.13.0", - "rustix 0.38.30", - "windows-sys 0.48.0", -] - -[[package]] -name = "async-signal" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e47d90f65a225c4527103a8d747001fc56e375203592b25ad103e1ca13124c5" -dependencies = [ - "async-io 2.3.0", - "async-lock 2.8.0", - "atomic-waker", - "cfg-if", - "futures-core", - "futures-io", - "rustix 0.38.30", - "signal-hook-registry", - "slab", - "windows-sys 0.48.0", -] - -[[package]] -name = "async-task" -version = "4.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbb36e985947064623dbd357f727af08ffd077f93d696782f3c56365fa2e2799" - -[[package]] -name = "atomic-waker" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" - [[package]] name = "autocfg" version = "1.1.0" @@ -350,22 +173,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "blocking" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a37913e8dc4ddcc604f0c6d3bf2887c995153af3611de9e23c352b44c1b9118" -dependencies = [ - "async-channel 2.1.1", - "async-lock 3.3.0", - "async-task", - "fastrand 2.0.1", - "futures-io", - "futures-lite 2.2.0", - "piper", - "tracing", -] - [[package]] name = "byteorder" version = "1.5.0" @@ -445,15 +252,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" -[[package]] -name = "concurrent-queue" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d16048cd947b08fa32c24458a22f5dc5e835264f689f4f5653210c69fd107363" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "cookie-factory" version = "0.3.2" @@ -625,53 +423,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "event-listener" -version = "2.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" - -[[package]] -name = "event-listener" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93877bcde0eb80ca09131a08d23f0a5c18a620b01db137dba666d18cd9b30c2" -dependencies = [ - "concurrent-queue", - "parking", - "pin-project-lite", -] - -[[package]] -name = "event-listener" -version = "4.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b215c49b2b248c855fb73579eb1f4f26c38ffdc12973e20e07b91d78d5646e" -dependencies = [ - "concurrent-queue", - "parking", - "pin-project-lite", -] - -[[package]] -name = "event-listener-strategy" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3" -dependencies = [ - "event-listener 4.0.3", - "pin-project-lite", -] - -[[package]] -name = "fastrand" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] - [[package]] name = "fastrand" version = "2.0.1" @@ -748,34 +499,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" -[[package]] -name = "futures-lite" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" -dependencies = [ - "fastrand 1.9.0", - "futures-core", - "futures-io", - "memchr", - "parking", - "pin-project-lite", - "waker-fn", -] - -[[package]] -name = "futures-lite" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445ba825b27408685aaecefd65178908c36c6e96aaf6d8599419d46e624192ba" -dependencies = [ - "fastrand 2.0.1", - "futures-core", - "futures-io", - "parking", - "pin-project-lite", -] - [[package]] name = "futures-macro" version = "0.3.30" @@ -950,7 +673,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.5.5", + "socket2", "tokio", "tower-service", "tracing", @@ -985,34 +708,14 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.1.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +checksum = "433de089bd45971eecf4668ee0ee8f4cec17db4f8bd8f7bc3197a6ce37aa7d9b" dependencies = [ "equivalent", "hashbrown", ] -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "io-lifetimes" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.48.0", -] - [[package]] name = "is-terminal" version = "0.4.10" @@ -1020,7 +723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", - "rustix 0.38.30", + "rustix", "windows-sys 0.52.0", ] @@ -1102,12 +805,6 @@ dependencies = [ "redox_syscall", ] -[[package]] -name = "linux-raw-sys" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" - [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -1279,12 +976,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "parking" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae" - [[package]] name = "parking_lot" version = "0.12.1" @@ -1357,47 +1048,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "piper" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668d31b1c4eba19242f2088b2bf3316b82ca31082a8335764db4e083db7485d4" -dependencies = [ - "atomic-waker", - "fastrand 2.0.1", - "futures-io", -] - -[[package]] -name = "polling" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" -dependencies = [ - "autocfg", - "bitflags 1.3.2", - "cfg-if", - "concurrent-queue", - "libc", - "log 0.4.20", - "pin-project-lite", - "windows-sys 0.48.0", -] - -[[package]] -name = "polling" -version = "3.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "545c980a3880efd47b2e262f6a4bb6daad6555cf3367aa9c4e52895f69537a41" -dependencies = [ - "cfg-if", - "concurrent-queue", - "pin-project-lite", - "rustix 0.38.30", - "tracing", - "windows-sys 0.52.0", -] - [[package]] name = "pool" version = "0.1.4" @@ -1601,9 +1251,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", @@ -1645,20 +1295,6 @@ dependencies = [ "nom", ] -[[package]] -name = "rustix" -version = "0.37.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.3.8", - "windows-sys 0.48.0", -] - [[package]] name = "rustix" version = "0.38.30" @@ -1668,7 +1304,7 @@ dependencies = [ "bitflags 2.4.2", "errno", "libc", - "linux-raw-sys 0.4.13", + "linux-raw-sys", "windows-sys 0.52.0", ] @@ -1776,18 +1412,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.195" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" +checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.195" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" +checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" dependencies = [ "proc-macro2", "quote", @@ -1796,9 +1432,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.111" +version = "1.0.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "176e46fa42316f18edd598015a5166857fc835ec732f5215eac6b7bdbf0a84f4" +checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" dependencies = [ "itoa", "ryu", @@ -1850,15 +1486,6 @@ dependencies = [ "digest", ] -[[package]] -name = "signal-hook-registry" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" -dependencies = [ - "libc", -] - [[package]] name = "slab" version = "0.4.9" @@ -1874,33 +1501,6 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" -[[package]] -name = "smol" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13f2b548cd8447f8de0fdf1c592929f70f4fc7039a05e47404b0d096ec6987a1" -dependencies = [ - "async-channel 1.9.0", - "async-executor", - "async-fs", - "async-io 1.13.0", - "async-lock 2.8.0", - "async-net", - "async-process", - "blocking", - "futures-lite 1.13.0", -] - -[[package]] -name = "socket2" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7916fc008ca5542385b89a3d3ce689953c143e9304a9bf8beec1de48994c0d" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "socket2" version = "0.5.5" @@ -1915,13 +1515,7 @@ dependencies = [ name = "sozu" version = "0.15.19" dependencies = [ - "anyhow", - "async-dup", - "async-io 1.13.0", "clap", - "futures", - "futures-lite 1.13.0", - "hex", "jemallocator", "libc", "log 0.4.20", @@ -1930,14 +1524,13 @@ dependencies = [ "nom", "num_cpus", "paw", - "regex", "serde", "serde_json", - "smol", "sozu-command-lib", "sozu-lib", "tempfile", "termion", + "thiserror", "time", ] @@ -2010,7 +1603,7 @@ dependencies = [ "serial_test", "sha2", "slab", - "socket2 0.5.5", + "socket2", "sozu-command-lib", "thiserror", "time", @@ -2076,9 +1669,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", - "fastrand 2.0.1", + "fastrand", "redox_syscall", - "rustix 0.38.30", + "rustix", "windows-sys 0.52.0", ] @@ -2192,7 +1785,7 @@ dependencies = [ "mio", "num_cpus", "pin-project-lite", - "socket2 0.5.5", + "socket2", "windows-sys 0.48.0", ] @@ -2334,12 +1927,6 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "waker-fn" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c4517f54858c779bbcbf228f4fca63d121bf85fbecb2dc578cdf4a39395690" - [[package]] name = "want" version = "0.3.1" @@ -2370,7 +1957,7 @@ dependencies = [ "either", "home", "once_cell", - "rustix 0.38.30", + "rustix", ] [[package]] @@ -2529,9 +2116,9 @@ checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" [[package]] name = "winnow" -version = "0.5.34" +version = "0.5.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7cf47b659b318dccbd69cc4797a39ae128f533dce7902a1096044d1967b9c16" +checksum = "1931d78a9c73861da0134f453bb1f790ce49b2e30eba8410b4b79bac72b46a2d" dependencies = [ "memchr", ] diff --git a/bin/Cargo.toml b/bin/Cargo.toml index 317b61f3a..b66b4da1b 100644 --- a/bin/Cargo.toml +++ b/bin/Cargo.toml @@ -14,22 +14,12 @@ authors = [ "Florentin Dubois ", ] categories = ["network-programming"] -edition="2021" +edition = "2021" rust-version = "1.70.0" -include = [ - "README.md", - "Cargo.toml", - "src/**/*", -] +include = ["README.md", "Cargo.toml", "src/**/*"] [dependencies] -anyhow = "^1.0.79" -async-dup = "^1.2.4" -async-io = "^1.13.0" clap = { version = "^4.4.18", features = ["derive"] } -futures = "^0.3.30" -futures-lite = "^1.13.0" -hex = "^0.4.3" jemallocator = { version = "^0.5.4", optional = true } libc = "^0.2.152" log = "^0.4.20" @@ -40,13 +30,12 @@ paw = "^1.0.0" serde = { version = "^1.0.195", features = ["derive"] } serde_json = "^1.0.111" time = "^0.3.29" -regex = "^1.10.3" -smol = "^1.3.0" tempfile = "^3.9.0" termion = "^3.0.0" - sozu-command-lib = { path = "../command", version = "^0.15.19" } sozu-lib = { path = "../lib", version = "^0.15.19" } +thiserror = "^1.0.49" + [target.'cfg(target_os="linux")'.dependencies] num_cpus = "^1.16.0" diff --git a/bin/src/cli.rs b/bin/src/cli.rs index cff0293ba..6fedf322a 100644 --- a/bin/src/cli.rs +++ b/bin/src/cli.rs @@ -109,7 +109,10 @@ pub enum SubCmd { #[clap(long = "hard", help = "do not wait for connections to finish")] hard: bool, }, - #[clap(name = "upgrade", about = "upgrade the proxy")] + #[clap( + name = "upgrade", + about = "upgrade the main process OR a specific worker. Specify a longer timeout." + )] Upgrade { #[clap(long = "worker", help = "upgrade a specific worker")] worker: Option, diff --git a/bin/src/command/mod.rs b/bin/src/command/mod.rs index 20cb0d609..e811235b6 100644 --- a/bin/src/command/mod.rs +++ b/bin/src/command/mod.rs @@ -1,1088 +1,193 @@ +mod requests; +pub mod server; +pub mod sessions; +pub mod upgrade; + use std::{ - collections::{HashMap, HashSet}, - fs, - os::unix::{ - fs::PermissionsExt, - io::{AsRawFd, FromRawFd, IntoRawFd}, - net::{UnixListener, UnixStream}, - }, - path::PathBuf, + fs, io::Error as IoError, num::ParseIntError, os::unix::fs::PermissionsExt, path::PathBuf, }; -use anyhow::{bail, Context}; -use async_dup::Arc; -use async_io::Async; -use futures::{ - channel::{ - mpsc::{channel, Receiver, Sender}, - oneshot, - }, - {SinkExt, StreamExt}, -}; -use futures_lite::{ - future, - io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, -}; -use nix::{ - sys::signal::{kill, Signal}, - unistd::Pid, -}; -use serde::{Deserialize, Serialize}; +use mio::net::UnixListener; use sozu_command_lib::{ - config::Config, + config::{Config, ConfigError}, logging::setup_logging_with_config, - proto::command::{ - request::RequestType, response_content::ContentType, MetricsConfiguration, Request, - Response, ResponseContent, ResponseStatus, RunState, Status, - }, - request::WorkerRequest, - response::WorkerResponse, - scm_socket::{Listeners, ScmSocket}, - state::ConfigState, }; use crate::{ - get_executable_path, - upgrade::{SerializedWorker, UpgradeData}, - util, - worker::{start_worker, Worker}, + cli::Args, + command::{requests::load_static_config, server::CommandHub}, + util::{get_config_file_path, get_executable_path, setup_metrics, write_pid_file, UtilError}, }; -mod requests; - -/// The CommandServer receives these CommandMessages, either from within Sōzu, -/// or from without, in which case they are ALWAYS of the Clientrequest variant. -#[derive(Debug)] -enum CommandMessage { - ClientNew { - client_id: String, - sender: Sender, // to send things back to the client - }, - ClientClose { - client_id: String, - }, - ClientRequest { - client_id: String, - request: Request, - }, - WorkerResponse { - worker_id: u32, - response: WorkerResponse, - }, - WorkerClose { - worker_id: u32, - }, - Advancement { - client_id: String, - advancement: Advancement, - }, - MasterStop, -} - -#[derive(PartialEq, Eq, Clone, Debug)] -pub enum Advancement { - Error(String), - Processing(String), - Ok(Success), +use self::server::{HubError, ServerError}; + +#[derive(thiserror::Error, Debug)] +pub enum StartError { + #[error("failed to load config: {0}")] + LoadConfig(ConfigError), + #[error("could not get path of config file: {0}")] + GetConfigPath(UtilError), + #[error("could not delete previous socket at {0}: {1}")] + RemoveSocket(PathBuf, IoError), + #[error("could not bind to listener: {0}")] + BindToListener(IoError), + #[error("could not write PID file of main process: {0}")] + WritePidFile(UtilError), + #[error("failed to set metrics on the main process: {0}")] + SetupMetrics(UtilError), + #[error("failed to get executable path: {0}")] + GetExecutablePath(UtilError), + #[error("could not get path to the command socket: {0}")] + GetSocketPath(ConfigError), + #[error("could not create command hub: {0}")] + CreateCommandHub(HubError), + #[error("could not load file: {0}")] + LoadProcFile(ConfigError), + #[error("could parse system max file descriptors: {0}")] + ParseSystemMaxFd(ParseIntError), + #[error("Too many allowed connection for a worker")] + TooManyAllowedConnections, + #[error("could not set the unix socket permissions: {0}")] + SetPermissions(IoError), + #[error("could not launch new worker: {0}")] + LaunchWorker(ServerError), } -/// Indicates success of either inner Sōzu logic and of handling the ClientRequest, -/// in which case Success caries the response data. -#[derive(PartialEq, Eq, Clone, Debug)] -pub enum Success { - CertificatesFromTheState(ResponseContent), - ClientClose(String), // the client id - ClientNew(String), // the client id - HandledClientRequest, - ListFrontends(ResponseContent), // the list of frontends - ListListeners(ResponseContent), // the list of listeners - ListWorkers(ResponseContent), - LoadState(String, usize, usize), // state path, oks, errors - Logging(String), // new logging level - Metrics(MetricsConfiguration), // enable / disable / clear metrics on the proxy - MasterStop, - // this should contain CommandResponseData but the logic does not return anything - // is this logic gone into sozu_command_lib::proxy::Query::Metrics(_) ? - // Metrics, - NotifiedClient(String), // client id - PropagatedWorkerEvent, - Query(ResponseContent), - ReloadConfiguration(usize, usize), // ok, errors - RequestCounts(ResponseContent), - SaveState(usize, String), // amount of written commands, path of the saved state - Status(ResponseContent), // Vec - SubscribeEvent(String), - UpgradeMain(i32), // pid of the new main process - UpgradeWorker(u32), // worker id - WorkerKilled(u32), // worker id - WorkerLaunched(u32), // worker id - WorkerRequest, - WorkerResponse, - WorkerRestarted(u32), // worker id - WorkerStopped(u32), // worker id -} - -// This is how success is logged on Sōzu, and, given the case, manifested to the client -impl std::fmt::Display for Success { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - Self::CertificatesFromTheState(_) => { - write!(f, "Successfully queried certificates from the state") - } - Self::ClientClose(id) => write!(f, "Close client: {id}"), - Self::ClientNew(id) => write!(f, "New client successfully added: {id}"), - Self::HandledClientRequest => write!(f, "Successfully handled the client request"), - Self::ListFrontends(_) => write!(f, "Successfully gathered the list of frontends"), - Self::ListListeners(_) => write!(f, "Successfully listed all listeners"), - Self::ListWorkers(_) => write!(f, "Successfully listed all workers"), - Self::LoadState(path, ok, error) => write!( - f, - "Successfully loaded state from path {path}, {ok} ok messages, {error} errors" - ), - Self::Logging(logging_filter) => { - write!(f, "Successfully set the logging level to {logging_filter}") - } - Self::Metrics(metrics_cfg) => { - write!(f, "Successfully set the metrics to {metrics_cfg:?}") - } - Self::MasterStop => write!(f, "stopping main process"), - Self::NotifiedClient(id) => { - write!(f, "Successfully notified client {id} of the advancement") - } - Self::PropagatedWorkerEvent => { - write!(f, "Sent worker response to all subscribing clients") - } - Self::Query(_) => write!(f, "Ran the query successfully"), - Self::ReloadConfiguration(ok, error) => write!( - f, - "Successfully reloaded configuration, ok: {ok}, errors: {error}" - ), - Self::RequestCounts(_) => write!(f, "count requests"), - Self::SaveState(counter, path) => { - write!(f, "saved {counter} config messages to {path}") - } - Self::Status(_) => { - write!(f, "Sent a status response to client") - } - Self::SubscribeEvent(client_id) => { - write!(f, "Successfully Added {client_id} to subscribers") - } - Self::UpgradeMain(pid) => write!( - f, - "new main process launched with pid {pid}, closing the old one" - ), - Self::UpgradeWorker(id) => { - write!(f, "Successfully upgraded worker with new id: {id}") - } - Self::WorkerKilled(id) => write!(f, "Successfully killed worker {id}"), - Self::WorkerLaunched(id) => write!(f, "Successfully launched worker {id}"), - Self::WorkerRequest => write!(f, "Successfully executed the request on all workers"), - Self::WorkerResponse => write!(f, "Successfully handled worker response"), - Self::WorkerRestarted(id) => write!(f, "Successfully restarted worker {id}"), - Self::WorkerStopped(id) => write!(f, "Successfully stopped worker {id}"), - } - } -} - -#[derive(Deserialize, Serialize, Debug)] -pub struct ProxyConfiguration { - id: String, - state: ConfigState, -} - -pub struct CommandServer { - /// file descriptor of the unix listener socket, usually "sozu.sock" - unix_listener_fd: i32, - /// this sender is cloned and distributed around, to send messages back - command_tx: Sender, - /// where the main loop receives messages - command_rx: Receiver, - /// All client loops. id -> cloned command_tx - clients: HashMap>, - /// handles to the workers as seen from the main process - workers: Vec, - /// A map of requests sent to workers. - /// Any function requesting a worker will log the request id in here, associated - /// with a sender and the worker id. This sender will be used to notify the function of the worker's - /// response (and worker id). - /// In certain cases, the same response may need to be transmitted several - /// times over. Therefore, a number is recorded next to the sender in - /// the hashmap. - in_flight: HashMap< - String, // the request id - ( - futures::channel::mpsc::Sender<(WorkerResponse, u32)>, // (response, worker id) to notify whoever sent the Request - usize, // the number of expected responses - ), - >, - event_subscribers: HashSet, - state: ConfigState, - config: Config, - /// id of the next worker to be spawned - next_worker_id: u32, - /// the path to the sozu executable, used to spawn workers - executable_path: String, - /// caching the number of backends instead of going through the whole state.backends hashmap - backends_count: usize, - /// caching the number of frontends instead of going through the whole state.http/hhtps/tcp_fronts hashmaps - frontends_count: usize, - accept_cancel: Option>, -} - -impl CommandServer { - fn new( - fd: i32, - config: Config, - command_tx: Sender, - command_rx: Receiver, - mut workers: Vec, - accept_cancel: oneshot::Sender<()>, - ) -> anyhow::Result { - //FIXME - if config.metrics.is_some() { - /*METRICS.with(|metrics| { - if let Some(sock) = (*metrics.borrow_mut()).socket_mut() { - poll.registry().register(sock, Token(1), Interest::WRITABLE).expect("should register the metrics socket"); - } else { - error!("could not register metrics socket"); - } - });*/ - } - - let state: ConfigState = Default::default(); - - for worker in workers.iter_mut() { - let main_to_worker_channel = worker - .worker_channel - .take() - .with_context(|| format!("No channel present in worker {}", worker.id))? - .sock; - let (worker_tx, worker_rx) = channel(10000); - worker.sender = Some(worker_tx); - - let main_to_worker_stream = Async::new(unsafe { - let fd = main_to_worker_channel.into_raw_fd(); - UnixStream::from_raw_fd(fd) - }) - .with_context(|| "Could not get a unix stream from the file descriptor")?; - - let id = worker.id; - let command_tx = command_tx.clone(); - smol::spawn(async move { - worker_loop(id, main_to_worker_stream, command_tx, worker_rx).await; - }) - .detach(); - } - - let next_id = workers.len() as u32; - let executable_path = unsafe { get_executable_path()? }; - let backends_count = state.count_backends(); - let frontends_count = state.count_frontends(); - - Ok(CommandServer { - unix_listener_fd: fd, - config, - state, - command_tx, - command_rx, - clients: HashMap::new(), - workers, - event_subscribers: HashSet::new(), - in_flight: HashMap::new(), - next_worker_id: next_id, - executable_path, - backends_count, - frontends_count, - accept_cancel: Some(accept_cancel), - }) - } - - pub async fn run(&mut self) { - while let Some(command) = self.command_rx.next().await { - let result: anyhow::Result = match command { - CommandMessage::ClientNew { client_id, sender } => { - // this appears twice, which is weird - debug!("adding new client {}", client_id); - self.clients.insert(client_id.to_owned(), sender); - Ok(Success::ClientNew(client_id)) - } - CommandMessage::ClientClose { client_id } => { - debug!("removing client {}", client_id); - self.clients.remove(&client_id); - self.event_subscribers.remove(&client_id); - Ok(Success::ClientClose(client_id)) - } - CommandMessage::ClientRequest { client_id, request } => { - self.handle_client_request(client_id, request).await - } - CommandMessage::WorkerClose { worker_id } => self - .handle_worker_close(worker_id) - .await - .with_context(|| "Could not close worker"), - CommandMessage::WorkerResponse { - worker_id, - response, - } => self - .handle_worker_response(worker_id, response) - .await - .with_context(|| "Could not handle worker response"), - CommandMessage::Advancement { - client_id, - advancement: response, - } => { - let success_result = self - .notify_advancement_to_client(client_id, response.clone()) - .await; - if let Advancement::Ok(Success::UpgradeMain(_)) = response { - std::thread::sleep(std::time::Duration::from_secs(2)); - info!("shutting down old main"); - std::process::exit(0); - }; - success_result - } - CommandMessage::MasterStop => { - info!("stopping main process"); - Ok(Success::MasterStop) - } - }; - - match result { - Ok(request_success) => { - trace!("request OK: {}", request_success); - - // perform shutdowns - if request_success == Success::MasterStop { - // breaking the loop brings run() to return and ends Sōzu - // shouldn't we have the same break for both shutdowns? - break; - } - } - Err(error) => { - // log the error on the main process without stopping it - error!("Failed request: {:#?}", error); - } - } - } - } - - pub fn generate_upgrade_data(&self) -> UpgradeData { - let workers: Vec = self - .workers - .iter() - .map(SerializedWorker::from_worker) - .collect(); - //FIXME: ensure there's at least one worker - let state = self.state.clone(); - - UpgradeData { - command_socket_fd: self.unix_listener_fd, - config: self.config.clone(), - workers, - state, - next_id: self.next_worker_id, - //token_count: self.token_count, - } - } - - pub fn from_upgrade_data(upgrade_data: UpgradeData) -> anyhow::Result { - let UpgradeData { - command_socket_fd, - config, - workers: serialized_workers, - state, - next_id, - } = upgrade_data; - - debug!("listener is: {}", command_socket_fd); - let async_listener = Async::new(unsafe { UnixListener::from_raw_fd(command_socket_fd) })?; - - let (accept_cancel_tx, accept_cancel_rx) = oneshot::channel(); - let (command_tx, command_rx) = channel(10000); - let cloned_command_tx = command_tx.clone(); - let cloned_config = config.clone(); - - smol::spawn(accept_clients( - cloned_command_tx, - async_listener, - accept_cancel_rx, - cloned_config, - )) - .detach(); - - let tx = command_tx.clone(); - - let mut workers: Vec = Vec::new(); - - for serialized in serialized_workers.iter() { - if serialized.run_state == RunState::Stopped - || serialized.run_state == RunState::Stopping - { - continue; - } - - let (worker_tx, worker_rx) = channel(10000); - let sender = Some(worker_tx); - - debug!("deserializing worker: {:?}", serialized); - let worker_stream = Async::new(unsafe { UnixStream::from_raw_fd(serialized.fd) }) - .with_context(|| "Could not create an async unix stream to spawn the worker")?; - - let id = serialized.id; - let command_tx = tx.clone(); - //async fn worker(id: u32, sock: Async, tx: Sender, rx: Receiver<()>) -> std::io::Result<()> { - smol::spawn(async move { - worker_loop(id, worker_stream, command_tx, worker_rx).await; - }) - .detach(); - - let scm_socket = ScmSocket::new(serialized.scm) - .with_context(|| "Could not get scm to create worker")?; - - let worker = Worker { - worker_channel_fd: serialized.fd, - id: serialized.id, - worker_channel: None, - sender, - pid: serialized.pid, - run_state: serialized.run_state, - queue: serialized.queue.clone().into(), - scm_socket, - }; - workers.push(worker); - } - - let config_state = state.clone(); - - let backends_count = config_state.count_backends(); - let frontends_count = config_state.count_frontends(); - - let executable_path = unsafe { get_executable_path()? }; - - Ok(CommandServer { - unix_listener_fd: command_socket_fd, - config, - state, - command_tx, - command_rx, - clients: HashMap::new(), - workers, - event_subscribers: HashSet::new(), - in_flight: HashMap::new(), - next_worker_id: next_id, - executable_path, - backends_count, - frontends_count, - accept_cancel: Some(accept_cancel_tx), - }) - } - - pub fn disable_cloexec_before_upgrade(&mut self) -> anyhow::Result<()> { - for worker in self.workers.iter_mut() { - if worker.run_state == RunState::Running { - let _ = util::disable_close_on_exec(worker.worker_channel_fd).map_err(|e| { - error!( - "could not disable close on exec for worker {}: {}", - worker.id, e - ); - }); - } - } - trace!( - "disabling cloexec on listener with file descriptor: {}", - self.unix_listener_fd - ); - util::disable_close_on_exec(self.unix_listener_fd)?; - Ok(()) - } - - pub fn enable_cloexec_after_upgrade(&mut self) -> anyhow::Result<()> { - for worker in self.workers.iter_mut() { - if worker.run_state == RunState::Running { - let _ = util::enable_close_on_exec(worker.worker_channel_fd).map_err(|e| { - error!( - "could not enable close on exec for worker {}: {}", - worker.id, e - ); - }); - } - } - util::enable_close_on_exec(self.unix_listener_fd)?; - Ok(()) - } - - pub async fn load_static_cluster_configuration(&mut self) -> anyhow::Result<()> { - let (tx, mut rx) = futures::channel::mpsc::channel(self.workers.len() * 2); - - let mut total_message_count = 0usize; - - //FIXME: too many loops, this could be cleaner - for message in self.config.generate_config_messages()? { - let request = message.content; - if let Err(e) = self.state.dispatch(&request) { - error!("Could not execute request on state: {:#}", e); - } - - if let &Some(RequestType::AddCertificate(_)) = &request.request_type { - debug!("config generated AddCertificate( ... )"); - } else { - debug!("config generated {:?}", request); - } - - let mut count = 0usize; - for worker in self.workers.iter_mut().filter(|worker| worker.is_active()) { - worker.send(message.id.clone(), request.clone()).await; - count += 1; - } - - if count == 0 { - // FIXME: should send back error here - error!("no worker found"); - } else { - self.in_flight - .insert(message.id.clone(), (tx.clone(), count)); - total_message_count += count; - } - } - - self.backends_count = self.state.count_backends(); - self.frontends_count = self.state.count_frontends(); - gauge!("configuration.clusters", self.state.clusters.len()); - gauge!("configuration.backends", self.backends_count); - gauge!("configuration.frontends", self.frontends_count); - - smol::spawn(async move { - let mut ok = 0usize; - let mut error = 0usize; - - let mut i = 0; - while let Some((proxy_response, _)) = rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - ok += 1; - } - ResponseStatus::Processing => { - //info!("metrics processing"); - continue; - } - ResponseStatus::Failure => { - error!( - "error handling configuration message {}: {}", - proxy_response.id, proxy_response.message - ); - error += 1; - } - }; - - i += 1; - if i == total_message_count { - break; - } - } - - if error == 0 { - info!("loading state: {} ok messages, 0 errors", ok); - } else { - error!("loading state: {} ok messages, {} errors", ok, error); - } - }) - .detach(); - Ok(()) - } - - /// in case a worker has crashed while Running and automatic_worker_restart is set to true - pub async fn restart_worker(&mut self, worker_id: u32) -> anyhow::Result<()> { - let worker_to_upgrade = &mut (self - .workers - .get_mut(worker_id as usize) - .with_context(|| "there should be a worker at that token")?); - - match kill(Pid::from_raw(worker_to_upgrade.pid), None) { - Ok(_) => { - error!( - "worker process {} (PID = {}) is alive but the worker must have crashed. Killing and replacing", - worker_to_upgrade.id, worker_to_upgrade.pid - ); - } - Err(_) => { - error!( - "worker process {} (PID = {}) not answering, killing and replacing", - worker_to_upgrade.id, worker_to_upgrade.pid - ); - } - } - - kill(Pid::from_raw(worker_to_upgrade.pid), Signal::SIGKILL) - .with_context(|| "failed to kill the worker process")?; - - worker_to_upgrade.run_state = RunState::Stopped; - - incr!("worker_restart"); - - let new_worker_id = self.next_worker_id; - let listeners = Some(Listeners { - http: Vec::new(), - tls: Vec::new(), - tcp: Vec::new(), - }); - - let mut new_worker = start_worker( - new_worker_id, - &self.config, - self.executable_path.clone(), - &self.state, - listeners, - ) - .with_context(|| format!("Could not start new worker {new_worker_id}"))?; +pub fn begin_main_process(args: &Args) -> Result<(), StartError> { + let config_file_path = get_config_file_path(args).map_err(StartError::GetConfigPath)?; - info!("created new worker: {}", new_worker_id); - self.next_worker_id += 1; + let config = Config::load_from_path(config_file_path).map_err(StartError::LoadConfig)?; - let sock = new_worker - .worker_channel - .take() - .with_context(|| { - format!( - "the new worker with id {} does not have a channel", - new_worker.id - ) - })? // this used to crash with unwrap(), do we still want to crash? - .sock; - let (worker_tx, worker_rx) = channel(10_000); - new_worker.sender = Some(worker_tx); - - let stream = Async::new(unsafe { - let fd = sock.into_raw_fd(); - UnixStream::from_raw_fd(fd) - })?; - - let new_worker_id = new_worker.id; - let command_tx = self.command_tx.clone(); - smol::spawn(async move { - worker_loop(new_worker_id, stream, command_tx, worker_rx).await; - }) - .detach(); - - let mut requests = self.state.generate_activate_requests(); - for (count, request) in requests.drain(..).enumerate() { - new_worker - .send(format!("RESTART-{new_worker_id}-ACTIVATE-{count}"), request) - .await; - } - - new_worker - .send( - format!("RESTART-{new_worker_id}-STATUS"), - RequestType::Status(Status {}).into(), - ) - .await; - - self.workers.push(new_worker); - - Ok(()) - } - - async fn handle_worker_close(&mut self, id: u32) -> anyhow::Result { - info!("removing worker {}", id); - - if let Some(worker) = self.workers.iter_mut().find(|w| w.id == id) { - // In case a worker crashes and should be restarted - if self.config.worker_automatic_restart && worker.run_state == RunState::Running { - info!("Automatically restarting worker {}", id); - match self.restart_worker(id).await { - Ok(()) => info!("Worker {} has automatically restarted!", id), - Err(e) => error!("Could not restart worker {}: {}", id, e), - } - return Ok(Success::WorkerRestarted(id)); - } - - info!("Closing the worker {}.", worker.id); - if !worker.the_pid_is_alive() { - info!("Worker {} is dead, setting to Stopped.", worker.id); - worker.run_state = RunState::Stopped; - return Ok(Success::WorkerStopped(id)); - } - - info!( - "Worker {} is not dead but should be. Let's kill it.", - worker.id - ); - - match kill(Pid::from_raw(worker.pid), Signal::SIGKILL) { - Ok(()) => { - info!("Worker {} was successfully killed", id); - worker.run_state = RunState::Stopped; - return Ok(Success::WorkerKilled(id)); - } - Err(e) => { - return Err(e).with_context(|| "failed to kill the worker process"); - } - } - } - bail!(format!("Could not find worker {id}")) - } - - async fn handle_worker_response( - &mut self, - worker_id: u32, - response: WorkerResponse, - ) -> anyhow::Result { - // Notify the client with Processing in case of a proxy event - if let Some(ResponseContent { - content_type: Some(ContentType::Event(event)), - }) = response.content - { - for client_id in self.event_subscribers.iter() { - if let Some(client_tx) = self.clients.get_mut(client_id) { - let event = Response::new( - ResponseStatus::Processing, - format!("{worker_id}"), - Some(ContentType::Event(event.clone()).into()), - ); - client_tx - .send(event) - .await - .with_context(|| format!("could not send message to client {client_id}"))? - } - } - return Ok(Success::PropagatedWorkerEvent); - } + setup_logging_with_config(&config, "MAIN"); + info!("Starting up"); + setup_metrics(&config).map_err(StartError::SetupMetrics)?; + write_pid_file(&config).map_err(StartError::WritePidFile)?; - // Notify the function that sent the request to which the worker responded. - // The in_flight map contains the id of each sent request, together with a sender - // we use to send the response to. - match self.in_flight.remove(&response.id) { - None => { - // FIXME: this message happens a lot at startup because AddCluster - // messages receive responses from each of the HTTP, HTTPS and TCP - // proxys. The clusters list should be merged - debug!("unknown response id: {}", response.id); - } - Some((mut requester_tx, mut expected_responses)) => { - let response_id = response.id.clone(); + update_process_limits(&config)?; - // if a worker returned Ok or Error, we're not expecting any more - // messages with this id from it - match response.status { - ResponseStatus::Ok | ResponseStatus::Failure => { - expected_responses -= 1; - } - _ => {} - }; + let executable_path = unsafe { get_executable_path().map_err(StartError::GetExecutablePath)? }; - if requester_tx - .send((response.clone(), worker_id)) - .await - .is_err() - { - error!("Failed to send worker response back: {}", response); - }; + let command_socket_path = config + .command_socket_path() + .map_err(StartError::GetSocketPath)?; - // reinsert the message_id and sender into the hashmap, for later reuse - if expected_responses > 0 { - self.in_flight - .insert(response_id, (requester_tx, expected_responses)); - } - } - } - Ok(Success::WorkerResponse) - } -} - -pub fn start_server( - config: Config, - command_socket_path: String, - workers: Vec, -) -> anyhow::Result<()> { let path = PathBuf::from(&command_socket_path); if fs::metadata(&path).is_ok() { info!("A socket is already present. Deleting..."); - fs::remove_file(&path) - .with_context(|| format!("could not delete previous socket at {path:?}"))?; + fs::remove_file(&path).map_err(|io_err| StartError::RemoveSocket(path.clone(), io_err))?; } - let unix_listener = match UnixListener::bind(&path) { - Ok(unix_listener) => unix_listener, - Err(e) => { - error!("could not create unix socket: {:?}", e); - // the workers did not even get the configuration, we can kill them right away - for worker in workers { - error!("killing worker n°{} (PID {})", worker.id, worker.pid); - let _ = kill(Pid::from_raw(worker.pid), Signal::SIGKILL).map_err(|e| { - error!("could not kill worker: {:?}", e); - }); - } - bail!("couldn't start server"); - } - }; + let unix_listener = UnixListener::bind(&path).map_err(StartError::BindToListener)?; - if let Err(e) = fs::set_permissions(&path, fs::Permissions::from_mode(0o600)) { - error!("could not set the unix socket permissions: {:?}", e); - let _ = fs::remove_file(&path).map_err(|e2| { - error!("could not remove the unix socket: {:?}", e2); - }); - // the workers did not even get the configuration, we can kill them right away - for worker in workers { - error!("killing worker n°{} (PID {})", worker.id, worker.pid); - let _ = kill(Pid::from_raw(worker.pid), Signal::SIGKILL).map_err(|e| { - error!("could not kill worker: {:?}", e); - }); - } - bail!("couldn't start server"); - } + fs::set_permissions(&path, fs::Permissions::from_mode(0o600)) + .map_err(StartError::SetPermissions)?; - future::block_on(async { - // Create a listener. - let listener_fd = unix_listener.as_raw_fd(); - let async_listener = Async::new(unix_listener)?; - info!("Listening on {:?}", async_listener.get_ref().local_addr()?); + // Create a copy of the state path to load state later + let saved_state_path = config.saved_state.clone(); + let worker_count = config.worker_count; - let (accept_cancel_tx, accept_cancel_rx) = oneshot::channel(); - let (command_tx, command_rx) = channel(10000); - let cloned_command_tx = command_tx.clone(); - let cloned_config = config.clone(); + info!("Creating command hub"); + let mut command_hub = CommandHub::new(unix_listener, config, executable_path) + .map_err(StartError::CreateCommandHub)?; - smol::spawn(accept_clients( - cloned_command_tx, - async_listener, - accept_cancel_rx, - cloned_config, - )) - .detach(); - - // Create a copy of the state path to load state later - let saved_state_path = config.saved_state.clone(); - - let mut server = CommandServer::new( - listener_fd, - config, - command_tx, - command_rx, - workers, - accept_cancel_tx, - )?; + info!("Launching workers"); + for _ in 0..worker_count { + command_hub + .launch_new_worker(None) + .map_err(StartError::LaunchWorker)?; + } - let _ = server - .load_static_cluster_configuration() - .await - .map_err(|load_error| { - error!( - "Error loading static cluster configuration: {:#}", - load_error - ) - }); + info!("Load static configuration"); + load_static_config(&mut command_hub.server, None, None); - if let Some(path) = saved_state_path { - server - .load_state(None, &path) - .await - .with_context(|| format!("Loading {:?} failed", &path))?; - } + if let Some(path) = saved_state_path { + requests::load_state(&mut command_hub.server, None, &path); + } - gauge!("configuration.clusters", server.state.clusters.len()); - gauge!("configuration.backends", server.backends_count); - gauge!("configuration.frontends", server.frontends_count); + command_hub.run(); - info!("waiting for configuration client connections"); - server.run().await; - info!("main process stopped"); - Ok(()) - }) + info!("main process stopped"); + Ok(()) } -/// spawns a client loop whenever a client connects to the socket -async fn accept_clients( - mut command_tx: Sender, - async_listener: Async, - accept_cancel_rx: oneshot::Receiver<()>, - config: Config, -) { - setup_logging_with_config(&config, "MAIN"); - let mut counter = 0usize; - let mut accept_cancel_rx = Some(accept_cancel_rx); - info!("Accepting client connections"); - loop { - let accept_client = async_listener.accept(); - futures::pin_mut!(accept_client); - let (stream, _) = - match futures::future::select(accept_cancel_rx.take().unwrap(), accept_client).await { - futures::future::Either::Left((_canceled, _)) => { - info!("stopping listener"); - break; - } - futures::future::Either::Right((stream_and_addr, cancel_rx)) => { - accept_cancel_rx = Some(cancel_rx); - stream_and_addr.expect("Can not get unix stream to create a client loop.") - } - }; - let (client_tx, client_rx) = channel(10000); - - let client_id = format!("CL-{counter}"); +#[cfg(target_os = "linux")] +/// We check the hard_limit. The soft_limit can be changed at runtime +/// by the process or any user. hard_limit can only be changed by root +fn update_process_limits(config: &Config) -> Result<(), StartError> { + info!("Updating process limits"); + let wanted_opened_files = (config.max_connections as u64) * 2; - smol::spawn(client_loop( - client_id.clone(), - stream, - command_tx.clone(), - client_rx, - )) - .detach(); + let system_max_fd = get_system_max_fd("/proc/sys/fs/file-max")?; - command_tx - .send(CommandMessage::ClientNew { - client_id, - sender: client_tx, - }) - .await - .expect("Failed at sending ClientNew message"); - counter += 1; + if config.max_connections > system_max_fd { + error!( + "Proxies total max_connections can't be higher than system's file-max limit. \ + Current limit: {}, current value: {}", + system_max_fd, config.max_connections + ); + return Err(StartError::TooManyAllowedConnections); } -} - -/// The client loop does two things: -/// - write everything destined to the client onto the unix stream -/// - parse CommandRequests from the unix stream and send them to the command server -async fn client_loop( - client_id: String, - stream: Async, - mut command_tx: Sender, - mut client_rx: Receiver, -) { - let read_stream = Arc::new(stream); - let mut write_stream = read_stream.clone(); - - smol::spawn(async move { - while let Some(response) = client_rx.next().await { - trace!("sending back message to client: {:?}", response); - let mut message: Vec = serde_json::to_string(&response) - .map(|string| string.into_bytes()) - .unwrap_or_else(|_| Vec::new()); - // separate all messages with a 0 byte - message.push(0); - let _ = write_stream.write_all(&message).await; - } - }) - .detach(); + // Get the soft and hard limits for the current process + let mut limits = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut limits) }; - debug!("will start receiving messages from client {}", client_id); + // Ensure we don't exceed the hard limit + if limits.rlim_max < wanted_opened_files { + error!( + "at least one worker can't have that many connections. \ + current max file descriptor hard limit is: {}, \ + configured max_connections is {} (the worker needs two file descriptors \ + per client connection)", + limits.rlim_max, config.max_connections + ); + return Err(StartError::TooManyAllowedConnections); + } - // Read the stream by splitting it on 0 bytes - let mut split_iterator = BufReader::new(read_stream).split(0); - while let Some(message) = split_iterator.next().await { - let message = match message { - Err(e) => { - error!("could not split message: {:?}", e); - break; - } - Ok(msg) => msg, - }; + if limits.rlim_cur < wanted_opened_files && limits.rlim_cur != limits.rlim_max { + // Try to get twice what we need to be safe, or rlim_max if we exceed that + limits.rlim_cur = limits.rlim_max.min(wanted_opened_files * 2); + unsafe { + libc::setrlimit(libc::RLIMIT_NOFILE, &limits); - match serde_json::from_slice::(&message) { - Err(e) => { - error!("could not decode client message: {:?}", e); - break; - } - Ok(request) => { - debug!("got command request: {:?}", request); - let client_id = client_id.clone(); - if let Err(e) = command_tx - .send(CommandMessage::ClientRequest { client_id, request }) - .await - { - error!("error sending client request to command server: {:?}", e); - } - } + // Refresh the data we have + libc::getrlimit(libc::RLIMIT_NOFILE, &mut limits); } } - // If the loop breaks, request the command server to close the client - if let Err(send_error) = command_tx - .send(CommandMessage::ClientClose { - client_id: client_id.to_owned(), - }) - .await - { + // Ensure we don't exceed the new soft limit + if limits.rlim_cur < wanted_opened_files { error!( - "The client loop {} could not send ClientClose to the command server: {:?}", - client_id, send_error + "at least one worker can't have that many connections. \ + current max file descriptor soft limit is: {}, \ + configured max_connections is {} (the worker needs two file descriptors \ + per client connection)", + limits.rlim_cur, config.max_connections ); + return Err(StartError::TooManyAllowedConnections); } -} - -/// the worker loop does two things: -/// - write everything destined to the worker onto the unix stream -/// - parse ProxyResponses from the unix stream and send them to the CommandServer -async fn worker_loop( - worker_id: u32, - stream: Async, - mut command_tx: Sender, - mut worker_rx: Receiver, -) { - let read_stream = Arc::new(stream); - let mut write_stream = read_stream.clone(); - smol::spawn(async move { - debug!("will start sending messages to worker {}", worker_id); - while let Some(worker_request) = worker_rx.next().await { - debug!("sending to worker {}: {:?}", worker_id, worker_request); - let mut message: Vec = serde_json::to_string(&worker_request) - .map(|string| string.into_bytes()) - .unwrap_or_else(|_| Vec::new()); - - // separate all messages with a 0 byte - message.push(0); - let _ = write_stream.write_all(&message).await; - } - }) - .detach(); - - debug!("will start receiving messages from worker {}", worker_id); + Ok(()) +} - // Read the stream by splitting it on 0 bytes - let mut split_iterator = BufReader::new(read_stream).split(0); - while let Some(message) = split_iterator.next().await { - let message = match message { - Err(e) => { - error!("could not split message: {:?}", e); - break; - } - Ok(msg) => msg, - }; +/// To ensure we don't exceed the system maximum capacity +fn get_system_max_fd(max_file_path: &str) -> Result { + let max_file = Config::load_file(max_file_path).map_err(StartError::LoadProcFile)?; - match serde_json::from_slice::(&message) { - Err(e) => { - error!("could not decode worker message: {:?}", e); - break; - } - Ok(response) => { - debug!("worker {} replied message: {:?}", worker_id, response); - let worker_id = worker_id; - if let Err(e) = command_tx - .send(CommandMessage::WorkerResponse { - worker_id, - response, - }) - .await - { - error!("error sending worker response to command server: {:?}", e); - } - } - } - } + trace!("{}: '{}'", max_file_path, max_file); - error!("worker loop stopped, will close the worker {}", worker_id); + max_file + .trim() + .parse::() + .map_err(StartError::ParseSystemMaxFd) +} - // if the loop breaks, request the command server to close the worker - if let Err(send_error) = command_tx - .send(CommandMessage::WorkerClose { - worker_id: worker_id.to_owned(), - }) - .await - { - error!( - "The worker loop {} could not send WorkerClose to the CommandServer: {:?}", - worker_id, send_error - ); - } +#[cfg(not(target_os = "linux"))] +fn update_process_limits(_: &Config) -> Result<(), StartError> { + Ok(()) } diff --git a/bin/src/command/requests.rs b/bin/src/command/requests.rs index 5eae67c7b..ee8493e43 100644 --- a/bin/src/command/requests.rs +++ b/bin/src/command/requests.rs @@ -1,15 +1,10 @@ use std::{ - collections::{BTreeMap, HashSet}, + collections::{BTreeMap, HashMap}, fs::File, io::{ErrorKind, Read}, - os::unix::io::{FromRawFd, IntoRawFd}, - os::unix::net::UnixStream, - time::{Duration, Instant}, }; -use anyhow::{bail, Context}; -use async_io::Async; -use futures::{channel::mpsc::*, SinkExt, StreamExt}; +use mio::Token; use nom::{HexDisplay, Offset}; use sozu_command_lib::{ @@ -20,1421 +15,826 @@ use sozu_command_lib::{ proto::command::{ request::RequestType, response_content::ContentType, AggregatedMetrics, AvailableMetrics, CertificatesWithFingerprints, ClusterHashes, ClusterInformations, FrontendFilters, - MetricsConfiguration, QueryCertificatesFilters, Request, Response, ResponseContent, - ResponseStatus, ReturnListenSockets, RunState, SoftStop, Status, WorkerInfo, WorkerInfos, - WorkerResponses, + HardStop, QueryCertificatesFilters, QueryMetricsOptions, Request, ResponseContent, + ResponseStatus, RunState, SoftStop, Status, WorkerInfo, WorkerInfos, WorkerResponses, }, request::WorkerRequest, - scm_socket::Listeners, }; +use sozu_lib::metrics::METRICS; -use sozu::metrics::METRICS; - -use crate::{ - command::{Advancement, CommandMessage, CommandServer, Success}, - upgrade::fork_main_into_new_main, - worker::{start_worker, Worker}, +use crate::command::{ + server::{ + DefaultGatherer, Gatherer, GatheringTask, MessageClient, Server, ServerState, Timeout, + WorkerId, + }, + sessions::{ClientSession, OptionalClient}, + upgrade::{upgrade_main, upgrade_worker}, }; -impl CommandServer { - pub async fn handle_client_request( - &mut self, - client_id: String, - request: Request, - ) -> anyhow::Result { - trace!("Received request {:?}", request); - - let cloned_client_id = client_id.clone(); - let cloned_request = request.clone(); - - let result: anyhow::Result> = match request.request_type { - Some(RequestType::SaveState(path)) => self.save_state(&path).await, - Some(RequestType::ListWorkers(_)) => self.list_workers().await, - Some(RequestType::ListFrontends(filters)) => self.list_frontends(filters).await, - Some(RequestType::ListListeners(_)) => self.list_listeners(), - Some(RequestType::LoadState(path)) => self.load_state(Some(client_id), &path).await, - Some(RequestType::LaunchWorker(tag)) => self.launch_worker(client_id, &tag).await, - Some(RequestType::UpgradeMain(_)) => self.upgrade_main(client_id).await, - Some(RequestType::UpgradeWorker(worker_id)) => { - self.upgrade_worker(client_id, worker_id).await - } - Some(RequestType::ConfigureMetrics(config)) => { - match MetricsConfiguration::try_from(config) { - Ok(config) => self.configure_metrics(client_id, config).await, - Err(_) => Err(anyhow::Error::msg("wrong i32 for metrics configuration")), - } - } - Some(RequestType::Logging(logging_filter)) => { - self.set_logging_level(logging_filter, client_id).await +impl Server { + pub fn handle_client_request(&mut self, client: &mut ClientSession, request: Request) { + let request_type = match request.request_type { + Some(req) => req, + None => { + error!("empty request sent by client {:?}", client); + return; } - Some(RequestType::SubscribeEvents(_)) => { - self.event_subscribers.insert(client_id.clone()); - Ok(Some(Success::SubscribeEvent(client_id.clone()))) - } - Some(RequestType::ReloadConfiguration(path)) => { - self.reload_configuration(client_id, path).await - } - Some(RequestType::Status(_)) => self.status(client_id).await, - Some(RequestType::QueryCertificatesFromTheState(filters)) => { - self.query_certificates_from_the_state(filters) - } - Some(RequestType::CountRequests(_)) => self.query_request_count(), - Some(RequestType::QueryClusterById(_)) - | Some(RequestType::QueryCertificatesFromWorkers(_)) - | Some(RequestType::QueryClustersByDomain(_)) - | Some(RequestType::QueryClustersHashes(_)) - | Some(RequestType::QueryMetrics(_)) => self.query(client_id, request).await, - - // any other case is an request for the workers, except for SoftStop and HardStop. - // TODO: we should have something like: - // RequestContent::SoftStop => self.do_something(), - // RequestContent::HardStop => self.do_nothing_and_return_early(), - // but it goes in there instead: - Some(_request_for_workers) => self.worker_requests(client_id, cloned_request).await, - None => Err(anyhow::Error::msg("Empty request")), }; - - // Notify the command server by sending using his command_tx - match result { - Ok(Some(success)) => { - info!("{}", success); - trace!("details success of the client request: {:?}", success); - return_success(self.command_tx.clone(), cloned_client_id, success).await; - } - Err(anyhow_error) => { - let formatted = format!("{anyhow_error:#}"); - error!("{:#}", formatted); - return_error(self.command_tx.clone(), cloned_client_id, formatted).await; - } - Ok(None) => { - // do nothing here. Ok(None) means the function has already returned its result - // on its own to the command server - } + match request_type { + RequestType::SaveState(path) => save_state(self, client, &path), + RequestType::LoadState(path) => load_state(self, Some(client), &path), + RequestType::ListWorkers(_) => list_workers(self, client), + RequestType::ListFrontends(inner) => list_frontend_command(self, client, inner), + RequestType::ListListeners(_) => list_listeners(self, client), + RequestType::UpgradeMain(_) => upgrade_main(self, client), + RequestType::UpgradeWorker(worker_id) => upgrade_worker(self, client, worker_id), + RequestType::SubscribeEvents(_) => subscribe_client_to_events(self, client), + RequestType::ReloadConfiguration(path) => { + load_static_config(self, Some(client), Some(&path)) + } + RequestType::Status(_) => status(self, client), + RequestType::AddCluster(_) + | RequestType::ActivateListener(_) + | RequestType::AddBackend(_) + | RequestType::AddCertificate(_) + | RequestType::AddHttpFrontend(_) + | RequestType::AddHttpListener(_) + | RequestType::AddHttpsFrontend(_) + | RequestType::AddHttpsListener(_) + | RequestType::AddTcpFrontend(_) + | RequestType::AddTcpListener(_) + | RequestType::ConfigureMetrics(_) + | RequestType::DeactivateListener(_) + | RequestType::RemoveBackend(_) + | RequestType::RemoveCertificate(_) + | RequestType::RemoveCluster(_) + | RequestType::RemoveHttpFrontend(_) + | RequestType::RemoveHttpsFrontend(_) + | RequestType::RemoveListener(_) + | RequestType::RemoveTcpFrontend(_) + | RequestType::ReplaceCertificate(_) => { + worker_request(self, client, request_type); + } + RequestType::QueryClustersHashes(_) + | RequestType::QueryClustersByDomain(_) + | RequestType::QueryCertificatesFromWorkers(_) + | RequestType::QueryClusterById(_) => { + query_clusters(self, client, request_type); + } + RequestType::QueryMetrics(inner) => query_metrics(self, client, inner), + RequestType::SoftStop(_) => stop(self, client, false), + RequestType::HardStop(_) => stop(self, client, true), + RequestType::Logging(logging_filter) => set_logging_level(self, client, logging_filter), + RequestType::QueryCertificatesFromTheState(filters) => { + query_certificates_from_main(self, client, filters) + } + RequestType::CountRequests(_) => count_requests(self, client), + + RequestType::LaunchWorker(_) => {} // not yet implemented, nor used, anywhere + RequestType::ReturnListenSockets(_) => {} // This is only implemented by workers, } - - Ok(Success::HandledClientRequest) } - pub fn query_request_count(&mut self) -> anyhow::Result> { - let request_counts = self.state.get_request_counts(); - Ok(Some(Success::RequestCounts( - ContentType::RequestCounts(request_counts).into(), - ))) + /// get infos from the state of the main process + fn query_main(&self, request: RequestType) -> Option { + match request { + RequestType::QueryClusterById(cluster_id) => Some( + ContentType::Clusters(ClusterInformations { + vec: self.state.cluster_state(&cluster_id).into_iter().collect(), + }) + .into(), + ), + RequestType::QueryClustersByDomain(domain) => { + let cluster_ids = self + .state + .get_cluster_ids_by_domain(domain.hostname, domain.path); + let vec = cluster_ids + .iter() + .filter_map(|cluster_id| self.state.cluster_state(cluster_id)) + .collect(); + Some(ContentType::Clusters(ClusterInformations { vec }).into()) + } + RequestType::QueryClustersHashes(_) => Some( + ContentType::ClusterHashes(ClusterHashes { + map: self.state.hash_state(), + }) + .into(), + ), + _ => None, + } } +} - pub async fn save_state(&mut self, path: &str) -> anyhow::Result> { - let mut file = File::create(path) - .with_context(|| format!("could not open file at path: {}", &path))?; - - let counter = self - .state - .write_requests_to_file(&mut file) - .with_context(|| "failed writing state to file")?; +//=============================================== +// non-scattered commands - info!("wrote {} commands to {}", counter, path); +pub fn query_certificates_from_main( + server: &mut Server, + client: &mut ClientSession, + filters: QueryCertificatesFilters, +) { + debug!( + "querying certificates in the state with filters {}", + filters + ); - Ok(Some(Success::SaveState(counter, path.into()))) - } + let certs = server.state.get_certificates(filters); - pub async fn load_state( - &mut self, - client_id: Option, - path: &str, - ) -> anyhow::Result> { - let mut file = match File::open(path) { - Ok(file) => file, - Err(err) if matches!(err.kind(), ErrorKind::NotFound) => { - info!("The state file does not exists, skipping the loading."); - self.backends_count = self.state.count_backends(); - self.frontends_count = self.state.count_frontends(); - return Ok(None); - } - Err(err) => { - return Err(err).with_context(|| format!("Cannot open file at path {path}")); - } - }; - - let mut buffer = Buffer::with_capacity(200000); + client.finish_ok_with_content( + ContentType::CertificatesWithFingerprints(CertificatesWithFingerprints { certs }).into(), + "Successfully queried certificates from the state of main process", + ); +} - info!("starting to load state from {}", path); +/// return how many requests were received by Sōzu since startup +fn count_requests(server: &mut Server, client: &mut ClientSession) { + let request_counts = server.state.get_request_counts(); - let mut message_counter = 0usize; - let mut diff_counter = 0usize; + client.finish_ok_with_content( + ContentType::RequestCounts(request_counts).into(), + "Successfully counted requests received by the state", + ); +} - let (load_state_tx, mut load_state_rx) = futures::channel::mpsc::channel(10000); - loop { - let previous = buffer.available_data(); +pub fn list_frontend_command( + server: &mut Server, + client: &mut ClientSession, + filters: FrontendFilters, +) { + match server.query_main(RequestType::ListFrontends(filters)) { + Some(response) => client.finish_ok_with_content(response, "Successfully listed frontends"), + None => client.finish_failure("main process could not list frontends"), + } +} - //FIXME: we should read in streaming here - let bytes_read = file - .read(buffer.space()) - .with_context(|| "Error reading the saved state file")?; +fn list_workers(server: &mut Server, client: &mut ClientSession) { + let vec = server + .workers + .values() + .map(|worker| WorkerInfo { + id: worker.id, + pid: worker.pid, + run_state: worker.run_state as i32, + }) + .collect(); - buffer.fill(bytes_read); + debug!("workers: {:?}", vec); + client.finish_ok_with_content( + ContentType::Workers(WorkerInfos { vec }).into(), + "Successfully listed workers", + ); +} - if buffer.available_data() == 0 { - debug!("Empty buffer"); - break; - } +fn list_listeners(server: &mut Server, client: &mut ClientSession) { + let vec = server.state.list_listeners(); + client.finish_ok_with_content( + ContentType::ListenersList(vec).into(), + "Successfully listed listeners", + ); +} - let mut offset = 0usize; - match parse_several_requests::(buffer.data()) { - Ok((i, requests)) => { - if !i.is_empty() { - debug!("could not parse {} bytes", i.len()); - if previous == buffer.available_data() { - bail!("error consuming load state message"); - } - } - offset = buffer.data().offset(i); - - for request in requests { - message_counter += 1; - - if self.state.dispatch(&request.content).is_ok() { - diff_counter += 1; - - let mut found = false; - let id = format!("LOAD-STATE-{}-{diff_counter}", request.id); - - for worker in - self.workers.iter_mut().filter(|worker| worker.is_active()) - { - let worker_message_id = format!("{}-{}", id, worker.id); - worker - .send(worker_message_id.clone(), request.content.clone()) - .await; - self.in_flight - .insert(worker_message_id, (load_state_tx.clone(), 1)); - - found = true; - } - - if !found { - bail!("no worker found"); - } - } - } - } - Err(nom::Err::Incomplete(_)) => { - if buffer.available_data() == buffer.capacity() { - error!( - "message too big, stopping parsing:\n{}", - buffer.data().to_hex(16) - ); - break; - } - } - Err(parse_error) => { - bail!("saved state parse error: {:?}", parse_error); - } - } - buffer.consume(offset); +fn save_state(server: &mut Server, client: &mut ClientSession, path: &str) { + debug!("saving state to file {}", path); + let mut file = match File::create(path) { + Ok(file) => file, + Err(error) => { + client.finish_failure(format!("Cannot create file at path {path}: {error}")); + return; } + }; - info!( - "stopped loading data from file, remaining: {} bytes, saw {} messages, generated {} diff messages", - buffer.available_data(), message_counter, diff_counter - ); - - if diff_counter > 0 { - info!( - "state loaded from {}, will start sending {} messages to workers", - path, diff_counter - ); - - let command_tx = self.command_tx.to_owned(); - let path = path.to_owned(); - - smol::spawn(async move { - let mut ok = 0usize; - let mut error = 0usize; - while let Some((proxy_response, _)) = load_state_rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - ok += 1; - } - ResponseStatus::Processing => {} - ResponseStatus::Failure => { - error!("{}", proxy_response.message); - error += 1; - } - }; - debug!("ok:{}, error: {}", ok, error); - } - - let client_id = match client_id { - Some(client_id) => client_id, - None => { - match error { - 0 => info!("loading state: {} ok messages, 0 errors", ok), - _ => error!("loading state: {} ok messages, {} errors", ok, error), - } - return; - } - }; - - // notify the command server - match error { - 0 => { - return_success( - command_tx, - client_id, - Success::LoadState(path.to_string(), ok, error), - ) - .await; - } - _ => { - return_error( - command_tx, - client_id, - format!("Loading state failed, ok: {ok}, error: {error}, path: {path}"), - ) - .await; - } - } - }) - .detach(); - } else { - info!("no messages sent to workers: local state already had those messages"); - if let Some(client_id) = client_id { - return_success( - self.command_tx.clone(), - client_id, - Success::LoadState(path.to_string(), 0, 0), - ) - .await; - } + match server.state.write_requests_to_file(&mut file) { + Ok(counter) => { + client.finish_ok(format!("Saved {counter} config messages to {path}")); + } + Err(error) => { + client.finish_failure(format!("Failed writing state to file: {error}")); } - - self.backends_count = self.state.count_backends(); - self.frontends_count = self.state.count_frontends(); - Ok(None) } +} - pub async fn list_frontends( - &mut self, - filters: FrontendFilters, - ) -> anyhow::Result> { - info!( - "Received a request to list frontends, along these filters: {:?}", - filters - ); - - let listed_frontends = self.state.list_frontends(filters); +/// change logging level on the main process, and on all workers +fn set_logging_level(server: &mut Server, client: &mut ClientSession, logging_filter: String) { + debug!("Changing main process log level to {}", logging_filter); + logging::LOGGER.with(|l| { + let directives = logging::parse_logging_spec(&logging_filter); + l.borrow_mut().set_directives(directives); + }); + + // also change / set the content of RUST_LOG so future workers / main thread + // will have the new logging filter value + ::std::env::set_var("RUST_LOG", &logging_filter); + debug!( + "Logging level now: {}", + ::std::env::var("RUST_LOG").unwrap_or("could get RUST_LOG from env".to_string()) + ); - Ok(Some(Success::ListFrontends( - ContentType::FrontendList(listed_frontends).into(), - ))) - } + worker_request(server, client, RequestType::Logging(logging_filter)); +} - fn list_listeners(&self) -> anyhow::Result> { - let listeners_list = self.state.list_listeners(); +fn subscribe_client_to_events(server: &mut Server, client: &mut ClientSession) { + info!("Subscribing client {:?} to listen to events", client.token); + server.event_subscribers.insert(client.token); +} - Ok(Some(Success::ListListeners( - ContentType::ListenersList(listeners_list).into(), - ))) - } +//=============================================== +// Query clusters - pub async fn list_workers(&mut self) -> anyhow::Result> { - let workers: Vec = self - .workers - .iter() - .map(|worker| WorkerInfo { - id: worker.id, - pid: worker.pid, - run_state: worker.run_state as i32, - }) - .collect(); +#[derive(Debug)] +pub struct QueryClustersTask { + pub client_token: Token, + pub request_type: RequestType, + pub gatherer: DefaultGatherer, + main_process_response: Option, +} - debug!("workers: {:#?}", workers); +pub fn query_clusters( + server: &mut Server, + client: &mut ClientSession, + request_content: RequestType, +) { + client.return_processing("Querying cluster..."); + + server.scatter( + request_content.clone().into(), + Box::new(QueryClustersTask { + client_token: client.token, + gatherer: DefaultGatherer::default(), + main_process_response: server.query_main(request_content.clone()), + request_type: request_content, + }), + Timeout::Default, + None, + ) +} - Ok(Some(Success::ListWorkers( - ContentType::Workers(WorkerInfos { vec: workers }).into(), - ))) +impl GatheringTask for QueryClustersTask { + fn client_token(&self) -> Option { + Some(self.client_token) } - pub fn query_certificates_from_the_state( - &self, - filters: QueryCertificatesFilters, - ) -> anyhow::Result> { - debug!( - "querying certificates in the state with filters {}", - filters - ); - - let certs = self.state.get_certificates(filters); - - Ok(Some(Success::CertificatesFromTheState( - ContentType::CertificatesWithFingerprints(CertificatesWithFingerprints { certs }) - .into(), - ))) + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer } - pub async fn launch_worker( - &mut self, - client_id: String, - _tag: &str, - ) -> anyhow::Result> { - let mut worker = start_worker( - self.next_worker_id, - &self.config, - self.executable_path.clone(), - &self.state, - None, - ) - .with_context(|| format!("Failed at creating worker {}", self.next_worker_id))?; - - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Sending configuration requests to the new worker...", - ) - .await; - - info!("created new worker: {}", worker.id); - - self.next_worker_id += 1; - - let sock = worker - .worker_channel - .take() - .expect("No channel on the worker being launched") - .sock; - let (worker_tx, worker_rx) = channel(10000); - worker.sender = Some(worker_tx); - - let stream = Async::new(unsafe { - let fd = sock.into_raw_fd(); - UnixStream::from_raw_fd(fd) - })?; - - let id = worker.id; - let command_tx = self.command_tx.clone(); - - smol::spawn(async move { - super::worker_loop(id, stream, command_tx, worker_rx).await; - }) - .detach(); - - info!( - "sending listeners: to the new worker: {:?}", - worker.scm_socket.send_listeners(&Listeners { - http: Vec::new(), - tls: Vec::new(), - tcp: Vec::new(), + fn on_finish( + self: Box, + _server: &mut Server, + client: &mut OptionalClient, + _timed_out: bool, + ) { + let mut worker_responses: BTreeMap = self + .gatherer + .responses + .into_iter() + .filter_map(|(worker_id, proxy_response)| { + proxy_response + .content + .map(|response_content| (worker_id.to_string(), response_content)) }) - ); + .collect(); - let activate_requests = self.state.generate_activate_requests(); - for (count, request) in activate_requests.into_iter().enumerate() { - worker.send(format!("{id}-ACTIVATE-{count}"), request).await; + if let Some(main_response) = self.main_process_response { + worker_responses.insert(String::from("main"), main_response); } - self.workers.push(worker); - - return_success( - self.command_tx.clone(), - client_id, - Success::WorkerLaunched(id), - ) - .await; - Ok(None) + client.finish_ok_with_content( + ContentType::WorkerResponses(WorkerResponses { + map: worker_responses, + }) + .into(), + "Successfully queried clusters", + ); } +} - pub async fn upgrade_main(&mut self, client_id: String) -> anyhow::Result> { - self.disable_cloexec_before_upgrade()?; +//=============================================== +// Load static configuration - return_processing( - self.command_tx.clone(), - client_id, - "The proxy is processing the upgrade command.", - ) - .await; +#[derive(Debug)] +struct LoadStaticConfigTask { + gatherer: DefaultGatherer, + client_token: Option, +} - let upgrade_data = self.generate_upgrade_data(); +pub fn load_static_config(server: &mut Server, mut client: OptionalClient, path: Option<&str>) { + let task_id = server.new_task( + Box::new(LoadStaticConfigTask { + gatherer: DefaultGatherer::default(), + client_token: client.as_ref().map(|c| c.token), + }), + Timeout::None, + ); - let (new_main_pid, mut fork_confirmation_channel) = - fork_main_into_new_main(self.executable_path.clone(), upgrade_data) - .with_context(|| "Could not start a new main process")?; + let new_config; - if let Err(e) = fork_confirmation_channel.blocking() { - error!( - "Could not block the fork confirmation channel: {}. This is not normal, you may need to restart sozu", - e - ); + let config = match path { + Some(path) if !path.is_empty() => { + info!("loading static configuration at path {}", path); + new_config = Config::load_from_path(path) + .unwrap_or_else(|_| panic!("cannot load configuration from '{path}'")); + &new_config } - let received_ok_from_new_process = fork_confirmation_channel.read_message(); - debug!("upgrade channel sent {:?}", received_ok_from_new_process); - - // signaling the accept loop that it should stop - if let Err(e) = self - .accept_cancel - .take() // we should create a method on Self for this frequent procedure - .expect("No channel on the main process") - .send(()) - { - error!("could not close the accept loop: {:?}", e); + _ => { + info!("reloading static configuration"); + &server.config } + }; - if !received_ok_from_new_process - .with_context(|| "Did not receive fork confirmation from new worker")? - { - bail!("forking the new worker failed") - } - info!("wrote final message, closing"); - Ok(Some(Success::UpgradeMain(new_main_pid))) - } - - pub async fn upgrade_worker( - &mut self, - client_id: String, - worker_id: u32, - ) -> anyhow::Result> { - info!( - "client[{}] msg wants to upgrade worker {}", - client_id, worker_id - ); + client.return_processing(format!( + "Reloading static configuration at path {}", + config.config_path + )); - if !self - .workers - .iter() - .any(|worker| worker.id == worker_id && worker.is_active()) - { - bail!(format!( - "The worker {} does not exist, or is stopped / stopping.", - &worker_id - )); + let config_messages = match config.generate_config_messages() { + Ok(messages) => messages, + Err(config_err) => { + client.finish_failure(format!("could not generate new config: {}", config_err)); + return; } + }; - // same as launch_worker - let next_id = self.next_worker_id; - let mut new_worker = start_worker( - next_id, - &self.config, - self.executable_path.clone(), - &self.state, - None, - ) - .with_context(|| "failed at creating worker")?; - - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Sending configuration requests to the worker", - ) - .await; - - info!("created new worker: {}", next_id); - - self.next_worker_id += 1; - - let sock = new_worker - .worker_channel - .take() - .with_context(|| "No channel on new worker".to_string())? - .sock; - let (worker_tx, worker_rx) = channel(10000); - new_worker.sender = Some(worker_tx); - - new_worker - .sender - .as_mut() - .with_context(|| "No sender on new worker".to_string())? - .send(WorkerRequest { - id: format!("UPGRADE-{worker_id}-STATUS"), - content: RequestType::Status(Status {}).into(), - }) - .await - .with_context(|| { - format!( - "could not send status message to worker {:?}", - new_worker.id, - ) - })?; - - let mut listeners = None; - { - let old_worker: &mut Worker = self - .workers - .iter_mut() - .find(|worker| worker.id == worker_id) - .unwrap(); - - /* - old_worker.channel.set_blocking(true); - old_worker.channel.write_message(&ProxyRequest { id: String::from(message_id), request: RequestContent::ReturnListenSockets }); - info!("sent returnlistensockets message to worker"); - old_worker.channel.set_blocking(false); - */ - let (sockets_return_tx, mut sockets_return_rx) = futures::channel::mpsc::channel(3); - let id = format!("{client_id}-return-sockets"); - self.in_flight.insert(id.clone(), (sockets_return_tx, 1)); - old_worker - .send( - id.clone(), - RequestType::ReturnListenSockets(ReturnListenSockets {}).into(), - ) - .await; - - info!("sent ReturnListenSockets to old worker"); - - let cloned_command_tx = self.command_tx.clone(); - let cloned_req_id = client_id.clone(); - smol::spawn(async move { - while let Some((proxy_response, _)) = sockets_return_rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - info!("returnsockets OK"); - break; - } - ResponseStatus::Processing => { - info!("returnsockets processing"); - } - ResponseStatus::Failure => { - return_error(cloned_command_tx, cloned_req_id, proxy_response.message) - .await; - break; - } - }; - } - }) - .detach(); - - let mut counter = 0usize; - - loop { - info!("waiting for listen sockets from the old worker"); - if let Err(e) = old_worker.scm_socket.set_blocking(true) { - error!("Could not set the old worker socket to blocking: {}", e); - }; - match old_worker.scm_socket.receive_listeners() { - Ok(l) => { - listeners = Some(l); - break; - } - Err(error) => { - error!( - "Could not receive listerners from scm socket with file descriptor {}:\n{:?}", - old_worker.scm_socket.fd, error - ); - counter += 1; - if counter == 50 { - break; - } - std::thread::sleep(Duration::from_millis(100)); - } - } - } - info!("got the listen sockets from the old worker"); - old_worker.run_state = RunState::Stopping; - - let (softstop_tx, mut softstop_rx) = futures::channel::mpsc::channel(10); - let softstop_id = format!("{client_id}-softstop"); - self.in_flight.insert(softstop_id.clone(), (softstop_tx, 1)); - old_worker - .send( - softstop_id.clone(), - RequestType::SoftStop(SoftStop {}).into(), - ) - .await; - - let mut command_tx = self.command_tx.clone(); - let cloned_client_id = client_id.clone(); - let worker_id = old_worker.id; - smol::spawn(async move { - while let Some((proxy_response, _)) = softstop_rx.next().await { - match proxy_response.status { - // should we send all this to the command server? - ResponseStatus::Ok => { - info!("softstop OK"); // this doesn't display :-( - if let Err(e) = command_tx - .send(CommandMessage::WorkerClose { worker_id }) - .await - { - error!( - "could not send worker close message to {}: {:?}", - worker_id, e - ); - } - break; - } - ResponseStatus::Processing => { - info!("softstop processing"); - } - ResponseStatus::Failure => { - info!("softstop error: {:?}", proxy_response.message); - break; - } - }; - } - return_processing( - command_tx.clone(), - cloned_client_id, - "Processing softstop responses from the workers...", - ) - .await; - }) - .detach(); + for (request_index, message) in config_messages.into_iter().enumerate() { + let request = message.content; + if let Err(error) = server.state.dispatch(&request) { + client.return_processing(format!("Could not execute request on state: {:#}", error)); + continue; } - match listeners { - Some(l) => { - info!( - "sending listeners: to the new worker: {:?}", - new_worker.scm_socket.send_listeners(&l) - ); - l.close(); - } - None => error!("could not get the list of listeners from the previous worker"), - }; - - let stream = Async::new(unsafe { - let fd = sock.into_raw_fd(); - UnixStream::from_raw_fd(fd) - })?; - - let id = new_worker.id; - let command_tx = self.command_tx.clone(); - smol::spawn(async move { - super::worker_loop(id, stream, command_tx, worker_rx).await; - }) - .detach(); - - let activate_requests = self.state.generate_activate_requests(); - for (count, request) in activate_requests.into_iter().enumerate() { - new_worker - .send(format!("{client_id}-ACTIVATE-{count}"), request) - .await; + if let &Some(RequestType::AddCertificate(_)) = &request.request_type { + debug!("config generated AddCertificate( ... )"); + } else { + debug!("config generated {:?}", request); } - info!("sent config messages to the new worker"); - self.workers.push(new_worker); - - info!("finished upgrade"); - Ok(Some(Success::UpgradeWorker(id))) + server.scatter_on(request, task_id, request_index, None); } +} - pub async fn reload_configuration( - &mut self, - client_id: String, - config_path: String, - ) -> anyhow::Result> { - // check that this works - let path = match config_path.is_empty() { - true => &self.config.config_path, - false => &config_path, - }; - // config_path.as_deref().unwrap_or(&self.config.config_path); - let new_config = Config::load_from_path(path) - .with_context(|| format!("cannot load configuration from '{path}'"))?; - - let mut diff_counter = 0usize; - - let (load_state_tx, mut load_state_rx) = futures::channel::mpsc::channel(10000); - - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Reloading configuration, sending config messages to workers...", - ) - .await; - - for request in new_config.generate_config_messages()? { - if self.state.dispatch(&request.content).is_ok() { - diff_counter += 1; - - let mut found = false; - let id = format!("LOAD-STATE-{}-{}", &request.id, diff_counter); - - for worker in self.workers.iter_mut().filter(|worker| worker.is_active()) { - let worker_message_id = format!("{}-{}", id, worker.id); - worker - .send(worker_message_id.clone(), request.content.clone()) - .await; - self.in_flight - .insert(worker_message_id, (load_state_tx.clone(), 1)); +impl GatheringTask for LoadStaticConfigTask { + fn client_token(&self) -> Option { + self.client_token + } - found = true; - } + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer + } - if !found { - bail!("no worker found"); + fn on_finish( + self: Box, + server: &mut Server, + client: &mut OptionalClient, + _timed_out: bool, + ) { + let mut messages = vec![]; + for (worker_id, response) in self.gatherer.responses { + match response.status { + ResponseStatus::Ok => {} + ResponseStatus::Failure => { + messages.push(format!("worker {worker_id}: {}", response.message)) } + ResponseStatus::Processing => {} } } - // clone everything we will need in the detached thread - let command_tx = self.command_tx.clone(); - let cloned_identifier = client_id.clone(); - - if diff_counter > 0 { - info!( - "state loaded from {}, will start sending {} messages to workers", - new_config.config_path, diff_counter - ); - smol::spawn(async move { - let mut ok = 0usize; - let mut error = 0usize; - while let Some((proxy_response, _)) = load_state_rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - ok += 1; - } - ResponseStatus::Processing => {} - ResponseStatus::Failure => { - error!("{}", proxy_response.message); - error += 1; - } - }; - debug!("ok:{}, error: {}", ok, error); - } - - if error == 0 { - return_success( - command_tx, - cloned_identifier, - Success::ReloadConfiguration(ok, error), - ) - .await; - } else { - return_error( - command_tx, - cloned_identifier, - format!( - "Reloading configuration failed. ok: {ok} messages, error: {error}" - ), - ) - .await; - } - }) - .detach(); + if self.gatherer.errors > 0 { + client.finish_failure(format!( + "\nloading static configuration failed: {} OK, {} errors:\n- {}", + self.gatherer.ok, + self.gatherer.errors, + messages.join("\n- ") + )); } else { - info!("no messages sent to workers: local state already had those messages"); + client.finish_ok(format!( + "Successfully loaded the config: {} ok, {} errors", + self.gatherer.ok, self.gatherer.errors, + )); } - self.backends_count = self.state.count_backends(); - self.frontends_count = self.state.count_frontends(); - gauge!("configuration.clusters", self.state.clusters.len()); - gauge!("configuration.backends", self.backends_count); - gauge!("configuration.frontends", self.frontends_count); - - self.config = new_config; - - Ok(None) + server.update_counts(); } +} - pub async fn status(&mut self, client_id: String) -> anyhow::Result> { - info!("Requesting the status of all workers."); - - let (status_tx, mut status_rx) = futures::channel::mpsc::channel(self.workers.len() * 2); - - // create a status list with the available info of the main process - let mut worker_info_map: BTreeMap = BTreeMap::new(); - - let prefix = format!("{client_id}-status-"); - - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Sending status requests to workers...", - ) - .await; - - let mut count = 0usize; - for worker in self.workers.iter_mut() { - info!("Worker {} is {}", worker.id, worker.run_state); - - // create request ids even if we don't send any request, as keys in the tree map - let worker_request_id = format!("{}{}", prefix, worker.id); - // send a status request to supposedly running workers to update the list afterwards - if worker.run_state == RunState::Running { - info!("Summoning status of worker {}", worker.id); - worker - .send( - worker_request_id.clone(), - RequestType::Status(Status {}).into(), - ) - .await; - count += 1; - self.in_flight - .insert(worker_request_id.clone(), (status_tx.clone(), 1)); - } - worker_info_map.insert(worker_request_id, worker.querying_info()); - } +// ========================================================= +// Worker request - let command_tx = self.command_tx.clone(); - let thread_client_id = client_id.clone(); - let worker_timeout = self.config.worker_timeout; - let now = Instant::now(); - - smol::spawn(async move { - let mut i = 0; - - while let Some((proxy_response, _)) = status_rx.next().await { - info!( - "received response with id {}: {:?}", - proxy_response.id, proxy_response - ); - let new_run_state = match proxy_response.status { - ResponseStatus::Ok => RunState::Running, - ResponseStatus::Processing => continue, - ResponseStatus::Failure => RunState::NotAnswering, - }; - worker_info_map - .entry(proxy_response.id) - .and_modify(|worker_info| worker_info.run_state = new_run_state as i32); - - i += 1; - if i == count || now.elapsed() > Duration::from_secs(worker_timeout as u64) { - break; - } - } +#[derive(Debug)] +struct WorkerTask { + pub client_token: Token, + pub gatherer: DefaultGatherer, +} - let worker_info_vec = WorkerInfos { - vec: worker_info_map - .values() - .map(|worker_info| worker_info.to_owned()) - .collect(), - }; +pub fn worker_request( + server: &mut Server, + client: &mut ClientSession, + request_content: RequestType, +) { + let request = request_content.into(); - return_success( - command_tx, - thread_client_id, - Success::Status(ContentType::Workers(worker_info_vec).into()), - ) - .await; - }) - .detach(); - Ok(None) + if let Err(error) = server.state.dispatch(&request) { + client.finish_failure(format!( + "could not dispatch request on the main process state: {error}", + )); + return; } + client.return_processing("Processing worker request..."); + + server.scatter( + request, + Box::new(WorkerTask { + client_token: client.token, + gatherer: DefaultGatherer::default(), + }), + Timeout::Default, + None, + ) +} - // This handles the CLI's "metrics enable", "metrics disable", "metrics clear" - // To get the proxy's metrics, the cli command is "metrics get", handled by the query() function - pub async fn configure_metrics( - &mut self, - client_id: String, - config: MetricsConfiguration, - ) -> anyhow::Result> { - let (metrics_tx, mut metrics_rx) = futures::channel::mpsc::channel(self.workers.len() * 2); - let mut count = 0usize; - for worker in self - .workers - .iter_mut() - .filter(|worker| worker.run_state != RunState::Stopped) - { - let req_id = format!("{}-metrics-{}", client_id, worker.id); - worker - .send( - req_id.clone(), - RequestType::ConfigureMetrics(config as i32).into(), - ) - .await; - count += 1; - self.in_flight.insert(req_id, (metrics_tx.clone(), 1)); - } - - let prefix = format!("{client_id}-metrics-"); - - let command_tx = self.command_tx.clone(); - let thread_client_id = client_id.clone(); - smol::spawn(async move { - let mut responses = Vec::new(); - let mut i = 0; - while let Some((proxy_response, _)) = metrics_rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - let tag = proxy_response.id.trim_start_matches(&prefix).to_string(); - responses.push((tag, proxy_response)); - } - ResponseStatus::Processing => { - //info!("metrics processing"); - continue; - } - ResponseStatus::Failure => { - let tag = proxy_response.id.trim_start_matches(&prefix).to_string(); - responses.push((tag, proxy_response)); - } - }; +impl GatheringTask for WorkerTask { + fn client_token(&self) -> Option { + Some(self.client_token) + } - i += 1; - if i == count { - break; - } - } + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer + } - let mut messages = vec![]; - let mut has_error = false; - for response in responses.iter() { - match response.1.status { - ResponseStatus::Failure => { - messages.push(format!("{}: {}", response.0, response.1.message)); - has_error = true; - } - _ => messages.push(format!("{}: OK", response.0)), + fn on_finish( + self: Box, + _server: &mut Server, + client: &mut OptionalClient, + timed_out: bool, + ) { + let mut messages = vec![]; + + for (worker_id, response) in self.gatherer.responses { + match response.status { + ResponseStatus::Ok => messages.push(format!("{worker_id}: OK")), + ResponseStatus::Failure => { + messages.push(format!("{worker_id}: {}", response.message)) } + ResponseStatus::Processing => {} } + } - if has_error { - return_error(command_tx, thread_client_id, messages.join(", ")).await; - } else { - return_success(command_tx, thread_client_id, Success::Metrics(config)).await; - } - }) - .detach(); - Ok(None) + if self.gatherer.errors > 0 || timed_out { + client.finish_failure(messages.join(", ")); + } else { + client.finish_ok("Successfully applied request to all workers"); + } } +} - pub async fn query( - &mut self, - client_id: String, - request: Request, - ) -> anyhow::Result> { - debug!("Received this query: {:?}", request); - let (query_tx, mut query_rx) = futures::channel::mpsc::channel(self.workers.len() * 2); - let mut count = 0usize; - for worker in self - .workers - .iter_mut() - .filter(|worker| worker.run_state != RunState::Stopped) - { - let req_id = format!("{}-query-{}", client_id, worker.id); - worker.send(req_id.clone(), request.clone()).await; - count += 1; - self.in_flight.insert(req_id, (query_tx.clone(), 1)); - } +// ========================================================= +// Query Metrics - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Query was sent to the workers...", - ) - .await; +#[derive(Debug)] +struct QueryMetricsTask { + pub client_token: Token, + pub gatherer: DefaultGatherer, + options: QueryMetricsOptions, +} - let main_response_content = match &request.request_type { - Some(RequestType::QueryClustersHashes(_)) => Some( - ContentType::ClusterHashes(ClusterHashes { - map: self.state.hash_state(), - }) - .into(), - ), - Some(RequestType::QueryClusterById(cluster_id)) => Some( - ContentType::Clusters(ClusterInformations { - vec: self.state.cluster_state(cluster_id).into_iter().collect(), - }) - .into(), - ), - Some(RequestType::QueryClustersByDomain(domain)) => { - let cluster_ids = self - .state - .get_cluster_ids_by_domain(domain.hostname.clone(), domain.path.clone()); - let vec = cluster_ids - .iter() - .filter_map(|cluster_id| self.state.cluster_state(cluster_id)) - .collect(); - Some(ContentType::Clusters(ClusterInformations { vec }).into()) - } - _ => None, - }; +fn query_metrics(server: &mut Server, client: &mut ClientSession, options: QueryMetricsOptions) { + client.return_processing("Querrying metrics..."); + + server.scatter( + RequestType::QueryMetrics(options.clone()).into(), + Box::new(QueryMetricsTask { + client_token: client.token, + gatherer: DefaultGatherer::default(), + options, + }), + Timeout::Default, + None, + ); +} - // all these are passed to the thread - let command_tx = self.command_tx.clone(); - let cloned_identifier = client_id.clone(); +impl GatheringTask for QueryMetricsTask { + fn client_token(&self) -> Option { + Some(self.client_token) + } + + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer + } - // this may waste resources and time in case of queries others than Metrics + fn on_finish( + self: Box, + _server: &mut Server, + client: &mut OptionalClient, + _timed_out: bool, + ) { let main_metrics = METRICS.with(|metrics| (*metrics.borrow_mut()).dump_local_proxy_metrics()); - smol::spawn(async move { - let mut responses = Vec::new(); - let mut i = 0; - while let Some((proxy_response, worker_id)) = query_rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - responses.push((worker_id, proxy_response)); - } - ResponseStatus::Processing => { - info!("metrics processing"); - continue; - } - ResponseStatus::Failure => { - responses.push((worker_id, proxy_response)); - } - }; - - i += 1; - if i == count { - break; + if self.options.list { + let mut summed_proxy_metrics = Vec::new(); + let mut summed_cluster_metrics = Vec::new(); + for (_, response) in self.gatherer.responses { + if let Some(ResponseContent { + content_type: + Some(ContentType::AvailableMetrics(AvailableMetrics { + proxy_metrics: listed_proxy_metrics, + cluster_metrics: listed_cluster_metrics, + })), + }) = response.content + { + summed_proxy_metrics.append(&mut listed_proxy_metrics.clone()); + summed_cluster_metrics.append(&mut listed_cluster_metrics.clone()); } } - - debug!("Received these worker responses: {:?}", responses); - - let mut worker_responses: BTreeMap = responses - .into_iter() - .filter_map(|(worker_id, proxy_response)| { - proxy_response - .content - .map(|response_content| (worker_id.to_string(), response_content)) + return client.finish_ok_with_content( + ContentType::AvailableMetrics(AvailableMetrics { + proxy_metrics: summed_proxy_metrics, + cluster_metrics: summed_cluster_metrics, }) - .collect(); - - let response_content = match &request.request_type { - &Some(RequestType::QueryClustersHashes(_)) - | &Some(RequestType::QueryClusterById(_)) - | &Some(RequestType::QueryClustersByDomain(_)) => { - if let Some(main_response) = main_response_content { - worker_responses.insert(String::from("main"), main_response); - } - ContentType::WorkerResponses(WorkerResponses { - map: worker_responses, - }) - .into() - } - &Some(RequestType::QueryCertificatesFromWorkers(_)) => { - info!( - "Received a response to the certificates query: {:?}", - worker_responses - ); - ContentType::WorkerResponses(WorkerResponses { - map: worker_responses, - }) - .into() - } - Some(RequestType::QueryMetrics(options)) => { - if options.list { - let mut summed_proxy_metrics = Vec::new(); - let mut summed_cluster_metrics = Vec::new(); - for (_, response) in worker_responses { - if let Some(ContentType::AvailableMetrics(AvailableMetrics { - proxy_metrics, - cluster_metrics, - })) = response.content_type - { - summed_proxy_metrics.append(&mut proxy_metrics.clone()); - summed_cluster_metrics.append(&mut cluster_metrics.clone()); - } - } - ContentType::AvailableMetrics(AvailableMetrics { - proxy_metrics: summed_proxy_metrics, - cluster_metrics: summed_cluster_metrics, - }) - .into() - } else { - let workers_metrics = worker_responses - .into_iter() - .filter_map(|(worker_id, worker_response)| match worker_response { - ResponseContent { - content_type: Some(ContentType::WorkerMetrics(worker_metrics)), - } => Some((worker_id, worker_metrics)), - _ => None, - }) - .collect(); - ContentType::Metrics(AggregatedMetrics { - main: main_metrics, - workers: workers_metrics, - }) - .into() - } - } - _ => return, // very very unlikely - }; + .into(), + "Successfully listed available metrics", + ); + } - return_success( - command_tx, - cloned_identifier, - Success::Query(response_content), + let workers_metrics = self + .gatherer + .responses + .into_iter() + .filter_map( + |(worker_id, worker_response)| match worker_response.content { + Some(ResponseContent { + content_type: Some(ContentType::WorkerMetrics(worker_metrics)), + }) => Some((worker_id.to_string(), worker_metrics)), + _ => None, + }, ) - .await; - }) - .detach(); + .collect(); - Ok(None) + client.finish_ok_with_content( + ContentType::Metrics(AggregatedMetrics { + main: main_metrics, + workers: workers_metrics, + }) + .into(), + "Successfully aggregated all metrics", + ); } +} - pub async fn set_logging_level( - &mut self, - logging_filter: String, - client_id: String, - ) -> anyhow::Result> { - debug!("Changing main process log level to {}", logging_filter); - logging::LOGGER.with(|l| { - let directives = logging::parse_logging_spec(&logging_filter); - l.borrow_mut().set_directives(directives); - }); - // also change / set the content of RUST_LOG so future workers / main thread - // will have the new logging filter value - ::std::env::set_var("RUST_LOG", &logging_filter); - debug!("Logging level now: {}", ::std::env::var("RUST_LOG")?); - - // notify the workers too - let _worker_success = self - .worker_requests( - client_id, - RequestType::Logging(logging_filter.clone()).into(), - ) - .await?; - Ok(Some(Success::Logging(logging_filter))) - } +// ========================================================= +// Load state - pub async fn worker_requests( - &mut self, - client_id: String, - request: Request, - ) -> anyhow::Result> { - if let &Some(RequestType::AddCertificate(_)) = &request.request_type { - debug!("workerconfig client request AddCertificate()"); - } else { - debug!("workerconfig client request {:?}", request); - } +#[derive(Debug)] +struct LoadStateTask { + /// this task may be called by the main process, without a client + pub client_token: Option, + pub gatherer: DefaultGatherer, + path: String, +} - self.state - .dispatch(&request) - .with_context(|| "Could not execute request on the state")?; - - if self.config.automatic_state_save & !request.is_a_stop() { - if let Some(path) = self.config.saved_state.clone() { - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Saving state to file", - ) - .await; - - let mut file = File::create(&path) - .with_context(|| "Could not create file to automatically save the state")?; - - self.state - .write_requests_to_file(&mut file) - .with_context(|| format!("could not save state automatically to {path}"))?; - } +pub fn load_state(server: &mut Server, mut client: OptionalClient, path: &str) { + info!("loading state at path {}", path); + + let mut file = match File::open(path) { + Ok(file) => file, + Err(err) if matches!(err.kind(), ErrorKind::NotFound) => { + client.finish_failure(format!("Cannot find file at path {path}")); + return; } + Err(error) => { + client.finish_failure(format!("Cannot open file at path {path}: {error}")); + return; + } + }; - return_processing( - self.command_tx.clone(), - client_id.clone(), - "Sending the request to all workers".to_owned(), - ) - .await; - - let (worker_request_tx, mut worker_request_rx) = - futures::channel::mpsc::channel(self.workers.len() * 2); - let mut found = false; - let mut stopping_workers = HashSet::new(); - let mut worker_count = 0usize; - for worker in self.workers.iter_mut().filter(|worker| worker.is_active()) { - if request.is_a_stop() { - worker.run_state = RunState::Stopping; - stopping_workers.insert(worker.id); - } + client.return_processing(format!("Parsing state file from {path}...")); + + let task_id = server.new_task( + Box::new(LoadStateTask { + client_token: client.as_ref().map(|c| c.token), + gatherer: DefaultGatherer::default(), + path: path.to_owned(), + }), + Timeout::None, + ); - let req_id = format!("{}-worker-{}", client_id, worker.id); - worker.send(req_id.clone(), request.clone()).await; - self.in_flight - .insert(req_id, (worker_request_tx.clone(), 1)); + let mut buffer = Buffer::with_capacity(200000); + let mut scatter_request_counter = 0usize; - found = true; - worker_count += 1; + let status = loop { + let previous = buffer.available_data(); + + match file.read(buffer.space()) { + Ok(bytes_read) => buffer.fill(bytes_read), + Err(error) => break Err(format!("Error reading the saved state file: {error}")), + }; + + if buffer.available_data() == 0 { + trace!("load_state: empty buffer"); + break Ok(()); } - let should_stop_main = request.is_a_stop(); - - let mut command_tx = self.command_tx.clone(); - let thread_client_id = client_id.clone(); - - smol::spawn(async move { - let mut responses = Vec::new(); - let mut response_count = 0usize; - while let Some((proxy_response, worker_id)) = worker_request_rx.next().await { - match proxy_response.status { - ResponseStatus::Ok => { - responses.push((worker_id, proxy_response)); - - if stopping_workers.contains(&worker_id) { - if let Err(e) = command_tx - .send(CommandMessage::WorkerClose { worker_id }) - .await - { - error!( - "could not send worker close message to {}: {:?}", - worker_id, e - ); - } - } - } - ResponseStatus::Processing => { - info!("request is processing"); - continue; + let mut offset = 0usize; + match parse_several_requests::(buffer.data()) { + Ok((i, requests)) => { + if !i.is_empty() { + debug!("load_state: could not parse {} bytes", i.len()); + if previous == buffer.available_data() { + break Err("Error consuming load state message".into()); } - ResponseStatus::Failure => { - responses.push((worker_id, proxy_response)); - } - }; - - response_count += 1; - if response_count == worker_count { - break; } - } + offset = buffer.data().offset(i); - // send the request to kill the main process only after all workers responded - if should_stop_main { - if let Err(e) = command_tx.send(CommandMessage::MasterStop).await { - error!("could not send main stop message: {:?}", e); + for request in requests { + if server.state.dispatch(&request.content).is_ok() { + scatter_request_counter += 1; + server.scatter_on(request.content, task_id, scatter_request_counter, None); + } } } - - let mut messages = vec![]; - let mut has_error = false; - for response in responses.iter() { - match response.1.status { - ResponseStatus::Failure => { - messages.push(format!("{}: {}", response.0, response.1.message)); - has_error = true; - } - _ => messages.push(format!("{}: OK", response.0)), + Err(nom::Err::Incomplete(_)) => { + if buffer.available_data() == buffer.capacity() { + break Err(format!( + "message too big, stopping parsing:\n{}", + buffer.data().to_hex(16) + )); } } - - if has_error { - return_error(command_tx, thread_client_id, messages.join(", ")).await; - } else { - return_success(command_tx, thread_client_id, Success::WorkerRequest).await; + Err(parse_error) => { + break Err(format!("saved state parse error: {:?}", parse_error)); } - }) - .detach(); + } + buffer.consume(offset); + }; - if !found { - bail!("no worker found"); + match status { + Ok(()) => { + client.return_processing("Applying state file..."); + } + Err(message) => { + client.finish_failure(message); + server.cancel_task(task_id); } + } +} - match request.request_type { - Some(RequestType::AddBackend(_)) | Some(RequestType::RemoveBackend(_)) => { - self.backends_count = self.state.count_backends() - } - Some(RequestType::AddHttpFrontend(_)) - | Some(RequestType::AddHttpsFrontend(_)) - | Some(RequestType::AddTcpFrontend(_)) - | Some(RequestType::RemoveHttpFrontend(_)) - | Some(RequestType::RemoveHttpsFrontend(_)) - | Some(RequestType::RemoveTcpFrontend(_)) => { - self.frontends_count = self.state.count_frontends() - } - _ => {} - }; +impl GatheringTask for LoadStateTask { + fn client_token(&self) -> Option { + self.client_token + } - gauge!("configuration.clusters", self.state.clusters.len()); - gauge!("configuration.backends", self.backends_count); - gauge!("configuration.frontends", self.frontends_count); + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer + } - Ok(None) + fn on_finish( + self: Box, + _server: &mut Server, + client: &mut OptionalClient, + _timed_out: bool, + ) { + let DefaultGatherer { ok, errors, .. } = self.gatherer; + if errors == 0 { + client.finish_ok(format!( + "Successfully loaded state from path {}, {} ok messages, {} errors", + self.path, ok, errors + )); + return; + } + client.finish_failure(format!("loading state: {ok} ok messages, {errors} errors")); } +} - pub async fn notify_advancement_to_client( - &mut self, - client_id: String, - response: Advancement, - ) -> anyhow::Result { - let command_response = match response { - Advancement::Ok(success) => { - let success_message = success.to_string(); - - let command_response_data = match success { - Success::ListFrontends(crd) - | Success::RequestCounts(crd) - | Success::ListWorkers(crd) - | Success::CertificatesFromTheState(crd) - | Success::Query(crd) - | Success::ListListeners(crd) - | Success::Status(crd) => Some(crd), - _ => None, - }; +// ========================================================== +// status - Response::new(ResponseStatus::Ok, success_message, command_response_data) - } - Advancement::Processing(processing_message) => { - Response::new(ResponseStatus::Processing, processing_message, None) - } - Advancement::Error(error_message) => { - Response::new(ResponseStatus::Failure, error_message, None) - } - }; +#[derive(Debug)] +struct StatusTask { + pub client_token: Token, + pub gatherer: DefaultGatherer, + worker_infos: HashMap, +} - trace!( - "Sending response to request sent by client {}: {:?}", - client_id, - command_response - ); +fn status(server: &mut Server, client: &mut ClientSession) { + client.return_processing("Querying status of workers..."); + + let worker_infos = server + .workers + .values() + .map(|worker| (worker.id, worker.querying_info())) + .collect(); + + server.scatter( + RequestType::Status(Status {}).into(), + Box::new(StatusTask { + client_token: client.token, + gatherer: DefaultGatherer::default(), + worker_infos, + }), + Timeout::Default, + None, + ); +} - match self.clients.get_mut(&client_id) { - Some(client_tx) => { - trace!("sending from main process to client loop"); - client_tx.send(command_response).await.with_context(|| { - format!("Could not notify client {client_id} about request") - })?; - } - None => bail!(format!("Could not find client {client_id}")), +impl GatheringTask for StatusTask { + fn client_token(&self) -> Option { + Some(self.client_token) + } + + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer + } + + fn on_finish( + mut self: Box, + _server: &mut Server, + client: &mut OptionalClient, + _timed_out: bool, + ) { + for (worker_id, response) in self.gatherer.responses { + let new_run_state = match response.status { + ResponseStatus::Ok => RunState::Running, + ResponseStatus::Processing => continue, + ResponseStatus::Failure => RunState::NotAnswering, + }; + + self.worker_infos + .entry(worker_id) + .and_modify(|worker_info| worker_info.run_state = new_run_state as i32); } - Ok(Success::NotifiedClient(client_id)) + let worker_info_vec = WorkerInfos { + vec: self.worker_infos.into_values().collect(), + }; + + client.finish_ok_with_content( + ContentType::Workers(worker_info_vec).into(), + "Successfully collected the status of workers", + ); } } -// Those return functions are meant to be called in detached threads -// to notify the command server of an request's advancement. -async fn return_error( - mut command_tx: Sender, - client_id: String, - error_message: T, -) where - T: ToString, -{ - let advancement = CommandMessage::Advancement { - client_id, - advancement: Advancement::Error(error_message.to_string()), - }; +// ========================================================== +// Soft stop and hard stop - trace!("return_error: sending event to the command server"); - if let Err(e) = command_tx.send(advancement).await { - error!("Error while return error to the command server: {}", e) +#[derive(Debug)] +struct StopTask { + pub client_token: Token, + pub gatherer: DefaultGatherer, + pub hardness: bool, +} + +/// stop the main process and workers, true for hard stop +fn stop(server: &mut Server, client: &mut ClientSession, hardness: bool) { + let task = Box::new(StopTask { + client_token: client.token, + gatherer: DefaultGatherer::default(), + hardness, + }); + + server.run_state = ServerState::WorkersStopping; + if hardness { + client.return_processing("Performing hard stop..."); + server.scatter( + RequestType::HardStop(HardStop {}).into(), + task, + Timeout::Default, + None, + ); + } else { + client.return_processing("Performing soft stop..."); + server.scatter( + RequestType::SoftStop(SoftStop {}).into(), + task, + Timeout::None, + None, + ); } } -async fn return_processing( - mut command_tx: Sender, - client_id: String, - processing_message: T, -) where - T: ToString, -{ - let advancement = CommandMessage::Advancement { - client_id, - advancement: Advancement::Processing(processing_message.to_string()), - }; +impl GatheringTask for StopTask { + fn client_token(&self) -> Option { + Some(self.client_token) + } - trace!("return_processing: sending event to the command server"); - if let Err(e) = command_tx.send(advancement).await { - error!( - "Error while returning processing to the command server: {}", - e - ) + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer } -} -async fn return_success( - mut command_tx: Sender, - client_id: String, - success: Success, -) { - let advancement = CommandMessage::Advancement { - client_id, - advancement: Advancement::Ok(success), - }; - trace!( - "return_success: sending event to the command server: {:?}", - advancement - ); - if let Err(e) = command_tx.send(advancement).await { - error!("Error while returning success to the command server: {}", e) + fn on_finish( + self: Box, + server: &mut Server, + client: &mut OptionalClient, + timed_out: bool, + ) { + if timed_out && self.hardness { + client.finish_failure(format!( + "Workers take too long to stop ({} ok, {} errors), stopping the main process to sever the link", + self.gatherer.ok, self.gatherer.errors + )); + } + server.run_state = ServerState::Stopping; + client.finish_ok(format!( + "Successfully closed {} workers, {} errors, stopping the main process...", + self.gatherer.ok, self.gatherer.errors + )); } } diff --git a/bin/src/command/server.rs b/bin/src/command/server.rs new file mode 100644 index 000000000..77e598543 --- /dev/null +++ b/bin/src/command/server.rs @@ -0,0 +1,860 @@ +use std::{ + collections::{HashMap, HashSet}, + fmt::Debug, + io::Error as IoError, + ops::{Deref, DerefMut}, + os::fd::{AsRawFd, FromRawFd}, + time::{Duration, Instant}, +}; + +use libc::pid_t; +use mio::{ + net::{UnixListener, UnixStream}, + Events, Interest, Poll, Token, +}; +use nix::{ + sys::signal::{kill, Signal}, + unistd::Pid, +}; + +use sozu_command_lib::{ + channel::Channel, + config::Config, + proto::command::{ + request::RequestType, response_content::ContentType, Request, ResponseContent, + ResponseStatus, RunState, Status, + }, + ready::Ready, + request::WorkerRequest, + response::WorkerResponse, + scm_socket::{Listeners, ScmSocket, ScmSocketError}, + state::ConfigState, +}; + +use crate::{ + command::{ + sessions::{ + wants_to_tick, ClientResult, ClientSession, OptionalClient, WorkerResult, WorkerSession, + }, + upgrade::UpgradeData, + }, + util::{disable_close_on_exec, enable_close_on_exec, get_executable_path, UtilError}, + worker::{fork_main_into_worker, WorkerError}, +}; + +use super::upgrade::SerializedWorkerSession; + +pub type ClientId = u32; +pub type SessionId = usize; +pub type TaskId = usize; +pub type WorkerId = u32; +pub type RequestId = String; + +/// Gather messages and notifies when there are no more left to read. +#[allow(unused)] +pub trait Gatherer { + /// increment how many responses we expect + fn inc_expected_responses(&mut self, count: usize); + + /// Return true if enough responses has been gathered + fn has_finished(&self) -> bool; + + /// Aggregate a response + fn on_message( + &mut self, + server: &mut Server, + client: &mut OptionalClient, + worker_id: WorkerId, + message: WorkerResponse, + ); +} + +/// Must be satisfied by commands that need to wait for worker responses +#[allow(unused)] +pub trait GatheringTask: Debug { + /// get access to the client that sent the command (if any) + fn client_token(&self) -> Option; + + /// get access to the gatherer for this task (each task can implement its own gathering strategy) + fn get_gatherer(&mut self) -> &mut dyn Gatherer; + + /// This is called once every worker has answered + /// It allows to operate both on the server (launch workers...) and the client (send an answer...) + fn on_finish( + self: Box, + server: &mut Server, + client: &mut OptionalClient, + timed_out: bool, + ); +} + +/// Implemented by all objects that can behave like a client (for instance: notify of processing request) +pub trait MessageClient { + /// return an OK to the client + fn finish_ok>(&mut self, message: T); + + /// return response content to the client + fn finish_ok_with_content>(&mut self, content: ResponseContent, message: T); + + /// return failure to the client + fn finish_failure>(&mut self, message: T); + + /// notify the client about an ongoing task + fn return_processing>(&mut self, message: T); + + /// transmit response content to the client, even though a task is not finished + fn return_processing_with_content>( + &mut self, + message: S, + content: ResponseContent, + ); +} + +/// A timeout for the tasks of the main process server +pub enum Timeout { + None, + Default, + #[allow(unused)] + Custom(Duration), +} + +/// Contains a task and its execution timeout +#[derive(Debug)] +struct TaskContainer { + job: Box, + timeout: Option, +} + +/// Default strategy when gathering responses from workers +#[derive(Debug, Default)] +pub struct DefaultGatherer { + /// number of OK responses received from workers + pub ok: usize, + /// number of failures received from workers + pub errors: usize, + /// worker responses are accumulated here + pub responses: Vec<(WorkerId, WorkerResponse)>, + /// number of expected responses, excluding processing responses + pub expected_responses: usize, +} + +#[allow(unused)] +impl Gatherer for DefaultGatherer { + fn inc_expected_responses(&mut self, count: usize) { + self.expected_responses += count; + } + + fn has_finished(&self) -> bool { + self.ok + self.errors >= self.expected_responses + } + + fn on_message( + &mut self, + server: &mut Server, + client: &mut OptionalClient, + worker_id: WorkerId, + message: WorkerResponse, + ) { + match message.status { + ResponseStatus::Ok => self.ok += 1, + ResponseStatus::Failure => self.errors += 1, + ResponseStatus::Processing => client.return_processing(format!( + "Worker {} is processing {}. {}", + worker_id, message.id, message.message + )), + } + self.responses.push((worker_id, message)); + } +} + +#[derive(thiserror::Error, Debug)] +pub enum HubError { + #[error("could not create main server: {0}")] + CreateServer(ServerError), + #[error("could not get executable path")] + GetExecutablePath(UtilError), + #[error("could not create SCM socket for worker {0}: {1}")] + CreateScmSocket(u32, ScmSocketError), +} + +/// A platform to receive client connections, pass orders to workers, +/// gather data, etc. +#[derive(Debug)] +pub struct CommandHub { + /// contains workers and the event loop + pub server: Server, + /// keeps track of agents that contacted Sōzu on the UNIX socket + clients: HashMap, + /// register tasks, for parallel execution + tasks: HashMap, +} + +impl Deref for CommandHub { + type Target = Server; + + fn deref(&self) -> &Self::Target { + &self.server + } +} +impl DerefMut for CommandHub { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.server + } +} + +impl CommandHub { + pub fn new( + unix_listener: UnixListener, + config: Config, + executable_path: String, + ) -> Result { + Ok(Self { + server: Server::new(unix_listener, config, executable_path) + .map_err(HubError::CreateServer)?, + clients: HashMap::new(), + tasks: HashMap::new(), + }) + } + + fn register_client(&mut self, mut stream: UnixStream) { + let token = self.next_session_token(); + if let Err(err) = self.register(token, &mut stream) { + error!("Could not register client: {}", err); + } + let channel = Channel::new(stream, 4096, usize::MAX); + let id = self.next_client_id(); + let session = ClientSession::new(channel, id, token); + info!("register new client: {}", id); + debug!("new client: {:?}", session); + self.clients.insert(token, session); + } + + fn get_client_mut(&mut self, token: &Token) -> Option<(&mut Server, &mut ClientSession)> { + self.clients + .get_mut(token) + .map(|client| (&mut self.server, client)) + } + + /// recreate the command hub when upgrading the main process + pub fn from_upgrade_data(upgrade_data: UpgradeData) -> Result { + let UpgradeData { + command_socket_fd, + config, + workers, + state, + next_client_id, + next_session_id, + next_task_id, + next_worker_id, + } = upgrade_data; + + let executable_path = + unsafe { get_executable_path().map_err(HubError::GetExecutablePath)? }; + + let unix_listener = unsafe { UnixListener::from_raw_fd(command_socket_fd) }; + + let command_buffer_size = config.command_buffer_size; + let max_command_buffer_size = config.max_command_buffer_size; + + let mut server = + Server::new(unix_listener, config, executable_path).map_err(HubError::CreateServer)?; + + server.state = state; + server.update_counts(); + server.next_client_id = next_client_id; + server.next_session_id = next_session_id; + server.next_task_id = next_task_id; + server.next_worker_id = next_worker_id; + + for worker in workers + .iter() + .filter(|w| w.run_state != RunState::Stopped && w.run_state != RunState::Stopping) + { + let worker_stream = unsafe { UnixStream::from_raw_fd(worker.channel_fd) }; + let channel: Channel = + Channel::new(worker_stream, command_buffer_size, max_command_buffer_size); + + let scm_socket = ScmSocket::new(worker.scm_fd) + .map_err(|scm_err| HubError::CreateScmSocket(worker.id, scm_err))?; + + if let Err(err) = server.register_worker(worker.id, worker.pid, channel, scm_socket) { + error!("could not register worker: {}", err); + } + } + + Ok(CommandHub { + server, + clients: HashMap::new(), + tasks: HashMap::new(), + }) + } + + /// contains the main event loop + /// - accept clients + /// - receive requests from clients and responses from workers + /// - dispatch these message to the [Server] + /// - manage timeouts of tasks + pub fn run(&mut self) { + let mut events = Events::with_capacity(100); + debug!("running the command hub: {:?}", self); + + loop { + let run_state = self.run_state; + let now = Instant::now(); + + let mut tasks = std::mem::take(&mut self.tasks); + let mut queued_tasks = std::mem::take(&mut self.server.queued_tasks); + self.tasks = tasks + .drain() + .chain(queued_tasks.drain()) + .filter_map(|(task_id, mut task)| { + if task.job.get_gatherer().has_finished() { + self.handle_finishing_task(task_id, task, false); + return None; + } + if let Some(timeout) = task.timeout { + if timeout < now { + self.handle_finishing_task(task_id, task, true); + return None; + } + } + Some((task_id, task)) + }) + .collect(); + + let next_timeout = self.tasks.values().filter_map(|t| t.timeout).max(); + let mut poll_timeout = next_timeout.map(|t| t.saturating_duration_since(now)); + + if self.run_state == ServerState::Stopping { + // when closing, close all ClientSession which are not transfering data + self.clients + .retain(|_, s| s.channel.back_buf.available_data() > 0); + // when all ClientSession are closed, the CommandServer stops + if self.clients.is_empty() { + break; + } + } + + let sessions_to_tick = self + .clients + .iter() + .filter_map(|(t, s)| { + if wants_to_tick(&s.channel) { + Some((*t, Ready::EMPTY, None)) + } else { + None + } + }) + .chain(self.workers.iter().filter_map(|(token, session)| { + if session.run_state != RunState::Stopped && wants_to_tick(&session.channel) { + Some((*token, Ready::EMPTY, None)) + } else { + None + } + })) + .collect::>(); + + let workers_to_spawn = self.workers_to_spawn(); + + // if we have sessions to tick or workers to spawn, we don't want to block on poll + if !sessions_to_tick.is_empty() || workers_to_spawn > 0 { + poll_timeout = Some(Duration::default()); + } + + events.clear(); + trace!("Tasks: {:?}", self.tasks); + trace!("Sessions to tick: {:?}", sessions_to_tick); + trace!("Polling timeout: {:?}", poll_timeout); + match self.poll.poll(&mut events, poll_timeout) { + Ok(()) => {} + Err(error) => error!("Error while polling: {:?}", error), + } + + self.automatic_worker_spawn(workers_to_spawn); + + let events = sessions_to_tick.into_iter().chain( + events + .into_iter() + .map(|event| (event.token(), Ready::from(event), Some(event))), + ); + for (token, ready, event) in events { + match token { + Token(0) => { + if run_state == ServerState::Stopping { + // do not accept new clients when stopping + continue; + } + if ready.is_readable() { + while let Ok((stream, _addr)) = self.unix_listener.accept() { + self.register_client(stream); + } + } + } + token => { + trace!("{:?} got event: {:?}", token, event); + if let Some((server, client)) = self.get_client_mut(&token) { + client.update_readiness(ready); + match client.ready() { + ClientResult::NothingToDo => {} + ClientResult::NewRequest(request) => { + debug!("Received new request: {:?}", request); + server.handle_client_request(client, request); + } + ClientResult::CloseSession => { + info!("Closing client {:#?}", client); + self.event_subscribers.remove(&token); + self.clients.remove(&token); + } + } + } else if let Some(worker) = self.workers.get_mut(&token) { + if run_state == ServerState::Stopping { + // do not read responses from workers when stopping + continue; + } + worker.update_readiness(ready); + let worker_id = worker.id; + match worker.ready() { + WorkerResult::NothingToDo => {} + WorkerResult::NewResponses(responses) => { + for response in responses { + self.handle_worker_response(worker_id, response); + } + } + WorkerResult::CloseSession => self.handle_worker_close(&token), + } + } + } + } + } + } + } + + fn handle_worker_response(&mut self, worker_id: WorkerId, response: WorkerResponse) { + // transmit backend events to subscribing clients + if let Some(ResponseContent { + content_type: Some(ContentType::Event(event)), + }) = response.content + { + for client_token in &self.server.event_subscribers { + if let Some(client) = self.clients.get_mut(client_token) { + client.return_processing_with_content( + format!("{worker_id}"), + ContentType::Event(event.clone()).into(), + ); + } + } + return; + } + + let Some(task_id) = self.in_flight.get(&response.id).copied() else { + error!("Got a response for an unknown task: {}", response); + return; + }; + + let task = match self.tasks.get_mut(&task_id) { + Some(task) => task, + None => { + error!("Got a response for an unknown task"); + return; + } + }; + + let client = &mut task + .job + .client_token() + .and_then(|token| self.clients.get_mut(&token)); + task.job + .get_gatherer() + .on_message(&mut self.server, client, worker_id, response); + } + + fn handle_finishing_task(&mut self, task_id: TaskId, task: TaskContainer, timed_out: bool) { + if timed_out { + debug!("Task timeout: {:?}", task); + } else { + debug!("Task finish: {:?}", task); + } + let client = &mut task + .job + .client_token() + .and_then(|token| self.clients.get_mut(&token)); + task.job.on_finish(&mut self.server, client, false); + self.in_flight + .retain(|_, in_flight_task_id| *in_flight_task_id != task_id); + } +} + +#[derive(thiserror::Error, Debug)] +pub enum ServerError { + #[error("Could not create Poll with MIO: {0:?}")] + CreatePoll(IoError), + #[error("Could not register channel in MIO registry: {0:?}")] + RegisterChannel(IoError), + #[error("Could not fork the main into a new worker: {0}")] + ForkMain(WorkerError), + #[error("Did not find worker. This should NOT happen.")] + WorkerNotFound, + #[error("could not enable cloexec: {0}")] + EnableCloexec(UtilError), + #[error("could not disable cloexec: {0}")] + DisableCloexec(UtilError), +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum ServerState { + Running, + WorkersStopping, + Stopping, +} + +/// Manages workers +/// Functions as an executer for tasks that have two steps: +/// - scatter to workers +/// - gather worker responses +/// - trigger a finishing function when all responses are gathered +#[derive(Debug)] +pub struct Server { + pub config: Config, + /// Sōzu clients that subscribed to events + pub event_subscribers: HashSet, + /// path to the executable binary of Sōzu (for upgrading) + pub executable_path: String, + /// keep track of the tasks + in_flight: HashMap, + next_client_id: ClientId, + next_session_id: SessionId, + next_task_id: TaskId, + next_worker_id: WorkerId, + /// the MIO structure that registers sockets and polls them all + poll: Poll, + /// all tasks created in one tick, to be propagated to the Hub at each tick + queued_tasks: HashMap, + /// contains all business logic of Sōzu (frontends, backends, routing, etc.) + pub state: ConfigState, + /// used to shut down gracefully + pub run_state: ServerState, + /// the UNIX socket on which to receive clients + unix_listener: UnixListener, + /// the Sōzu processes running parallel to the main process. + /// The workers perform the whole business of proxying and must be + /// synchronized at all times. + pub workers: HashMap, +} + +impl Server { + fn new( + mut unix_listener: UnixListener, + config: Config, + executable_path: String, + ) -> Result { + let poll = mio::Poll::new().map_err(ServerError::CreatePoll)?; + poll.registry() + .register( + &mut unix_listener, + Token(0), + Interest::READABLE | Interest::WRITABLE, + ) + .map_err(ServerError::RegisterChannel)?; + + Ok(Self { + config, + event_subscribers: HashSet::new(), + executable_path, + in_flight: HashMap::new(), + next_client_id: 0, + next_session_id: 1, // 0 is reserved for the UnixListener + next_task_id: 0, + next_worker_id: 0, + poll, + queued_tasks: HashMap::new(), + state: ConfigState::new(), + run_state: ServerState::Running, + unix_listener, + workers: HashMap::new(), + }) + } + + /// - fork the main process into a new worker + /// - register the worker in mio + /// - send a Status request to the new worker + pub fn launch_new_worker( + &mut self, + listeners: Option, + ) -> Result<&mut WorkerSession, ServerError> { + let worker_id = self.next_worker_id(); + let (worker_pid, main_to_worker_channel, main_to_worker_scm) = fork_main_into_worker( + &worker_id.to_string(), + &self.config, + self.executable_path.clone(), + &self.state, + Some(listeners.unwrap_or_default()), + ) + .map_err(ServerError::ForkMain)?; + + let worker_session = self.register_worker( + worker_id, + worker_pid, + main_to_worker_channel, + main_to_worker_scm, + )?; + + // TODO: make sure the worker is registered as NotAnswering, + // and create a task that will pass it to Running when it respond OK to this request: + worker_session.send(&WorkerRequest { + id: format!("INITIAL-STATUS-{worker_id}"), + content: RequestType::Status(Status {}).into(), + }); + + Ok(worker_session) + } + + /// count backends and frontends in the cache, update gauge metrics + pub fn update_counts(&mut self) { + gauge!("configuration.clusters", self.state.clusters.len()); + gauge!("configuration.backends", self.state.count_backends()); + gauge!("configuration.frontends", self.state.count_frontends()); + } + + fn next_session_token(&mut self) -> Token { + let token = Token(self.next_session_id); + self.next_session_id += 1; + token + } + fn next_client_id(&mut self) -> ClientId { + let id = self.next_client_id; + self.next_client_id += 1; + id + } + + fn next_task_id(&mut self) -> TaskId { + let id = self.next_task_id; + self.next_task_id += 1; + id + } + + fn next_worker_id(&mut self) -> WorkerId { + let id = self.next_worker_id; + self.next_worker_id += 1; + id + } + + fn register(&mut self, token: Token, stream: &mut UnixStream) -> Result<(), ServerError> { + self.poll + .registry() + .register(stream, token, Interest::READABLE | Interest::WRITABLE) + .map_err(ServerError::RegisterChannel) + } + + /// returns None if the worker is not alive + pub fn get_active_worker_by_id(&self, id: WorkerId) -> Option<&WorkerSession> { + self.workers + .values() + .find(|worker| worker.id == id && worker.is_active()) + } + + /// register a worker session in the server, return the mutable worker session + pub fn register_worker( + &mut self, + worker_id: WorkerId, + pid: pid_t, + mut channel: Channel, + scm_socket: ScmSocket, + ) -> Result<&mut WorkerSession, ServerError> { + let token = self.next_session_token(); + self.register(token, &mut channel.sock)?; + self.workers.insert( + token, + WorkerSession::new(channel, worker_id, pid, token, scm_socket), + ); + self.workers + .get_mut(&token) + .ok_or(ServerError::WorkerNotFound) + } + + /// Add a task in a queue to make it accessible until the next tick + pub fn new_task(&mut self, job: Box, timeout: Timeout) -> TaskId { + let task_id = self.next_task_id(); + let timeout = match timeout { + Timeout::None => None, + Timeout::Default => Some(Duration::from_secs(self.config.worker_timeout as u64)), + Timeout::Custom(duration) => Some(duration), + } + .map(|duration| Instant::now() + duration); + self.queued_tasks + .insert(task_id, TaskContainer { job, timeout }); + task_id + } + + pub fn scatter( + &mut self, + request: Request, + job: Box, + timeout: Timeout, + target: Option, // if None, scatter to all workers + ) { + let task_id = self.new_task(job, timeout); + + self.scatter_on(request, task_id, 0, target); + } + + pub fn scatter_on( + &mut self, + request: Request, + task_id: TaskId, + request_id: usize, + target: Option, + ) { + let task = match self.queued_tasks.get_mut(&task_id) { + Some(task) => task, + None => { + error!("no task found with id {}", task_id); + return; + } + }; + + let mut worker_count = 0; + let mut worker_request = WorkerRequest { + id: String::new(), + content: request, + }; + + for worker in self.workers.values_mut().filter(|w| { + target + .map(|id| id == w.id && w.run_state != RunState::Stopped) + .unwrap_or(w.run_state != RunState::Stopped) + }) { + worker_count += 1; + worker_request.id = format!( + "{}-{}-{}-{}", + worker_request.content.short_name(), + worker.id, + task_id, + request_id, + ); + debug!("scattering to worker {}: {:?}", worker.id, worker_request); + worker.send(&worker_request); + self.in_flight.insert(worker_request.id, task_id); + } + task.job.get_gatherer().inc_expected_responses(worker_count); + } + + pub fn cancel_task(&mut self, task_id: TaskId) { + self.queued_tasks.remove(&task_id); + } + + /// Called when the main cannot communicate anymore with a worker (it's channel closed) + /// Calls Self::close_worker which makes sure the worker is killed to prevent it from + /// going rogue if it wasn't the case + pub fn handle_worker_close(&mut self, token: &Token) { + match self.workers.get(token) { + Some(worker) => { + info!("closing session of worker {}", worker.id); + trace!("closing worker session {:?}", worker); + } + None => { + error!("No worker exists with token {:?}", token); + return; + } + }; + + self.close_worker(token); + } + + /// returns how many workers should be started to reach config count + pub fn workers_to_spawn(&self) -> u16 { + if self.config.worker_automatic_restart && self.run_state == ServerState::Running { + self.config + .worker_count + .saturating_sub(self.alive_workers() as u16) + } else { + 0 + } + } + + /// spawn brand new workers + pub fn automatic_worker_spawn(&mut self, count: u16) { + if count == 0 { + return; + } + + info!("Automatically restarting {} workers", count); + for _ in 0..count { + if let Err(err) = self.launch_new_worker(None) { + error!("could not launch new worker: {}", err); + } + } + } + + fn alive_workers(&self) -> usize { + self.workers + .values() + .filter(|worker| worker.is_active()) + .count() + } + + /// kill the worker process + pub fn close_worker(&mut self, token: &Token) { + let worker = match self.workers.get_mut(token) { + Some(w) => w, + None => { + error!("No worker exists with token {:?}", token); + return; + } + }; + + match kill(Pid::from_raw(worker.pid), Signal::SIGKILL) { + Ok(()) => info!("Worker {} was successfully killed", worker.id), + Err(_) => info!("worker {} was already dead", worker.id), + } + worker.run_state = RunState::Stopped; + } + + /// Make the file descriptors of the channel survive the upgrade + pub fn disable_cloexec_before_upgrade(&mut self) -> Result { + trace!( + "disabling cloexec on listener with file descriptor: {}", + self.unix_listener.as_raw_fd() + ); + + disable_close_on_exec(self.unix_listener.as_raw_fd()).map_err(ServerError::DisableCloexec) + } + + /// This enables workers to be notified in case the main process dies + pub fn enable_cloexec_after_upgrade(&mut self) -> Result { + for worker in self.workers.values_mut() { + if worker.run_state == RunState::Running { + let _ = enable_close_on_exec(worker.channel.fd()).map_err(|e| { + error!( + "could not enable close on exec for worker {}: {}", + worker.id, e + ); + }); + } + } + enable_close_on_exec(self.unix_listener.as_raw_fd()).map_err(ServerError::EnableCloexec) + } + + /// summarize the server into what is needed to recreate it, when upgrading + pub fn generate_upgrade_data(&self) -> UpgradeData { + UpgradeData { + command_socket_fd: self.unix_listener.as_raw_fd(), + config: self.config.clone(), + workers: self + .workers + .values() + .filter_map(|session| match SerializedWorkerSession::try_from(session) { + Ok(serialized_session) => Some(serialized_session), + Err(err) => { + error!("failed to serialize worker session: {}", err); + None + } + }) + .collect(), + state: self.state.clone(), + next_client_id: self.next_client_id, + next_session_id: self.next_session_id, + next_task_id: self.next_task_id, + next_worker_id: self.next_worker_id, + } + } +} diff --git a/bin/src/command/sessions.rs b/bin/src/command/sessions.rs new file mode 100644 index 000000000..aa1bd305f --- /dev/null +++ b/bin/src/command/sessions.rs @@ -0,0 +1,292 @@ +use std::fmt::Debug; + +use libc::pid_t; +use mio::Token; +use serde::{de::DeserializeOwned, Serialize}; + +use sozu_command_lib::{ + channel::Channel, + proto::command::{Request, Response, ResponseContent, ResponseStatus, RunState, WorkerInfo}, + ready::Ready, + request::WorkerRequest, + response::WorkerResponse, + scm_socket::ScmSocket, +}; + +use crate::command::server::{ClientId, MessageClient, WorkerId}; + +/// Track a client from start to finish +#[derive(Debug)] +pub struct ClientSession { + pub channel: Channel, + pub id: ClientId, + pub token: Token, +} + +/// The return type of the ready method +#[derive(Debug)] +pub enum ClientResult { + NothingToDo, + NewRequest(Request), + CloseSession, +} + +impl ClientSession { + pub fn new(mut channel: Channel, id: ClientId, token: Token) -> Self { + channel.interest = Ready::READABLE | Ready::ERROR | Ready::HUP; + Self { channel, id, token } + } + + /// queue a response for the client (the event loop does the send) + fn send(&mut self, response: Response) { + if let Err(e) = self.channel.write_message(&response) { + error!("error writing on channel: {}", e); + self.channel.readiness = Ready::ERROR; + return; + } + self.channel.interest.insert(Ready::WRITABLE); + } + + pub fn update_readiness(&mut self, events: Ready) { + self.channel.handle_events(events); + } + + /// drive the channel read and write + pub fn ready(&mut self) -> ClientResult { + if self.channel.readiness.is_error() || self.channel.readiness.is_hup() { + return ClientResult::CloseSession; + } + + let status = self.channel.writable(); + trace!("client writable: {:?}", status); + let mut requests = extract_messages(&mut self.channel); + match requests.pop() { + Some(request) => { + if !requests.is_empty() { + error!("more than one request at a time"); + } + ClientResult::NewRequest(request) + } + None => ClientResult::NothingToDo, + } + } +} + +impl MessageClient for ClientSession { + fn finish_ok>(&mut self, message: T) { + let message = message.into(); + info!("{}", message); + self.send(Response { + status: ResponseStatus::Ok.into(), + message, + content: None, + }) + } + + fn finish_ok_with_content>(&mut self, content: ResponseContent, message: T) { + let message = message.into(); + info!("{}", message); + self.send(Response { + status: ResponseStatus::Ok.into(), + message, + content: Some(content), + }) + } + + fn finish_failure>(&mut self, message: T) { + let message = message.into(); + error!("{}", message); + self.send(Response { + status: ResponseStatus::Failure.into(), + message, + content: None, + }) + } + + fn return_processing>(&mut self, message: S) { + let message = message.into(); + info!("{}", message); + self.send(Response { + status: ResponseStatus::Processing.into(), + message, + content: None, + }); + } + + fn return_processing_with_content>( + &mut self, + message: S, + content: ResponseContent, + ) { + let message = message.into(); + info!("{}", message); + self.send(Response { + status: ResponseStatus::Processing.into(), + message, + content: Some(content), + }); + } +} + +pub type OptionalClient<'a> = Option<&'a mut ClientSession>; + +impl MessageClient for OptionalClient<'_> { + fn finish_ok>(&mut self, message: T) { + match self { + None => info!("{}", message.into()), + Some(client) => client.finish_ok(message), + } + } + + fn finish_ok_with_content>(&mut self, content: ResponseContent, message: T) { + match self { + None => info!("{}", message.into()), + Some(client) => client.finish_ok_with_content(content, message), + } + } + + fn finish_failure>(&mut self, message: T) { + match self { + None => error!("{}", message.into()), + Some(client) => client.finish_failure(message), + } + } + + fn return_processing>(&mut self, message: T) { + match self { + None => info!("{}", message.into()), + Some(client) => client.return_processing(message), + } + } + + fn return_processing_with_content>( + &mut self, + message: S, + content: ResponseContent, + ) { + match self { + None => info!("{}", message.into()), + Some(client) => client.return_processing_with_content(message, content), + } + } +} + +/// Follow a worker throughout its lifetime (launching, communitation, softstop/hardstop) +#[derive(Debug)] +pub struct WorkerSession { + pub channel: Channel, + pub id: WorkerId, + pub pid: pid_t, + pub run_state: RunState, + /// meant to send listeners to the worker upon start + pub scm_socket: ScmSocket, + pub token: Token, +} + +/// The return type of the ready method +#[derive(Debug)] +pub enum WorkerResult { + NothingToDo, + NewResponses(Vec), + CloseSession, +} + +impl WorkerSession { + pub fn new( + mut channel: Channel, + id: WorkerId, + pid: pid_t, + token: Token, + scm_socket: ScmSocket, + ) -> Self { + channel.interest = Ready::READABLE | Ready::ERROR | Ready::HUP; + Self { + channel, + id, + pid, + run_state: RunState::Running, + scm_socket, + token, + } + } + + /// queue a request for the worker (the event loop does the send) + pub fn send(&mut self, request: &WorkerRequest) { + trace!("Sending to worker: {:?}", request); + if let Err(e) = self.channel.write_message(request) { + error!("Could not send request to worker: {}", e); + self.channel.readiness = Ready::ERROR; + return; + } + self.channel.interest.insert(Ready::WRITABLE); + } + + pub fn update_readiness(&mut self, events: Ready) { + self.channel.handle_events(events); + } + + /// drive the channel read and write + pub fn ready(&mut self) -> WorkerResult { + let status = self.channel.writable(); + trace!("Worker writable: {:?}", status); + let responses = extract_messages(&mut self.channel); + if !responses.is_empty() { + return WorkerResult::NewResponses(responses); + } + + if self.channel.readiness.is_error() || self.channel.readiness.is_hup() { + debug!("worker {} is unresponsive, closing the session", self.id); + return WorkerResult::CloseSession; + } + + WorkerResult::NothingToDo + } + + /// get the run state of the worker (defaults to NotAnswering) + pub fn querying_info(&self) -> WorkerInfo { + let run_state = match self.run_state { + RunState::Stopping => RunState::Stopping, + RunState::Stopped => RunState::Stopped, + RunState::Running | RunState::NotAnswering => RunState::NotAnswering, + }; + WorkerInfo { + id: self.id, + pid: self.pid, + run_state: run_state as i32, + } + } + + pub fn is_active(&self) -> bool { + self.run_state != RunState::Stopping && self.run_state != RunState::Stopped + } +} + +/// read and parse messages (Requests or Responses) from the channel +pub fn extract_messages(channel: &mut Channel) -> Vec +where + Tx: Debug + Serialize, + Rx: Debug + DeserializeOwned, +{ + let mut messages = Vec::new(); + loop { + let status = channel.readable(); + trace!("Channel readable: {:?}", status); + let old_capacity = channel.front_buf.capacity(); + let message = channel.read_message(); + match message { + Ok(message) => messages.push(message), + Err(_) => { + if old_capacity == channel.front_buf.capacity() { + return messages; + } + } + } + } +} + +/// used by the event loop to know wether to call ready on a session, +/// given the state of its channel +pub fn wants_to_tick(channel: &Channel) -> bool { + (channel.readiness.is_writable() && channel.back_buf.available_data() > 0) + || (channel.readiness.is_hup() || channel.readiness.is_error()) +} diff --git a/bin/src/command/upgrade.rs b/bin/src/command/upgrade.rs new file mode 100644 index 000000000..d3ca66cd7 --- /dev/null +++ b/bin/src/command/upgrade.rs @@ -0,0 +1,350 @@ +use std::os::fd::AsRawFd; + +use libc::pid_t; +use mio::Token; +use serde::{Deserialize, Serialize}; + +use sozu_command_lib::{ + config::Config, + proto::command::{ + request::RequestType, ResponseStatus, ReturnListenSockets, RunState, SoftStop, + }, + response::WorkerResponse, + state::ConfigState, +}; + +use crate::{ + command::{ + server::{ + ClientId, Gatherer, GatheringTask, MessageClient, Server, ServerState, SessionId, + TaskId, Timeout, WorkerId, + }, + sessions::{ClientSession, OptionalClient}, + }, + upgrade::{fork_main_into_new_main, UpgradeError}, + util::disable_close_on_exec, +}; + +use super::sessions::WorkerSession; + +#[derive(Debug)] +enum UpgradeWorkerProgress { + /// 1. request listeners from the old worker + /// 2. store listeners to pass them to new worker, + RequestingListenSockets { + old_worker_token: Token, + old_worker_id: WorkerId, + }, + /// 3. soft stop the old worker + /// 4. activate the listeners of the new worker + StopOldActivateNew { + old_worker_id: WorkerId, + new_worker_id: WorkerId, + }, +} + +#[derive(Debug)] +struct UpgradeWorkerTask { + pub client_token: Token, + progress: UpgradeWorkerProgress, + + ok: usize, + errors: usize, + responses: Vec<(WorkerId, WorkerResponse)>, + expected_responses: usize, +} + +pub fn upgrade_worker(server: &mut Server, client: &mut ClientSession, old_worker_id: WorkerId) { + info!( + "client[{:?}] msg wants to upgrade worker {}", + client.token, old_worker_id + ); + + let old_worker_token = match server.get_active_worker_by_id(old_worker_id) { + Some(session) => session.token, + None => { + client.finish_failure(format!( + "Worker {} does not exist, or is stopping / stopped", + old_worker_id + )); + return; + } + }; + + client.return_processing(format!( + "Requesting listen sockets from worker {old_worker_id}" + )); + server.scatter( + RequestType::ReturnListenSockets(ReturnListenSockets {}).into(), + Box::new(UpgradeWorkerTask { + client_token: client.token, + progress: UpgradeWorkerProgress::RequestingListenSockets { + old_worker_token, + old_worker_id, + }, + ok: 0, + errors: 0, + responses: Vec::new(), + expected_responses: 0, + }), + Timeout::Default, + Some(old_worker_id), + ); +} + +impl UpgradeWorkerTask { + fn receive_listen_sockets( + self, + server: &mut Server, + client: &mut OptionalClient, + old_worker_token: Token, + old_worker_id: WorkerId, + ) { + let old_worker = match server.workers.get_mut(&old_worker_token) { + Some(old_worker) => old_worker, + None => { + client.finish_failure(format!("Worker {old_worker_id} died while upgrading, it should be restarted automatically")); + return; + } + }; + let old_worker_id = old_worker.id; + + match old_worker.scm_socket.set_blocking(true) { + Ok(_) => {} + Err(error) => { + client.finish_failure(format!("Could not set SCM sockets to blocking: {error:?}")); + return; + } + } + + let listeners = match old_worker.scm_socket.receive_listeners() { + Ok(listeners) => listeners, + Err(_) => { + client.finish_failure( + "Could not upgrade worker: did not get back listeners from the old worker", + ); + return; + } + }; + + old_worker.run_state = RunState::Stopping; + + // lauch new worker + let new_worker = match server.launch_new_worker(Some(listeners)) { + Ok(worker) => worker, + Err(worker_err) => { + return client.finish_failure(format!("could not launch new worker: {worker_err}")) + } + }; + client.return_processing(format!("Launched a new worker with id {}", new_worker.id)); + let new_worker_id = new_worker.id; + + let finish_task = server.new_task( + Box::new(UpgradeWorkerTask { + client_token: self.client_token, + progress: UpgradeWorkerProgress::StopOldActivateNew { + old_worker_id, + new_worker_id, + }, + + ok: 0, + errors: 0, + responses: Vec::new(), + expected_responses: 0, + }), + Timeout::None, + ); + + // Stop the old worker + client.return_processing(format!("Soft stopping worker with id {}", old_worker_id)); + server.scatter_on( + RequestType::SoftStop(SoftStop {}).into(), + finish_task, + 0, + Some(old_worker_id), + ); + + // activate new worker + for (count, request) in server + .state + .generate_activate_requests() + .into_iter() + .enumerate() + { + server.scatter_on(request, finish_task, count + 1, Some(new_worker_id)); + } + } +} + +impl GatheringTask for UpgradeWorkerTask { + fn client_token(&self) -> Option { + Some(self.client_token) + } + + fn get_gatherer(&mut self) -> &mut dyn super::server::Gatherer { + self + } + + fn on_finish( + self: Box, + server: &mut Server, + client: &mut OptionalClient, + _timed_out: bool, + ) { + match self.progress { + UpgradeWorkerProgress::RequestingListenSockets { + old_worker_token, + old_worker_id, + } => { + if self.ok == 1 { + self.receive_listen_sockets(server, client, old_worker_token, old_worker_id); + } else { + client.finish_failure(format!( + "Could not get listen sockets from old worker:{:?}", + self.responses + )); + } + } + UpgradeWorkerProgress::StopOldActivateNew { + old_worker_id, + new_worker_id, + } => { + client.finish_ok( + format!( + "Upgrade successful:\n- finished soft stop of worker {:?}\n- finished activation of new worker {:?}", + old_worker_id, new_worker_id + ) + ); + } + } + } +} + +impl Gatherer for UpgradeWorkerTask { + fn inc_expected_responses(&mut self, count: usize) { + self.expected_responses += count; + } + + fn has_finished(&self) -> bool { + self.ok + self.errors >= self.expected_responses + } + + fn on_message( + &mut self, + _server: &mut Server, + client: &mut OptionalClient, + worker_id: WorkerId, + message: WorkerResponse, + ) { + match message.status { + ResponseStatus::Ok => { + self.ok += 1; + match self.progress { + UpgradeWorkerProgress::RequestingListenSockets { .. } => {} + UpgradeWorkerProgress::StopOldActivateNew { .. } => { + client.return_processing(format!( + "Worker {} answered OK to {}. {}", + worker_id, message.id, message.message + )) + } + } + } + ResponseStatus::Failure => self.errors += 1, + ResponseStatus::Processing => client.return_processing(format!( + "Worker {} is processing {}. {}", + worker_id, message.id, message.message + )), + } + self.responses.push((worker_id, message)); + } +} + +//=============================================== +// Upgrade the main process + +/// Summary of a worker session, meant to be passed to a new main process +/// during an upgrade, in order to recreate the worker +#[derive(Deserialize, Serialize, Debug)] +pub struct SerializedWorkerSession { + /// file descriptor of the UNIX channel + pub channel_fd: i32, + pub pid: pid_t, + pub id: WorkerId, + pub run_state: RunState, + /// file descriptor of the SCM socket + pub scm_fd: i32, +} + +impl TryFrom<&WorkerSession> for SerializedWorkerSession { + type Error = UpgradeError; + + fn try_from(worker: &WorkerSession) -> Result { + disable_close_on_exec(worker.channel.fd()).map_err(|util_err| { + UpgradeError::DisableCloexec { + fd_name: format!("main-to-worker-{}-channel", worker.id), + util_err, + } + })?; + + Ok(Self { + channel_fd: worker.channel.sock.as_raw_fd(), + pid: worker.pid, + id: worker.id, + run_state: worker.run_state, + scm_fd: worker.scm_socket.raw_fd(), + }) + } +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct UpgradeData { + /// file descriptor of the unix command socket + pub command_socket_fd: i32, + pub config: Config, + pub next_client_id: ClientId, + pub next_session_id: SessionId, + pub next_task_id: TaskId, + pub next_worker_id: WorkerId, + /// JSON serialized workers + pub workers: Vec, + pub state: ConfigState, +} + +pub fn upgrade_main(server: &mut Server, client: &mut ClientSession) { + if let Err(err) = server.disable_cloexec_before_upgrade() { + client.finish_failure(err.to_string()); + } + + client.return_processing("Upgrading the main process..."); + + let upgrade_data = server.generate_upgrade_data(); + + let (new_main_pid, mut fork_confirmation_channel) = + match fork_main_into_new_main(server.executable_path.clone(), upgrade_data) { + Ok(tuple) => tuple, + Err(fork_error) => { + client.finish_failure(format!( + "Could not start a new main process by forking: {}", + fork_error + )); + return; + } + }; + + let received_ok_from_new_process = fork_confirmation_channel.read_message().unwrap_or(false); + + debug!( + "new main process sent a fork confirmation: {:?}", + received_ok_from_new_process + ); + + if !received_ok_from_new_process { + client.finish_failure("Upgrade of main process failed: no feedback from the new main"); + } else { + client.finish_ok(format!( + "Upgrade successful, closing main process. New main process has pid {}", + new_main_pid + )); + server.run_state = ServerState::Stopping; + } +} diff --git a/bin/src/ctl/command.rs b/bin/src/ctl/command.rs index 518eea513..59e0b07a0 100644 --- a/bin/src/ctl/command.rs +++ b/bin/src/ctl/command.rs @@ -1,62 +1,45 @@ -use anyhow::{self, bail, Context}; - -use sozu_command_lib::proto::command::{ - request::RequestType, response_content::ContentType, ListWorkers, QueryMetricsOptions, Request, - Response, ResponseContent, ResponseStatus, RunState, UpgradeMain, +use std::time::Duration; + +use sozu_command_lib::{ + logging::setup_logging_with_config, + proto::command::{ + request::RequestType, response_content::ContentType, ListWorkers, QueryMetricsOptions, + Request, Response, ResponseContent, ResponseStatus, UpgradeMain, + }, }; -use crate::ctl::{create_channel, CommandManager}; +use crate::ctl::{create_channel, CommandManager, CtlError}; impl CommandManager { - fn write_request_on_channel(&mut self, request: Request) -> anyhow::Result<()> { + fn write_request_on_channel(&mut self, request: Request) -> Result<(), CtlError> { self.channel .write_message(&request) - .with_context(|| "Could not write the request") + .map_err(CtlError::WriteRequest) } - fn read_channel_message_with_timeout(&mut self) -> anyhow::Result { + fn read_channel_message_with_timeout(&mut self) -> Result { self.channel .read_message_blocking_timeout(Some(self.timeout)) - .with_context(|| "Command timeout. The proxy didn't send an answer") + .map_err(CtlError::ReadBlocking) } - pub fn send_request(&mut self, request: Request) -> Result<(), anyhow::Error> { + fn send_request_get_response( + &mut self, + request: Request, + timeout: bool, + ) -> Result { self.channel .write_message(&request) - .with_context(|| "Could not write the request")?; - - loop { - let response = self.read_channel_message_with_timeout()?; - - match response.status() { - ResponseStatus::Processing => { - if !self.json { - debug!("Proxy is processing: {}", response.message); - } - } - ResponseStatus::Failure => bail!("Request failed: {}", response.message), - ResponseStatus::Ok => { - if !self.json { - info!("{}", response.message); - } - response.display(self.json)?; - break; - } - } - } - Ok(()) - } - - // 1. Request a list of workers - // 2. Send an UpgradeMain - // 3. Send an UpgradeWorker to each worker - pub fn upgrade_main(&mut self) -> Result<(), anyhow::Error> { - info!("Preparing to upgrade proxy..."); - - self.write_request_on_channel(RequestType::ListWorkers(ListWorkers {}).into())?; + .map_err(CtlError::WriteRequest)?; loop { - let response = self.read_channel_message_with_timeout()?; + let response = if timeout { + self.read_channel_message_with_timeout()? + } else { + self.channel + .read_message_blocking_timeout(None) + .map_err(CtlError::ReadBlocking)? + }; match response.status() { ResponseStatus::Processing => { @@ -64,109 +47,28 @@ impl CommandManager { debug!("Processing: {}", response.message); } } - ResponseStatus::Failure => { - bail!( - "Error: failed to get the list of worker: {}", - response.message - ); - } - ResponseStatus::Ok => { - if let Some(ResponseContent { - content_type: Some(ContentType::Workers(ref worker_infos)), - }) = response.content - { - // display worker status - response.display(false)?; - - self.write_request_on_channel( - RequestType::UpgradeMain(UpgradeMain {}).into(), - )?; - - info!("Upgrading main process"); - - loop { - let response = self.read_channel_message_with_timeout()?; - - match response.status() { - ResponseStatus::Processing => { - debug!("Main process is upgrading"); - } - ResponseStatus::Failure => { - bail!( - "Error: failed to upgrade the main: {}", - response.message - ); - } - ResponseStatus::Ok => { - info!("Main process upgrade succeeded: {}", response.message); - break; - } - } - } - - // Reconnect to the new main - info!("Reconnecting to new main process..."); - self.channel = create_channel(&self.config) - .with_context(|| "could not reconnect to the command unix socket")?; - - // Do a rolling restart of the workers - let running_workers = worker_infos - .vec - .iter() - .filter(|worker| worker.run_state == RunState::Running as i32) - .collect::>(); - let running_count = running_workers.len(); - for (i, worker) in running_workers.iter().enumerate() { - info!( - "Upgrading worker {} (#{} out of {})", - worker.id, - i + 1, - running_count - ); - - self.upgrade_worker(worker.id) - .with_context(|| "Upgrading the worker failed")?; - //thread::sleep(Duration::from_millis(1000)); - } - - info!("Proxy successfully upgraded!"); - } else { - info!("Received a response of the wrong kind: {:?}", response); - } - break; - } + ResponseStatus::Failure => return Err(CtlError::Failure(response.message)), + ResponseStatus::Ok => return Ok(response), } } - Ok(()) } - pub fn upgrade_worker(&mut self, worker_id: u32) -> Result<(), anyhow::Error> { - trace!("upgrading worker {}", worker_id); - - //FIXME: we should be able to soft stop one specific worker - self.write_request_on_channel(RequestType::UpgradeWorker(worker_id).into())?; + fn send_request_display_response( + &mut self, + request: Request, + timeout: bool, + ) -> Result<(), CtlError> { + self.send_request_get_response(request, timeout)? + .display(self.json) + .map_err(CtlError::Display) + } - loop { - let response = self.read_channel_message_with_timeout()?; + pub fn send_request(&mut self, request: Request) -> Result<(), CtlError> { + self.send_request_display_response(request, true) + } - match response.status() { - ResponseStatus::Processing => { - if !self.json { - info!("Proxy is processing: {}", response.message); - } - } - ResponseStatus::Failure => bail!( - "could not stop the worker {}: {}", - worker_id, - response.message - ), - ResponseStatus::Ok => { - info!("Success: {}", response.message); - break; - } - } - } - Ok(()) + pub fn send_request_no_timeout(&mut self, request: Request) -> Result<(), CtlError> { + self.send_request_display_response(request, false) } pub fn get_metrics( @@ -177,7 +79,7 @@ impl CommandManager { cluster_ids: Vec, backend_ids: Vec, no_clusters: bool, - ) -> Result<(), anyhow::Error> { + ) -> Result<(), CtlError> { let request: Request = RequestType::QueryMetrics(QueryMetricsOptions { list, cluster_ids, @@ -200,11 +102,11 @@ impl CommandManager { match response.status() { ResponseStatus::Processing => { if !self.json { - debug!("Proxy is processing: {}", response.message); + debug!("Processing: {}", response.message); } } ResponseStatus::Failure | ResponseStatus::Ok => { - response.display(self.json)?; + response.display(self.json).map_err(CtlError::Display)?; break; } } @@ -224,4 +126,72 @@ impl CommandManager { Ok(()) } + + pub fn upgrade_main(&mut self) -> Result<(), CtlError> { + debug!("updating main process"); + self.send_request(RequestType::UpgradeMain(UpgradeMain {}).into())?; + + info!("recreating a channel to reconnect with the new main process..."); + self.channel = create_channel(&self.config)?; + + info!("requesting the list of workers from the new main"); + let response = + self.send_request_get_response(RequestType::ListWorkers(ListWorkers {}).into(), true)?; + + let workers = match response.content { + Some(ResponseContent { + content_type: Some(ContentType::Workers(worker_infos)), + }) => worker_infos, + _ => return Err(CtlError::WrongResponse(response)), + }; + + info!("About to upgrade these workers: {:?}", workers); + + let mut upgrade_jobs = Vec::new(); + + for worker in workers.vec { + info!("trying to upgrade worker {}", worker.id); + let config = self.config.clone(); + + upgrade_jobs.push(std::thread::spawn(move || { + setup_logging_with_config(&config, &format!("UPGRADE-WRK-{}", worker.id)); + + info!("creating channel to upgrade worker {}", worker.id); + let channel = match create_channel(&config) { + Ok(channel) => channel, + Err(e) => { + error!( + "could not create channel to worker {}, this is critical: {}", + worker.id, e + ); + return; + } + }; + + info!("created channel to upgrade worker {}", worker.id); + + let mut command_manager = CommandManager { + channel, + timeout: Duration::from_secs(60), // overriden by upgrade_timeout anyway + config, + json: false, + }; + + match command_manager.upgrade_worker(worker.id) { + Ok(()) => info!("successfully upgraded worker {}", worker.id), + Err(e) => error!("error upgrading worker {}: {}", worker.id, e), + } + })); + } + + for job in upgrade_jobs { + if let Err(e) = job.join() { + error!("an upgrading job panicked: {:?}", e) + } + } + + info!("Finished upgrading"); + + Ok(()) + } } diff --git a/bin/src/ctl/mod.rs b/bin/src/ctl/mod.rs index ce360fa4e..22a6c9ff6 100644 --- a/bin/src/ctl/mod.rs +++ b/bin/src/ctl/mod.rs @@ -1,20 +1,61 @@ +mod command; +mod request_builder; + use std::time::Duration; -use anyhow::Context; use sozu_command_lib::{ - channel::Channel, - config::Config, + certificate::CertificateError, + channel::{Channel, ChannelError}, + config::{Config, ConfigError}, logging::setup_logging_with_config, - proto::command::{Request, Response}, + proto::{ + command::{Request, Response}, + DisplayError, + }, }; use crate::{ cli::{self, *}, - get_config_file_path, load_configuration, + util::{get_config_file_path, UtilError}, }; -mod command; -mod request_builder; +#[derive(thiserror::Error, Debug)] +pub enum CtlError { + #[error("failed to get config: {0}")] + GetConfig(UtilError), + #[error("failed to load config: {0}")] + LoadConfig(ConfigError), + #[error("could not create channel to Sōzu. Are you sure the proxy is up?: {0}")] + CreateChannel(ChannelError), + #[error("failed to find the path of the command socket: {0}")] + GetCommandSocketPath(ConfigError), + #[error("failed to block channel to Sōzu: {0}")] + BlockChannel(ChannelError), + #[error("could not display response: {0}")] + Display(DisplayError), + #[error("could not read message on a blocking channel: {0}")] + ReadBlocking(ChannelError), + #[error("Request failed: {0}")] + Failure(String), + #[error("could not write request on channel: {0}")] + WriteRequest(ChannelError), + #[error("could not get certificate fingerprint")] + GetFingerprint(CertificateError), + #[error("could not decode fingerprint")] + DecodeFingerprint(CertificateError), + #[error("Please provide either one, {0} OR {1}")] + ArgsNeeded(String, String), + #[error("could not load certificate")] + LoadCertificate(CertificateError), + #[error("wrong address {0}: {1}")] + WrongAddress(String, UtilError), + #[error("wrong input to create listener")] + CreateListener(ConfigError), + #[error("domain can not be empty")] + NeedClusterDomain, + #[error("wrong response from Sōzu: {0:?}")] + WrongResponse(Response), +} pub struct CommandManager { channel: Channel, @@ -24,11 +65,15 @@ pub struct CommandManager { json: bool, } -pub fn ctl(args: cli::Args) -> anyhow::Result<()> { - let config_file_path = get_config_file_path(&args)?; - let config = load_configuration(config_file_path)?; +pub fn ctl(args: cli::Args) -> Result<(), CtlError> { + let config_path = get_config_file_path(&args).map_err(CtlError::GetConfig)?; - setup_logging_with_config(&config, "CTL"); + let config = Config::load_from_path(config_path).map_err(CtlError::LoadConfig)?; + + // prevent logging for json responses for a clean output + if !args.json { + setup_logging_with_config(&config, "CTL"); + } // If the command is `config check` then exit because if we are here, the configuration is valid if let SubCmd::Config { @@ -39,9 +84,7 @@ pub fn ctl(args: cli::Args) -> anyhow::Result<()> { std::process::exit(0); } - let channel = create_channel(&config).with_context(|| { - "could not connect to the command unix socket. Are you sure the proxy is up?" - })?; + let channel = create_channel(&config)?; let timeout = Duration::from_millis(args.timeout.unwrap_or(config.ctl_command_timeout)); if !args.json { @@ -59,7 +102,7 @@ pub fn ctl(args: cli::Args) -> anyhow::Result<()> { } impl CommandManager { - fn handle_command(&mut self, command: SubCmd) -> anyhow::Result<()> { + fn handle_command(&mut self, command: SubCmd) -> Result<(), CtlError> { debug!("Executing command {:?}", command); match command { SubCmd::Shutdown { hard } => { @@ -167,16 +210,18 @@ impl CommandManager { } /// creates a blocking channel -pub fn create_channel(config: &Config) -> anyhow::Result> { +pub fn create_channel(config: &Config) -> Result, CtlError> { + let command_socket_path = &config + .command_socket_path() + .map_err(CtlError::GetCommandSocketPath)?; + let mut channel = Channel::from_path( - &config.command_socket_path()?, + command_socket_path, config.command_buffer_size, config.max_command_buffer_size, ) - .with_context(|| "Could not create Channel from the given path")?; + .map_err(CtlError::CreateChannel)?; - channel - .blocking() - .with_context(|| "Could not block the channel used to communicate with Sōzu")?; + channel.blocking().map_err(CtlError::BlockChannel)?; Ok(channel) } diff --git a/bin/src/ctl/request_builder.rs b/bin/src/ctl/request_builder.rs index ce0ff3ec1..6389ab625 100644 --- a/bin/src/ctl/request_builder.rs +++ b/bin/src/ctl/request_builder.rs @@ -1,7 +1,5 @@ use std::collections::BTreeMap; -use anyhow::{bail, Context}; - use sozu_command_lib::{ certificate::{ decode_fingerprint, get_fingerprint_from_certificate_path, load_full_certificate, @@ -23,57 +21,60 @@ use crate::{ MetricsCmd, TcpFrontendCmd, TcpListenerCmd, }, ctl::CommandManager, + util::parse_socket_address, }; +use super::CtlError; + impl CommandManager { - pub fn save_state(&mut self, path: String) -> anyhow::Result<()> { + pub fn save_state(&mut self, path: String) -> Result<(), CtlError> { debug!("Saving the state to file {}", path); self.send_request(RequestType::SaveState(path).into()) } - pub fn load_state(&mut self, path: String) -> anyhow::Result<()> { + pub fn load_state(&mut self, path: String) -> Result<(), CtlError> { debug!("Loading the state on path {}", path); self.send_request(RequestType::LoadState(path).into()) } - pub fn count_requests(&mut self) -> anyhow::Result<()> { + pub fn count_requests(&mut self) -> Result<(), CtlError> { self.send_request(RequestType::CountRequests(CountRequests {}).into()) } - pub fn soft_stop(&mut self) -> anyhow::Result<()> { + pub fn soft_stop(&mut self) -> Result<(), CtlError> { debug!("shutting down proxy softly"); self.send_request(RequestType::SoftStop(SoftStop {}).into()) } - pub fn hard_stop(&mut self) -> anyhow::Result<()> { + pub fn hard_stop(&mut self) -> Result<(), CtlError> { debug!("shutting down proxy the hard way"); self.send_request(RequestType::HardStop(HardStop {}).into()) } - pub fn status(&mut self) -> anyhow::Result<()> { + pub fn status(&mut self) -> Result<(), CtlError> { debug!("Requesting status…"); self.send_request(RequestType::Status(Status {}).into()) } - pub fn configure_metrics(&mut self, cmd: MetricsCmd) -> anyhow::Result<()> { + pub fn configure_metrics(&mut self, cmd: MetricsCmd) -> Result<(), CtlError> { debug!("Configuring metrics: {:?}", cmd); let configuration = match cmd { MetricsCmd::Enable => MetricsConfiguration::Enabled, MetricsCmd::Disable => MetricsConfiguration::Disabled, MetricsCmd::Clear => MetricsConfiguration::Clear, - _ => bail!("The command passed to the configure_metrics function is wrong."), + _ => return Ok(()), // completely unlikely }; self.send_request(RequestType::ConfigureMetrics(configuration as i32).into()) } - pub fn reload_configuration(&mut self, path: Option) -> anyhow::Result<()> { + pub fn reload_configuration(&mut self, path: Option) -> Result<(), CtlError> { debug!("Reloading configuration…"); let path = match path { Some(p) => p, @@ -88,7 +89,7 @@ impl CommandManager { https: bool, tcp: bool, domain: Option, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { debug!("Listing frontends"); self.send_request( @@ -102,11 +103,11 @@ impl CommandManager { ) } - pub fn events(&mut self) -> anyhow::Result<()> { - self.send_request(RequestType::SubscribeEvents(SubscribeEvents {}).into()) + pub fn events(&mut self) -> Result<(), CtlError> { + self.send_request_no_timeout(RequestType::SubscribeEvents(SubscribeEvents {}).into()) } - pub fn backend_command(&mut self, cmd: BackendCmd) -> anyhow::Result<()> { + pub fn backend_command(&mut self, cmd: BackendCmd) -> Result<(), CtlError> { match cmd { BackendCmd::Add { id, @@ -140,7 +141,7 @@ impl CommandManager { } } - pub fn cluster_command(&mut self, cmd: ClusterCmd) -> anyhow::Result<()> { + pub fn cluster_command(&mut self, cmd: ClusterCmd) -> Result<(), CtlError> { match cmd { ClusterCmd::Add { id, @@ -174,7 +175,10 @@ impl CommandManager { domain, } => { if cluster_id.is_some() && domain.is_some() { - bail!("Error: Either request an cluster ID or a domain name"); + return Err(CtlError::ArgsNeeded( + "a cluster id".to_string(), + "a domain name".to_string(), + )); } let request = if let Some(ref cluster_id) = cluster_id { @@ -184,14 +188,11 @@ impl CommandManager { domain.splitn(2, '/').map(|elem| elem.to_string()).collect(); if splitted.is_empty() { - bail!("Domain can't be empty"); + return Err(CtlError::NeedClusterDomain)?; } let query_domain = QueryClusterByDomain { - hostname: splitted - .get(0) - .with_context(|| "Domain can't be empty")? - .clone(), + hostname: splitted.get(0).ok_or(CtlError::NeedClusterDomain)?.clone(), path: splitted.get(1).cloned().map(|path| format!("/{path}")), // We add the / again because of the splitn removing it }; @@ -205,7 +206,7 @@ impl CommandManager { } } - pub fn tcp_frontend_command(&mut self, cmd: TcpFrontendCmd) -> anyhow::Result<()> { + pub fn tcp_frontend_command(&mut self, cmd: TcpFrontendCmd) -> Result<(), CtlError> { match cmd { TcpFrontendCmd::Add { id, address, tags } => self.send_request( RequestType::AddTcpFrontend(RequestTcpFrontend { @@ -226,7 +227,7 @@ impl CommandManager { } } - pub fn http_frontend_command(&mut self, cmd: HttpFrontendCmd) -> anyhow::Result<()> { + pub fn http_frontend_command(&mut self, cmd: HttpFrontendCmd) -> Result<(), CtlError> { match cmd { HttpFrontendCmd::Add { hostname, @@ -274,7 +275,7 @@ impl CommandManager { } } - pub fn https_frontend_command(&mut self, cmd: HttpFrontendCmd) -> anyhow::Result<()> { + pub fn https_frontend_command(&mut self, cmd: HttpFrontendCmd) -> Result<(), CtlError> { match cmd { HttpFrontendCmd::Add { hostname, @@ -322,7 +323,7 @@ impl CommandManager { } } - pub fn https_listener_command(&mut self, cmd: HttpsListenerCmd) -> anyhow::Result<()> { + pub fn https_listener_command(&mut self, cmd: HttpsListenerCmd) -> Result<(), CtlError> { match cmd { HttpsListenerCmd::Add { address, @@ -351,7 +352,7 @@ impl CommandManager { .with_request_timeout(request_timeout) .with_connect_timeout(connect_timeout) .to_tls(Some(&self.config)) - .with_context(|| "Error creating HTTPS listener")?; + .map_err(CtlError::CreateListener)?; self.send_request(RequestType::AddHttpsListener(https_listener).into()) } @@ -367,7 +368,7 @@ impl CommandManager { } } - pub fn http_listener_command(&mut self, cmd: HttpListenerCmd) -> anyhow::Result<()> { + pub fn http_listener_command(&mut self, cmd: HttpListenerCmd) -> Result<(), CtlError> { match cmd { HttpListenerCmd::Add { address, @@ -392,7 +393,8 @@ impl CommandManager { .with_back_timeout(back_timeout) .with_connect_timeout(connect_timeout) .to_http(Some(&self.config)) - .with_context(|| "Error creating HTTP listener")?; + .map_err(CtlError::CreateListener)?; + self.send_request(RequestType::AddHttpListener(http_listener).into()) } HttpListenerCmd::Remove { address } => { @@ -407,7 +409,7 @@ impl CommandManager { } } - pub fn tcp_listener_command(&mut self, cmd: TcpListenerCmd) -> anyhow::Result<()> { + pub fn tcp_listener_command(&mut self, cmd: TcpListenerCmd) -> Result<(), CtlError> { match cmd { TcpListenerCmd::Add { address, @@ -418,7 +420,7 @@ impl CommandManager { .with_public_address(public_address) .with_expect_proxy(expect_proxy) .to_tcp(Some(&self.config)) - .with_context(|| "Could not create TCP listener")?; + .map_err(CtlError::CreateListener)?; self.send_request(RequestType::AddTcpListener(listener).into()) } @@ -434,7 +436,7 @@ impl CommandManager { } } - pub fn list_listeners(&mut self) -> anyhow::Result<()> { + pub fn list_listeners(&mut self) -> Result<(), CtlError> { self.send_request(RequestType::ListListeners(ListListeners {}).into()) } @@ -442,10 +444,13 @@ impl CommandManager { &mut self, address: String, listener_type: ListenerType, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { + let address = parse_socket_address(&address) + .map_err(|util_err| CtlError::WrongAddress(address, util_err))?; + self.send_request( RequestType::RemoveListener(RemoveListener { - address: address.parse().with_context(|| "wrong socket address")?, + address: address.to_string(), proxy: listener_type.into(), }) .into(), @@ -456,10 +461,13 @@ impl CommandManager { &mut self, address: String, listener_type: ListenerType, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { + let address = parse_socket_address(&address) + .map_err(|util_err| CtlError::WrongAddress(address, util_err))?; + self.send_request( RequestType::ActivateListener(ActivateListener { - address: address.parse().with_context(|| "wrong socket address")?, + address: address.to_string(), proxy: listener_type.into(), from_scm: false, }) @@ -471,10 +479,13 @@ impl CommandManager { &mut self, address: String, listener_type: ListenerType, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { + let address = parse_socket_address(&address) + .map_err(|util_err| CtlError::WrongAddress(address, util_err))?; + self.send_request( RequestType::DeactivateListener(DeactivateListener { - address: address.parse().with_context(|| "wrong socket address")?, + address: address.to_string(), proxy: listener_type.into(), to_scm: false, }) @@ -482,7 +493,7 @@ impl CommandManager { ) } - pub fn logging_filter(&mut self, filter: &LoggingLevel) -> anyhow::Result<()> { + pub fn logging_filter(&mut self, filter: &LoggingLevel) -> Result<(), CtlError> { self.send_request(RequestType::Logging(filter.to_string().to_lowercase()).into()) } @@ -493,7 +504,7 @@ impl CommandManager { certificate_chain_path: &str, key_path: &str, versions: Vec, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { let new_certificate = load_full_certificate( certificate_path, certificate_chain_path, @@ -501,7 +512,7 @@ impl CommandManager { versions, vec![], ) - .with_context(|| "Could not load the full certificate")?; + .map_err(CtlError::LoadCertificate)?; self.send_request( RequestType::AddCertificate(AddCertificate { @@ -523,18 +534,21 @@ impl CommandManager { old_certificate_path: Option<&str>, old_fingerprint: Option<&str>, versions: Vec, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { let old_fingerprint = match (old_certificate_path, old_fingerprint) { (None, None) | (Some(_), Some(_)) => { - bail!("Error: Please provide either one, the old certificate's path OR its fingerprint") + return Err(CtlError::ArgsNeeded( + "the path to the old certificate".to_string(), + "the path to the old fingerprint".to_string(), + )) } (Some(old_certificate_path), None) => { - get_fingerprint_from_certificate_path(old_certificate_path).with_context(|| { - "Could not retrieve the fingerprint from the given certificate path" - })? + get_fingerprint_from_certificate_path(old_certificate_path) + .map_err(CtlError::GetFingerprint)? + } + (None, Some(fingerprint)) => { + decode_fingerprint(fingerprint).map_err(CtlError::DecodeFingerprint)? } - (None, Some(fingerprint)) => decode_fingerprint(fingerprint) - .with_context(|| "Error decoding the given fingerprint")?, }; let new_certificate = load_full_certificate( @@ -544,7 +558,7 @@ impl CommandManager { versions, vec![], ) - .with_context(|| "Could not load the full certificate")?; + .map_err(CtlError::LoadCertificate)?; self.send_request( RequestType::ReplaceCertificate(ReplaceCertificate { @@ -564,18 +578,21 @@ impl CommandManager { address: String, certificate_path: Option<&str>, fingerprint: Option<&str>, - ) -> anyhow::Result<()> { + ) -> Result<(), CtlError> { let fingerprint = match (certificate_path, fingerprint) { (None, None) | (Some(_), Some(_)) => { - bail!("Error: Please provide either one, the path OR the fingerprint of the certificate") + return Err(CtlError::ArgsNeeded( + "the path to the certificate".to_string(), + "the fingerprint of the certificate".to_string(), + )) } (Some(certificate_path), None) => { - get_fingerprint_from_certificate_path(certificate_path).with_context(|| { - "Could not retrieve the finger print from the given certificate path" - })? + get_fingerprint_from_certificate_path(certificate_path) + .map_err(CtlError::GetFingerprint)? + } + (None, Some(fingerprint)) => { + decode_fingerprint(fingerprint).map_err(CtlError::DecodeFingerprint)? } - (None, Some(fingerprint)) => decode_fingerprint(fingerprint) - .with_context(|| "Error decoding the given fingerprint")?, }; self.send_request( @@ -592,7 +609,7 @@ impl CommandManager { fingerprint: Option, domain: Option, query_workers: bool, - ) -> Result<(), anyhow::Error> { + ) -> Result<(), CtlError> { let filters = QueryCertificatesFilters { domain, fingerprint, @@ -604,4 +621,9 @@ impl CommandManager { self.send_request(RequestType::QueryCertificatesFromTheState(filters).into()) } } + + pub fn upgrade_worker(&mut self, worker_id: u32) -> Result<(), CtlError> { + debug!("upgrading worker {}", worker_id); + self.send_request(RequestType::UpgradeWorker(worker_id).into()) + } } diff --git a/bin/src/main.rs b/bin/src/main.rs index 77969aa0b..cbbce85ad 100644 --- a/bin/src/main.rs +++ b/bin/src/main.rs @@ -37,40 +37,48 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; /// the arguments to the sozu command line mod cli; /// Receives orders from the CLI, transmits to workers +// mod command; mod command; /// The command line logic mod ctl; /// Forking & restarting the main process using a more recent executable of Sōzu mod upgrade; /// Some unix helper functions -mod util; +pub mod util; /// Start and restart the worker UNIX processes mod worker; -#[cfg(target_os = "linux")] -use anyhow::bail; -use anyhow::Context; -use cli::Args; +use std::panic; + #[cfg(target_os = "linux")] use libc::{cpu_set_t, pid_t}; -#[cfg(target_os = "linux")] -use regex::Regex; + use sozu::metrics::METRICS; -use sozu_command_lib::{config::Config, logging::setup_logging_with_config}; -use std::panic; -use crate::worker::{get_executable_path, start_workers, Worker}; +use cli::Args; +use command::{begin_main_process, sessions::WorkerSession, StartError}; +use ctl::CtlError; +use upgrade::UpgradeError; +use worker::WorkerError; + +#[derive(thiserror::Error, Debug)] +enum MainError { + #[error("failed to start Sōzu: {0}")] + StartMain(StartError), + #[error("failed to start new worker: {0}")] + BeginWorker(WorkerError), + #[error("failed to start new main process: {0}")] + BeginNewMain(UpgradeError), + #[error("{0}")] + Cli(CtlError), +} #[paw::main] -fn main(args: Args) -> anyhow::Result<()> { +fn main(args: Args) { register_panic_hook(); - match args.cmd { - cli::SubCmd::Start => { - start(&args)?; - info!("main process stopped"); - Ok(()) - } + let result = match args.cmd { + cli::SubCmd::Start => begin_main_process(&args).map_err(MainError::StartMain), // this is used only by the CLI when upgrading cli::SubCmd::Worker { fd: worker_to_main_channel_fd, @@ -90,6 +98,7 @@ fn main(args: Args) -> anyhow::Result<()> { command_buffer_size, max_command_buffer_size, ) + .map_err(MainError::BeginWorker) } // this is used only by the CLI when upgrading cli::SubCmd::Main { @@ -106,61 +115,22 @@ fn main(args: Args) -> anyhow::Result<()> { command_buffer_size, max_command_buffer_size, ) + .map_err(MainError::BeginNewMain) } - _ => ctl::ctl(args), - } -} - -fn start(args: &cli::Args) -> Result<(), anyhow::Error> { - let config_file_path = get_config_file_path(args)?; - let config = load_configuration(config_file_path)?; - - setup_logging_with_config(&config, "MAIN"); - info!("Starting up"); - util::setup_metrics(&config).with_context(|| "Could not setup metrics")?; - util::write_pid_file(&config).with_context(|| "PID file is not writeable")?; - - update_process_limits(&config)?; - - let executable_path = - unsafe { get_executable_path().with_context(|| "Could not get executable path")? }; - let workers = - start_workers(executable_path, &config).with_context(|| "Failed at spawning workers")?; - - if config.handle_process_affinity { - set_workers_affinity(&workers); - } - - let command_socket_path = config.command_socket_path()?; - - command::start_server(config, command_socket_path, workers) - .with_context(|| "could not start Sozu")?; - - Ok(()) -} - -pub fn get_config_file_path(args: &cli::Args) -> Result<&str, anyhow::Error> { - match args.config.as_ref() { - Some(config_file) => Ok(config_file.as_str()), - None => option_env!("SOZU_CONFIG").ok_or_else(|| { - anyhow::Error::msg( - "Configuration file hasn't been specified. Either use -c with the start command \ - or use the SOZU_CONFIG environment variable when building sozu.", - ) - }), + _ => ctl::ctl(args).map_err(MainError::Cli), + }; + match result { + Ok(_) => {} + Err(main_error) => println!("{}", main_error), } } -pub fn load_configuration(config_file: &str) -> Result { - Config::load_from_path(config_file).with_context(|| "Invalid configuration file.") -} - /// Set workers process affinity, see man sched_setaffinity /// Bind each worker (including the main) process to a CPU core. /// Can bind multiple processes to a CPU core if there are more processes /// than CPU cores. Only works on Linux. #[cfg(target_os = "linux")] -fn set_workers_affinity(workers: &Vec) { +fn set_workers_affinity(workers: &Vec) { let mut cpu_count = 0; let max_cpu = num_cpus::get(); @@ -210,80 +180,6 @@ fn set_process_affinity(pid: pid_t, cpu: usize) { }; } -#[cfg(target_os = "linux")] -// We check the hard_limit. The soft_limit can be changed at runtime -// by the process or any user. hard_limit can only be changed by root -fn update_process_limits(config: &Config) -> Result<(), anyhow::Error> { - let wanted_opened_files = (config.max_connections as u64) * 2; - - // Ensure we don't exceed the system maximum capacity - let f = Config::load_file("/proc/sys/fs/file-max") - .with_context(|| "Couldn't read /proc/sys/fs/file-max")?; - let re_max = Regex::new(r"(\d*)")?; - let system_max_fd = re_max - .captures(&f) - .and_then(|c| c.get(1)) - .and_then(|m| m.as_str().parse::().ok()) - .with_context(|| "Couldn't parse /proc/sys/fs/file-max")?; - if config.max_connections > system_max_fd { - error!( - "Proxies total max_connections can't be higher than system's file-max limit. \ - Current limit: {}, current value: {}", - system_max_fd, config.max_connections - ); - bail!("Too many allowed connections"); - } - - // Get the soft and hard limits for the current process - let mut limits = libc::rlimit { - rlim_cur: 0, - rlim_max: 0, - }; - unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut limits) }; - - // Ensure we don't exceed the hard limit - if limits.rlim_max < wanted_opened_files { - error!( - "at least one worker can't have that many connections. \ - current max file descriptor hard limit is: {}, \ - configured max_connections is {} (the worker needs two file descriptors \ - per client connection)", - limits.rlim_max, config.max_connections - ); - bail!("Too many allowed connection for a worker"); - } - - if limits.rlim_cur < wanted_opened_files && limits.rlim_cur != limits.rlim_max { - // Try to get twice what we need to be safe, or rlim_max if we exceed that - limits.rlim_cur = limits.rlim_max.min(wanted_opened_files * 2); - unsafe { - libc::setrlimit(libc::RLIMIT_NOFILE, &limits); - - // Refresh the data we have - libc::getrlimit(libc::RLIMIT_NOFILE, &mut limits); - } - } - - // Ensure we don't exceed the new soft limit - if limits.rlim_cur < wanted_opened_files { - error!( - "at least one worker can't have that many connections. \ - current max file descriptor soft limit is: {}, \ - configured max_connections is {} (the worker needs two file descriptors \ - per client connection)", - limits.rlim_cur, config.max_connections - ); - bail!("Too many allowed connection for a worker"); - } - - Ok(()) -} - -#[cfg(not(target_os = "linux"))] -fn update_process_limits(_: &Config) -> Result<(), anyhow::Error> { - Ok(()) -} - fn register_panic_hook() { // We save the original panic hook so we can call it later // to have the original behavior diff --git a/bin/src/upgrade.rs b/bin/src/upgrade.rs index 1b05a6639..e1ccc1d59 100644 --- a/bin/src/upgrade.rs +++ b/bin/src/upgrade.rs @@ -1,60 +1,74 @@ use std::{ fs::File, - io::Seek, + io::{Error as IoError, Write}, + io::{Read, Seek}, os::unix::io::{AsRawFd, FromRawFd}, os::unix::process::CommandExt, process::Command, }; -use anyhow::{bail, Context}; -use futures_lite::future; use libc::{self, pid_t}; use mio::net::UnixStream; -use nix::unistd::{fork, ForkResult}; -use serde::{Deserialize, Serialize}; - +use nix::{ + errno::Errno, + unistd::{fork, ForkResult}, +}; +use serde_json::Error as SerdeError; use tempfile::tempfile; use sozu_command_lib::{ - channel::Channel, config::Config, logging::setup_logging_with_config, proto::command::RunState, - request::WorkerRequest, state::ConfigState, + channel::{Channel, ChannelError}, + logging::setup_logging_with_config, }; -use crate::{command::CommandServer, util, worker::Worker}; - -#[derive(Deserialize, Serialize, Debug)] -pub struct SerializedWorker { - pub fd: i32, - pub pid: i32, - pub id: u32, - pub run_state: RunState, - pub queue: Vec, - pub scm: i32, -} - -impl SerializedWorker { - pub fn from_worker(worker: &Worker) -> SerializedWorker { - SerializedWorker { - fd: worker.worker_channel_fd, - pid: worker.pid, - id: worker.id, - run_state: worker.run_state, - queue: worker.queue.clone().into(), - scm: worker.scm_socket.raw_fd(), - } - } -} +use crate::{ + command::{ + server::{CommandHub, HubError, ServerError}, + upgrade::UpgradeData, + }, + util::{self, UtilError}, +}; -/// the data needed to start a new main process -#[derive(Deserialize, Serialize, Debug)] -pub struct UpgradeData { - /// file descriptor of the unix command socket - pub command_socket_fd: i32, - pub config: Config, - /// JSON serialized workers - pub workers: Vec, - pub state: ConfigState, - pub next_id: u32, +#[derive(thiserror::Error, Debug)] +pub enum UpgradeError { + #[error("could not create temporary state file for the upgrade: {0}")] + CreateUpgradeFile(IoError), + #[error("could not disable cloexec on {fd_name}'s file descriptor: {util_err}")] + DisableCloexec { + fd_name: String, + util_err: UtilError, + }, + #[error("could not create MIO pair of unix stream: {0}")] + CreateUnixStream(IoError), + #[error("could not rewind the temporary upgrade file: {0}")] + Rewind(IoError), + #[error("could not write upgrade data to temporary file: {0}")] + SerdeWriteError(SerdeError), + #[error("could not write upgrade data to temporary file: {0}")] + WriteFile(IoError), + #[error("could not read upgrade data from file: {0}")] + ReadFile(IoError), + #[error("could not read upgrade data to temporary file: {0}")] + SerdeReadError(SerdeError), + #[error("unix fork failed: {0}")] + Fork(Errno), + #[error("failed to set metrics on the new main process: {0}")] + SetupMetrics(UtilError), + #[error("could not write PID file of new main process: {0}")] + WritePidFile(UtilError), + #[error( + "the channel failed to send confirmation of upgrade {result} to the old main process: {channel_err}" + )] + SendConfirmation { + result: String, + channel_err: ChannelError, + }, + #[error("Could not block the fork confirmation channel: {0}. This is not normal, you may need to restart sozu")] + BlockChannel(ChannelError), + #[error("could not create a command hub from the upgrade data: {0}")] + CreateHub(HubError), + #[error("could not enable cloexec after upgrade: {0}")] + EnableCloexec(ServerError), } /// unix-forks the main process @@ -66,23 +80,34 @@ pub struct UpgradeData { pub fn fork_main_into_new_main( executable_path: String, upgrade_data: UpgradeData, -) -> Result<(pid_t, Channel<(), bool>), anyhow::Error> { +) -> Result<(pid_t, Channel<(), bool>), UpgradeError> { trace!("parent({})", unsafe { libc::getpid() }); - let mut upgrade_file = - tempfile().with_context(|| "could not create temporary file for upgrade")?; + let mut upgrade_file = tempfile().map_err(UpgradeError::CreateUpgradeFile)?; - util::disable_close_on_exec(upgrade_file.as_raw_fd())?; + util::disable_close_on_exec(upgrade_file.as_raw_fd()).map_err(|util_err| { + UpgradeError::DisableCloexec { + fd_name: "upgrade-file".to_string(), + util_err, + } + })?; - serde_json::to_writer(&mut upgrade_file, &upgrade_data) - .with_context(|| "could not write upgrade data to temporary file")?; + info!("Writing upgrade data to file"); + let upgrade_data_string = + serde_json::to_string(&upgrade_data).map_err(UpgradeError::SerdeWriteError)?; upgrade_file - .rewind() - .with_context(|| "could not seek to beginning of file")?; + .write_all(upgrade_data_string.as_bytes()) + .map_err(UpgradeError::WriteFile)?; + upgrade_file.rewind().map_err(UpgradeError::Rewind)?; - let (old_to_new, new_to_old) = UnixStream::pair()?; + let (old_to_new, new_to_old) = UnixStream::pair().map_err(UpgradeError::CreateUnixStream)?; - util::disable_close_on_exec(new_to_old.as_raw_fd())?; + util::disable_close_on_exec(new_to_old.as_raw_fd()).map_err(|util_err| { + UpgradeError::DisableCloexec { + fd_name: "new-main-to-old-main-channel".to_string(), + util_err, + } + })?; let mut fork_confirmation_channel: Channel<(), bool> = Channel::new( old_to_new, @@ -90,24 +115,14 @@ pub fn fork_main_into_new_main( upgrade_data.config.max_command_buffer_size, ); - if let Err(e) = fork_confirmation_channel.blocking() { - error!( - "Could not block the fork confirmation channel: {}. This is not normal, you may need to restart sozu", - e - ); - } + fork_confirmation_channel + .blocking() + .map_err(UpgradeError::BlockChannel)?; info!("launching new main"); - match unsafe { fork().with_context(|| "fork failed")? } { + match unsafe { fork().map_err(UpgradeError::Fork)? } { ForkResult::Parent { child } => { - info!("main launched: {}", child); - - if let Err(e) = fork_confirmation_channel.nonblocking() { - error!( - "Could not unblock the fork confirmation channel: {}. This is not normal, you may need to restart sozu", - e - ); - } + info!("new main launched, with pid {}", child); Ok((child.into(), fork_confirmation_channel)) } @@ -138,7 +153,7 @@ pub fn begin_new_main_process( upgrade_file_fd: i32, command_buffer_size: usize, max_command_buffer_size: usize, -) -> anyhow::Result<()> { +) -> Result<(), UpgradeError> { let mut fork_confirmation_channel: Channel = Channel::new( unsafe { UnixStream::from_raw_fd(new_to_old_channel_fd) }, command_buffer_size, @@ -150,36 +165,43 @@ pub fn begin_new_main_process( error!("Could not block the fork confirmation channel: {}", e); } - let upgrade_file = unsafe { File::from_raw_fd(upgrade_file_fd) }; + println!("reading upgrade data from file"); + + let mut upgrade_file = unsafe { File::from_raw_fd(upgrade_file_fd) }; + let mut content = String::new(); + let _ = upgrade_file + .read_to_string(&mut content) + .map_err(UpgradeError::ReadFile)?; + let upgrade_data: UpgradeData = - serde_json::from_reader(upgrade_file).with_context(|| "could not parse upgrade data")?; + serde_json::from_str(&content).map_err(UpgradeError::SerdeReadError)?; + let config = upgrade_data.config.clone(); + println!("Setting up logging"); + setup_logging_with_config(&config, "MAIN"); - util::setup_metrics(&config).with_context(|| "Could not setup metrics")?; - //info!("new main got upgrade data: {:?}", upgrade_data); + util::setup_metrics(&config).map_err(UpgradeError::SetupMetrics)?; + + let mut command_hub = + CommandHub::from_upgrade_data(upgrade_data).map_err(UpgradeError::CreateHub)?; + + command_hub + .enable_cloexec_after_upgrade() + .map_err(UpgradeError::EnableCloexec)?; + + util::write_pid_file(&config).map_err(UpgradeError::WritePidFile)?; + + fork_confirmation_channel + .write_message(&true) + .map_err(|channel_err| UpgradeError::SendConfirmation { + result: "success".to_string(), + channel_err, + })?; - let mut server = CommandServer::from_upgrade_data(upgrade_data)?; - server.enable_cloexec_after_upgrade()?; info!("starting new main loop"); - match util::write_pid_file(&config) { - Ok(()) => { - fork_confirmation_channel - .write_message(&true) - .with_context(|| "Could not send confirmation of fork using the channel")?; - future::block_on(async { - server.run().await; - }); - info!("main process stopped"); - Ok(()) - } - Err(e) => { - fork_confirmation_channel - .write_message(&false) - .with_context(|| "Could not send fork failure message using the channel")?; - error!("Couldn't write PID file. Error: {:?}", e); - error!("Couldn't upgrade main process"); - bail!("begin_new_main_process() failed"); - } - } + command_hub.run(); + + info!("main process stopped"); + Ok(()) } diff --git a/bin/src/util.rs b/bin/src/util.rs index 1ddde32a7..5f1c09ab3 100644 --- a/bin/src/util.rs +++ b/bin/src/util.rs @@ -1,64 +1,185 @@ -use std::{fs::File, io::Write, os::unix::io::RawFd}; +use std::{ + ffi::OsString, + fs::{read_link, File}, + io::{Error as IoError, Write}, + net::{AddrParseError, SocketAddr}, + os::unix::io::RawFd, + path::PathBuf, +}; + +use nix::{ + errno::Errno, + fcntl::{fcntl, FcntlArg, FdFlag}, +}; +use thiserror; -use anyhow::Context; - -use nix::fcntl::{fcntl, FcntlArg, FdFlag}; - -use sozu::metrics; use sozu_command_lib::config::Config; +use sozu_lib::metrics::{self, MetricError}; + +use crate::cli; + +#[derive(thiserror::Error, Debug)] +pub enum UtilError { + #[error("could not get flags (F_GETFD) on file descriptor {0}: {1}")] + GetFlags(RawFd, Errno), + #[error("could not convert flags for file descriptor {0}")] + ConvertFlags(RawFd), + #[error("could not set flags for file descriptor {0}: {1}")] + SetFlags(RawFd, Errno), + #[error("could not create pid file {0}: {1}")] + CreatePidFile(String, IoError), + #[error("could not write pid file {0}: {1}")] + WritePidFile(String, IoError), + #[error("could not sync pid file {0}: {1}")] + SyncPidFile(String, IoError), + #[error("Failed to convert PathBuf {0} to String: {1:?}")] + OsString(PathBuf, OsString), + #[error("could not read file {0}: {1}")] + Read(String, IoError), + #[error("failed to retrieve current executable path: {0}")] + CurrentExe(IoError), + #[error("could not setup metrics: {0}")] + SetupMetrics(MetricError), + #[error( + "Configuration file hasn't been specified. Either use -c with the start command, + or use the SOZU_CONFIG environment variable when building sozu." + )] + GetConfigFilePath, + #[error("could not parse socket address: {0}")] + ParseSocketAddress(AddrParseError), +} -pub fn enable_close_on_exec(fd: RawFd) -> Result { +/// FD_CLOEXEC is set by default on every fd in Rust standard lib, +/// so we need to remove the flag on the client, otherwise +/// it won't be accessible +pub fn enable_close_on_exec(fd: RawFd) -> Result { let file_descriptor = - fcntl(fd, FcntlArg::F_GETFD).with_context(|| "could not get file descriptor flags")?; + fcntl(fd, FcntlArg::F_GETFD).map_err(|err_no| UtilError::GetFlags(fd, err_no))?; - let mut new_flags = FdFlag::from_bits(file_descriptor) - .ok_or_else(|| anyhow::format_err!("could not convert flags for file descriptor"))?; + let mut new_flags = FdFlag::from_bits(file_descriptor).ok_or(UtilError::ConvertFlags(fd))?; new_flags.insert(FdFlag::FD_CLOEXEC); - fcntl(fd, FcntlArg::F_SETFD(new_flags)).with_context(|| "could not set file descriptor flags") + fcntl(fd, FcntlArg::F_SETFD(new_flags)).map_err(|err_no| UtilError::SetFlags(fd, err_no)) } /// FD_CLOEXEC is set by default on every fd in Rust standard lib, /// so we need to remove the flag on the client, otherwise /// it won't be accessible -pub fn disable_close_on_exec(fd: RawFd) -> Result { +pub fn disable_close_on_exec(fd: RawFd) -> Result { let old_flags = - fcntl(fd, FcntlArg::F_GETFD).with_context(|| "could not get file descriptor flags")?; + fcntl(fd, FcntlArg::F_GETFD).map_err(|err_no| UtilError::GetFlags(fd, err_no))?; - let mut new_flags = FdFlag::from_bits(old_flags) - .ok_or_else(|| anyhow::format_err!("could not convert flags for file descriptor"))?; + let mut new_flags = FdFlag::from_bits(old_flags).ok_or(UtilError::ConvertFlags(fd))?; new_flags.remove(FdFlag::FD_CLOEXEC); - fcntl(fd, FcntlArg::F_SETFD(new_flags)).with_context(|| "could not set file descriptor flags") + fcntl(fd, FcntlArg::F_SETFD(new_flags)).map_err(|err_no| UtilError::SetFlags(fd, err_no)) } -pub fn setup_metrics(config: &Config) -> anyhow::Result<()> { +pub fn setup_metrics(config: &Config) -> Result<(), UtilError> { if let Some(metrics) = config.metrics.as_ref() { - return Ok(metrics::setup( + return metrics::setup( &metrics.address, "MAIN", metrics.tagged_metrics, metrics.prefix.clone(), - )?); + ) + .map_err(UtilError::SetupMetrics); } Ok(()) } -pub fn write_pid_file(config: &Config) -> Result<(), anyhow::Error> { +pub fn write_pid_file(config: &Config) -> Result<(), UtilError> { let pid_file_path: Option<&str> = config .pid_file_path .as_ref() .map(|pid_file_path| pid_file_path.as_ref()); - if let Some(pid_file_path) = pid_file_path { - let mut file = File::create(pid_file_path)?; + if let Some(path) = pid_file_path { + let mut file = File::create(path) + .map_err(|io_err| UtilError::CreatePidFile(path.to_owned(), io_err))?; let pid = unsafe { libc::getpid() }; - file.write_all(format!("{pid}").as_bytes())?; - file.sync_all()?; + file.write_all(format!("{pid}").as_bytes()) + .map_err(|write_err| UtilError::WritePidFile(path.to_owned(), write_err))?; + file.sync_all() + .map_err(|sync_err| UtilError::SyncPidFile(path.to_owned(), sync_err))?; } Ok(()) } + +pub fn get_config_file_path(args: &cli::Args) -> Result<&str, UtilError> { + match args.config.as_ref() { + Some(config_file) => Ok(config_file.as_str()), + None => option_env!("SOZU_CONFIG").ok_or(UtilError::GetConfigFilePath), + } +} + +pub fn parse_socket_address(address: &str) -> Result { + address + .parse::() + .map_err(UtilError::ParseSocketAddress) +} + +#[cfg(target_os = "freebsd")] +pub unsafe fn get_executable_path() -> Result { + let mut capacity = PATH_MAX as usize; + let mut path: Vec = Vec::with_capacity(capacity); + path.extend(repeat(0).take(capacity)); + + let mib: Vec = vec![CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1]; + let len = mib.len() * size_of::(); + let element_size = size_of::(); + + let res = sysctl( + mib.as_ptr(), + (len / element_size) as u32, + path.as_mut_ptr() as *mut c_void, + &mut capacity, + std::ptr::null() as *const c_void, + 0, + ); + if res != 0 { + panic!("Could not retrieve the path of the executable"); + } + + Ok(String::from_raw_parts( + path.as_mut_ptr(), + capacity - 1, + path.len(), + )) +} + +#[cfg(target_os = "linux")] +pub unsafe fn get_executable_path() -> Result { + let path = read_link("/proc/self/exe") + .map_err(|io_err| UtilError::Read("/proc/self/exe".to_string(), io_err))?; + + let mut path_str = path + .clone() + .into_os_string() + .into_string() + .map_err(|string_err| UtilError::OsString(path, string_err))?; + + if path_str.ends_with(" (deleted)") { + // The kernel appends " (deleted)" to the symlink when the original executable has been replaced + let len = path_str.len(); + path_str.truncate(len - 10) + } + + Ok(path_str) +} + +#[cfg(target_os = "macos")] +extern "C" { + pub fn _NSGetExecutablePath(buf: *mut c_char, size: *mut u32) -> i32; +} + +#[cfg(target_os = "macos")] +pub unsafe fn get_executable_path() -> Result { + let path = std::env::current_exe().map_err(|io_err| UtilError::CurrentExe(io_err))?; + + Ok(path.to_string_lossy().to_string()) +} diff --git a/bin/src/worker.rs b/bin/src/worker.rs index 2c3a57c27..a403c9897 100644 --- a/bin/src/worker.rs +++ b/bin/src/worker.rs @@ -1,207 +1,81 @@ +#[cfg(target_os = "freebsd")] +use std::{ffi::c_void, iter::repeat, mem::size_of}; use std::{ - collections::VecDeque, - fmt, fs::File, + io::Error as IoError, io::Seek, os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}, os::unix::process::CommandExt, process::Command, }; -#[cfg(target_os = "freebsd")] -use std::{ffi::c_void, iter::repeat, mem::size_of}; -#[cfg(target_os = "linux")] -use anyhow::bail; -use anyhow::Context; -use futures::SinkExt; #[cfg(target_os = "macos")] use libc::c_char; use libc::{self, pid_t}; #[cfg(target_os = "freebsd")] use libc::{sysctl, CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, PATH_MAX}; use mio::net::UnixStream; -use nix::{self, unistd::*}; -use nix::{sys::signal::kill, unistd::Pid}; - +use nix::{ + self, + errno::Errno, + unistd::{fork, ForkResult}, +}; use tempfile::tempfile; -use sozu::{metrics, server::Server}; use sozu_command_lib::{ - channel::Channel, + channel::{Channel, ChannelError}, config::Config, logging::setup_logging_with_config, - proto::command::{request::RequestType, Request, RunState, Status, WorkerInfo}, ready::Ready, - request::{read_requests_from_file, WorkerRequest}, + request::{read_requests_from_file, RequestError, WorkerRequest}, response::WorkerResponse, - scm_socket::{Listeners, ScmSocket}, - state::ConfigState, + scm_socket::{Listeners, ScmSocket, ScmSocketError}, + state::{ConfigState, StateError}, +}; +use sozu_lib::{ + metrics::{self, MetricError}, + server::{Server, ServerError as LibServerError}, }; -use crate::util; - -/// An instance of Sōzu, as seen from the main process -pub struct Worker { - pub id: u32, - /// for the worker to receive requests and respond to the main process - pub worker_channel: Option>, - /// file descriptor of the command channel - pub worker_channel_fd: i32, - pub pid: pid_t, - pub run_state: RunState, - pub queue: VecDeque, - /// Used to send and receive listeners (socket addresses and file descriptors) - pub scm_socket: ScmSocket, - /// Used to send proxyrequests to the worker loop - pub sender: Option>, -} - -impl Worker { - pub fn new( - id: u32, - pid: pid_t, - command_channel: Channel, - scm_socket: ScmSocket, - _: &Config, - ) -> Worker { - Worker { - id, - worker_channel_fd: command_channel.sock.as_raw_fd(), - worker_channel: Some(command_channel), - sender: None, - pid, - run_state: RunState::Running, - queue: VecDeque::new(), - scm_socket, - } - } - - /// send proxy request to the worker, via the mpsc sender - pub async fn send(&mut self, order_id: String, content: Request) { - if let Some(worker_tx) = self.sender.as_mut() { - if let Err(e) = worker_tx - .send(WorkerRequest { - id: order_id.clone(), - content, - }) - .await - { - error!( - "error sending message {} to worker {:?}: {:?}", - order_id, self.id, e - ); - } - } - } - - /// send a kill -0 to check on the pid, if it's dead it should be an error - pub fn the_pid_is_alive(&self) -> bool { - kill(Pid::from_raw(self.pid), None).is_ok() - } - - /// get info about a worker, with a NotAnswering run state by default, - /// to be updated when the worker responds - pub fn querying_info(&self) -> WorkerInfo { - let run_state = match self.run_state { - RunState::Stopping => RunState::Stopping, - RunState::Stopped => RunState::Stopped, - RunState::Running | RunState::NotAnswering => RunState::NotAnswering, - }; - WorkerInfo { - id: self.id, - pid: self.pid, - run_state: run_state as i32, - } - } - - pub fn is_active(&self) -> bool { - self.run_state != RunState::Stopping && self.run_state != RunState::Stopped - } - - /* - pub fn push_message(&mut self, message: ProxyRequest) { - self.queue.push_back(message); - self.channel.interest.insert(Ready::WRITABLE); - } - - pub fn can_handle_events(&self) -> bool { - self.channel.readiness().is_readable() || (!self.queue.is_empty() && self.channel.readiness().is_writable()) - }*/ -} - -impl fmt::Debug for Worker { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "Worker {{ id: {}, run_state: {:?} }}", - self.id, self.run_state - ) - } -} - -/// Called once at the beginning of the main process, this forks main into as many workers -pub fn start_workers(executable_path: String, config: &Config) -> anyhow::Result> { - let state = ConfigState::new(); - let mut workers = Vec::new(); - for index in 0..config.worker_count { - let listeners = Some(Listeners { - http: Vec::new(), - tls: Vec::new(), - tcp: Vec::new(), - }); - - let (pid, command_channel, scm_socket) = fork_main_into_worker( - &index.to_string(), - config, - executable_path.clone(), - &state, - listeners, - )?; - let mut worker = Worker::new(index as u32, pid, command_channel, scm_socket, config); - - // the new worker expects a status message at startup - if let Some(worker_channel) = worker.worker_channel.as_mut() { - if let Err(e) = worker_channel.blocking() { - error!("Could not block the worker channel: {}", e); - } - - worker_channel - .write_message(&WorkerRequest { - id: format!("start-status-{index}"), - content: RequestType::Status(Status {}).into(), - }) - .with_context(|| "Could not send status request to the worker")?; - - if let Err(e) = worker_channel.nonblocking() { - error!("Could not unblock the worker channel: {}", e); - } - } - - workers.push(worker); - } - info!("Created workers"); - Ok(workers) -} - -/// called by the CommandServer to start an individual worker -/// returns a handle of the worker, with channels to write to it -pub fn start_worker( - id: u32, - config: &Config, - executable_path: String, - state: &ConfigState, - listeners: Option, -) -> anyhow::Result { - let (worker_pid, main_to_worker_channel, main_to_worker_scm) = - fork_main_into_worker(&id.to_string(), config, executable_path, state, listeners)?; - - Ok(Worker::new( - id, - worker_pid, - main_to_worker_channel, - main_to_worker_scm, - config, - )) +use crate::util::{self, UtilError}; + +#[derive(thiserror::Error, Debug)] +pub enum WorkerError { + #[error("could not read on the channel")] + ReadChannel(ChannelError), + #[error("could not parse configuration from temporary file: {0}")] + ReadRequestsFromFile(RequestError), + #[error("could not setup metrics on new worker: {0}")] + SetupMetrics(MetricError), + #[error("could not create new worker from config: {0}")] + NewServerFromConfig(LibServerError), + #[error("could not create {kind} scm socket: {scm_err}")] + CreateScmSocket { + kind: String, + scm_err: ScmSocketError, + }, + #[error("could not create temporary file to pass the state to the new worker: {0}")] + CreateStateFile(IoError), + #[error("could not disable cloexec on {fd_name}'s file descriptor: {util_err}")] + DisableCloexec { + fd_name: String, + util_err: UtilError, + }, + #[error("could not write state to temporary file: {0}")] + WriteStateFile(StateError), + #[error("could not rewind the temporary state file: {0}")] + Rewind(IoError), + #[error("could not create MIO pair of unix stream: {0}")] + CreateUnixStream(IoError), + #[error("could not send config to the new worker: {0}")] + SendConfig(ChannelError), + #[error("unix fork failed: {0}")] + Fork(Errno), + #[error("Could not set the worker-to-main channel to {state}: {channel_err}")] + SetChannel { + state: String, + channel_err: ChannelError, + }, } /// called within a worker process, this starts the actual proxy @@ -212,22 +86,25 @@ pub fn begin_worker_process( id: i32, command_buffer_size: usize, max_command_buffer_size: usize, -) -> Result<(), anyhow::Error> { +) -> Result<(), WorkerError> { let mut worker_to_main_channel: Channel = Channel::new( unsafe { UnixStream::from_raw_fd(worker_to_main_channel_fd) }, command_buffer_size, max_command_buffer_size, ); - if let Err(e) = worker_to_main_channel.blocking() { - error!("Could not block the worker-to-main channel: {}", e); - } + worker_to_main_channel + .blocking() + .map_err(|channel_err| WorkerError::SetChannel { + state: "blocking".to_string(), + channel_err, + })?; let mut configuration_state_file = unsafe { File::from_raw_fd(configuration_state_fd) }; let worker_config = worker_to_main_channel .read_message() - .with_context(|| "worker could not read configuration from socket")?; + .map_err(WorkerError::ReadChannel)?; let worker_id = format!("{}-{:02}", "WRK", id); @@ -241,11 +118,14 @@ pub fn begin_worker_process( ); info!("worker {} starting...", id); let initial_state = read_requests_from_file(&mut configuration_state_file) - .with_context(|| "could not parse configuration state data")?; + .map_err(WorkerError::ReadRequestsFromFile)?; - if let Err(e) = worker_to_main_channel.nonblocking() { - error!("Could not unblock the worker-to-main channel: {}", e); - } + worker_to_main_channel + .nonblocking() + .map_err(|channel_err| WorkerError::SetChannel { + state: "nonblocking".to_string(), + channel_err, + })?; let mut worker_to_main_channel: Channel = worker_to_main_channel.into(); @@ -258,10 +138,14 @@ pub fn begin_worker_process( metrics.tagged_metrics, metrics.prefix.clone(), ) - .with_context(|| "Could not setup metrics")?; + .map_err(WorkerError::SetupMetrics)?; } - let worker_to_main_scm_socket = ScmSocket::new(worker_to_main_scm_fd) - .with_context(|| "could not create worker-to-main scm socket")?; + + let worker_to_main_scm_socket = + ScmSocket::new(worker_to_main_scm_fd).map_err(|scm_err| WorkerError::CreateScmSocket { + kind: "worker-to-main".to_string(), + scm_err, + })?; let mut server = Server::try_new_from_config( worker_to_main_channel, @@ -270,7 +154,7 @@ pub fn begin_worker_process( initial_state, true, ) - .with_context(|| "Could not create server from config")?; + .map_err(WorkerError::NewServerFromConfig)?; info!("starting event loop"); server.run(); @@ -280,7 +164,7 @@ pub fn begin_worker_process( /// unix-forks the main process /// -/// - Parent: sends config and listeners to the new worker +/// - Parent: sends config, state and listeners to the new worker /// - Child: calls the sozu executable path like so: `sozu worker --id [...]` /// /// returns the child process pid, and channels to talk to it. @@ -290,29 +174,48 @@ pub fn fork_main_into_worker( executable_path: String, state: &ConfigState, listeners: Option, -) -> anyhow::Result<(pid_t, Channel, ScmSocket)> { +) -> Result<(pid_t, Channel, ScmSocket), WorkerError> { trace!("parent({})", unsafe { libc::getpid() }); - let mut state_file = - tempfile().with_context(|| "could not create temporary file for configuration state")?; - util::disable_close_on_exec(state_file.as_raw_fd())?; + let mut state_file = tempfile().map_err(WorkerError::CreateStateFile)?; + util::disable_close_on_exec(state_file.as_raw_fd()).map_err(|util_err| { + WorkerError::DisableCloexec { + fd_name: "state_file".to_string(), + util_err, + } + })?; state .write_requests_to_file(&mut state_file) - .with_context(|| "Could not write state to file")?; + .map_err(WorkerError::WriteStateFile)?; - state_file - .rewind() - .with_context(|| "could not seek to beginning of file")?; + state_file.rewind().map_err(WorkerError::Rewind)?; - let (main_to_worker, worker_to_main) = UnixStream::pair()?; - let (main_to_worker_scm, worker_to_main_scm) = UnixStream::pair()?; + let (main_to_worker, worker_to_main) = + UnixStream::pair().map_err(WorkerError::CreateUnixStream)?; + let (main_to_worker_scm, worker_to_main_scm) = + UnixStream::pair().map_err(WorkerError::CreateUnixStream)?; - let main_to_worker_scm = ScmSocket::new(main_to_worker_scm.into_raw_fd()) - .with_context(|| "Could not create main-to-worker scm socket")?; + let main_to_worker_scm = + ScmSocket::new(main_to_worker_scm.into_raw_fd()).map_err(|scm_err| { + WorkerError::CreateScmSocket { + kind: "main-to-worker".to_string(), + scm_err, + } + })?; - util::disable_close_on_exec(worker_to_main.as_raw_fd())?; - util::disable_close_on_exec(worker_to_main_scm.as_raw_fd())?; + util::disable_close_on_exec(worker_to_main.as_raw_fd()).map_err(|util_err| { + WorkerError::DisableCloexec { + fd_name: "worker-to-main".to_string(), + util_err, + } + })?; + util::disable_close_on_exec(worker_to_main_scm.as_raw_fd()).map_err(|util_err| { + WorkerError::DisableCloexec { + fd_name: "worker-to-main-scm".to_string(), + util_err, + } + })?; let mut main_to_worker_channel: Channel = Channel::new( main_to_worker, @@ -325,18 +228,22 @@ pub fn fork_main_into_worker( error!("Could not block the main-to-worker channel: {}", e); } - info!("{} launching worker", worker_id); + info!("launching worker {}", worker_id); debug!("executable path is {}", executable_path); - match unsafe { fork() } { - Ok(ForkResult::Parent { child: worker_pid }) => { - info!("{} worker launched: {}", worker_id, worker_pid); + + match unsafe { fork().map_err(WorkerError::Fork)? } { + ForkResult::Parent { child: worker_pid } => { + info!("launching worker {} with pid {}", worker_id, worker_pid); main_to_worker_channel .write_message(config) - .with_context(|| "Could not send config to the new worker using the channel")?; + .map_err(WorkerError::SendConfig)?; - if let Err(e) = main_to_worker_channel.nonblocking() { - error!("Could not unblock the main-to-worker channel: {}", e); - } + main_to_worker_channel + .nonblocking() + .map_err(|channel_err| WorkerError::SetChannel { + state: "nonblocking".to_string(), + channel_err, + })?; if let Some(listeners) = listeners { info!("sending listeners to new worker: {:?}", listeners); @@ -344,7 +251,13 @@ pub fn fork_main_into_worker( info!("sent listeners from main: {:?}", result); listeners.close(); }; - util::disable_close_on_exec(main_to_worker_scm.fd)?; + + util::disable_close_on_exec(main_to_worker_scm.fd).map_err(|util_err| { + WorkerError::DisableCloexec { + fd_name: "main-to-worker-main-scm".to_string(), + util_err, + } + })?; Ok(( worker_pid.into(), @@ -352,7 +265,7 @@ pub fn fork_main_into_worker( main_to_worker_scm, )) } - Ok(ForkResult::Child) => { + ForkResult::Child => { trace!("child({}):\twill spawn a child", unsafe { libc::getpid() }); Command::new(executable_path) .arg("worker") @@ -372,70 +285,5 @@ pub fn fork_main_into_worker( unreachable!(); } - Err(e) => { - error!("Error during fork(): {}", e); - Err(anyhow::Error::from(e)) - } } } - -#[cfg(target_os = "linux")] -pub unsafe fn get_executable_path() -> anyhow::Result { - use std::fs; - - let path = fs::read_link("/proc/self/exe").with_context(|| "/proc/self/exe doesn't exist")?; - - let mut path_str = match path.into_os_string().into_string() { - Ok(s) => s, - Err(_) => bail!("Failed to convert PathBuf to String"), - }; - - if path_str.ends_with(" (deleted)") { - // The kernel appends " (deleted)" to the symlink when the original executable has been replaced - let len = path_str.len(); - path_str.truncate(len - 10) - } - - Ok(path_str) -} - -#[cfg(target_os = "macos")] -extern "C" { - pub fn _NSGetExecutablePath(buf: *mut c_char, size: *mut u32) -> i32; -} - -#[cfg(target_os = "macos")] -pub unsafe fn get_executable_path() -> anyhow::Result { - let path = - std::env::current_exe().with_context(|| "failed to retrieve current executable path")?; - Ok(path.to_string_lossy().to_string()) -} - -#[cfg(target_os = "freebsd")] -pub unsafe fn get_executable_path() -> anyhow::Result { - let mut capacity = PATH_MAX as usize; - let mut path: Vec = Vec::with_capacity(capacity); - path.extend(repeat(0).take(capacity)); - - let mib: Vec = vec![CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1]; - let len = mib.len() * size_of::(); - let element_size = size_of::(); - - let res = sysctl( - mib.as_ptr(), - (len / element_size) as u32, - path.as_mut_ptr() as *mut c_void, - &mut capacity, - std::ptr::null() as *const c_void, - 0, - ); - if res != 0 { - panic!("Could not retrieve the path of the executable"); - } - - Ok(String::from_raw_parts( - path.as_mut_ptr(), - capacity - 1, - path.len(), - )) -} diff --git a/command/src/certificate.rs b/command/src/certificate.rs index cc27f7808..c7697a5b4 100644 --- a/command/src/certificate.rs +++ b/command/src/certificate.rs @@ -214,8 +214,7 @@ pub fn get_fingerprint_from_certificate_path( } pub fn decode_fingerprint(fingerprint: &str) -> Result { - let bytes = - hex::decode(fingerprint).map_err(|hex_error| CertificateError::DecodeError(hex_error))?; + let bytes = hex::decode(fingerprint).map_err(CertificateError::DecodeError)?; Ok(Fingerprint(bytes)) } diff --git a/command/src/channel.rs b/command/src/channel.rs index b85788041..d33c58b73 100644 --- a/command/src/channel.rs +++ b/command/src/channel.rs @@ -52,7 +52,7 @@ pub enum ChannelError { /// Used in pairs to communicate, in a blocking or non-blocking way. pub struct Channel { pub sock: MioUnixStream, - front_buf: Buffer, + pub front_buf: Buffer, pub back_buf: Buffer, max_buffer_size: usize, pub readiness: Ready, @@ -62,6 +62,24 @@ pub struct Channel { phantom_rx: PhantomData, } +impl std::fmt::Debug for Channel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct(&format!( + "Channel<{}, {}>", + std::any::type_name::(), + std::any::type_name::() + )) + .field("sock", &self.sock.as_raw_fd()) + // .field("front_buf", &self.front_buf) + // .field("back_buf", &self.back_buf) + // .field("max_buffer_size", &self.max_buffer_size) + .field("readiness", &self.readiness) + .field("interest", &self.interest) + .field("blocking", &self.blocking) + .finish() + } +} + impl Channel { /// Creates a nonblocking channel on a given socket path pub fn from_path( @@ -187,6 +205,7 @@ impl Channel { let mut count = 0usize; loop { let size = self.front_buf.available_space(); + trace!("channel available space: {}", size); if size == 0 { self.interest.remove(Ready::READABLE); break; @@ -284,6 +303,7 @@ impl Channel { if self.front_buf.capacity() == self.max_buffer_size { error!("command buffer full, cannot grow more, ignoring"); } else { + println!("growing channel"); let new_size = min(self.front_buf.capacity() + 5000, self.max_buffer_size); self.front_buf.grow(new_size); } diff --git a/command/src/command.proto b/command/src/command.proto index dcdc048a7..391f67449 100644 --- a/command/src/command.proto +++ b/command/src/command.proto @@ -19,7 +19,10 @@ message Request { FrontendFilters list_frontends = 5; // list all listeners ListListeners list_listeners = 6; - // launch a new worker, giving its tag + // launch a new worker + // never implemented, the tag is unused and probably not needed + // we may still implement it later with no paramater + // the main process will automatically assign a new id to a new worker string launch_worker = 7; // upgrade the main process UpgradeMain upgrade_main = 8; diff --git a/command/src/logging.rs b/command/src/logging.rs index f6fbd5de2..0d6c3d5aa 100644 --- a/command/src/logging.rs +++ b/command/src/logging.rs @@ -21,6 +21,9 @@ thread_local! { pub static TAG: String = LOGGER.with(|logger| {logger.borrow().tag.clone()}); } +// TODO: check if this error is critical: +// could not register compat logger: SetLoggerError(()) +// The CompatLogger may need a variable that tells wether it has been initiated already pub static COMPAT_LOGGER: CompatLogger = CompatLogger; pub struct Logger { diff --git a/command/src/proto/display.rs b/command/src/proto/display.rs index 6220f3ff1..4bc8df774 100644 --- a/command/src/proto/display.rs +++ b/command/src/proto/display.rs @@ -58,50 +58,50 @@ pub fn concatenate_vector(vec: &[String]) -> String { vec.join(", ") } -pub fn format_request_type(request_type: &RequestType) -> String { +pub fn format_request_type(request_type: &RequestType) -> &str { match request_type { - RequestType::SaveState(_) => "SaveState".to_owned(), - RequestType::LoadState(_) => "LoadState".to_owned(), - RequestType::CountRequests(_) => "CountRequests".to_owned(), - RequestType::ListWorkers(_) => "ListWorkers".to_owned(), - RequestType::ListFrontends(_) => "ListFrontends".to_owned(), - RequestType::ListListeners(_) => "ListListeners".to_owned(), - RequestType::LaunchWorker(_) => "LaunchWorker".to_owned(), - RequestType::UpgradeMain(_) => "UpgradeMain".to_owned(), - RequestType::UpgradeWorker(_) => "UpgradeWorker".to_owned(), - RequestType::SubscribeEvents(_) => "SubscribeEvents".to_owned(), - RequestType::ReloadConfiguration(_) => "ReloadConfiguration".to_owned(), - RequestType::Status(_) => "Status".to_owned(), - RequestType::AddCluster(_) => "AddCluster".to_owned(), - RequestType::RemoveCluster(_) => "RemoveCluster".to_owned(), - RequestType::AddHttpFrontend(_) => "AddHttpFrontend".to_owned(), - RequestType::RemoveHttpFrontend(_) => "RemoveHttpFrontend".to_owned(), - RequestType::AddHttpsFrontend(_) => "AddHttpsFrontend".to_owned(), - RequestType::RemoveHttpsFrontend(_) => "RemoveHttpsFrontend".to_owned(), - RequestType::AddCertificate(_) => "AddCertificate".to_owned(), - RequestType::ReplaceCertificate(_) => "ReplaceCertificate".to_owned(), - RequestType::RemoveCertificate(_) => "RemoveCertificate".to_owned(), - RequestType::AddTcpFrontend(_) => "AddTcpFrontend".to_owned(), - RequestType::RemoveTcpFrontend(_) => "RemoveTcpFrontend".to_owned(), - RequestType::AddBackend(_) => "AddBackend".to_owned(), - RequestType::RemoveBackend(_) => "RemoveBackend".to_owned(), - RequestType::AddHttpListener(_) => "AddHttpListener".to_owned(), - RequestType::AddHttpsListener(_) => "AddHttpsListener".to_owned(), - RequestType::AddTcpListener(_) => "AddTcpListener".to_owned(), - RequestType::RemoveListener(_) => "RemoveListener".to_owned(), - RequestType::ActivateListener(_) => "ActivateListener".to_owned(), - RequestType::DeactivateListener(_) => "DeactivateListener".to_owned(), - RequestType::QueryClusterById(_) => "QueryClusterById".to_owned(), - RequestType::QueryClustersByDomain(_) => "QueryClustersByDomain".to_owned(), - RequestType::QueryClustersHashes(_) => "QueryClustersHashes".to_owned(), - RequestType::QueryMetrics(_) => "QueryMetrics".to_owned(), - RequestType::SoftStop(_) => "SoftStop".to_owned(), - RequestType::HardStop(_) => "HardStop".to_owned(), - RequestType::ConfigureMetrics(_) => "ConfigureMetrics".to_owned(), - RequestType::Logging(_) => "Logging".to_owned(), - RequestType::ReturnListenSockets(_) => "ReturnListenSockets".to_owned(), - RequestType::QueryCertificatesFromTheState(_) => "QueryCertificatesFromTheState".to_owned(), - RequestType::QueryCertificatesFromWorkers(_) => "QueryCertificatesFromWorkers".to_owned(), + RequestType::SaveState(_) => "SaveState", + RequestType::LoadState(_) => "LoadState", + RequestType::CountRequests(_) => "CountRequests", + RequestType::ListWorkers(_) => "ListWorkers", + RequestType::ListFrontends(_) => "ListFrontends", + RequestType::ListListeners(_) => "ListListeners", + RequestType::LaunchWorker(_) => "LaunchWorker", + RequestType::UpgradeMain(_) => "UpgradeMain", + RequestType::UpgradeWorker(_) => "UpgradeWorker", + RequestType::SubscribeEvents(_) => "SubscribeEvents", + RequestType::ReloadConfiguration(_) => "ReloadConfiguration", + RequestType::Status(_) => "Status", + RequestType::AddCluster(_) => "AddCluster", + RequestType::RemoveCluster(_) => "RemoveCluster", + RequestType::AddHttpFrontend(_) => "AddHttpFrontend", + RequestType::RemoveHttpFrontend(_) => "RemoveHttpFrontend", + RequestType::AddHttpsFrontend(_) => "AddHttpsFrontend", + RequestType::RemoveHttpsFrontend(_) => "RemoveHttpsFrontend", + RequestType::AddCertificate(_) => "AddCertificate", + RequestType::ReplaceCertificate(_) => "ReplaceCertificate", + RequestType::RemoveCertificate(_) => "RemoveCertificate", + RequestType::AddTcpFrontend(_) => "AddTcpFrontend", + RequestType::RemoveTcpFrontend(_) => "RemoveTcpFrontend", + RequestType::AddBackend(_) => "AddBackend", + RequestType::RemoveBackend(_) => "RemoveBackend", + RequestType::AddHttpListener(_) => "AddHttpListener", + RequestType::AddHttpsListener(_) => "AddHttpsListener", + RequestType::AddTcpListener(_) => "AddTcpListener", + RequestType::RemoveListener(_) => "RemoveListener", + RequestType::ActivateListener(_) => "ActivateListener", + RequestType::DeactivateListener(_) => "DeactivateListener", + RequestType::QueryClusterById(_) => "QueryClusterById", + RequestType::QueryClustersByDomain(_) => "QueryClustersByDomain", + RequestType::QueryClustersHashes(_) => "QueryClustersHashes", + RequestType::QueryMetrics(_) => "QueryMetrics", + RequestType::SoftStop(_) => "SoftStop", + RequestType::HardStop(_) => "HardStop", + RequestType::ConfigureMetrics(_) => "ConfigureMetrics", + RequestType::Logging(_) => "Logging", + RequestType::ReturnListenSockets(_) => "ReturnListenSockets", + RequestType::QueryCertificatesFromTheState(_) => "QueryCertificatesFromTheState", + RequestType::QueryCertificatesFromWorkers(_) => "QueryCertificatesFromWorkers", } } @@ -128,12 +128,17 @@ impl Response { } } - let content = match &self.content { - Some(content) => content, - None => return Ok(println!("No content")), - }; - - content.display(json) + match &self.content { + Some(content) => content.display(json), + None => { + if json { + println!("{{}}"); + } else { + println!("No content"); + } + Ok(()) + } + } } } @@ -153,9 +158,9 @@ impl ResponseContent { ContentType::Metrics(aggr_metrics) => print_metrics(aggr_metrics), ContentType::FrontendList(frontends) => print_frontends(frontends), ContentType::ListenersList(listeners) => print_listeners(listeners), - ContentType::WorkerMetrics(worker_metrics) => print_worker_metrics(&worker_metrics), - ContentType::AvailableMetrics(list) => print_available_metrics(&list), - ContentType::RequestCounts(request_counts) => print_request_counts(&request_counts), + ContentType::WorkerMetrics(worker_metrics) => print_worker_metrics(worker_metrics), + ContentType::AvailableMetrics(list) => print_available_metrics(list), + ContentType::RequestCounts(request_counts) => print_request_counts(request_counts), ContentType::CertificatesWithFingerprints(certs) => { print_certificates_with_validity(certs) } @@ -205,12 +210,15 @@ pub fn print_status(worker_infos: &WorkerInfos) -> Result<(), DisplayError> { table.set_format(*prettytable::format::consts::FORMAT_BOX_CHARS); table.add_row(row!["worker id", "pid", "run state"]); - for worker_info in &worker_infos.vec { + let mut sorted_infos = worker_infos.vec.clone(); + sorted_infos.sort_by_key(|worker| worker.id); + + for worker_info in &sorted_infos { let row = row!( worker_info.id, worker_info.pid, RunState::try_from(worker_info.run_state) - .map_err(|e| DisplayError::DecodeError(e))? + .map_err(DisplayError::DecodeError)? .as_str_name() ); table.add_row(row); diff --git a/command/src/request.rs b/command/src/request.rs index c8350b23b..459cc1abc 100644 --- a/command/src/request.rs +++ b/command/src/request.rs @@ -12,9 +12,12 @@ use nom::{HexDisplay, Offset}; use crate::{ buffer::fixed::Buffer, parser::parse_several_requests, - proto::command::{ - request::RequestType, LoadBalancingAlgorithms, PathRuleKind, Request, RequestHttpFrontend, - RulePosition, + proto::{ + command::{ + request::RequestType, LoadBalancingAlgorithms, PathRuleKind, Request, + RequestHttpFrontend, RulePosition, + }, + display::format_request_type, }, response::{HttpFrontend, MessageId}, }; @@ -112,6 +115,13 @@ impl Request { Some(RequestType::SoftStop(_)) | Some(RequestType::HardStop(_)) ) } + + pub fn short_name(&self) -> &str { + match &self.request_type { + Some(request_type) => format_request_type(request_type), + None => "Unallowed", + } + } } /// This is sent only from Sōzu to Sōzu @@ -139,14 +149,12 @@ pub fn read_requests_from_file(file: &mut File) -> Result, Re loop { let previous = buffer.available_data(); - let bytes_read = file - .read(buffer.space()) - .map_err(|e| RequestError::FileError(e))?; + let bytes_read = file.read(buffer.space()).map_err(RequestError::FileError)?; buffer.fill(bytes_read); if buffer.available_data() == 0 { - debug!("Empty buffer"); + trace!("read_requests_from_file: empty buffer"); break; } @@ -154,7 +162,7 @@ pub fn read_requests_from_file(file: &mut File) -> Result, Re match parse_several_requests::(buffer.data()) { Ok((i, requests)) => { if !i.is_empty() { - debug!("could not parse {} bytes", i.len()); + trace!("read_requests_from_file: could not parse {} bytes", i.len()); if previous == buffer.available_data() { break; } @@ -166,7 +174,7 @@ pub fn read_requests_from_file(file: &mut File) -> Result, Re Err(nom::Err::Incomplete(_)) => { if buffer.available_data() == buffer.capacity() { error!( - "message too big, stopping parsing:\n{}", + "read_requests_from_file: message too big, stopping parsing:\n{}", buffer.data().to_hex(16) ); break; diff --git a/command/src/scm_socket.rs b/command/src/scm_socket.rs index ec91bce8c..1ea74135c 100644 --- a/command/src/scm_socket.rs +++ b/command/src/scm_socket.rs @@ -209,7 +209,7 @@ impl ScmSocket { } /// Socket addresses and file descriptors needed by a Proxy to start listening -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)] pub struct Listeners { pub http: Vec<(SocketAddr, RawFd)>, pub tls: Vec<(SocketAddr, RawFd)>, @@ -305,11 +305,7 @@ mod tests { let receiving_scm_socket = ScmSocket::new(stream_2.as_raw_fd()).expect("Could not create scm socket"); - let listeners = Listeners { - http: vec![], - tcp: vec![], - tls: vec![], - }; + let listeners = Listeners::default(); sending_scm_socket .send_listeners(&listeners) diff --git a/command/src/state.rs b/command/src/state.rs index 2df12a2e5..9532f34b2 100644 --- a/command/src/state.rs +++ b/command/src/state.rs @@ -155,7 +155,7 @@ impl ConfigState { if let Some(request_type) = &request.request_type { let count = self .request_counts - .entry(format_request_type(request_type)) + .entry(format_request_type(request_type).to_owned()) .or_insert(1); *count += 1; } @@ -1175,9 +1175,7 @@ impl ConfigState { /// Types like `HttpFrontend` are converted into protobuf ones, like `RequestHttpFrontend` pub fn cluster_state(&self, cluster_id: &str) -> Option { let configuration = self.clusters.get(cluster_id).cloned(); - if configuration.is_none() { - return None; - } + configuration.as_ref()?; let http_frontends: Vec = self .http_fronts @@ -1402,7 +1400,7 @@ impl ConfigState { .map_err(StateError::FileError)?; if counter % 1000 == 0 { - info!("writing command {}", counter); + info!("writing {} commands to file", counter); file.sync_all().map_err(StateError::FileError)?; } counter += 1; diff --git a/doc/managing_workers.md b/doc/managing_workers.md deleted file mode 100644 index 680403036..000000000 --- a/doc/managing_workers.md +++ /dev/null @@ -1,159 +0,0 @@ -# How are Sōzu's workers managed? - -Sōzu's main process starts and manages _workers_, which are subinstances of itself. -This core feature makes Sōzu pretty efficient, but raises the question of managing state across a whole cluster of processes. - -How do we solve this challenge? Unix sockets and channels. - -## Architecture - -`sozu` command line sends commands on a unix socket. -In the `command::start_server()` function of the main process, -there is a thread running in the background where a unix listener accepts new -connection and spawns client loops. - -The client loops parse client requests and forward them to the Command Server -through mpsc channels. **mpsc** = Multiple Producer, Single Consumer. -The sending end of the channel, called `command_tx`, is cloned and used many times over, -but the messages are all received by a single `command_rx` in the Command Server main loop. - -``` - UNIX UNIX - SOCKET SOCKET - | ^ | ^ - | | | | - +----v-+-----+ +----v-+-----+ - | client | | client | as many more - | loop | | loop | clients as we - +-+-------^--+ +--+-----^---+ want - | | | | - | | mpsc channels | | - | | | | -+----v-------+---------------v-----+------+ -| | -| Command | -| Server | -| | -+----+-------^---------------+-----^------+ - | | | | - | | mpsc channels | | - | | | | - +-v-------+--+ +--v-----+---+ - | worker | | worker | as many more - | loop | | loop | workers as we - +----+-^-----+ +----+-^-----+ want - | | | | - v | v | - UNIX UNIX - SOCKET SOCKET -``` - -As you can guess from the drawing, the exact same logic applies when workers send messages -to the CommandServer. - -The Command Server is able to send messages to clients and to workers by -keeping track of the sending ends of their mpsc channels, `client_tx` and `worker_tx`. - -In turn, clients and workers listen on their own receivers, `client_rx` and `worker_rx`, and -write everything onto their respective unix streams, to notify clients and workers. - -# Asynchronous handling of commands - -It is impossible to manage commands synchronously. -Some tasks are fast enough (for example, dumping the state), some are way too long. -For instance, loading a new state implies to: - -- parse a state file to derive instructions from it -- send ALL instructions to ALL workers -- wait for ALL workers to reply - -Blocking the main thread is unthinkable. Therefore, Sōzu detaches threads by doing, for instance: - -```rust -smol::spawn( - client_loop(id, unix_stream, command_tx, client_rx) -).detach(); -``` - -This make the client loop run in the background. -Using similar syntax, we can wait for worker responses in the background. -But how can we bring data back from those threads? => **more channels**. - -# The flow of requests, responses, and detached threads. - -What the Command Server does to perform a task: - -``` - +-------------+ - | | - | client | - | loop | - | | - +----+--------+ - | MAIN THREAD DETACHED THREAD - +----------+-------------------------------------------+ +------------------+ - | | | | | - | | create mpsc channel | | | - +--------------+ | | +------------------+ | | | - | | | v v v | | Listen on | -+--+ worker loop |<--+--+------ REQUEST SENDER RECEIVER | | the receiver | -| | | | | | | | | | | -| +--------------+ | | | | | | | | -| | | v | | | | Wait for all | -| +--------------+ | | id | | | | responses | -| | | | | | | +-----------+--->| | -+--+ worker loop |<--+ | | | | | | -| | | | | | | | | apply logic | -| +--------------+ | | v v | | | -| | | in_flight hash map use RESPONSES | | | -| +--------------+ | | +-----------------+ sender +-------------+--->| | -| | | | | | +------------>| | | | -+--+ worker loop |<--+ | | -request ids | | | | | -| | | | | | retrieve | | | Send final | -| +--------------+ | | -senders | sender | | | result to the | -| | | |<-----+ | | | main thread | -| | +-----------------+ | | | | (not shown) | -| | id | | | | -| | ^ | | | | -| | | | | | | -+------------------------+->RESPONSES----------------------+------+ | | | - | | | | - | | | | - +------------------------------------------------------+ +------------------+ -``` - -## What the main thread does to client requests - -- **Receive a client request** through the client loop, and if this request necessitates to talk to the workers, -- **send requests to the workers** through the worker loop. This goes fast. -- **create an mpsc** task channel with two ends, the _sender_ and the _receiver_. -- in a hash map called `in_flight`, keep track of: - - the `request_id` - - the _sender_ -- Give the _receiver_ to a **detached thread** - -## What the main thread does to worker responses - -- **Receives worker responses** through the worker loop -- **Looks at the `response_id`**, which is the same as the `request_id` seen above -- searches the `in_flight` hash map to **retrieve the associated _sender_** -- uses the _sender_ to **send the response into the detached thread** - -## What the detached thread does - -- **waits for worker responses** on the _receiver_ -- Completes the logic -- sends the final response **back to the command server** using `command_tx`, - _just like client loops and worker loops do_, because they are detached threads too. - -The Command Server just puts this final response into the client loop, and _voilà_! - -## To sum up - -Here is what is delegated into the background (all those boxes around the main thread): - -1. reading and writing from/onto the unix sockets -2. waiting for and processing worker responses - -The Command Server can be described as event-based because everything is returned -to the main loop using channels, in no precise order, asynchronously. diff --git a/e2e/src/sozu/worker.rs b/e2e/src/sozu/worker.rs index 9dd30c6ff..1527a5115 100644 --- a/e2e/src/sozu/worker.rs +++ b/e2e/src/sozu/worker.rs @@ -51,18 +51,6 @@ pub fn set_no_close_exec(fd: i32) { } impl Worker { - pub fn empty_file_config() -> FileConfig { - FileConfig::default() - } - - pub fn empty_listeners() -> Listeners { - Listeners { - http: Vec::new(), - tls: Vec::new(), - tcp: Vec::new(), - } - } - pub fn into_config(file_config: FileConfig) -> Config { ConfigBuilder::new(file_config, "") .into_config() @@ -70,8 +58,8 @@ impl Worker { } pub fn empty_config() -> (Config, Listeners, ConfigState) { - let listeners = Worker::empty_listeners(); - let config = Worker::empty_file_config(); + let listeners = Listeners::default(); + let config = FileConfig::default(); let config = Worker::into_config(config); let state = ConfigState::new(); (config, listeners, state) diff --git a/e2e/src/tests/tests.rs b/e2e/src/tests/tests.rs index 65ae1cbfa..27e35737b 100644 --- a/e2e/src/tests/tests.rs +++ b/e2e/src/tests/tests.rs @@ -12,6 +12,7 @@ use sozu_command_lib::{ request::RequestType, ActivateListener, AddCertificate, CertificateAndKey, ListenerType, RemoveBackend, RequestHttpFrontend, }, + scm_socket::Listeners, state::ConfigState, }; @@ -171,9 +172,9 @@ pub fn try_backend_stop(nb_requests: usize, zombie: Option) -> State { let config = Worker::into_config(FileConfig { zombie_check_interval: zombie, - ..Worker::empty_file_config() + ..FileConfig::default() }); - let listeners = Worker::empty_listeners(); + let listeners = Listeners::default(); let state = ConfigState::new(); let (mut worker, mut backends) = setup_async_test( "BACKSTOP", diff --git a/lib/src/http.rs b/lib/src/http.rs index 3e52ad579..71504367a 100644 --- a/lib/src/http.rs +++ b/lib/src/http.rs @@ -1065,11 +1065,7 @@ pub fn start_http_worker( let server_scm_socket = ScmSocket::new(scm_server.as_raw_fd()).with_context(|| "Could not create scm socket")?; - if let Err(e) = client_scm_socket.send_listeners(&Listeners { - http: Vec::new(), - tls: Vec::new(), - tcp: Vec::new(), - }) { + if let Err(e) = client_scm_socket.send_listeners(&Listeners::default()) { error!("error sending empty listeners: {:?}", e); } diff --git a/lib/src/metrics/local_drain.rs b/lib/src/metrics/local_drain.rs index ea29ef009..3ee77ed40 100644 --- a/lib/src/metrics/local_drain.rs +++ b/lib/src/metrics/local_drain.rs @@ -269,7 +269,8 @@ impl LocalDrain { return Ok(ContentType::WorkerMetrics(WorkerMetrics { proxy: proxy_metrics, clusters: BTreeMap::new(), - }).into()); + }) + .into()); } let worker_metrics = match (cluster_ids.is_empty(), backend_ids.is_empty()) { diff --git a/lib/src/server.rs b/lib/src/server.rs index 5431d3c78..454bca8e5 100644 --- a/lib/src/server.rs +++ b/lib/src/server.rs @@ -3,11 +3,11 @@ use std::{ cell::RefCell, collections::{HashSet, VecDeque}, convert::TryFrom, + io::Error as IoError, os::unix::io::{AsRawFd, FromRawFd}, rc::Rc, }; -use anyhow::Context; use mio::{ net::{TcpListener as MioTcpListener, TcpStream}, Events, Interest, Poll, Token, @@ -22,13 +22,13 @@ use sozu_command::{ request::RequestType, response_content::ContentType, ActivateListener, AddBackend, CertificatesWithFingerprints, Cluster, ClusterHashes, ClusterInformations, DeactivateListener, Event, HttpListenerConfig, HttpsListenerConfig, ListenerType, - LoadBalancingAlgorithms, LoadMetric, MetricsConfiguration, RemoveBackend, ResponseStatus, - TcpListenerConfig as CommandTcpListener, + LoadBalancingAlgorithms, LoadMetric, MetricsConfiguration, RemoveBackend, Request, + ResponseStatus, TcpListenerConfig as CommandTcpListener, }, ready::Ready, request::WorkerRequest, response::{MessageId, WorkerResponse}, - scm_socket::{Listeners, ScmSocket}, + scm_socket::{Listeners, ScmSocket, ScmSocketError}, state::ConfigState, }; @@ -220,6 +220,21 @@ impl SessionManager { } } +#[derive(thiserror::Error, Debug)] +pub enum ServerError { + #[error("could not create event loop with MIO poll: {0}")] + CreatePoll(IoError), + #[error("could not clone the MIO registry: {0}")] + CloneRegistry(IoError), + #[error("could not register the channel: {0}")] + RegisterChannel(IoError), + #[error("{msg}:{scm_err}")] + ScmSocket { + msg: String, + scm_err: ScmSocketError, + }, +} + /// `Server` handles the event loop, the listeners, the sessions and /// communication with the configuration channel. /// @@ -271,8 +286,8 @@ impl Server { config: Config, initial_state: Vec, expects_initial_status: bool, - ) -> anyhow::Result { - let event_loop = Poll::new().with_context(|| "could not create event loop")?; + ) -> Result { + let event_loop = Poll::new().map_err(ServerError::CreatePoll)?; let pool = Rc::new(RefCell::new(Pool::with_capacity( config.min_buffers, config.max_buffers, @@ -315,7 +330,7 @@ impl Server { let registry = event_loop .registry() .try_clone() - .with_context(|| "could not clone the mio Registry")?; + .map_err(ServerError::CloneRegistry)?; let https = https::HttpsProxy::new(registry, sessions.clone(), pool.clone(), backends.clone()); @@ -350,7 +365,7 @@ impl Server { server_config: ServerConfig, initial_state: Option>, expects_initial_status: bool, - ) -> anyhow::Result { + ) -> Result { FEATURES.with(|_features| { // initializing feature flags }); @@ -361,7 +376,7 @@ impl Server { Token(0), Interest::READABLE | Interest::WRITABLE, ) - .with_context(|| "should register the channel")?; + .map_err(ServerError::RegisterChannel)?; METRICS.with(|metrics| { if let Some(sock) = (*metrics.borrow_mut()).socket_mut() { @@ -379,7 +394,8 @@ impl Server { let registry = poll .registry() .try_clone() - .with_context(|| "could not clone the mio Registry")?; + .map_err(ServerError::CloneRegistry)?; + http::HttpProxy::new(registry, sessions.clone(), pool.clone(), backends.clone()) } })); @@ -390,7 +406,7 @@ impl Server { let registry = poll .registry() .try_clone() - .with_context(|| "could not clone the mio Registry")?; + .map_err(ServerError::CloneRegistry)?; https::HttpsProxy::new(registry, sessions.clone(), pool.clone(), backends.clone()) } @@ -402,7 +418,8 @@ impl Server { let registry = poll .registry() .try_clone() - .with_context(|| "could not clone the mio Registry")?; + .map_err(ServerError::CloneRegistry)?; + tcp::TcpProxy::new(registry, sessions.clone(), backends.clone()) } })); @@ -457,10 +474,19 @@ impl Server { let msg = server.channel.read_message(); debug!("got message: {:?}", msg); - if let Ok(msg) = msg { - if let Err(e) = server.channel.write_message(&WorkerResponse::ok(msg.id)) { + if let Ok(WorkerRequest { + id, + content: + Request { + request_type: Some(RequestType::Status(_)), + }, + }) = msg + { + if let Err(e) = server.channel.write_message(&WorkerResponse::ok(id)) { error!("Could not send an ok to the main process: {}", e); } + } else { + panic!("plz give me a status request first when I start, you sent me this instead: {:?}", msg); } server.unblock_channel(); } @@ -469,15 +495,25 @@ impl Server { server .scm .set_blocking(true) - .with_context(|| "Could not set the scm socket to blocking")?; - let listeners = server - .scm - .receive_listeners() - .with_context(|| "could not receive listeners from the scm socket")?; + .map_err(|scm_err| ServerError::ScmSocket { + msg: "Could not set the scm socket to blocking".to_string(), + scm_err, + })?; + let listeners = + server + .scm + .receive_listeners() + .map_err(|scm_err| ServerError::ScmSocket { + msg: "could not receive listeners from the scm socket".to_string(), + scm_err, + })?; server .scm .set_blocking(false) - .with_context(|| "Could not set the scm socket to unblocking")?; + .map_err(|scm_err| ServerError::ScmSocket { + msg: "Could not set the scm socket to unblocking".to_string(), + scm_err, + })?; info!("received listeners: {:?}", listeners); server.scm_listeners = Some(listeners); @@ -525,7 +561,7 @@ impl Server { } if event.is_read_closed() || event.is_write_closed() { error!("command channel was closed"); - continue; + return; } let ready = Ready::from(event); self.channel.handle_events(ready); @@ -686,7 +722,13 @@ impl Server { } Some(RequestType::ReturnListenSockets(_)) => { info!("received ReturnListenSockets order"); - self.return_listen_sockets(); + match self.return_listen_sockets() { + Ok(_) => push_queue(WorkerResponse::ok(request.id.clone())), + Err(error) => push_queue(WorkerResponse::error( + request.id.clone(), + format!("Could not send listeners on scm socket: {error:?}"), + )), + } } _ => self.notify(request), }, @@ -812,15 +854,21 @@ impl Server { if let Err(e) = self.channel.run() { error!("Error while running the server channel: {}", e); } - self.block_channel(); + // self.block_channel(); let id = self .shutting_down .take() .expect("should have shut down correctly"); // panicking here makes sense actually + + debug!("Responding OK to main process for request {}", id); + let proxy_response = WorkerResponse::ok(id); if let Err(e) = self.channel.write_message(&proxy_response) { error!("Could not write response to the main process: {}", e); } + if let Err(e) = self.channel.run() { + error!("Error while running the server channel: {}", e); + } return true; } @@ -940,6 +988,7 @@ impl Server { ) }; push_queue(response); + return; } // if all certificates are queried, or filtered by domain name, // the request will be handled by the https proxy @@ -1397,7 +1446,7 @@ impl Server { } /// Send all socket addresses and file descriptors of all proxies, via the scm socket - pub fn return_listen_sockets(&mut self) { + pub fn return_listen_sockets(&mut self) -> Result<(), ScmSocketError> { self.unblock_scm_socket(); let mut http_listeners = self.http.borrow_mut().give_back_listeners(); @@ -1448,6 +1497,7 @@ impl Server { self.block_scm_socket(); info!("sent default listeners: {:?}", res); + res } fn block_scm_socket(&mut self) { diff --git a/lib/src/tcp.rs b/lib/src/tcp.rs index f33040883..bdf70a66e 100644 --- a/lib/src/tcp.rs +++ b/lib/src/tcp.rs @@ -7,7 +7,7 @@ use std::{ rc::Rc, }; -use anyhow::{bail, Context}; +use anyhow::Context; use mio::{ net::TcpListener as MioTcpListener, net::{TcpStream as MioTcpStream, UnixStream}, @@ -1233,11 +1233,15 @@ impl TcpProxy { Ok(()) } - pub fn remove_tcp_front(&mut self, front: RequestTcpFrontend) -> anyhow::Result<()> { - let address = front - .address - .parse() - .with_context(|| "wrong socket address")?; + pub fn remove_tcp_front(&mut self, front: RequestTcpFrontend) -> Result<(), ProxyError> { + let address = + front + .address + .parse::() + .map_err(|parse_error| ProxyError::SocketParse { + address: front.address.clone(), + error: parse_error.to_string(), + })?; let mut listener = match self .listeners @@ -1245,7 +1249,7 @@ impl TcpProxy { .find(|l| l.borrow().address == address) { Some(l) => l.borrow_mut(), - None => bail!(format!("no such listener for '{}'", front.address)), + None => return Err(ProxyError::NoListenerFound(address)), }; listener.set_tags(front.address, None); @@ -1761,11 +1765,7 @@ mod tests { let server_scm_socket = ScmSocket::new(scm_server.as_raw_fd()).expect("Could not create scm socket"); client_scm_socket - .send_listeners(&Listeners { - http: Vec::new(), - tls: Vec::new(), - tcp: Vec::new(), - }) + .send_listeners(&Listeners::default()) .unwrap(); let server_config = server::ServerConfig {