diff --git a/.gitignore b/.gitignore index 0592392..e42056d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /target .DS_Store + +# Per-session runtime lockfile from local scheduling tooling — never source-controlled. +.claude/scheduled_tasks.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index 13148ca..ecb1e9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **Periodic peer-status log line.** Every 30s the node logs a single + `peer status` line with `connected_count` / `known_count` and the short + endpoint ids of both sets: `connected_peers` (live CRDT-sync connections, + from the transport) and `known_peers` (peers this node dialed — the exact + set used for distribution targeting and blob-provider lookup). A receiver + missing from a sender's `known_peers` is precisely why a synced attachment + document never becomes a delivered file; this makes that visible at a + glance. Fires immediately at startup, then on the interval. + +### Changed + +- **Default log filter now covers the whole sync stack.** Was + `peat_node=info,peat_mesh=info`; now also includes `peat_protocol=info` + (the crate that owns the attachment send/receive watchers — targeting and + blob fetch — so a failed delivery is no longer silent) and `iroh=warn` + (surfaces QUIC dial / connection failures without info-level packet spam). + `RUST_LOG` still overrides the entire default. + ## [0.4.5] - 2026-06-19 ### Added diff --git a/examples/compose/attachments/README.md b/examples/compose/attachments/README.md index 8debb5d..42fb28e 100644 --- a/examples/compose/attachments/README.md +++ b/examples/compose/attachments/README.md @@ -146,7 +146,23 @@ B_ID=$(peat-node derive-id --shared-key "$K" --node-id node-b) Each machine runs one node with its own `.env` (node id, shared key, iroh UDP port, and the peer's derived id @ its IP:port). No `peer.sh`, no `GetStatus` -round-trip, no mDNS. See the header of `docker-compose.multi-host.yml` for the +round-trip, no mDNS. + +> **Both machines must list each other** (`A_ID` in B's `PEAT_NODE_PEERS`, `B_ID` +> in A's) — and this is mandatory, not symmetry for its own sake. A node's +> `known_peers` is populated *only when it dials out*; accepting an inbound +> connection doesn't register the peer. Attachment delivery reads `known_peers` +> on both ends — the **sender's** set decides who a distribution targets, the +> **receiver's** set is where it fetches the blob. List only one side and the +> distribution *document* still syncs (CRDT gossip is transitive) but the file +> is never written: the "synced but nothing delivered" symptom. One iroh QUIC +> connection carries the bytes either way, so this is two *dials*, not two +> connections. The `peer status` log line (`connected_peers` vs `known_peers`, +> every 30s) shows whether each side actually dialed the other. The requirement +> for adjacent peers is tracked for removal upstream in +> [peat-node#170](https://github.com/defenseunicorns/peat-node/issues/170). + +See the header of `docker-compose.multi-host.yml` for the full per-machine `.env` and the firewall/UDP-publish note, and [`docs/CONFIGURATION.md` → Deterministic identity](../../../docs/CONFIGURATION.md#deterministic-identity--offline-peer-id-derivation) for the full reference. diff --git a/examples/compose/attachments/docker-compose.multi-host.yml b/examples/compose/attachments/docker-compose.multi-host.yml index f1669ff..3859d4f 100644 --- a/examples/compose/attachments/docker-compose.multi-host.yml +++ b/examples/compose/attachments/docker-compose.multi-host.yml @@ -36,6 +36,17 @@ # PEER_ADDR The OTHER node's reachable address as host:udp-port, where # udp-port is that node's IROH_UDP_PORT. e.g. 10.0.0.20:51072 # +# ⚠ BOTH MACHINES MUST SET PEER_ID/PEER_ADDR pointing at each other — this is a +# TWO-WAY dial, not a convenience. A node's `known_peers` is populated ONLY +# when IT dials out (the PEAT_NODE_PEERS entry); accepting an inbound +# connection does NOT register the peer. Attachment delivery reads `known_peers` +# on both ends: the SENDER's set decides who a distribution targets, and the +# RECEIVER's set is where it looks to fetch the blob. If only one side lists the +# other, the distribution *document* still syncs but the file is never +# delivered — the classic "synced but nothing written" symptom. (One iroh QUIC +# connection still carries the bytes both ways; the requirement is two dials, +# not two connections. Tracked upstream: peat-node#170.) +# # # Optional — attachment delivery (off by default). Uncomment the matching # # `environment:`/`volumes:` lines below to enable, and create the host dir. # ATTACHMENT_ROOT Readable outbox root, name=path. e.g. outbox=/var/lib/peat/outbox @@ -89,7 +100,13 @@ services: # mDNS can't cross subnets; rely purely on the static peer above. PEAT_NODE_DISABLE_MDNS: "true" PEAT_NODE_AUTO_SYNC: "true" - RUST_LOG: peat_node=info + # Log the whole sync stack, not just peat-node. `peat_protocol` carries the + # attachment targeting + blob-fetch logs and `peat_mesh` the peer-status + # heartbeat (connected_peers vs known_peers) — exactly what you need to see + # whether a peer is actually peered. `iroh=warn` surfaces QUIC dial + # failures. (peat-node v0.4.6+ uses this as the built-in default; setting + # it explicitly here makes the example self-contained on older images too.) + RUST_LOG: peat_node=info,peat_mesh=info,peat_protocol=info,iroh=warn # Optional attachment delivery — uncomment with the volume(s) below: # PEAT_NODE_ATTACHMENT_ROOT: ${ATTACHMENT_ROOT} # PEAT_NODE_ATTACHMENT_INBOX: ${ATTACHMENT_INBOX} diff --git a/src/main.rs b/src/main.rs index 5cbe89e..bce795e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -365,7 +365,16 @@ async fn main() -> anyhow::Result<()> { tracing_subscriber::fmt() .with_env_filter( tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| "peat_node=info,peat_mesh=info".into()), + // Include the whole sync stack at info by default, not just + // peat-node itself. `peat_protocol` carries the attachment + // send/receive watcher logs (targeting + blob fetch) — without + // it a failed delivery is invisible. `iroh=warn` surfaces QUIC + // dial / connection failures (the usual reason a peer never + // enters `known_peers`) without the info-level packet spam. + // Override the whole thing with `RUST_LOG`. + .unwrap_or_else(|_| { + "peat_node=info,peat_mesh=info,peat_protocol=info,iroh=warn".into() + }), ) .init(); diff --git a/src/node.rs b/src/node.rs index 2abd109..79a9d42 100644 --- a/src/node.rs +++ b/src/node.rs @@ -202,6 +202,12 @@ struct PeerRegistration { /// scenario surfaces a different sweet spot. const RECONNECT_WATCHDOG_INTERVAL: Duration = Duration::from_secs(5); +/// Interval for the periodic peer-status heartbeat log. Coarse on purpose: +/// it's an operator-facing "who am I connected to / who can I target" +/// breadcrumb, not a fast control loop, so it shouldn't compete with the +/// watchdog's 5 s cadence or spam the log. +const PEER_STATUS_LOG_INTERVAL: Duration = Duration::from_secs(30); + /// Initial backoff for a freshly-registered peer or a peer whose last /// dial succeeded. Equal to the watchdog interval so the first retry /// fires on the very next tick after a transient drop. @@ -603,6 +609,55 @@ impl SidecarNode { }); } + // Periodic peer-status heartbeat. Operators diagnosing sync or + // attachment problems need a single line answering "who is this node + // actually connected to, and which peers can it target?". Two sets + // matter and they can legitimately differ: + // * `connected_peers()` (transport) — live CRDT-sync connections. + // * `known_peers()` (blob store) — peers THIS node dialed. This is + // the exact set `resolve_targets` uses for distribution targeting + // and `fetch_blob` uses to locate blob providers, so it's the set + // that governs whether an attachment can be delivered. + // A peer in `known` but not `connected` means a dial is failing; a + // receiver missing from a sender's `known` set is why a synced + // distribution doc never turns into a delivered file. Logged at info + // so it shows under the default filter. Holds a `Weak` ref so it exits + // cleanly when the node is dropped (mirrors the reconnect watchdog). + { + let backend_weak = Arc::downgrade(&backend); + tokio::spawn(async move { + let mut ticker = tokio::time::interval(PEER_STATUS_LOG_INTERVAL); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + ticker.tick().await; + let Some(backend) = backend_weak.upgrade() else { + debug!("peer-status logger: backend dropped, exiting"); + break; + }; + let connected: Vec = backend + .transport() + .connected_peers() + .into_iter() + .map(|id| id.fmt_short().to_string()) + .collect(); + let known: Vec = backend + .blob_store() + .known_peers() + .await + .into_iter() + .map(|id| id.fmt_short().to_string()) + .collect(); + info!( + connected_count = connected.len(), + known_count = known.len(), + connected_peers = ?connected, + known_peers = ?known, + "peer status" + ); + } + }); + } + // ── mDNS peer discovery ────────────────────────────────────────────── // On by default; `disable_mdns: true` (or `--disable-mdns`) opts out. // In environments without multicast (Kubernetes, most containers) init