From a7a706cff7c9e11ac93d6ed487988c3dd0b74569 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Thu, 30 Jan 2025 11:09:43 +0200
Subject: [PATCH 01/78] Fix submodule reference after #10473 (#10577)

---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index b654fa88b6fd..f0ffc8279dbc 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9
+Subproject commit f0ffc8279dbcbbc439981a4fd001a9687e5d665d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 982f53769238..c3eaeac927ce 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9"
+    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
   ],
   "v16": [
     "16.6",

From b24727134c5eebd9965297c1a65618cfda5c4a52 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 10:27:40 +0100
Subject: [PATCH 02/78] pageserver: improve read amp metric (#10573)

## Problem

The current global `pageserver_layers_visited_per_vectored_read_global`
metric does not appear to accurately measure read amplification. It
divides the layer count by the number of reads in a batch, but this
means that e.g. 10 reads with 100 L0 layers will only measure a read amp
of 10 per read, while the actual read amp was 100.

While the cost of layer visits are amortized across the batch, and some
layers may not intersect with a given key, each visited layer
contributes directly to the observed latency for every read in the
batch, which is what we care about.

Touches https://github.com/neondatabase/cloud/issues/23283.
Extracted from #10566.

## Summary of changes

* Count the number of layers visited towards each read in the batch,
instead of the average across the batch.
* Rename `pageserver_layers_visited_per_vectored_read_global` to
`pageserver_layers_per_read_global`.
* Reduce the read amp log warning threshold down from 512 to 100.
---
 pageserver/src/metrics.rs              | 16 ++++++++-----
 pageserver/src/tenant/timeline.rs      | 31 ++++++++++++++------------
 test_runner/regress/test_compaction.py | 10 ++++-----
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 77c0967afc4b..30f3a49a9dea 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -116,11 +116,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+/// Measures layers visited per read (i.e. read amplification).
+///
+/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
+/// are amortized across the batch, and some layers may not intersect with a given key, each visited
+/// layer contributes directly to the observed latency for every read in the batch, which is what we
+/// care about.
+pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_layers_per_read_global",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
     )
     .expect("failed to define a metric")
 });
@@ -3912,7 +3918,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
 
     // histograms
     [
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &LAYERS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b4b30fcd23e2..71f0b4c9bfd8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,6 +51,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::rate_limit::RateLimit;
 use utils::{
     fs_ext,
     guard_arc_swap::GuardArcSwap,
@@ -115,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -1044,7 +1045,7 @@ impl Timeline {
     }
 
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
 
     /// Look up multiple page versions at a given LSN
     ///
@@ -1221,25 +1222,27 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
-            let avg = layers_visited as f64 / results.len() as f64;
-            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
+            // Record the total number of layers visited towards each key in the batch. While some
+            // layers may not intersect with a given read, and the cost of layer visits are
+            // amortized across the batch, each visited layer contributes directly to the observed
+            // latency for every read in the batch, which is what we care about.
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = keyspace.total_raw_size();
+                    let num_pages = results.len();
                     tracing::info!(
                       shard_id = %self.tenant_shard_id.shard_slug(),
                       lsn = %lsn,
-                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
                 });
             }
 
-            // Note that this is an approximation. Tracking the exact number of layers visited
-            // per key requires virtually unbounded memory usage and is inefficient
-            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
+            for _ in &results {
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+            }
         }
 
         Ok(results)
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 2edfc884add6..763a63c2e56e 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -86,9 +86,9 @@ def test_pageserver_compaction_smoke(
     log.info("Checking layer access metrics ...")
 
     layer_access_metric_names = [
-        "pageserver_layers_visited_per_vectored_read_global_sum",
-        "pageserver_layers_visited_per_vectored_read_global_count",
-        "pageserver_layers_visited_per_vectored_read_global_bucket",
+        "pageserver_layers_per_read_global_sum",
+        "pageserver_layers_per_read_global_count",
+        "pageserver_layers_per_read_global_bucket",
     ]
 
     metrics = env.pageserver.http_client().get_metrics()
@@ -96,8 +96,8 @@ def test_pageserver_compaction_smoke(
         layer_access_metrics = metrics.query_all(name)
         log.info(f"Got metrics: {layer_access_metrics}")
 
-    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
-    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_per_read_global_count")
     if vectored_count.value > 0:
         assert vectored_sum.value > 0
         vectored_average = vectored_sum.value / vectored_count.value

From d3db96c2112381e7a0601719e2e7d55d11d946ed Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 11:55:07 +0100
Subject: [PATCH 03/78] pageserver: add `pageserver_deltas_per_read_global`
 metric (#10570)

## Problem

We suspect that Postgres checkpoints will limit the number of page
deltas necessary to reconstruct a page, but don't know for certain.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add `pageserver_deltas_per_read_global` metric.

This pairs with `pageserver_layers_per_read_global` from #10573.
---
 pageserver/src/metrics.rs              | 11 +++++++++++
 pageserver/src/tenant/storage_layer.rs | 10 ++++++++++
 pageserver/src/tenant/timeline.rs      |  3 ++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 30f3a49a9dea..bf75ea3bc6b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -131,6 +131,16 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
+    register_histogram!(
+        "pageserver_deltas_per_read_global",
+        "Number of delta pages applied to image page per read",
+        vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_concurrent_initdb",
@@ -3919,6 +3929,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     // histograms
     [
         &LAYERS_PER_READ_GLOBAL,
+        &DELTAS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index c1fe67c87c5d..3800852ccc3e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState {
     pub(crate) img: Option<(Lsn, Bytes)>,
 }
 
+impl ValueReconstructState {
+    /// Returns the number of page deltas applied to the page image.
+    pub fn num_deltas(&self) -> usize {
+        match self.img {
+            Some(_) => self.records.len(),
+            None => self.records.len() - 1, // omit will_init record
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub(crate) enum ValueReconstructSituation {
     Complete,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 71f0b4c9bfd8..4a69376239a1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -116,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
+use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -1195,6 +1195,7 @@ impl Timeline {
                             return (key, Err(err));
                         }
                     };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
 
                     // The walredo module expects the records to be descending in terms of Lsn.
                     // And we submit the IOs in that order, so, there shuold be no need to sort here.

From 8804d5894336762d077147e2dac9b780653f2a8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 30 Jan 2025 11:18:07 +0000
Subject: [PATCH 04/78] Nightly Benchmarks: use pgbench from artifacts 
 (#10370)

We don't use statically linked OpenSSL anymore (#10302),
it's ok to switch to Neon's pgbench for pgvector benchmarks
---
 .github/workflows/benchmarking.yml | 57 +++++++++---------------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 32747d825caa..9446b4d17b6e 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -319,7 +319,7 @@ jobs:
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, 
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
@@ -458,7 +458,7 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB
     # without (neonvm-captest-new)
     # and with (neonvm-captest-new-many-tables) many relations in the database
     - name: Create many relations before the run
@@ -590,36 +590,20 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        # Just to make it easier to test things locally on macOS (with arm64)
-        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
-
-        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
-
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        mkdir -p /tmp/neon/pg_install/v17/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v17/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v17/lib
-
-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
-
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -642,13 +626,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
       with:
@@ -906,7 +883,7 @@ jobs:
             exit 1
             ;;
         esac
-        
+
         echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
 
     - name: Set up Connection String

From 6a2afa0c0216c183e51d68d2730ed67862247de2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 12:24:49 +0100
Subject: [PATCH 05/78] pageserver: add per-timeline read amp histogram
 (#10566)

## Problem

We don't have per-timeline observability for read amplification.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add a per-timeline `pageserver_layers_per_read` histogram.

NB: per-timeline histograms are expensive, but probably worth it in this
case.
---
 pageserver/src/metrics.rs         | 19 +++++++++++++++++++
 pageserver/src/tenant/timeline.rs |  1 +
 test_runner/fixtures/metrics.py   |  3 +++
 3 files changed, 23 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index bf75ea3bc6b9..f9edf8855333 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 /// are amortized across the batch, and some layers may not intersect with a given key, each visited
 /// layer contributes directly to the observed latency for every read in the batch, which is what we
 /// care about.
+pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_layers_per_read",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        // Low resolution to reduce cardinality.
+        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_layers_per_read_global",
@@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics {
     pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
+    pub layers_per_read: Histogram,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     pub visible_physical_size_gauge: UIntGauge,
@@ -2745,6 +2757,10 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let layers_per_read = LAYERS_PER_READ
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2809,6 +2825,7 @@ impl TimelineMetrics {
             disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
+            layers_per_read,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             visible_physical_size_gauge,
@@ -2978,6 +2995,8 @@ impl TimelineMetrics {
             }
         }
 
+        let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a69376239a1..ab4b3cac6304 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1242,6 +1242,7 @@ impl Timeline {
             }
 
             for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited as f64);
                 LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
             }
         }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd7e193778bb..83a1a8761153 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -158,6 +158,9 @@ def counter(name: str) -> str:
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",
     "pageserver_layer_count",
+    "pageserver_layers_per_read_bucket",
+    "pageserver_layers_per_read_count",
+    "pageserver_layers_per_read_sum",
     "pageserver_visible_physical_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",

From ab627ad9fd355e6a84a7403accee307284f8e017 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 11:54:02 +0000
Subject: [PATCH 06/78] storcon_cli: fix spurious error setting preferred AZ
 (#10568)

## Problem

The client code for `tenant-set-preferred-az` declared response type
`()`, so printed a spurious error on each use:
```
Error: receive body: error decoding response body: invalid type: map, expected unit at line 1 column 0
```

The requests were successful anyway.

## Summary of changes

- Declare the proper return type, so that the command succeeds quietly.
---
 control_plane/storcon_cli/src/main.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index d9b76b960093..985fe6b3b1d6 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -10,8 +10,8 @@ use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
         SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
-        TenantPolicyRequest,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> {
                     .collect(),
             };
             storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                .dispatch::<ShardsPreferredAzsRequest, ShardsPreferredAzsResponse>(
                     Method::PUT,
                     "control/v1/preferred_azs".to_string(),
                     Some(req),

From 93714c4c7bc472a80eb0e13c6e33ea3bf19389d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Jan 2025 13:03:36 +0100
Subject: [PATCH 07/78] secondary downloader: load metadata on loading of
 timeline (#10539)

Related to #10308, we might have legitimate changes in file size or
generation. Those changes should not cause warn log lines.

In order to detect changes of the generation number while the file size
stayed the same, load the metadata that we store on disk on loading of
the timeline.

Still do a comparison with the on-disk layer sizes to find any
discrepancies that might occur due to race conditions (new metadata file
gets written but layer file has not been updated yet, and PS shuts
down). However, as it's possible to hit it in a race conditon, downgrade
it to a warning.

Also fix a mistake in #10529: we want to compare the old with the new
metadata, not the old metadata with itself.
---
 pageserver/src/tenant/secondary/downloader.rs | 101 ++++++++++++++----
 pageserver/src/virtual_file.rs                |  53 +++++----
 2 files changed, 114 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index cf524fcb25e2..2e8c3946bd69 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -673,12 +673,30 @@ impl<'a> TenantDownloader<'a> {
             HeatMapDownload::Modified(m) => m,
         };
 
-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
+        // Heatmap storage location
         let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
 
+        let last_heatmap = if last_download.is_none() {
+            match load_heatmap(&heatmap_path, ctx).await {
+                Ok(htm) => htm,
+                Err(e) => {
+                    tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| {
+            htm.timelines
+                .iter()
+                .map(|tl| (tl.timeline_id, tl))
+                .collect::<HashMap<_, _>>()
+        });
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
@@ -707,10 +725,17 @@ impl<'a> TenantDownloader<'a> {
             let timeline_state = match timeline_state {
                 Some(t) => t,
                 None => {
+                    let last_heatmap =
+                        last_heatmap_timelines
+                            .as_ref()
+                            .and_then(|last_heatmap_timelines| {
+                                last_heatmap_timelines.get(&timeline.timeline_id).copied()
+                            });
                     // We have no existing state: need to scan local disk for layers first.
                     let timeline_state = init_timeline_state(
                         self.conf,
                         tenant_shard_id,
+                        last_heatmap,
                         timeline,
                         &self.secondary_state.resident_size_metric,
                     )
@@ -1079,12 +1104,12 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+            if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() {
                 tracing::info!(
                     "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
                     layer.name,
                     on_disk.metadata.generation_file_size(),
-                    on_disk.metadata.generation_file_size()
+                    layer.metadata.generation_file_size()
                 );
                 return LayerAction::Download;
             }
@@ -1277,6 +1302,7 @@ impl<'a> TenantDownloader<'a> {
 async fn init_timeline_state(
     conf: &'static PageServerConf,
     tenant_shard_id: &TenantShardId,
+    last_heatmap: Option<&HeatMapTimeline>,
     heatmap: &HeatMapTimeline,
     resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
@@ -1306,6 +1332,13 @@ async fn init_timeline_state(
     let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         heatmap.layers.iter().map(|l| (&l.name, l)).collect();
 
+    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
+        if let Some(last_heatmap) = last_heatmap {
+            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+        } else {
+            HashMap::new()
+        };
+
     while let Some(dentry) = dir
         .next_entry()
         .await
@@ -1339,18 +1372,32 @@ async fn init_timeline_state(
         match LayerName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
+                let last_meta = last_heatmap_metadata.get(&name);
+                let mut remove = false;
                 match remote_meta {
                     Some(remote_meta) => {
+                        let last_meta_generation_file_size = last_meta
+                            .map(|m| m.metadata.generation_file_size())
+                            .unwrap_or(remote_meta.metadata.generation_file_size());
                         // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
+                        if remote_meta.metadata.generation_file_size()
+                            != last_meta_generation_file_size
+                        {
+                            tracing::info!(
+                                "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}",
+                                last_meta_generation_file_size,
+                                remote_meta.metadata.generation_file_size()
+                            );
+                            remove = true;
+                        } else if local_meta.len() != remote_meta.metadata.file_size {
+                            // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had
+                            // the chance yet to download the new layer to disk, before the process restarted.
+                            tracing::info!(
                                 "Removing local layer {name} with unexpected local size {} != {}",
                                 local_meta.len(),
                                 remote_meta.metadata.file_size
                             );
+                            remove = true;
                         } else {
                             // We expect the access time to be initialized immediately afterwards, when
                             // the latest heatmap is applied to the state.
@@ -1372,15 +1419,18 @@ async fn init_timeline_state(
                             "Removing secondary local layer {} because it's absent in heatmap",
                             name
                         );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
+                        remove = true;
                     }
                 }
+                if remove {
+                    tokio::fs::remove_file(&dentry.path())
+                        .await
+                        .or_else(fs_ext::ignore_not_found)
+                        .fatal_err(&format!(
+                            "Removing layer {}",
+                            dentry.path().to_string_lossy()
+                        ));
+                }
             }
             Err(_) => {
                 // Ignore it.
@@ -1391,3 +1441,18 @@ async fn init_timeline_state(
 
     detail
 }
+
+/// Loads a json-encoded heatmap file from the provided on-disk path
+async fn load_heatmap(
+    path: &Utf8PathBuf,
+    ctx: &RequestContext,
+) -> Result<Option<HeatMapTenant>, anyhow::Error> {
+    let mut file = match VirtualFile::open(path, ctx).await {
+        Ok(file) => file,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
+        Err(e) => Err(e)?,
+    };
+    let st = file.read_to_string(ctx).await?;
+    let htm = serde_json::from_str(&st)?;
+    Ok(Some(htm))
+}
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8a7f4a4bf5fd..9d539198c7ae 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -234,6 +234,19 @@ impl VirtualFile {
     ) -> (FullSlice<Buf>, Result<usize, Error>) {
         self.inner.write_all(buf, ctx).await
     }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+
+    pub(crate) async fn read_to_string(
+        &mut self,
+        ctx: &RequestContext,
+    ) -> Result<String, anyhow::Error> {
+        let mut buf = Vec::new();
+        self.read_to_end(&mut buf, ctx).await?;
+        Ok(String::from_utf8(buf)?)
+    }
 }
 
 /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
@@ -993,6 +1006,24 @@ impl VirtualFileInner {
             (buf, result)
         })
     }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
+        loop {
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            match res {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&slice[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+            tmp = slice.into_inner();
+        }
+    }
 }
 
 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1237,10 +1268,6 @@ impl VirtualFile {
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         self.inner.read_blk(blknum, ctx).await
     }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner.read_to_end(buf, ctx).await
-    }
 }
 
 #[cfg(test)]
@@ -1260,24 +1287,6 @@ impl VirtualFileInner {
             slice.into_inner(),
         ))
     }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        let mut tmp = vec![0; 128];
-        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
-            match res {
-                Ok(0) => return Ok(()),
-                Ok(n) => {
-                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-            tmp = slice.into_inner();
-        }
-    }
 }
 
 impl Drop for VirtualFileInner {

From be51b10da73cf0e7c3109e8b6b2f5f3ad9f219b9 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 30 Jan 2025 15:31:49 +0100
Subject: [PATCH 08/78] chore(compute): Print some compute_ctl errors in debug
 mode (#10586)

## Problem

In some cases, we were returning a very shallow error like `error
sending request for url (XXX)`, which made it very hard to figure out
the actual error.

## Summary of changes

Use `{:?}` in a few places, and remove it from places where we were
printing a string anyway.
---
 compute_tools/src/extension_server.rs | 14 +++++++-------
 compute_tools/src/migration.rs        |  2 +-
 compute_tools/src/spec.rs             |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 64c338f4d7ad..00f46386e71b 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -258,14 +258,11 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
     let uri = format!("{}/{}", ext_remote_storage, ext_path);
 
-    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+    info!("Download extension {} from uri {}", ext_path, uri);
 
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
-            info!(
-                "Successfully downloaded remote extension data {:?}",
-                ext_path
-            );
+            info!("Successfully downloaded remote extension data {}", ext_path);
             REMOTE_EXT_REQUESTS_TOTAL
                 .with_label_values(&[&StatusCode::OK.to_string()])
                 .inc();
@@ -285,7 +282,10 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
     let resp = reqwest::get(uri).await.map_err(|e| {
         (
-            format!("could not perform remote extensions server request: {}", e),
+            format!(
+                "could not perform remote extensions server request: {:?}",
+                e
+            ),
             UNKNOWN_HTTP_STATUS.to_string(),
         )
     })?;
@@ -295,7 +295,7 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String
         StatusCode::OK => match resp.bytes().await {
             Ok(resp) => Ok(resp),
             Err(e) => Err((
-                format!("could not read remote extensions server response: {}", e),
+                format!("could not read remote extensions server response: {:?}", e),
                 // It's fine to return and report error with status as 200 OK,
                 // because we still failed to read the response.
                 status.to_string(),
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index aa3c6b01f035..7b7b042d848c 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> {
                     info!("Finished migration id={}", migration_id);
                 }
                 Err(e) => {
-                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    error!("Failed to run migration id={}: {:?}", migration_id, e);
                     DB_MIGRATION_FAILED
                         .with_label_values(&[migration_id.to_string().as_str()])
                         .inc();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 43a820885bcd..37d5d3a1a65f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -28,7 +28,7 @@ fn do_control_plane_request(
         .map_err(|e| {
             (
                 true,
-                format!("could not perform spec request to control plane: {}", e),
+                format!("could not perform spec request to control plane: {:?}", e),
                 UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
@@ -39,7 +39,7 @@ fn do_control_plane_request(
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
-                format!("could not deserialize control plane response: {}", e),
+                format!("could not deserialize control plane response: {:?}", e),
                 status.to_string(),
             )),
         },

From cf6dee946efa212078cbc6781549f4d5e27e8255 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 30 Jan 2025 10:25:29 -0500
Subject: [PATCH 09/78] fix(pageserver): gc-compaction race with read (#10543)

## Problem

close https://github.com/neondatabase/neon/issues/10482

## Summary of changes

Add an extra lock on the read path to protect against races. The read
path has an implication that only certain kind of compactions can be
performed. Garbage keys must first have an image layer covering the
range, and then being gc-ed -- they cannot be done in one operation. An
alternative to fix this is to move the layers read guard to be acquired
at the beginning of `get_vectored_reconstruct_data_timeline`, but that
was intentionally optimized out and I don't want to regress.

The race is not limited to image layers. Gc-compaction will consolidate
deltas automatically and produce a flat delta layer (i.e., when we have
retain_lsns below the gc-horizon). The same race would also cause
behaviors like getting an un-replayable key history as in
https://github.com/neondatabase/neon/issues/10049.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            |  6 ++++
 pageserver/src/tenant/timeline/compaction.rs | 37 +++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ab4b3cac6304..f387c81c29bc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -341,6 +341,8 @@ pub struct Timeline {
     // Needed to ensure that we can't create a branch at a point that was already garbage collected
     pub latest_gc_cutoff_lsn: Rcu<Lsn>,
 
+    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
+
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
     pub(crate) gc_info: std::sync::RwLock<GcInfo>,
@@ -2437,6 +2439,7 @@ impl Timeline {
                 shard_identity,
                 pg_version,
                 layers: Default::default(),
+                gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()),
 
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
@@ -3480,6 +3483,9 @@ impl Timeline {
         // image layer).
         let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
 
+        // See `compaction::compact_with_gc` for why we need this.
+        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
+
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5f7b5f1af584..7242f73a82d8 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2919,10 +2919,45 @@ impl Timeline {
         // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
         // operate on L1 layers.
         {
+            // Gc-compaction will rewrite the history of a key. This could happen in two ways:
+            //
+            // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume
+            // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace
+            // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer
+            // map, the read path then cannot find any keys below A, reporting a missing key error, while the key
+            // now gets stored in I at the compact LSN.
+            //
+            // ---------------                                       ---------------
+            //   delta1@LSN20                                         image1@LSN20
+            // ---------------  (read path collects delta@LSN20,  => ---------------  (read path cannot find anything
+            //   delta1@LSN10    yields)                                               below LSN 20)
+            // ---------------
+            //
+            // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers,
+            // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4,
+            // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4.
+            //
+            // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that
+            // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta
+            // layer containing 4, yields, and we update the layer map to put the delta layer.
+            //
+            // ---------------                                      ---------------
+            //   delta1@LSN4                                          image1@LSN4
+            // ---------------  (read path collects delta@LSN4,  => ---------------  (read path collects LSN4 and LSN1,
+            //  delta1@LSN1-3    yields)                              delta1@LSN1     which is an invalid history)
+            // ---------------                                      ---------------
+            //
+            // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads,
+            // and only allow reads to continue after the update is finished.
+
+            let update_guard = self.gc_compaction_layer_update_lock.write().await;
+            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
+            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
+            drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map.
         };
 
         // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.

From efe42db264fc00e9ea13d16851452a2c7c1e58a5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 30 Jan 2025 18:11:26 +0200
Subject: [PATCH 10/78] tests: test_pgdata_import_smoke requires the 'testing'
 cargo feature (#10569)

It took me ages to figure out why it was failing on my laptop. What I
saw was that when the test makes the 'import_pgdata' in the pageserver,
the pageserver actually performs a regular 'bootstrap' timeline creation
by running initdb, with no importing. It boiled down to the json request
that the test uses:

```
        {
            "new_timeline_id": str(timeline_id),
            "import_pgdata": {
                "idempotency_key": str(idempotency),
                "location": {"LocalFs": {"path": str(importbucket.absolute())}},
            },
        },
```

and how serde deserializes into rust structs. The 'LocalFs' enum variant
in `models.rs` is gated on the 'testing' cargo feature. On a non-testing
build, that got deserialized into the default Bootstrap enum variant, as
a valid TimelineCreateRequestModeImportPgdata variant could not be
formed.

PS. IMHO we should get rid of the testing feature, compile in all the
functionality, and have a runtime flag to disable anything dangeorous.
With that, you would've gotten a nice "feature only enabled in testing
mode" error in this case, or the test would've simply worked. But that's
another story.
---
 test_runner/regress/test_import_pgdata.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 182f715b0ef4..086d4b67c9e3 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -59,6 +59,9 @@ def handler(request: Request) -> Response:
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
+    # The test needs LocalFs support, which is only built in testing mode.
+    env.pageserver.is_testing_enabled_or_skip()
+
     env.pageserver.patch_config_toml_nonrecursive(
         {
             "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api"

From 6c8fc909d68f1e541c42f00162526ff1e59f6237 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 30 Jan 2025 17:41:46 +0100
Subject: [PATCH 11/78] Benchmarking PostgreSQL17: for OLAP need specific
 connstr secrets  (#10587)

## Problem

for OLAP benchmarks we need specific connstr secrets with different
database names for each job step

This is a follow-up for https://github.com/neondatabase/neon/pull/10536
In previous PR we used a common GitHub secret for a shared re-use
project that has 4 databases: neondb, tpch, clickbench and userexamples.

[Failure
example](https://neon-github-public-dev.s3.amazonaws.com/reports/main/13044872855/index.html#suites/54d0af6f403f1d8611e8894c2e07d023/fc029330265e9f6e/):


```log
# /tmp/neon/pg_install/v17/bin/psql user=neondb_owner dbname=neondb host=ep-broad-brook-w2luwzzv.us-east-2.aws.neon.build sslmode=require options='-cstatement_timeout=0 ' -c -- $ID$
-- TPC-H/TPC-R Pricing Summary Report Query (Q1)
-- Functional Query Definition
-- Approved February 1998
...
ERROR:  relation "lineitem" does not exist

```

## Summary of changes

We need dedicated GitHub secrets and dedicated connection strings for
each of the use cases.

## Test run
https://github.com/neondatabase/neon/actions/runs/13053968231
---
 .github/workflows/benchmarking.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 9446b4d17b6e..49f23e895b22 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -741,10 +741,10 @@ jobs:
           neonvm-captest-reuse)
             case "${PG_VERSION}" in
               16)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
                 ;;
               17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }}
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -864,7 +864,7 @@ jobs:
                 CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
                 ;;
               17)
-                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17"
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17"
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -984,7 +984,7 @@ jobs:
                 CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
                 ;;
               17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }}
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"

From 8293b252b2a7aaf82f0e108997483387ff2106be Mon Sep 17 00:00:00 2001
From: Cheng Chen <cheng@mooncakelabs.com>
Date: Thu, 30 Jan 2025 10:33:25 -0800
Subject: [PATCH 12/78] chore(compute): pg_mooncake v0.1.1 (#10578)

## Problem
Upgrade pg_mooncake to v0.1.1

## Summary of changes

https://github.com/Mooncake-Labs/pg_mooncake/blob/main/CHANGELOG.md#011-2025-01-29
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a428c61f3458..e9f6c03768e1 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1140,8 +1140,8 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 
-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
-    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
+    echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
     make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \

From bae0de643e75bc4e9154bc6e6e62368ba5c41d9d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 19:22:59 +0000
Subject: [PATCH 13/78] tests: relax constraints on
 test_timeline_archival_chaos (#10595)

## Problem

The test asserts that it completes at least 10 full timeline lifecycles,
but the noisy CI environment sometimes doesn't meet that goal.

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- Sleep for longer between pageserver restarts, so that the timeline
workers have more chance to make progress
- Sleep for shorter between retries from timeline worker, so that they
have better chance to get in while a pageserver is up between restarts
- Relax the success condition to complete at least 5 iterations instead
of 10
---
 test_runner/regress/test_timeline_archive.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index bec827058270..306e9716578e 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -582,12 +582,12 @@ def worker():
                 # This is expected: we are injecting chaos, API calls will sometimes fail.
                 # TODO: can we narrow this to assert we are getting friendly 503s?
                 log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random())
+                shutdown.wait(random.random() * 0.5)
             except requests.exceptions.RetryError as e:
                 # Retryable error repeated more times than `requests` is configured to tolerate, this
                 # is expected when a pageserver remains unavailable for a couple seconds
                 log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random())
+                shutdown.wait(random.random() * 0.5)
             except Exception as e:
                 log.warning(
                     f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
@@ -632,7 +632,7 @@ def worker():
 
                 # Make sure we're up for as long as we spent restarting, to ensure operations can make progress
                 log.info(f"Staying alive for {restart_duration}s")
-                time.sleep(restart_duration)
+                time.sleep(restart_duration * 2)
             else:
                 # Migrate our tenant between pageservers
                 origin_ps = env.get_tenant_pageserver(tenant_shard_id)
@@ -651,7 +651,7 @@ def worker():
 
     # Sanity check that during our run we did exercise some full timeline lifecycles, in case
     # one of our workers got stuck
-    assert len(timelines_deleted) > 10
+    assert len(timelines_deleted) > 5
 
     # That no invariant-violations were reported by workers
     assert violations == []

From 4d2c2e946007444b8b4bbeb5348d20986174ee16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:23:25 +0100
Subject: [PATCH 14/78] Revert "storcon: switch to diesel-async and
 tokio-postgres (#10280)" (#10592)

There was a regression of #10280, tracked in
[#23583](https://github.com/neondatabase/cloud/issues/23583).

I have ideas how to fix the issue, but we are too close to the release
cutoff, so revert #10280 for now. We can revert the revert later :).
---
 .github/workflows/_build-and-test-locally.yml |   4 +
 .github/workflows/build-macos.yml             |   2 +-
 .github/workflows/neon_extra_builds.yml       |   2 +-
 Cargo.lock                                    | 159 ++--
 Dockerfile                                    |   2 +-
 Makefile                                      |   2 +
 storage_controller/Cargo.toml                 |   5 +-
 storage_controller/src/main.rs                |   2 +-
 storage_controller/src/persistence.rs         | 799 ++++++++----------
 9 files changed, 405 insertions(+), 572 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index f97402a90b92..2daed9038688 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,6 +158,8 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -215,6 +217,8 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 347a511e980f..01d82a1ed2e7 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index f077e04d1c7b..5b5910badf37 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Cargo.lock b/Cargo.lock
index 9ba90355df19..359f989a7645 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -941,18 +941,6 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
-[[package]]
-name = "bb8"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
-dependencies = [
- "async-trait",
- "futures-util",
- "parking_lot 0.12.1",
- "tokio",
-]
-
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1312,7 +1300,7 @@ dependencies = [
  "tar",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1421,7 +1409,7 @@ dependencies = [
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -1797,24 +1785,11 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
+ "pq-sys",
+ "r2d2",
  "serde_json",
 ]
 
-[[package]]
-name = "diesel-async"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
-dependencies = [
- "async-trait",
- "bb8",
- "diesel",
- "futures-util",
- "scoped-futures",
- "tokio",
- "tokio-postgres 0.7.12",
-]
-
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4060,8 +4035,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
+ "postgres-protocol",
+ "postgres-types",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4092,7 +4067,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4150,7 +4125,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4456,7 +4431,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 ]
 
 [[package]]
@@ -4477,24 +4452,6 @@ dependencies = [
  "stringprep",
 ]
 
-[[package]]
-name = "postgres-protocol"
-version = "0.6.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
-dependencies = [
- "base64 0.22.1",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "hmac",
- "md-5",
- "memchr",
- "rand 0.8.5",
- "sha2",
- "stringprep",
-]
-
 [[package]]
 name = "postgres-protocol2"
 version = "0.1.0"
@@ -4519,18 +4476,7 @@ dependencies = [
  "bytes",
  "chrono",
  "fallible-iterator",
- "postgres-protocol 0.6.6",
-]
-
-[[package]]
-name = "postgres-types"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
-dependencies = [
- "bytes",
- "fallible-iterator",
- "postgres-protocol 0.6.7",
+ "postgres-protocol",
 ]
 
 [[package]]
@@ -4555,7 +4501,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4570,7 +4516,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "url",
 ]
 
@@ -4657,6 +4603,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4664,7 +4619,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
  "rand 0.8.5",
  "serde",
  "thiserror 1.0.69",
@@ -4912,7 +4867,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -4969,6 +4924,17 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r2d2"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
+dependencies = [
+ "log",
+ "parking_lot 0.12.1",
+ "scheduled-thread-pool",
+]
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5700,7 +5666,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5724,7 +5690,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -5783,12 +5749,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scoped-futures"
-version = "0.1.4"
+name = "scheduled-thread-pool"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
+checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
 dependencies = [
- "pin-project-lite",
+ "parking_lot 0.12.1",
 ]
 
 [[package]]
@@ -6323,7 +6289,6 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
- "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6338,10 +6303,10 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
- "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6394,7 +6359,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6873,34 +6838,8 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
- "rand 0.8.5",
- "socket2",
- "tokio",
- "tokio-util",
- "whoami",
-]
-
-[[package]]
-name = "tokio-postgres"
-version = "0.7.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
-dependencies = [
- "async-trait",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "futures-channel",
- "futures-util",
- "log",
- "parking_lot 0.12.1",
- "percent-encoding",
- "phf",
- "pin-project-lite",
- "postgres-protocol 0.6.7",
- "postgres-types 0.2.8",
+ "postgres-protocol",
+ "postgres-types",
  "rand 0.8.5",
  "socket2",
  "tokio",
@@ -6917,7 +6856,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7576,6 +7515,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7595,7 +7540,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
diff --git a/Dockerfile b/Dockerfile
index 7ba54c8ca5fa..f80666529bb5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Makefile b/Makefile
index d1238caebf5a..22ebfea7d571 100644
--- a/Makefile
+++ b/Makefile
@@ -64,6 +64,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
+CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 9860bd5d0e44..caaa22d0a5d7 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -45,11 +45,12 @@ strum_macros.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
+    "postgres",
+    "r2d2",
     "chrono",
 ] }
-diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-scoped-futures = "0.1.4"
+r2d2 = { version = "0.8.10" }
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 659c088d5143..801409d61292 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
+    let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 35eb15b29791..37bfaf1139a5 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,12 +5,9 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
+use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
-use diesel_async::pooled_connection::bb8::Pool;
-use diesel_async::pooled_connection::AsyncDieselConnectionManager;
-use diesel_async::RunQueryDsl;
-use diesel_async::{AsyncConnection, AsyncPgConnection};
+use diesel::Connection;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -23,7 +20,6 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -64,7 +60,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: Pool<AsyncPgConnection>,
+    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -80,7 +76,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
+    ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -128,7 +124,6 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
-#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -141,11 +136,6 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
-// A generous allowance for how many times we may retry serializable transactions
-// before giving up.  This is not expected to be hit: it is a defensive measure in case we
-// somehow engineer a situation where duelling transactions might otherwise live-lock.
-const MAX_RETRIES: usize = 128;
-
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -155,12 +145,12 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub async fn new(database_url: String) -> Self {
-        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new(database_url);
+    pub fn new(database_url: String) -> Self {
+        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = Pool::builder()
+        let connection_pool = diesel::r2d2::Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -168,7 +158,6 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
-            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -182,7 +171,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match AsyncPgConnection::establish(database_url).await {
+            match PgConnection::establish(database_url) {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -203,66 +192,20 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        // Can't use self.with_conn here as we do spawn_blocking which requires static.
-        let conn = self
-            .connection_pool
-            .dedicated_connection()
-            .await
-            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
-        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
-            AsyncConnectionWrapper::from(conn);
-        tokio::task::spawn_blocking(move || {
-            let mut retry_count = 0;
-            loop {
-                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
-                    .run_pending_migrations(MIGRATIONS)
-                    .map(|_| ())
-                    .map_err(|e| DatabaseError::Migration(e.to_string()));
-                match result {
-                    Ok(r) => break Ok(r),
-                    Err(
-                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
-                            diesel::result::DatabaseErrorKind::SerializationFailure,
-                            _,
-                        )),
-                    ) => {
-                        retry_count += 1;
-                        if retry_count > MAX_RETRIES {
-                            tracing::error!(
-                                "Exceeded max retries on SerializationFailure errors: {err:?}"
-                            );
-                            break Err(err);
-                        } else {
-                            // Retry on serialization errors: these are expected, because even though our
-                            // transactions don't fight for the same rows, they will occasionally collide
-                            // on index pages (e.g. increment_generation for unrelated shards can collide)
-                            tracing::debug!(
-                                "Retrying transaction on serialization failure {err:?}"
-                            );
-                            continue;
-                        }
-                    }
-                    Err(e) => break Err(e),
-                }
-            }
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
         })
         .await
-        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
-        Ok(())
     }
 
     /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<'a, 'b, F, R>(
-        &self,
-        op: DatabaseOperation,
-        func: F,
-    ) -> DatabaseResult<R>
+    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
-        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
-            + Send
-            + std::marker::Sync
-            + 'a,
-        R: Send + 'b,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
     {
         let latency = &METRICS_REGISTRY
             .metrics_group
@@ -284,75 +227,77 @@ impl Persistence {
         res
     }
 
-    /// Call the provided function with a Diesel database connection in a retry loop
-    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
-            + Send
-            + std::marker::Sync
-            + 'a,
-        R: Send + 'b,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
     {
-        let mut retry_count = 0;
-        loop {
-            let mut conn = self.connection_pool.get().await?;
-            match conn
-                .build_transaction()
-                .serializable()
-                .run(|c| func(c))
-                .await
-            {
-                Ok(r) => break Ok(r),
-                Err(
-                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
-                        diesel::result::DatabaseErrorKind::SerializationFailure,
-                        _,
-                    )),
-                ) => {
-                    retry_count += 1;
-                    if retry_count > MAX_RETRIES {
-                        tracing::error!(
-                            "Exceeded max retries on SerializationFailure errors: {err:?}"
-                        );
-                        break Err(err);
-                    } else {
-                        // Retry on serialization errors: these are expected, because even though our
-                        // transactions don't fight for the same rows, they will occasionally collide
-                        // on index pages (e.g. increment_generation for unrelated shards can collide)
-                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
-                        continue;
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
+        let mut conn = self.connection_pool.get()?;
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            let mut retry_count = 0;
+            loop {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
+                    Ok(r) => break Ok(r),
+                    Err(
+                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                            diesel::result::DatabaseErrorKind::SerializationFailure,
+                            _,
+                        )),
+                    ) => {
+                        retry_count += 1;
+                        if retry_count > MAX_RETRIES {
+                            tracing::error!(
+                                "Exceeded max retries on SerializationFailure errors: {err:?}"
+                            );
+                            break Err(err);
+                        } else {
+                            // Retry on serialization errors: these are expected, because even though our
+                            // transactions don't fight for the same rows, they will occasionally collide
+                            // on index pages (e.g. increment_generation for unrelated shards can collide)
+                            tracing::debug!(
+                                "Retrying transaction on serialization failure {err:?}"
+                            );
+                            continue;
+                        }
                     }
+                    Err(e) => break Err(e),
                 }
-                Err(e) => break Err(e),
             }
-        }
+        })
+        .await
+        .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = &node.to_persistent();
-        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
-            Box::pin(async move {
+        let np = node.to_persistent();
+        self.with_measured_conn(
+            DatabaseOperation::InsertNode,
+            move |conn| -> DatabaseResult<()> {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(np)
-                    .execute(conn)
-                    .await?;
+                    .values(&np)
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::nodes::table
-                        .load::<NodePersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -368,14 +313,11 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                Box::pin(async move {
-                    let updated = diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                        .execute(conn)
-                        .await?;
-                    Ok(updated)
-                })
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                    .execute(conn)?;
+                Ok(updated)
             })
             .await?;
 
@@ -397,16 +339,17 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::ListTenantShards,
+            move |conn| -> DatabaseResult<_> {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn).await?;
+                let result = query.load::<TenantShardPersistence>(conn)?;
 
                 Ok(result)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -416,14 +359,15 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::LoadTenant,
+            move |conn| -> DatabaseResult<_> {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn).await?;
+                let result = query.load::<TenantShardPersistence>(conn)?;
 
                 Ok(result)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -449,22 +393,19 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        let shards = &shards;
-        let metadata_health_records = &metadata_health_records;
-        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::InsertTenantShards,
+            move |conn| -> DatabaseResult<()> {
                 diesel::insert_into(tenant_shards::table)
-                    .values(shards)
-                    .execute(conn)
-                    .await?;
+                    .values(&shards)
+                    .execute(conn)?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(metadata_health_records)
-                    .execute(conn)
-                    .await?;
+                    .values(&metadata_health_records)
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -472,31 +413,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::DeleteTenant,
+            move |conn| -> DatabaseResult<()> {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::DeleteNode,
+            move |conn| -> DatabaseResult<()> {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -513,41 +454,34 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                Box::pin(async move {
-                    let rows_updated = diesel::update(tenant_shards)
-                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                        .set(generation.eq(generation + 1))
-                        .execute(conn)
-                        .await?;
-
-                    tracing::info!("Incremented {} tenants' generations", rows_updated);
-
-                    // TODO: UPDATE+SELECT in one query
-
-                    let updated = tenant_shards
-                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                        .select(TenantShardPersistence::as_select())
-                        .load(conn)
-                        .await?;
-
-                    // If the node went through a drain and restart phase before re-attaching,
-                    // then reset it's node scheduling policy to active.
-                    diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(
-                            scheduling_policy
-                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
-                        )
-                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                        .execute(conn)
-                        .await?;
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
+
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                // TODO: UPDATE+SELECT in one query
+
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
+
+                // If the node went through a drain and restart phase before re-attaching,
+                // then reset it's node scheduling policy to active.
+                diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .filter(
+                        scheduling_policy
+                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
+                    )
+                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                    .execute(conn)?;
 
-                    Ok(updated)
-                })
+                Ok(updated)
             })
             .await?;
 
@@ -584,22 +518,19 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                Box::pin(async move {
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .set((
-                            generation.eq(generation + 1),
-                            generation_pageserver.eq(node_id.0 as i64),
-                        ))
-                        // TODO: only returning() the generation column
-                        .returning(TenantShardPersistence::as_returning())
-                        .get_result(conn)
-                        .await?;
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-                    Ok(updated)
-                })
+                Ok(updated)
             })
             .await?;
 
@@ -631,15 +562,12 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                Box::pin(async move {
-                    let result = tenant_shards
-                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                        .select(TenantShardPersistence::as_select())
-                        .order(shard_number)
-                        .load(conn)
-                        .await?;
-                    Ok(result)
-                })
+                let result = tenant_shards
+                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                    .select(TenantShardPersistence::as_select())
+                    .order(shard_number)
+                    .load(conn)?;
+                Ok(result)
             })
             .await?;
 
@@ -687,18 +615,15 @@ impl Persistence {
                 break;
             }
 
-            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    Box::pin(async move {
-                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                        ).load(conn).await?;
-
-                        Ok(result)
-                    })
+                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                    ).load(conn)?;
+
+                    Ok(result)
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -732,58 +657,51 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        let tenant = &tenant;
-        let input_placement_policy = &input_placement_policy;
-        let input_config = &input_config;
-        let input_generation = &input_generation;
-        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            Box::pin(async move {
-                let query = match tenant {
-                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .into_boxed(),
-                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(input_tenant_id.to_string()))
-                        .into_boxed(),
-                };
+            let query = match tenant {
+                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .into_boxed(),
+                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(input_tenant_id.to_string()))
+                    .into_boxed(),
+            };
 
-                // Clear generation_pageserver if we are moving into a state where we won't have
-                // any attached pageservers.
-                let input_generation_pageserver = match input_placement_policy {
-                    None | Some(PlacementPolicy::Attached(_)) => None,
-                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-                };
+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
 
-                #[derive(AsChangeset)]
-                #[diesel(table_name = crate::schema::tenant_shards)]
-                struct ShardUpdate {
-                    generation: Option<i32>,
-                    placement_policy: Option<String>,
-                    config: Option<String>,
-                    scheduling_policy: Option<String>,
-                    generation_pageserver: Option<Option<i64>>,
-                }
+            #[derive(AsChangeset)]
+            #[diesel(table_name = crate::schema::tenant_shards)]
+            struct ShardUpdate {
+                generation: Option<i32>,
+                placement_policy: Option<String>,
+                config: Option<String>,
+                scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
+            }
 
-                let update = ShardUpdate {
-                    generation: input_generation.map(|g| g.into().unwrap() as i32),
-                    placement_policy: input_placement_policy
-                        .as_ref()
-                        .map(|p| serde_json::to_string(&p).unwrap()),
-                    config: input_config
-                        .as_ref()
-                        .map(|c| serde_json::to_string(&c).unwrap()),
-                    scheduling_policy: input_scheduling_policy
-                        .map(|p| serde_json::to_string(&p).unwrap()),
-                    generation_pageserver: input_generation_pageserver,
-                };
+            let update = ShardUpdate {
+                generation: input_generation.map(|g| g.into().unwrap() as i32),
+                placement_policy: input_placement_policy
+                    .as_ref()
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
+                scheduling_policy: input_scheduling_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
+            };
 
-                query.set(update).execute(conn).await?;
+            query.set(update).execute(conn)?;
 
-                Ok(())
-            })
+            Ok(())
         })
         .await?;
 
@@ -797,27 +715,23 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
-        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            Box::pin(async move {
-                let mut shards_updated = Vec::default();
-
-                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                        .execute(conn)
-                        .await?;
-
-                    if updated == 1 {
-                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
-                    }
+            let mut shards_updated = Vec::default();
+
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                    .execute(conn)?;
+
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
                 }
+            }
 
-                Ok(shards_updated)
-            })
+            Ok(shards_updated)
         })
         .await
     }
@@ -825,21 +739,17 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            Box::pin(async move {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation_pageserver.eq(Option::<i64>::None),
-                        placement_policy
-                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                    ))
-                    .execute(conn)
-                    .await?;
-
-                Ok(updated)
-            })
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                .set((
+                    generation_pageserver.eq(Option::<i64>::None),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
+
+            Ok(updated)
         })
         .await?;
 
@@ -858,16 +768,14 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        let parent_to_children = parent_to_children.as_slice();
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn).await?;
+                .execute(conn)?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -880,7 +788,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.to_vec();
+            let parent_to_children = parent_to_children.clone();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -888,7 +796,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn).await?;
+                    .load::<TenantShardPersistence>(conn)?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -903,13 +811,12 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn).await?;
+                        .execute(conn)?;
                 }
             }
 
             Ok(())
         })
-        })
         .await
     }
 
@@ -921,26 +828,25 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::CompleteShardSplit,
+            move |conn| -> DatabaseResult<()> {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -952,15 +858,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::AbortShardSplit,
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -980,12 +886,11 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1001,28 +906,25 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        let healthy_records = healthy_records.as_slice();
-        let unhealthy_records = unhealthy_records.as_slice();
-        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::UpdateMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
                 diesel::insert_into(metadata_health)
-                    .values(healthy_records)
+                    .values(&healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 diesel::insert_into(metadata_health)
-                    .values(unhealthy_records)
+                    .values(&unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1031,13 +933,15 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
-            Box::pin(async {
-                Ok(crate::schema::metadata_health::table
-                    .load::<MetadataHealthPersistence>(conn)
-                    .await?)
-            })
-        })
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                Ok(
+                    crate::schema::metadata_health::table
+                        .load::<MetadataHealthPersistence>(conn)?,
+                )
+            },
+        )
         .await
     }
 
@@ -1049,15 +953,10 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| {
-                Box::pin(async {
-                    DatabaseResult::Ok(
-                        crate::schema::metadata_health::table
-                            .filter(healthy.eq(false))
-                            .load::<MetadataHealthPersistence>(conn)
-                            .await?,
-                    )
-                })
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::metadata_health::table
+                    .filter(healthy.eq(false))
+                    .load::<MetadataHealthPersistence>(conn)?)
             },
         )
         .await
@@ -1071,14 +970,15 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthOutdated,
+            move |conn| -> DatabaseResult<_> {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn).await?;
+                let res = query.load::<MetadataHealthPersistence>(conn)?;
 
                 Ok(res)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1086,13 +986,12 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::controllers::table
-                        .load::<ControllerPersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
+                },
+            )
             .await?;
 
         if leader.len() > 1 {
@@ -1115,33 +1014,26 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
-                let prev = prev.clone();
-                let new = new.clone();
-                Box::pin(async move {
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
                     let updated = match &prev {
-                        Some(prev) => {
-                            diesel::update(controllers)
-                                .filter(address.eq(prev.address.clone()))
-                                .filter(started_at.eq(prev.started_at))
-                                .set((
-                                    address.eq(new.address.clone()),
-                                    started_at.eq(new.started_at),
-                                ))
-                                .execute(conn)
-                                .await?
-                        }
-                        None => {
-                            diesel::insert_into(controllers)
-                                .values(new.clone())
-                                .execute(conn)
-                                .await?
-                        }
+                        Some(prev) => diesel::update(controllers)
+                            .filter(address.eq(prev.address.clone()))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                address.eq(new.address.clone()),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(controllers)
+                            .values(new.clone())
+                            .execute(conn)?,
                     };
 
                     Ok(updated)
-                })
-            })
+                },
+            )
             .await?;
 
         if updated == 0 {
@@ -1156,13 +1048,12 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::safekeepers::table
-                        .load::<SafekeeperPersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1175,14 +1066,11 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                Ok(safekeepers
-                    .filter(id_column.eq(&id))
-                    .select(SafekeeperPersistence::as_select())
-                    .get_result(conn)
-                    .await?)
-            })
+        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
+            Ok(safekeepers
+                .filter(id_column.eq(&id))
+                .select(SafekeeperPersistence::as_select())
+                .get_result(conn)?)
         })
         .await
     }
@@ -1193,30 +1081,26 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| {
-            let record = record.clone();
-            Box::pin(async move {
-                let bind = record
-                    .as_insert_or_update()
-                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-
-                let inserted_updated = diesel::insert_into(safekeepers)
-                    .values(&bind)
-                    .on_conflict(id)
-                    .do_update()
-                    .set(&bind)
-                    .execute(conn)
-                    .await?;
-
-                if inserted_updated != 1 {
-                    return Err(DatabaseError::Logical(format!(
-                        "unexpected number of rows ({})",
-                        inserted_updated
-                    )));
-                }
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            let bind = record
+                .as_insert_or_update()
+                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+
+            let inserted_updated = diesel::insert_into(safekeepers)
+                .values(&bind)
+                .on_conflict(id)
+                .do_update()
+                .set(&bind)
+                .execute(conn)?;
+
+            if inserted_updated != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({})",
+                    inserted_updated
+                )));
+            }
 
-                Ok(())
-            })
+            Ok(())
         })
         .await
     }
@@ -1228,29 +1112,26 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                #[derive(Insertable, AsChangeset)]
-                #[diesel(table_name = crate::schema::safekeepers)]
-                struct UpdateSkSchedulingPolicy<'a> {
-                    id: i64,
-                    scheduling_policy: &'a str,
-                }
-                let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            #[derive(Insertable, AsChangeset)]
+            #[diesel(table_name = crate::schema::safekeepers)]
+            struct UpdateSkSchedulingPolicy<'a> {
+                id: i64,
+                scheduling_policy: &'a str,
+            }
+            let scheduling_policy_ = String::from(scheduling_policy_);
 
-                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                    .set(scheduling_policy.eq(scheduling_policy_))
-                    .execute(conn)
-                    .await?;
+            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                .set(scheduling_policy.eq(scheduling_policy_))
+                .execute(conn)?;
 
-                if rows_affected != 1 {
-                    return Err(DatabaseError::Logical(format!(
-                        "unexpected number of rows ({rows_affected})",
-                    )));
-                }
+            if rows_affected != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({rows_affected})",
+                )));
+            }
 
-                Ok(())
-            })
+            Ok(())
         })
         .await
     }

From bf6d5e93baa176f1f0015833beb108f9995a5142 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:32:35 +0100
Subject: [PATCH 15/78] Run tests of the contrib extensions (#10392)

## Problem
We don't test the extensions, shipped with contrib
## Summary of changes
The tests are now running
---
 compute/compute-node.Dockerfile           |   1 +
 compute/patches/contrib_pg16.patch        | 242 ++++++++++++++++++++++
 compute/patches/contrib_pg17.patch        | 196 ++++++++++++++++++
 docker-compose/compute_wrapper/Dockerfile |   2 +-
 docker-compose/docker_compose_test.sh     |  35 +++-
 docker-compose/run-tests.sh               |   6 +-
 6 files changed, 469 insertions(+), 13 deletions(-)
 create mode 100644 compute/patches/contrib_pg16.patch
 create mode 100644 compute/patches/contrib_pg17.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e9f6c03768e1..1ef449f0b06c 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1345,6 +1345,7 @@ FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
 RUN mkdir /ext-src
 
+COPY --from=pg-build /postgres /postgres
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
diff --git a/compute/patches/contrib_pg16.patch b/compute/patches/contrib_pg16.patch
new file mode 100644
index 000000000000..71adaabe7d4b
--- /dev/null
+++ b/compute/patches/contrib_pg16.patch
@@ -0,0 +1,242 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
++++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
++++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
++++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
++++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 72304e0..ebe131b 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
++++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
++++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
+index d1adbab..38b52ac 100644
+--- a/contrib/pageinspect/expected/gist.out
++++ b/contrib/pageinspect/expected/gist.out
+@@ -10,25 +10,6 @@ BEGIN;
+ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+- lsn | nsn | rightlink  | flags 
+------+-----+------------+-------
+- 0/1 | 0/0 | 4294967295 | {}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+- lsn | nsn | rightlink  | flags  
+------+-----+------------+--------
+- 0/1 | 0/0 | 4294967295 | {leaf}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+- lsn | nsn | rightlink | flags  
+------+-----+-----------+--------
+- 0/1 | 0/0 |         1 | {leaf}
+-(1 row)
+-
+ COMMIT;
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
+  itemoffset |   ctid    | itemlen | dead |             keys              
+diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
+index d263542..607992f 100644
+--- a/contrib/pageinspect/sql/gist.sql
++++ b/contrib/pageinspect/sql/gist.sql
+@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+ 
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+-
+ COMMIT;
+ 
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
diff --git a/compute/patches/contrib_pg17.patch b/compute/patches/contrib_pg17.patch
new file mode 100644
index 000000000000..0d6c1203b0cc
--- /dev/null
+++ b/compute/patches/contrib_pg17.patch
@@ -0,0 +1,196 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
++++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
++++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
++++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
++++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 86c148a..81bdb2c 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
++++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
++++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 61f44681dabf..b5f0f47ceb88 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
                        jq   \
                        netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
 
 USER postgres
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index a05d6c043dc4..e0c537edf37b 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -61,17 +61,32 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
+        # The following block does the same for the contrib/file_fdw test
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
+        rm -rf $TMPDIR
+        # Apply patches
+        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
         # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
+        rm -f testout.txt testout_contrib.txt
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
+        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
+        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
+            CONTRIB_FAILED=
+            FAILED=
+            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
+            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
+            for d in $FAILED $CONTRIB_FAILED; do
+                dn="$(basename $d)"
+                rm -rf $dn
+                mkdir $dn
+                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
+                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
+                cat $dn/regression.out $dn/regression.diffs || true
+                rm -rf $dn
             done
         rm -rf $FAILED
         exit 1
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 1e794a42a1f6..72ae61b0327a 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -x
 
-cd /ext-src || exit 2
+extdir=${1}
+
+cd "${extdir}" || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
 for d in ${LIST}; do
     [ -d "${d}" ] || continue
     if ! psql -w -c "select 1" >/dev/null; then

From 6da7c556c2f3725d51ea2c626491a8011be97f04 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 20:33:22 +0000
Subject: [PATCH 16/78] pageserver: fix race cleaning up timeline files when
 shut down during bootstrap (#10532)

## Problem

Timeline bootstrap starts a flush loop, but doesn't reliably shut down
the timeline (incl. waiting for flush loop to exit) before destroying
UninitializedTimeline, and that destructor tries to clean up local
storage. If local storage is still being written to, then this is
unsound.

Currently the symptom is that we see a "Directory not empty" error log,
e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/main/12966756686/index.html#testresult/5523f7d15f46f7f7/retries

## Summary of changes

- Move fallible IO part of bootstrap into a function (notably, this is
fallible in the case of the tenant being shut down while creation is
happening)
- When that function returns an error, call shutdown() on the timeline
---
 pageserver/src/http/routes.rs            |   8 +-
 pageserver/src/tenant.rs                 |  66 +++++-------
 pageserver/src/tenant/timeline/uninit.rs | 129 +++++++++++++++++------
 3 files changed, 130 insertions(+), 73 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0f3e9fdab601..2548a11b2ec0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3169,12 +3169,16 @@ async fn put_tenant_timeline_import_basebackup(
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let span = info_span!("import_basebackup",
+        tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(),
+        base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
     async move {
         let state = get_state(&request);
         let tenant = state
             .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         let broker_client = state.broker_client.clone();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 085f76c05d98..657cc78e2c26 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2426,7 +2426,7 @@ impl Tenant {
         // Make sure the freeze_and_flush reaches remote storage.
         tline.remote_client.wait_completion().await.unwrap();
 
-        let tl = uninit_tl.finish_creation()?;
+        let tl = uninit_tl.finish_creation().await?;
         // The non-test code would call tl.activate() here.
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -4702,7 +4702,7 @@ impl Tenant {
             )
             .await?;
 
-        let new_timeline = uninitialized_timeline.finish_creation()?;
+        let new_timeline = uninitialized_timeline.finish_creation().await?;
 
         // Root timeline gets its layers during creation and uploads them along with the metadata.
         // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -4892,10 +4892,11 @@ impl Tenant {
         }
 
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
+        let pgdata_path_deferred = pgdata_path.clone();
         scopeguard::defer! {
-            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
+            if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) {
                 // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
+                error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}");
             }
         }
         if let Some(existing_initdb_timeline_id) = load_existing_initdb {
@@ -4962,7 +4963,7 @@ impl Tenant {
             pgdata_lsn,
             pg_version,
         );
-        let raw_timeline = self
+        let mut raw_timeline = self
             .prepare_new_timeline(
                 timeline_id,
                 &new_metadata,
@@ -4973,42 +4974,33 @@ impl Tenant {
             .await?;
 
         let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
-        let unfinished_timeline = raw_timeline.raw_timeline()?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            &pgdata_path,
-            pgdata_lsn,
-            ctx,
-        )
-        .await
-        .with_context(|| {
-            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
-        })?;
+        raw_timeline
+            .write(|unfinished_timeline| async move {
+                import_datadir::import_timeline_from_postgres_datadir(
+                    &unfinished_timeline,
+                    &pgdata_path,
+                    pgdata_lsn,
+                    ctx,
+                )
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}"
+                    )
+                })?;
 
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "failpoint before-checkpoint-new-timeline"
-            )))
-        });
+                fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                    Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        "failpoint before-checkpoint-new-timeline"
+                    )))
+                });
 
-        unfinished_timeline
-            .freeze_and_flush()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
-                )
-            })?;
+                Ok(())
+            })
+            .await?;
 
         // All done!
-        let timeline = raw_timeline.finish_creation()?;
+        let timeline = raw_timeline.finish_creation().await?;
 
         // Callers are responsible to wait for uploads to complete and for activating the timeline.
 
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 80a09b4840d0..3074463384fb 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,4 +1,4 @@
-use std::{collections::hash_map::Entry, fs, sync::Arc};
+use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -8,7 +8,8 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 use crate::{
     context::RequestContext,
     import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+    span::debug_assert_current_span_has_tenant_and_timeline_id,
+    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };
 
 use super::Timeline;
@@ -24,6 +25,9 @@ pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
     raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
+    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
+    /// if aborting the timeline creation
+    needs_shutdown: bool,
 }
 
 impl<'t> UninitializedTimeline<'t> {
@@ -36,6 +40,50 @@ impl<'t> UninitializedTimeline<'t> {
             owning_tenant,
             timeline_id,
             raw_timeline,
+            needs_shutdown: false,
+        }
+    }
+
+    /// When writing data to this timeline during creation, use this wrapper: it will take care of
+    /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down
+    /// later.
+    pub(crate) async fn write<F, Fut>(&mut self, f: F) -> anyhow::Result<()>
+    where
+        F: FnOnce(Arc<Timeline>) -> Fut,
+        Fut: Future<Output = Result<(), CreateTimelineError>>,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop
+        self.needs_shutdown = true;
+
+        let timeline = self.raw_timeline()?;
+
+        // Spawn flush loop so that the Timeline is ready to accept writes
+        timeline.maybe_spawn_flush_loop();
+
+        // Invoke the provided function, which will write some data into the new timeline
+        if let Err(e) = f(timeline.clone()).await {
+            self.abort().await;
+            return Err(e.into());
+        }
+
+        // Flush the underlying timeline's ephemeral layers to disk
+        if let Err(e) = timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after timeline creation writes")
+        {
+            self.abort().await;
+            return Err(e);
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn abort(&self) {
+        if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() {
+            raw_timeline.shutdown(super::ShutdownMode::Hard).await;
         }
     }
 
@@ -44,11 +92,13 @@ impl<'t> UninitializedTimeline<'t> {
     /// This function launches the flush loop if not already done.
     ///
     /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) async fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_shard_id = self.owning_tenant.tenant_shard_id;
 
         if self.raw_timeline.is_none() {
+            self.abort().await;
+
             return Err(anyhow::anyhow!(
                 "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
             ));
@@ -62,16 +112,25 @@ impl<'t> UninitializedTimeline<'t> {
             .0
             .get_disk_consistent_lsn();
 
-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
+        if !new_disk_consistent_lsn.is_valid() {
+            self.abort().await;
+
+            return Err(anyhow::anyhow!(
+                "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
+            ));
+        }
 
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
+            Entry::Occupied(_) => {
+                // Unexpected, bug in the caller.  Tenant is responsible for preventing concurrent creation of the same timeline.
+                //
+                // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
+                // skip trying to delete the timeline directory too.
+                anyhow::bail!(
                 "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
-            ),
+                )
+            }
             Entry::Vacant(v) => {
                 // after taking here should be no fallible operations, because the drop guard will not
                 // cleanup after and would block for example the tenant deletion
@@ -93,36 +152,31 @@ impl<'t> UninitializedTimeline<'t> {
 
     /// Prepares timeline data by loading it from the basebackup archive.
     pub(crate) async fn import_basebackup_from_tar(
-        self,
+        mut self,
         tenant: Arc<Tenant>,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
+        self.write(|raw_timeline| async move {
+            import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx)
+                .await
+                .context("Failed to import basebackup")
+                .map_err(CreateTimelineError::Other)?;
 
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
+            fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "failpoint before-checkpoint-new-timeline"
+                )))
+            });
 
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
+            Ok(())
+        })
+        .await?;
 
         // All the data has been imported. Insert the Timeline into the tenant's timelines map
-        let tl = self.finish_creation()?;
+        let tl = self.finish_creation().await?;
         tl.activate(tenant, broker_client, None, ctx);
         Ok(tl)
     }
@@ -143,12 +197,19 @@ impl<'t> UninitializedTimeline<'t> {
 
 impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
-        if let Some((_, create_guard)) = self.raw_timeline.take() {
+        if let Some((timeline, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            // This is unusual, but can happen harmlessly if the pageserver is stopped while
-            // creating a timeline.
-            info!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(create_guard);
+            if self.needs_shutdown && !timeline.gate.close_complete() {
+                // This should not happen: caller should call [`Self::abort`] on failures
+                tracing::warn!(
+                    "Timeline not shut down after initialization failure, cannot clean up files"
+                );
+            } else {
+                // This is unusual, but can happen harmlessly if the pageserver is stopped while
+                // creating a timeline.
+                info!("Timeline got dropped without initializing, cleaning its files");
+                cleanup_timeline_directory(create_guard);
+            }
         }
     }
 }

From d18f6198e15d5201db3c3fa5e3d475ad9de334fb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:17:07 +0000
Subject: [PATCH 17/78] storcon: fix AZ-driven tenant selection in chaos
 (#10443)

## Problem

In https://github.com/neondatabase/neon/pull/10438 I had got the
function for picking tenants backwards, and it was preferring to move
things _away_ from their preferred AZ.

## Summary of changes

- Fix condition in `is_attached_outside_preferred_az`
---
 storage_controller/src/tenant_shard.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index cbc2696b260d..d344e27e3180 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1806,7 +1806,7 @@ impl TenantShard {
                         .get(&node_id)
                         .expect("referenced node exists")
                         .get_availability_zone_id(),
-                ) == self.intent.preferred_az_id.as_ref()
+                ) != self.intent.preferred_az_id.as_ref()
             })
             .unwrap_or(false)
     }

From e1273acdb1e018352d97ea11bf65adf90db69c78 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:43:36 +0000
Subject: [PATCH 18/78] pageserver: handle shutdown cleanly in layer download
 API (#10598)

## Problem

This API is used in tests and occasionally for support. It cast all
errors to 500.

That can cause a failure on the log checks:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13056992876/index.html#suites/ad9c266207b45eafe19909d1020dd987/683a7031d877f3db/

## Summary of changes

- Avoid using generic anyhow::Error for layer downloads
- Map shutdown cases to 503 in http route
---
 pageserver/src/http/routes.rs                 |  8 +++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 pageserver/src/tenant/timeline.rs             | 12 ++++++++++--
 test_runner/regress/test_ondemand_download.py | 19 +++++++------------
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2548a11b2ec0..eb9cb4da0c20 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1472,7 +1472,13 @@ async fn layer_download_handler(
     let downloaded = timeline
         .download_layer(&layer_name)
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            other => ApiError::InternalServerError(other.into()),
+        })?;
 
     match downloaded {
         Some(true) => json_response(StatusCode::OK, ()),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 99e0ff1aa5df..92313afba7b7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -340,7 +340,7 @@ impl Layer {
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> anyhow::Result<()> {
+    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
         self.0.get_or_maybe_download(true, None).await?;
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f387c81c29bc..827601fa8b86 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2028,8 +2028,16 @@ impl Timeline {
     pub(crate) async fn download_layer(
         &self,
         layer_file_name: &LayerName,
-    ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await? else {
+    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
+        let Some(layer) = self
+            .find_layer(layer_file_name)
+            .await
+            .map_err(|e| match e {
+                layer_manager::Shutdown => {
+                    super::storage_layer::layer::DownloadError::TimelineShutdown
+                }
+            })?
+        else {
             return Ok(None);
         };
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 028d1c2e49b8..c344f30f4d9c 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,6 +27,7 @@
 )
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import query_scalar, wait_until
+from urllib3 import Retry
 
 if TYPE_CHECKING:
     from typing import Any
@@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
             "compaction_period": "0s",
         }
     )
-    client = env.pageserver.http_client()
+
+    # Disable retries, because we'll hit code paths that can give us
+    # 503 and want to see that directly
+    client = env.pageserver.http_client(retries=Retry(status=0))
+
     failpoint = "before-downloading-layer-stream-pausable"
     client.configure_failpoints((failpoint, "pause"))
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*downloading failed, possibly for shutdown.*",
-        ]
-    )
-
     info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
     assert len(info.delta_layers()) == 1
 
@@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
 
         client.configure_failpoints((failpoint, "off"))
 
-        with pytest.raises(
-            PageserverApiException, match="downloading failed, possibly for shutdown"
-        ):
+        with pytest.raises(PageserverApiException, match="Shutting down"):
             download.result()
 
-        env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
-
         detach.result()
 
         client.configure_failpoints((failpoint, "pause"))

From 5e0c40709f8a18477454beb0fe7cb6d9d522dbf7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:45:43 +0000
Subject: [PATCH 19/78] storcon: refine chaos selection logic (#10600)

## Problem

In https://github.com/neondatabase/neon/pull/10438 it was pointed out
that it would be good to avoid picking tenants in ID order, and also to
avoid situations where we might double-select the same tenant.

There was an initial swing at this in
https://github.com/neondatabase/neon/pull/10443, where Chi suggested a
simpler approach which is done in this PR

## Summary of changes

- Split total set of tenants into in and out of home AZ
- Consume out of home AZ first, and if necessary shuffle + consume from
out of home AZ
---
 .../src/service/chaos_injector.rs             | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 98034421d6ea..91d7183fde8d 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -96,28 +96,37 @@ impl ChaosInjector {
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();
         let (nodes, tenants, scheduler) = inner.parts_mut();
-        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
 
         // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
         // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
         // random tenants to move, and then on next chaos iteration moving them back, then picking some new
         // random tenants on the next iteration.
-        let mut victims = Vec::with_capacity(batch_size);
-        for shard in tenants.values() {
-            if shard.is_attached_outside_preferred_az(nodes) {
-                victims.push(shard.tenant_shard_id);
-            }
+        let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants
+            .values()
+            .map(|shard| {
+                (
+                    shard.tenant_shard_id,
+                    shard.is_attached_outside_preferred_az(nodes),
+                )
+            })
+            .partition(|(_id, is_outside)| *is_outside);
+
+        let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect();
+        let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect();
 
-            if victims.len() >= batch_size {
-                break;
-            }
-        }
+        let mut victims = Vec::with_capacity(batch_size);
+        if out_of_home_az.len() >= batch_size {
+            tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len());
 
-        let choose_random = batch_size.saturating_sub(victims.len());
-        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
+            out_of_home_az.shuffle(&mut thread_rng());
+            victims.extend(out_of_home_az.into_iter().take(batch_size));
+        } else {
+            tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()));
 
-        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
-        victims.extend(random_victims);
+            victims.extend(out_of_home_az);
+            in_home_az.shuffle(&mut thread_rng());
+            victims.extend(in_home_az.into_iter().take(batch_size - victims.len()));
+        }
 
         for victim in victims {
             self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);

From df87a55609d2c9824c7445ef59a1391157ff6b55 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 30 Jan 2025 23:55:17 +0100
Subject: [PATCH 20/78] tests: Speed up test_pgdata_import_smoke on Postgres
 v17 (#10567)

The test runs this query:

    select count(*), sum(data::bigint)::bigint from t

to validate the test results between each part of the test. It performs
a simple sequential scan and aggregation, but was taking an order of
magnitude longer on v17 than on previous Postgres versions, which
sometimes caused the test to time out. There were two reasons for that:

1. On v17, the planner estimates the table to have only only one row. In
reality it has 305790 rows, and older versions estimated it at 611580,
which is not too bad given that the table has not been analyzed so the
planner bases that estimate just on the number of pages and the widths
of the datatypes. The new estimate of 1 row is much worse, and it leads
the planner to disregard parallel plans, whereas on older versions you
got a Parallel Seq Scan.

I tracked this down to upstream commit 29cf61ade3, "Consider fillfactor
when estimating relation size". With that commit,
table_block_relation_estimate_size() function calculates that each page
accommodates less than 1 row when the fillfactor is taken into account,
which rounds down to 0. In reality, the executor will always place at
least one row on a page regardless of fillfactor, but the new estimation
formula doesn't take that into account.

I reported this to pgsql-hackers
(https://www.postgresql.org/message-id/2bf9d973-7789-4937-a7ca-0af9fb49c71e%40iki.fi),
we don't need to do anything more about it in neon. It's OK to not use
parallel scans here; once issue 2. below is addressed, the queries are
fast enough without parallelism..

2. On v17, prefetching was not happening for the sequential scan. That's
because starting with v17, buffers are reserved in the shared buffer
cache before prefetching is initiated, and we use a tiny
shared_buffers=1MB setting in the tests. The prefetching is effectively
disabled with such a small shared_buffers setting, to protect the system
from completely starving out of buffers.

   To address that, simply bump up shared_buffers in the test.

This patch addresses the second issue, which is enough to fix the
problem.
---
 test_runner/regress/test_import_pgdata.py | 24 +++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 086d4b67c9e3..6b35f3c6bb37 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -70,6 +70,12 @@ def handler(request: Request) -> Response:
     env.pageserver.stop()
     env.pageserver.start()
 
+    # By default our tests run with a tiny shared_buffers=1MB setting. That
+    # doesn't allow any prefetching on v17 and above, where the new streaming
+    # read machinery keeps buffers pinned while prefetching them.  Use a higher
+    # setting to enable prefetching and speed up the tests
+    ep_config = ["shared_buffers=64MB"]
+
     #
     # Put data in vanilla pg
     #
@@ -246,7 +252,11 @@ def validate_vanilla_equivalence(ep):
     #
 
     ro_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn
+        branch_name=import_branch_name,
+        endpoint_id="ro",
+        tenant_id=tenant_id,
+        lsn=last_record_lsn,
+        config_lines=ep_config,
     )
 
     validate_vanilla_equivalence(ro_endpoint)
@@ -276,7 +286,10 @@ def validate_vanilla_equivalence(ep):
     # validate that we can write
     #
     rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id
+        branch_name=import_branch_name,
+        endpoint_id="rw",
+        tenant_id=tenant_id,
+        config_lines=ep_config,
     )
     rw_endpoint.safe_psql("create table othertable(values text)")
     rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
@@ -296,7 +309,7 @@ def validate_vanilla_equivalence(ep):
         ancestor_start_lsn=rw_lsn,
     )
     br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
     )
     validate_vanilla_equivalence(br_tip_endpoint)
     br_tip_endpoint.safe_psql("select * from othertable")
@@ -309,7 +322,10 @@ def validate_vanilla_equivalence(ep):
         ancestor_start_lsn=initdb_lsn,
     )
     br_initdb_endpoint = env.endpoints.create_start(
-        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+        branch_name="br-initdb",
+        endpoint_id="br-initdb-ro",
+        tenant_id=tenant_id,
+        config_lines=ep_config,
     )
     validate_vanilla_equivalence(br_initdb_endpoint)
     with pytest.raises(psycopg2.errors.UndefinedTable):

From 423e2396174cc8e0c97661a6ceb58ad290a17982 Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Fri, 31 Jan 2025 07:29:06 +0100
Subject: [PATCH 21/78] [infra/notes] impr: add issue types to issue templates
 (#10018)

refs #0000

---------

Co-authored-by: Fedor Dikarev <fedor@neon.tech>
---
 .github/ISSUE_TEMPLATE/bug-template.md  | 1 +
 .github/ISSUE_TEMPLATE/epic-template.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md
index d33eec3cdea0..234d9b5a3782 100644
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,6 +3,7 @@ name: Bug Template
 about: Used for describing bugs
 title: ''
 labels: t/bug
+type: Bug
 assignees: ''
 
 ---
diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index c442f50fde18..868fd084f151 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising
   more than 1 week of work.
 title: 'Epic: '
 labels: t/Epic
+type: Epic
 assignees: ''
 
 ---

From 738bf835836de94e8aa41b8575c6db78cb882c38 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 31 Jan 2025 09:53:43 +0000
Subject: [PATCH 22/78] chore: replace dashmap with clashmap (#10582)

## Problem

Because dashmap 6 switched to hashbrown RawTable API, it required us to
use unsafe code in the upgrade:
https://github.com/neondatabase/neon/pull/8107

## Summary of changes

Switch to clashmap, a fork maintained by me which removes much of the
unsafe and ultimately switches to HashTable instead of RawTable to
remove much of the unsafe requirement on us.
---
 Cargo.lock                             | 53 +++++++++++++++++++++++++-
 Cargo.toml                             |  2 +-
 proxy/Cargo.toml                       |  2 +-
 proxy/src/auth/backend/jwt.rs          |  6 +--
 proxy/src/cache/endpoints.rs           | 14 +++----
 proxy/src/cache/project_info.rs        | 12 +++---
 proxy/src/cancellation.rs              |  8 ++--
 proxy/src/control_plane/client/mod.rs  |  8 ++--
 proxy/src/rate_limiter/leaky_bucket.rs |  8 ++--
 proxy/src/rate_limiter/limiter.rs      |  6 +--
 proxy/src/redis/keys.rs                |  3 +-
 proxy/src/redis/kv_ops.rs              |  1 -
 proxy/src/serverless/conn_pool_lib.rs  | 12 +++---
 proxy/src/usage_metrics.rs             | 10 ++---
 14 files changed, 97 insertions(+), 48 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 359f989a7645..e9cbebcd0254 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1213,6 +1213,20 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
+[[package]]
+name = "clashmap"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+dependencies = [
+ "crossbeam-utils",
+ "hashbrown 0.15.2",
+ "lock_api",
+ "parking_lot_core 0.9.8",
+ "polonius-the-crab",
+ "replace_with",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -2531,6 +2545,12 @@ dependencies = [
  "allocator-api2",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2581,6 +2601,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
 
+[[package]]
+name = "higher-kinded-types"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e"
+dependencies = [
+ "never-say-never",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -3544,6 +3573,12 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "never-say-never"
+version = "6.6.666"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6"
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -4421,6 +4456,16 @@ dependencies = [
  "plotters-backend",
 ]
 
+[[package]]
+name = "polonius-the-crab"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9"
+dependencies = [
+ "higher-kinded-types",
+ "never-say-never",
+]
+
 [[package]]
 name = "postgres"
 version = "0.19.6"
@@ -4794,9 +4839,9 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
+ "clashmap",
  "compute_api",
  "consumption_metrics",
- "dashmap 5.5.0",
  "ecdsa 0.16.9",
  "ed25519-dalek",
  "env_logger 0.10.2",
@@ -5215,6 +5260,12 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "replace_with"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690"
+
 [[package]]
 name = "reqwest"
 version = "0.12.4"
diff --git a/Cargo.toml b/Cargo.toml
index 9ccdb45f6d82..9d15b78a9332 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,10 +77,10 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
+clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f362a4503537..35574e945cd9 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,9 +24,9 @@ bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive", "env"] }
+clashmap.workspace = true
 compute_api.workspace = true
 consumption_metrics.workspace = true
-dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index df716f8455f0..e05a693cee27 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,7 +4,7 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
 use arc_swap::ArcSwapOption;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{redirect, Client};
 use reqwest_retry::policies::ExponentialBackoff;
@@ -64,7 +64,7 @@ pub(crate) struct AuthRule {
 pub struct JwkCache {
     client: reqwest_middleware::ClientWithMiddleware,
 
-    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+    map: ClashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
 
 pub(crate) struct JwkCacheEntry {
@@ -469,7 +469,7 @@ impl Default for JwkCache {
 
         JwkCache {
             client,
-            map: DashMap::default(),
+            map: ClashMap::default(),
         }
     }
 }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 0136446d6dfb..b5c42cd23db7 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -3,7 +3,7 @@ use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 
-use dashmap::DashSet;
+use clashmap::ClashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
 use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
@@ -55,9 +55,9 @@ impl TryFrom<&Value> for ControlPlaneEvent {
 
 pub struct EndpointsCache {
     config: EndpointCacheConfig,
-    endpoints: DashSet<EndpointIdInt>,
-    branches: DashSet<BranchIdInt>,
-    projects: DashSet<ProjectIdInt>,
+    endpoints: ClashSet<EndpointIdInt>,
+    branches: ClashSet<BranchIdInt>,
+    projects: ClashSet<ProjectIdInt>,
     ready: AtomicBool,
     limiter: Arc<Mutex<GlobalRateLimiter>>,
 }
@@ -69,9 +69,9 @@ impl EndpointsCache {
                 config.limiter_info.clone(),
             ))),
             config,
-            endpoints: DashSet::new(),
-            branches: DashSet::new(),
-            projects: DashSet::new(),
+            endpoints: ClashSet::new(),
+            branches: ClashSet::new(),
+            projects: ClashSet::new(),
             ready: AtomicBool::new(false),
         }
     }
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index cab0b8b90594..a5e71f1a8744 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
 use tokio::sync::Mutex;
@@ -108,9 +108,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointIdInt, EndpointInfo>,
+    cache: ClashMap<EndpointIdInt, EndpointInfo>,
 
-    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -176,8 +176,8 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
 impl ProjectInfoCacheImpl {
     pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
         Self {
-            cache: DashMap::new(),
-            project2ep: DashMap::new(),
+            cache: ClashMap::new(),
+            project2ep: ClashMap::new(),
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
@@ -302,7 +302,7 @@ impl ProjectInfoCacheImpl {
         let mut removed = 0;
         let shard = self.project2ep.shards()[shard].write();
         for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get() {
+            for endpoint in endpoints {
                 self.cache.remove(endpoint);
                 removed += 1;
             }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 34f708a36b90..9a0b954341bb 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,3 +1,4 @@
+use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
@@ -8,7 +9,7 @@ use pq_proto::CancelKeyData;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::sync::mpsc;
+use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
 use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
@@ -17,14 +18,11 @@ use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
-use crate::metrics::CancelChannelSizeGuard;
-use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind};
+use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
-use std::convert::Infallible;
-use tokio::sync::oneshot;
 
 type IpSubnetKey = IpNet;
 
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index d559d96bbc61..b879f3a59ff4 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -6,7 +6,7 @@ use std::hash::Hash;
 use std::sync::Arc;
 use std::time::Duration;
 
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -148,7 +148,7 @@ impl ApiCaches {
 /// Various caches for [`control_plane`](super).
 pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    node_locks: ClashMap<K, Arc<DynamicLimiter>>,
     config: RateLimiterConfig,
     timeout: Duration,
     epoch: std::time::Duration,
@@ -180,7 +180,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     ) -> prometheus::Result<Self> {
         Ok(Self {
             name,
-            node_locks: DashMap::with_shard_amount(shards),
+            node_locks: ClashMap::with_shard_amount(shards),
             config,
             timeout,
             epoch,
@@ -238,7 +238,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                 let mut lock = shard.write();
                 let timer = self.metrics.reclamation_lag_seconds.start_timer();
                 let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1)
                     .count();
                 drop(lock);
                 self.metrics.semaphores_unregistered.inc_by(count as u64);
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index bff800f0a2f0..9645eaf725b1 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -2,7 +2,7 @@ use std::hash::Hash;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
 use ahash::RandomState;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use tokio::time::Instant;
 use tracing::info;
@@ -14,7 +14,7 @@ use crate::intern::EndpointIdInt;
 pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
 
 pub struct LeakyBucketRateLimiter<Key> {
-    map: DashMap<Key, LeakyBucketState, RandomState>,
+    map: ClashMap<Key, LeakyBucketState, RandomState>,
     config: utils::leaky_bucket::LeakyBucketConfig,
     access_count: AtomicUsize,
 }
@@ -27,7 +27,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
 
     pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
         Self {
-            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
+            map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
             config: config.into(),
             access_count: AtomicUsize::new(0),
         }
@@ -58,7 +58,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
         let shard = thread_rng().gen_range(0..n);
         self.map.shards()[shard]
             .write()
-            .retain(|_, value| !value.get().bucket_is_empty(now));
+            .retain(|(_, value)| !value.bucket_is_empty(now));
     }
 }
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index ec080f270b77..ef6c39f230f4 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;
 
 use anyhow::bail;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use itertools::Itertools;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
@@ -62,7 +62,7 @@ impl GlobalRateLimiter {
 pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<Key, Vec<RateBucket>, Hasher>,
+    map: ClashMap<Key, Vec<RateBucket>, Hasher>,
     info: Cow<'static, [RateBucketInfo]>,
     access_count: AtomicUsize,
     rand: Mutex<Rand>,
@@ -202,7 +202,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
         info!(buckets = ?info, "endpoint rate limiter");
         Self {
             info,
-            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
+            map: ClashMap::with_hasher_and_shard_amount(hasher, 64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
             rand: Mutex::new(rand),
         }
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
index dddc7e2054db..dcb9a59f873b 100644
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,7 +1,8 @@
+use std::io::ErrorKind;
+
 use anyhow::Ok;
 use pq_proto::{id_to_cancel_key, CancelKeyData};
 use serde::{Deserialize, Serialize};
-use std::io::ErrorKind;
 
 pub mod keyspace {
     pub const CANCEL_PREFIX: &str = "cancel";
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index dcc6aac51bb7..3689bf7ae29b 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,7 +1,6 @@
 use redis::{AsyncCommands, ToRedisArgs};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 pub struct RedisKVClient {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 44eac77e8f94..a300198de449 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::time::Duration;
 
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use parking_lot::RwLock;
 use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
@@ -351,11 +351,11 @@ where
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    pub(crate) global_pool: DashMap<EndpointCacheKey, Arc<RwLock<P>>>,
+    pub(crate) global_pool: ClashMap<EndpointCacheKey, Arc<RwLock<P>>>,
 
     /// Number of endpoint-connection pools
     ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
     pub(crate) global_pool_size: AtomicUsize,
@@ -396,7 +396,7 @@ where
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
+            global_pool: ClashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
             config,
             global_connections_count: Arc::new(AtomicUsize::new(0)),
@@ -442,10 +442,10 @@ where
             .start_timer();
         let current_len = shard.len();
         let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
+        shard.retain(|(endpoint, x)| {
             // if the current endpoint pool is unique (no other strong or weak references)
             // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+            if let Some(pool) = Arc::get_mut(x) {
                 let endpoints = pool.get_mut();
                 clients_removed = endpoints.clear_closed();
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index e1cc7e87b4a4..d369e3742f82 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -10,9 +10,9 @@ use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
+use clashmap::mapref::entry::Entry;
+use clashmap::ClashMap;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::mapref::entry::Entry;
-use dashmap::DashMap;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -137,7 +137,7 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 
 #[derive(Default)]
 pub(crate) struct Metrics {
-    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
 }
 
 impl Metrics {
@@ -213,7 +213,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
 }
 
 fn collect_and_clear_metrics<C: Clearable>(
-    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<C>, FastHasher>,
 ) -> Vec<(Ids, u64)> {
     let mut metrics_to_clear = Vec::new();
 
@@ -271,7 +271,7 @@ fn create_event_chunks<'a>(
 #[expect(clippy::too_many_arguments)]
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
     client: &http::ClientWithMiddleware,
     metric_collection_endpoint: &reqwest::Url,
     storage: Option<&GenericRemoteStorage>,

From 6041a935918f34f06ec7f0fb93cc6fc97c1dc1dd Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 31 Jan 2025 10:54:31 +0100
Subject: [PATCH 23/78] Update tokio base crates (#10556)

Update `tokio` base crates and their deps. Pin `tokio` to at least 1.41
which stabilized task ID APIs.

To dedup `mio` dep the `notify` crate is updated. It's used in
`compute_tools`.

https://github.com/neondatabase/neon/blob/9f81828429ad6475b4fbb1a814240213b74bec63/compute_tools/src/pg_helpers.rs#L258-L367
---
 Cargo.lock | 84 +++++++++++++++++++++++++++++++-----------------------
 Cargo.toml |  4 +--
 2 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9cbebcd0254..cada9604ff53 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -966,7 +966,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "cexpr",
  "clang-sys",
  "itertools 0.12.1",
@@ -994,9 +994,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
 
 [[package]]
 name = "block-buffer"
@@ -1563,7 +1563,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "crossterm_winapi",
  "libc",
  "parking_lot 0.12.1",
@@ -1794,7 +1794,7 @@ version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "byteorder",
  "chrono",
  "diesel_derives",
@@ -3088,11 +3088,11 @@ dependencies = [
 
 [[package]]
 name = "inotify"
-version = "0.9.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.8.0",
  "inotify-sys",
  "libc",
 ]
@@ -3269,9 +3269,9 @@ dependencies = [
 
 [[package]]
 name = "kqueue"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
 dependencies = [
  "kqueue-sys",
  "libc",
@@ -3279,9 +3279,9 @@ dependencies = [
 
 [[package]]
 name = "kqueue-sys"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
 dependencies = [
  "bitflags 1.3.2",
  "libc",
@@ -3308,9 +3308,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libloading"
@@ -3557,14 +3557,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
  "libc",
  "log",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -3610,7 +3610,7 @@ version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "cfg-if",
  "libc",
  "memoffset 0.9.0",
@@ -3628,12 +3628,11 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "6.1.1"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943"
 dependencies = [
- "bitflags 2.4.1",
- "crossbeam-channel",
+ "bitflags 2.8.0",
  "filetime",
  "fsevent-sys",
  "inotify",
@@ -3641,10 +3640,17 @@ dependencies = [
  "libc",
  "log",
  "mio",
+ "notify-types",
  "walkdir",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "notify-types"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4705,7 +4711,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "chrono",
  "flate2",
  "hex",
@@ -4720,7 +4726,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "chrono",
  "hex",
 ]
@@ -5545,7 +5551,7 @@ version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "errno",
  "libc",
  "linux-raw-sys 0.4.14",
@@ -6819,21 +6825,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
  "backtrace",
  "bytes",
  "libc",
  "mio",
- "num_cpus",
  "parking_lot 0.12.1",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -6864,9 +6869,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7151,7 +7156,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "bytes",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -7654,9 +7659,9 @@ dependencies = [
 
 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
  "same-file",
  "winapi-util",
@@ -7908,6 +7913,15 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 9d15b78a9332..267a91d773c4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -123,7 +123,7 @@ measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+notify = "8.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
@@ -177,7 +177,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.17", features = ["macros"] }
+tokio = { version = "1.41", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"

From 765ba43438f91bf75773e9de358dbf58f34c5d87 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 31 Jan 2025 13:33:24 +0300
Subject: [PATCH 24/78] Allow pageserver unreachable errors in
 test_scrubber_tenant_snapshot (#10585)

## Problem

test_scrubber_tenant_snapshot restarts pageservers, but log validation
fails tests on any non white listed storcon warnings, making the test
flaky.

## Summary of changes

Allow warns like
2025-01-29T12:37:42.622179Z WARN reconciler{seq=1
tenant_id=2011077aea9b4e8a60e8e8a19407634c shard_id=0004}: Call to node
2 (localhost:15352) management API failed, will retry (attempt 1):
receive body: error sending request for url
(http://localhost:15352/v1/tenant/2011077aea9b4e8a60e8e8a19407634c-0004/location_config):
client error (Connect)

ref https://github.com/neondatabase/neon/issues/10462
---
 test_runner/regress/test_storage_scrubber.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 1304d302b79c..7e92cc01cd2f 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -32,6 +32,12 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
 
     env = neon_env_builder.init_start()
+    # We restart pageserver(s), which will cause storage storage controller
+    # requests to fail and warn.
+    env.storage_controller.allowed_errors.append(".*management API still failed.*")
+    env.storage_controller.allowed_errors.append(
+        ".*Reconcile error.*error sending request for url.*"
+    )
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     branch = "main"

From f09cfd11cb3029f9395c0ad0bccd61d2a848a6db Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 10:54:14 +0000
Subject: [PATCH 25/78] pageserver: exclude archived timelines from
 freeze+flush on shutdown (#10594)

## Problem

If offloading races with normal shutdown, we get a "failed to freeze and
flush: cannot flush frozen layers when flush_loop is not running, state
is Exited". This is harmless but points to it being quite strange to try
and freeze and flush such a timeline. flushing on shutdown for an
archived timeline isn't useful.

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- During Timeline::shutdown, ignore ShutdownMode::FreezeAndFlush if the
timeline is archived
---
 pageserver/src/tenant/timeline.rs | 71 ++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 827601fa8b86..d6a8eaa4d957 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1818,7 +1818,7 @@ impl Timeline {
         self.last_record_lsn.shutdown();
 
         if let ShutdownMode::FreezeAndFlush = mode {
-            if let Some((open, frozen)) = self
+            let do_flush = if let Some((open, frozen)) = self
                 .layers
                 .read()
                 .await
@@ -1827,43 +1827,54 @@ impl Timeline {
                 .ok()
                 .filter(|(open, frozen)| *open || *frozen > 0)
             {
-                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                if self.remote_client.is_archived() == Some(true) {
+                    // No point flushing on shutdown for an archived timeline: it is not important
+                    // to have it nice and fresh after our restart, and trying to flush here might
+                    // race with trying to offload it (which also stops the flush loop)
+                    false
+                } else {
+                    tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                    true
+                }
             } else {
-                // this is double-shutdown, ignore it
-            }
+                // this is double-shutdown, it'll be a no-op
+                true
+            };
 
             // we shut down walreceiver above, so, we won't add anything more
             // to the InMemoryLayer; freeze it and wait for all frozen layers
             // to reach the disk & upload queue, then shut the upload queue and
             // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    self.remote_client.shutdown().await;
-                }
-                Err(FlushLayerError::Cancelled) => {
-                    // this is likely the second shutdown, ignore silently.
-                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-                    debug_assert!(self.cancel.is_cancelled());
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
+            if do_flush {
+                match self.freeze_and_flush().await {
+                    Ok(_) => {
+                        // drain the upload queue
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        self.remote_client.shutdown().await;
+                    }
+                    Err(FlushLayerError::Cancelled) => {
+                        // this is likely the second shutdown, ignore silently.
+                        // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                        debug_assert!(self.cancel.is_cancelled());
+                    }
+                    Err(e) => {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to freeze and flush: {e:#}");
+                    }
                 }
-            }
 
-            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
-            // we also do a final check here to ensure that the queue is empty.
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+                // we also do a final check here to ensure that the queue is empty.
+                if !self.remote_client.no_pending_work() {
+                    warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                }
             }
         }
 

From 7d5c70c717c5ad543fdbd6115e422e27e0e86da9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 31 Jan 2025 12:23:12 +0100
Subject: [PATCH 26/78] Update AWS SDK crates (#10588)

We want to keep the AWS SDK up to date as that way we benefit from new
developments and improvements.

Prior update was in #10056
---
 Cargo.lock | 75 ++++++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cada9604ff53..6b63c3c388ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -290,9 +290,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.10"
+version = "1.5.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
+checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -301,7 +301,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.60.7",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -332,9 +332,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.4.4"
+version = "1.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
+checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -366,7 +366,7 @@ dependencies = [
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -389,7 +389,7 @@ dependencies = [
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -414,7 +414,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -437,15 +437,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.50.0"
+version = "1.57.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
+checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -459,15 +459,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
+checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -481,15 +481,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
+checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -504,9 +504,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.6"
+version = "1.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
+checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -533,9 +533,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.1"
+version = "1.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
+checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -565,9 +565,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.5"
+version = "0.60.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
+checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -576,9 +576,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.11"
+version = "0.60.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
+checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -597,18 +597,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
-dependencies = [
- "aws-smithy-types",
-]
-
-[[package]]
-name = "aws-smithy-json"
-version = "0.61.1"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
+checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
 dependencies = [
  "aws-smithy-types",
 ]
@@ -625,9 +616,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.4"
+version = "1.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
+checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -669,9 +660,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.9"
+version = "1.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
+checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -704,9 +695,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.3"
+version = "1.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
+checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",

From afbcebe7f761dd555a3433aa34802b601367a82f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 31 Jan 2025 12:31:58 +0100
Subject: [PATCH 27/78] test_runner: force-compact in `test_sharding_autosplit`
 (#10605)

## Problem

This test may not fully detect data corruption during splits, since we
don't force-compact the entire keyspace.

## Summary of changes

Force-compact all data in `test_sharding_autosplit`.
---
 test_runner/performance/test_sharding_autosplit.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index 76c3ad01a4a7..e5a9f17da8c2 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -247,7 +247,7 @@ def assert_all_split():
         log.info(f"{shard_zero_id} timeline: {timeline_info}")
 
     # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
-    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # definitely hit pageserver for reads.  This compaction pass is expected to drop unwanted
     # layers but not do any rewrites (we're still in the same generation)
     for tenant_id, tenant_state in tenants.items():
         tenant_state.endpoint.stop()
@@ -296,6 +296,16 @@ def assert_all_split():
         for fut in pgbench_futs:
             fut.result()
 
+    # Run a full forced compaction, to detect any data corruption.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_compact(
+                shard_id,
+                tenant_state.timeline_id,
+                force_image_layer_creation=True,
+                force_l0_compaction=True,
+            )
+
     # Assert that some rewrites happened
     # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
     # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)

From 89cff08354f7c2f2bbb0a92df2eca6de828fa4fe Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Fri, 31 Jan 2025 12:46:33 +0100
Subject: [PATCH 28/78] unify pg-build-nonroot-with-cargo base layer and config
 retries in curl (#10575)

Ref: https://github.com/neondatabase/cloud/issues/23461

## Problem

Just made changes around and see these 2 base layers could be optimised.

and after review comment from @myrrc setting up timeouts and retries in
`alpine/curl` image

## Summary of changes
---
 compute/compute-node.Dockerfile | 43 +++++++++++++++------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1ef449f0b06c..32226c56a52d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -825,11 +825,11 @@ RUN case "${PG_VERSION}" in "v17") \
 
 #########################################################################################
 #
-# Layer "rust extensions"
-# This layer is used to build `pgrx` deps
+# Layer "pg build with nonroot user and cargo installed"
+# This layer is base and common for layers with `pgrx`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build
+FROM pg-build AS pg-build-nonroot-with-cargo
 ARG PG_VERSION
 
 RUN apt update && \
@@ -847,8 +847,18 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    case "${PG_VERSION}" in \
+    rm rustup-init
+
+#########################################################################################
+#
+# Layer "rust extensions"
+# This layer is used to build `pgrx` deps
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
         'v17') \
             echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
     esac && \
@@ -867,26 +877,10 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build-pgrx12
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
 ARG PG_VERSION
 
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
-    apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
-
-ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:$PATH"
-USER nonroot
-WORKDIR /home/nonroot
-
-RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
-
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    cargo install --locked --version 0.12.9 cargo-pgrx && \
+RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -1283,7 +1277,8 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
-RUN if [ "$TARGETARCH" = "amd64" ]; then\
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
+    if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
         pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
         sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\

From 503bc72d31ce15620479346dfa1081771a7f0a95 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 31 Jan 2025 11:48:46 +0000
Subject: [PATCH 29/78] CI: add `diesel print-schema` check (#10527)

## Problem

We want to check that `diesel print-schema` doesn't generate any changes
(`storage_controller/src/schema.rs`) in comparison with the list of
migration.

## Summary of changes
- Add `diesel_cli` to `build-tools` image
- Add `Check diesel schema` step to `build-neon` job, at this stage we
have all required binaries, so don't need to compile anything
additionally
- Check runs only on x86 release builds to be sure we do it at least
once per CI run.
---
 .github/workflows/_build-and-test-locally.yml | 20 +++++++++++++++++++
 .github/workflows/_check-codestyle-rust.yml   |  3 +++
 build-tools.Dockerfile                        |  3 +++
 3 files changed, 26 insertions(+)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 2daed9038688..e9483492c967 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -271,6 +271,26 @@ jobs:
           path: /tmp/neon
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
+      - name: Check diesel schema
+        if: inputs.build-type == 'release' && inputs.arch == 'x64'
+        env:
+          DATABASE_URL: postgresql://localhost:1235/storage_controller
+          POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        run: |
+          /tmp/neon/bin/neon_local init
+          /tmp/neon/bin/neon_local storage_controller start
+
+          diesel print-schema > storage_controller/src/schema.rs
+
+          if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then
+            echo >&2 "Uncommitted changes in diesel schema"
+
+            git diff .
+            exit 1
+          fi
+
+          /tmp/neon/bin/neon_local storage_controller stop
+
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
       - name: Merge and upload coverage data
         if: inputs.build-type == 'debug'
diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
index cbc47c640640..f7518d650027 100644
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -16,6 +16,9 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
 jobs:
   check-codestyle-rust:
     strategy:
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 9c13e480c125..dfcc7d06b493 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -261,6 +261,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -274,6 +275,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
     cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+                                      --features postgres-bundled --no-default-features && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 

From dce617fe070e8528aba5a5628b138e2fe3eb4c85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 31 Jan 2025 13:40:20 +0100
Subject: [PATCH 30/78] Update to rebased rust-postgres (#10584)

Update to a rebased version of our rust-postgres patches, rebased on
[this](https://github.com/sfackler/rust-postgres/commit/98f5a11bc0a8e451552d8941ffa078c7eb6cd60c)
commit this time.

With #10280 reapplied, this means that the rust-postgres crates will be
deduplicated, as the new crate versions are finally compatible with the
requirements of diesel-async.

Earlier update: #10561

rust-postgres PR: https://github.com/neondatabase/rust-postgres/pull/39
---
 Cargo.lock | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6b63c3c388ce..cdc620e48575 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4465,8 +4465,8 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.19.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4479,9 +4479,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -4513,7 +4513,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "bytes",
  "chrono",
@@ -6871,8 +6871,8 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.9"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.7.10"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "async-trait",
  "byteorder",

From 10cf5e7a38d45037f3f51b666c519b7e5c6c72a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 31 Jan 2025 14:42:59 +0100
Subject: [PATCH 31/78] Move cargo-deny into a separate workflow on a schedule
 (#10289)

## Problem
There are two (related) problems with the previous handling of
`cargo-deny`:
- When a new advisory is added to rustsec that affects a dependency,
unrelated pull requests will fail.
- New advisories rely on pushes or PRs to be surfaced. Problems that
already exist on main will only be found if we try to merge new things
into main.

## Summary of changes
We split out `cargo-deny` into a separate workflow that runs on all PRs
that touch `Cargo.lock`, and on a schedule on `main`, `release`,
`release-compute` and `release-proxy` to find new advisories.
---
 .github/actionlint.yml                      |  1 +
 .github/file-filters.yaml                   |  1 +
 .github/workflows/_check-codestyle-rust.yml |  5 --
 .github/workflows/build_and_test.yml        | 39 +++++++++++++-
 .github/workflows/cargo-deny.yml            | 57 +++++++++++++++++++++
 .github/workflows/pre-merge-checks.yml      |  3 +-
 6 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/cargo-deny.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index ecff0cc70b22..2b96ce95da32 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -27,3 +27,4 @@ config-variables:
   - SLACK_ON_CALL_QA_STAGING_STREAM
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_CICD_CHANNEL_ID
diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml
index 886cd3919ac2..02ee383d5ed3 100644
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,4 +1,5 @@
 rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+rust_dependencies: ['**/Cargo.lock']
 
 v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
 v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
index f7518d650027..c4c76914aa64 100644
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -87,8 +87,3 @@ jobs:
         run: |
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e588fc5a0e88..1274543429ca 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -45,6 +45,26 @@ jobs:
             run cancel-previous-in-concurrency-group.yml \
               --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
 
+  files-changed:
+    needs: [ check-permissions ]
+    runs-on: [ self-hosted, small ]
+    timeout-minutes: 3
+    outputs:
+      check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Check for file changes
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        id: files-changed
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: .github/file-filters.yaml
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, small ]
@@ -170,6 +190,14 @@ jobs:
       archs: '["x64", "arm64"]'
     secrets: inherit
 
+  check-dependencies-rust:
+    needs: [ files-changed, build-build-tools-image ]
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }}
+    uses: ./.github/workflows/cargo-deny.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
   build-and-test-locally:
     needs: [ tag, build-build-tools-image ]
     strategy:
@@ -1332,6 +1360,8 @@ jobs:
       - build-and-test-locally
       - check-codestyle-python
       - check-codestyle-rust
+      - check-dependencies-rust
+      - files-changed
       - promote-images-dev
       - test-images
       - trigger-custom-extensions-build-and-wait
@@ -1344,4 +1374,11 @@ jobs:
         if: |
           contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true')
+          || needs.build-and-test-locally.result == 'skipped'
+          || needs.check-codestyle-python.result == 'skipped'
+          || needs.check-codestyle-rust.result == 'skipped'
+          || needs.files-changed.result == 'skipped'
+          || needs.promote-images-dev.result == 'skipped'
+          || needs.test-images.result == 'skipped'
+          || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml
new file mode 100644
index 000000000000..433b377c327e
--- /dev/null
+++ b/.github/workflows/cargo-deny.yml
@@ -0,0 +1,57 @@
+name: cargo deny checks
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        required: false
+        type: string
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  cargo-deny:
+    strategy:
+      matrix:
+        ref: >-
+          ${{
+            fromJSON(
+              github.event_name == 'schedule'
+                && '["main","release","release-proxy","release-compute"]'
+                || format('["{0}"]', github.sha)
+            )
+          }}
+
+    runs-on: [self-hosted, small]
+
+    container:
+      image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+
+      - name: Check rust licenses/bans/advisories/sources
+        env:
+          CARGO_DENY_TARGET: >-
+            ${{ github.event_name == 'schedule' && 'advisories' || 'all' }}
+        run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET
+
+      - name: Post to a Slack channel
+        if: ${{ github.event_name == 'schedule' && failure() }}
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
+            text: |
+              Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+              Pinging @oncall-devprod.
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e6dfbaeed871..e92a153db90a 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -124,6 +124,7 @@ jobs:
       - name: Fail the job if any of the dependencies do not succeed or skipped
         run: exit 1
         if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
           || contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')

From a93e9f22fc0e35ad2863d1e57db9f3a01326b710 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 17:43:31 +0000
Subject: [PATCH 32/78] pageserver: remove faulty debug assertion in compaction
 (#10610)

## Problem

This assertion is incorrect: it is legal to see another shard's data at
this point, after a shard split.

Closes: https://github.com/neondatabase/neon/issues/10609

## Summary of changes

- Remove faulty assertion
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7242f73a82d8..7244e946cb79 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1503,11 +1503,9 @@ impl Timeline {
                     .await
                     .map_err(CompactionError::Other)?;
             } else {
-                let shard = self.shard_identity.shard_index();
                 let owner = self.shard_identity.get_shard_number(&key);
-                if cfg!(debug_assertions) {
-                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
-                }
+
+                // This happens after a shard split, when we're compacting an L0 created by our parent shard
                 debug!("dropping key {key} during compaction (it belongs on shard {owner})");
             }
 

From aedeb1c7c277703af861894455764a8c248df9b8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 17:43:54 +0000
Subject: [PATCH 33/78] pageserver: revise logging of cancelled request results
 (#10604)

## Problem

When a client dropped before a request completed, and a handler returned
an ApiError, we would log that at error severity. That was excessive in
the case of a request erroring on a shutdown, and could cause test
flakes.

example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13067651123/index.html#suites/ad9c266207b45eafe19909d1020dd987/6021ce86a0d72ae7/

```
Cancelled request finished with an error: ShuttingDown
```

## Summary of changes

- Log a different info-level on ShuttingDown and ResourceUnavailable API
errors from cancelled requests
---
 pageserver/src/http/routes.rs | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index eb9cb4da0c20..94f7510a4abf 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3393,7 +3393,17 @@ where
                             let status = response.status();
                             info!(%status, "Cancelled request finished successfully")
                         }
-                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                        Err(e) => match e {
+                            ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => {
+                                // Don't log this at error severity: they are normal during lifecycle of tenants/process
+                                info!("Cancelled request aborted for shutdown")
+                            }
+                            _ => {
+                                // Log these in a highly visible way, because we have no client to send the response to, but
+                                // would like to know that something went wrong.
+                                error!("Cancelled request finished with an error: {e:?}")
+                            }
+                        },
                     }
                 }
                 // only logging for cancelled panicked request handlers is the tracing_panic_hook,

From 48c87dc458a84fa9132ff22b1ae1fcc3d3094cda Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 31 Jan 2025 18:07:26 +0000
Subject: [PATCH 34/78] CI(pre-merge-checks): fix condition (#10617)

## Problem

Merge Queue fails if changes include Rust code.

## Summary of changes
- Fix condition for `build-build-tools-image`
- Add a couple of no-op `false ||` to make predicates look
symmetric
---
 .github/workflows/pre-merge-checks.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e92a153db90a..d39ccecac9e9 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -59,7 +59,10 @@ jobs:
           echo "${RUST_CHANGED_FILES}"
 
   build-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
+    if: |
+      false
+      || needs.get-changed-files.outputs.python-changed == 'true'
+      || needs.get-changed-files.outputs.rust-changed == 'true'
     needs: [ get-changed-files ]
     uses: ./.github/workflows/build-build-tools-image.yml
     with:
@@ -124,7 +127,8 @@ jobs:
       - name: Fail the job if any of the dependencies do not succeed or skipped
         run: exit 1
         if: |
-          (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
-          || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
+          false
+          || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result   == 'skipped' && needs.get-changed-files.outputs.rust-changed   == 'true')
           || contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')

From bc7822d90c82046de709b211faa03d3f720a6931 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 31 Jan 2025 19:41:17 +0100
Subject: [PATCH 35/78] temporarily disable some steps and run more often to
 expose more pgbench --initialize in benchmarking workflow (#10616)

## Problem

we want to disable some steps in benchmarking workflow that do not
initialize new projects and instead run the test more frequently

Test run
 https://github.com/neondatabase/neon/actions/runs/13077737888
---
 .github/workflows/benchmarking.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 49f23e895b22..20a8a6e2c9dc 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,8 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:   '0 3 * * *' # run once a day, timezone is utc
+    # - cron:   '0 3 * * *' # run once a day, timezone is utc
+    - cron: '0 */10 * * *' # Runs every 10 hours at minute 0
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -550,6 +551,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -683,7 +685,8 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -810,7 +813,8 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -929,7 +933,8 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write

From fcd195c2b63fdfbb5323258a5422469e1e850175 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 31 Jan 2025 13:04:26 -0600
Subject: [PATCH 36/78] Migrate compute_ctl arg parsing to clap derive (#10497)

The primary benefit is that all the ad hoc get_matches() calls are no
longer necessary. Now all it takes to get at the CLI arguments is
referencing a struct member. It's also great the we can replace the ad
hoc CLI struct we had with this more formal solution.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 346 +++++++++------------------
 1 file changed, 111 insertions(+), 235 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index b98cf706d343..47fc9cb7fe9d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,6 +34,7 @@
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::collections::HashMap;
+use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
@@ -44,7 +45,7 @@ use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
-use clap::Arg;
+use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
@@ -73,10 +74,75 @@ use utils::failpoint_support;
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";
 
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}
+
+#[derive(Parser)]
+#[command(rename_all = "kebab-case")]
+struct Cli {
+    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
+    pub pgbin: String,
+
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    pub remote_ext_config: Option<String>,
+
+    #[arg(long, default_value_t = 3080)]
+    pub http_port: u16,
+
+    #[arg(short = 'D', long, value_name = "DATADIR")]
+    pub pgdata: String,
+
+    #[arg(short = 'C', long, value_name = "DATABASE_URL")]
+    pub connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "neon-postgres")]
+    pub cgroup: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(
+        long,
+        default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor"
+    )]
+    pub filecache_connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "0.0.0.0:10301")]
+    pub vm_monitor_addr: String,
+
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub resize_swap_on_bind: bool,
+
+    #[arg(long)]
+    pub set_disk_quota_for_fs: Option<String>,
+
+    #[arg(short = 's', long = "spec", group = "spec")]
+    pub spec_json: Option<String>,
+
+    #[arg(short = 'S', long, group = "spec-path")]
+    pub spec_path: Option<OsString>,
+
+    #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
+    pub compute_id: Option<String>,
+
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
+    pub control_plane_uri: Option<String>,
+}
+
 fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
+    let cli = Cli::parse();
 
-    let (build_tag, clap_args) = init()?;
+    let build_tag = init()?;
+
+    let scenario = failpoint_support::init();
 
     // enable core dumping for all child processes
     setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -85,13 +151,11 @@ fn main() -> Result<()> {
         // Enter startup tracing context
         let _startup_context_guard = startup_context_from_env();
 
-        let cli_args = process_cli(&clap_args)?;
-
-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let cli_spec = try_spec_from_cli(&cli)?;
 
-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
+        let compute = wait_spec(build_tag, &cli, cli_spec)?;
 
-        start_postgres(&clap_args, wait_spec_result)?
+        start_postgres(&cli, compute)?
 
         // Startup is finished, exit the startup tracing span
     };
@@ -108,7 +172,7 @@ fn main() -> Result<()> {
     deinit_and_exit(wait_pg_result);
 }
 
-fn init() -> Result<(String, clap::ArgMatches)> {
+fn init() -> Result<String> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -123,66 +187,7 @@ fn init() -> Result<(String, clap::ArgMatches)> {
         .to_string();
     info!("build_tag: {build_tag}");
 
-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
-    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
-
-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<&'clap String>,
+    Ok(build_tag)
 }
 
 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -235,19 +240,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
     }
 }
 
-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
-    let compute_id = matches.get_one::<String>("compute-id");
-    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
-
+fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     // First, try to get cluster spec from the cli argument
-    if let Some(spec_json) = spec_json {
+    if let Some(ref spec_json) = cli.spec_json {
         info!("got spec from cli argument {}", spec_json);
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_str(spec_json)?),
@@ -256,7 +251,7 @@ fn try_spec_from_cli(
     }
 
     // Second, try to read it from the file if path is provided
-    if let Some(spec_path) = spec_path {
+    if let Some(ref spec_path) = cli.spec_path {
         let file = File::open(Path::new(spec_path))?;
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_reader(file)?),
@@ -264,17 +259,20 @@ fn try_spec_from_cli(
         });
     }
 
-    let Some(compute_id) = compute_id else {
+    if cli.compute_id.is_none() {
         panic!(
             "compute spec should be provided by one of the following ways: \
                 --spec OR --spec-path OR --control-plane-uri and --compute-id"
         );
     };
-    let Some(control_plane_uri) = control_plane_uri else {
+    if cli.control_plane_uri.is_none() {
         panic!("must specify both --control-plane-uri and --compute-id or none");
     };
 
-    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+    match get_spec_from_control_plane(
+        cli.control_plane_uri.as_ref().unwrap(),
+        cli.compute_id.as_ref().unwrap(),
+    ) {
         Ok(spec) => Ok(CliSpecParams {
             spec,
             live_config_allowed: true,
@@ -298,21 +296,12 @@ struct CliSpecParams {
 
 fn wait_spec(
     build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-        http_port,
-        ..
-    }: ProcessCliResult,
+    cli: &Cli,
     CliSpecParams {
         spec,
         live_config_allowed,
     }: CliSpecParams,
-) -> Result<WaitSpecResult> {
+) -> Result<Arc<ComputeNode>> {
     let mut new_state = ComputeState::new();
     let spec_set;
 
@@ -324,7 +313,7 @@ fn wait_spec(
     } else {
         spec_set = false;
     }
-    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
     let conn_conf = postgres::config::Config::from_str(connstr.as_str())
         .context("cannot build postgres config from connstr")?;
     let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
@@ -333,14 +322,14 @@ fn wait_spec(
         connstr,
         conn_conf,
         tokio_conn_conf,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version_string(pgbin),
-        http_port,
+        pgdata: cli.pgdata.clone(),
+        pgbin: cli.pgbin.clone(),
+        pgversion: get_pg_version_string(&cli.pgbin),
+        http_port: cli.http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
-        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
+        ext_remote_storage: cli.remote_ext_config.clone(),
         ext_download_progress: RwLock::new(HashMap::new()),
         build_tag,
     };
@@ -357,7 +346,7 @@ fn wait_spec(
     // Launch http service first, so that we can serve control-plane requests
     // while configuration is still in progress.
     let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+        launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");
 
     if !spec_set {
         // No spec provided, hang waiting for it.
@@ -389,27 +378,12 @@ fn wait_spec(
 
     launch_lsn_lease_bg_task_for_static(&compute);
 
-    Ok(WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<String>,
+    Ok(compute)
 }
 
 fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    }: WaitSpecResult,
+    cli: &Cli,
+    compute: Arc<ComputeNode>,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
@@ -437,7 +411,7 @@ fn start_postgres(
     let mut delay_exit = false;
 
     // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
         // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
         // *before* starting postgres.
         //
@@ -464,9 +438,9 @@ fn start_postgres(
 
     // Set disk quota if the compute spec says so
     if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, set_disk_quota_for_fs)
+        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
     {
-        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
             Ok(()) => {
                 let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
                 info!(%disk_quota_bytes, %size_mib, "set disk quota");
@@ -509,13 +483,7 @@ fn start_postgres(
         if #[cfg(target_os = "linux")] {
             use std::env;
             use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
 
-            // Only make a runtime if we need to.
             // Note: it seems like you can make a runtime in an inner scope and
             // if you start a task in it it won't be dropped. However, make it
             // in the outermost scope just to be safe.
@@ -538,15 +506,15 @@ fn start_postgres(
             let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
                 None
             } else {
-                file_cache_connstr.cloned()
+                Some(cli.filecache_connstr.clone())
             };
 
             let vm_monitor = rt.as_ref().map(|rt| {
                 rt.spawn(vm_monitor::start(
                     Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
+                        cgroup: Some(cli.cgroup.clone()),
                         pgconnstr,
-                        addr: vm_monitor_addr.clone(),
+                        addr: cli.vm_monitor_addr.clone(),
                     })),
                     token.clone(),
                 ))
@@ -702,105 +670,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
     exit(exit_code.unwrap_or(1))
 }
 
-fn cli() -> clap::Command {
-    // Env variable is set by `cargo`
-    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
-    clap::Command::new("compute_ctl")
-        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .default_value("postgres")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .arg(
-            Arg::new("compute-id")
-                .short('i')
-                .long("compute-id")
-                .value_name("COMPUTE_ID"),
-        )
-        .arg(
-            Arg::new("control-plane-uri")
-                .short('p')
-                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE_API_BASE_URI"),
-        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
-        .arg(
-            Arg::new("set-disk-quota-for-fs")
-                .long("set-disk-quota-for-fs")
-                .value_name("SET_DISK_QUOTA_FOR_FS")
-        )
-}
-
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
@@ -810,7 +679,14 @@ fn handle_exit_signal(sig: i32) {
     exit(1);
 }
 
-#[test]
-fn verify_cli() {
-    cli().debug_assert()
+#[cfg(test)]
+mod test {
+    use clap::CommandFactory;
+
+    use super::Cli;
+
+    #[test]
+    fn verify_cli() {
+        Cli::command().debug_assert()
+    }
 }

From ad1a41157affa94c9e818239b7c2d9fd26bb3de6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 31 Jan 2025 19:14:27 +0000
Subject: [PATCH 37/78] feat(proxy): optimizing the chances of large write in
 copy_bidirectional (#10608)

We forked copy_bidirectional to solve some issues like fast-shutdown
(disallowing half-open connections) and to introduce better error
tracking (which side of the conn closed down).

A change recently made its way upstream offering performance
improvements: https://github.com/tokio-rs/tokio/pull/6532. These seem
applicable to our fork, thus it makes sense to apply them here as well.
---
 proxy/src/proxy/copy_bidirectional.rs | 35 +++++++++++++++------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 3336a9556a5b..861f1766e84c 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -201,25 +201,26 @@ impl CopyBuffer {
         W: AsyncWrite + ?Sized,
     {
         loop {
-            // If our buffer is empty, then we need to read some data to
-            // continue.
-            if self.pos == self.cap && !self.read_done {
-                self.pos = 0;
-                self.cap = 0;
-
+            // If there is some space left in our buffer, then we try to read some
+            // data to continue, thus maximizing the chances of a large write.
+            if self.cap < self.buf.len() && !self.read_done {
                 match self.poll_fill_buf(cx, reader.as_mut()) {
                     Poll::Ready(Ok(())) => (),
                     Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))),
                     Poll::Pending => {
-                        // Try flushing when the reader has no progress to avoid deadlock
-                        // when the reader depends on buffered writer.
-                        if self.need_flush {
-                            ready!(writer.as_mut().poll_flush(cx))
-                                .map_err(ErrorDirection::Write)?;
-                            self.need_flush = false;
+                        // Ignore pending reads when our buffer is not empty, because
+                        // we can try to write data immediately.
+                        if self.pos == self.cap {
+                            // Try flushing when the reader has no progress to avoid deadlock
+                            // when the reader depends on buffered writer.
+                            if self.need_flush {
+                                ready!(writer.as_mut().poll_flush(cx))
+                                    .map_err(ErrorDirection::Write)?;
+                                self.need_flush = false;
+                            }
+
+                            return Poll::Pending;
                         }
-
-                        return Poll::Pending;
                     }
                 }
             }
@@ -246,9 +247,13 @@ impl CopyBuffer {
                 "writer returned length larger than input slice"
             );
 
+            // All data has been written, the buffer can be considered empty again
+            self.pos = 0;
+            self.cap = 0;
+
             // If we've written all the data and we've seen EOF, flush out the
             // data and finish the transfer.
-            if self.pos == self.cap && self.read_done {
+            if self.read_done {
                 ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?;
                 return Poll::Ready(Ok(self.amt));
             }

From 6dd48ba148af2eaf90c9d8b5505a760a9995f173 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Fri, 31 Jan 2025 21:32:57 +0100
Subject: [PATCH 38/78] feat(proxy): Implement access control with VPC endpoint
 checks and block for public internet / VPC (#10143)

- Wired up filtering on VPC endpoints
- Wired up block access from public internet / VPC depending on per
project flag
- Added cache invalidation for VPC endpoints (partially based on PR from
Raphael)
- Removed BackendIpAllowlist trait

---------

Co-authored-by: Ivan Efremov <ivan@neon.tech>
---
 proxy/src/auth/backend/console_redirect.rs    |  32 ++-
 proxy/src/auth/backend/mod.rs                 | 152 ++++++++----
 proxy/src/auth/mod.rs                         |  25 ++
 proxy/src/bin/local_proxy.rs                  |   1 +
 proxy/src/bin/proxy.rs                        |   1 +
 proxy/src/cache/project_info.rs               | 224 +++++++++++++++++-
 proxy/src/cancellation.rs                     |  55 ++++-
 proxy/src/config.rs                           |   1 +
 proxy/src/console_redirect_proxy.rs           |   3 +-
 proxy/src/context/mod.rs                      |  11 +-
 .../control_plane/client/cplane_proxy_v1.rs   | 174 +++++++++++++-
 proxy/src/control_plane/client/mock.rs        |  42 +++-
 proxy/src/control_plane/client/mod.rs         |  51 +++-
 proxy/src/control_plane/messages.rs           |  26 +-
 proxy/src/control_plane/mod.rs                |  33 ++-
 proxy/src/intern.rs                           |  22 +-
 proxy/src/metrics.rs                          |  13 +
 proxy/src/proxy/mod.rs                        |   3 +-
 proxy/src/proxy/tests/mod.rs                  |  16 +-
 proxy/src/redis/notifications.rs              |  48 +++-
 proxy/src/serverless/backend.rs               |  42 +++-
 proxy/src/types.rs                            |   2 +
 22 files changed, 844 insertions(+), 133 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 1cbf91d3ae73..9be29c38c938 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -7,8 +7,8 @@ use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
 
-use super::{ComputeCredentialKeys, ControlPlaneApi};
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use super::ComputeCredentialKeys;
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
@@ -84,26 +84,15 @@ pub(crate) fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
-#[async_trait]
-impl BackendIpAllowlist for ConsoleRedirectBackend {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        self.api
-            .get_allowed_ips_and_secret(ctx, user_info)
-            .await
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
-    }
-}
-
 impl ConsoleRedirectBackend {
     pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
         Self { console_uri, api }
     }
 
+    pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient {
+        &self.api
+    }
+
     pub(crate) async fn authenticate(
         &self,
         ctx: &RequestContext,
@@ -191,6 +180,15 @@ async fn authenticate(
         }
     }
 
+    // Check if the access over the public internet is allowed, otherwise block. Note that
+    // the console redirect is not behind the VPC service endpoint, so we don't need to check
+    // the VPC endpoint ID.
+    if let Some(public_access_allowed) = db_info.public_access_allowed {
+        if !public_access_allowed {
+            return Err(auth::AuthError::NetworkNotAllowed);
+        }
+    }
+
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
     // This config should be self-contained, because we won't
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index d17d91a56d96..7ef096207aed 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -26,10 +26,12 @@ use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
-    self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
+    self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
 };
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
@@ -99,6 +101,13 @@ impl<T> Backend<'_, T> {
             Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
         }
     }
+
+    pub(crate) fn get_api(&self) -> &ControlPlaneClient {
+        match self {
+            Self::ControlPlane(api, _) => api,
+            Self::Local(_) => panic!("Local backend has no API"),
+        }
+    }
 }
 
 impl<'a, T> Backend<'a, T> {
@@ -247,15 +256,6 @@ impl AuthenticationConfig {
     }
 }
 
-#[async_trait::async_trait]
-pub(crate) trait BackendIpAllowlist {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>>;
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -282,23 +282,51 @@ async fn auth_quirks(
         Ok(info) => (info, None),
     };
 
-    debug!("fetching user's authentication info");
-    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
+    debug!("fetching authentication info and allowlists");
 
     // check allowed list
-    if config.ip_allowlist_check_enabled
-        && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
-    {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    let allowed_ips = if config.ip_allowlist_check_enabled {
+        let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        }
+        allowed_ips
+    } else {
+        Cached::new_uncached(Arc::new(vec![]))
+    };
+
+    // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+    let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?;
+    if config.is_vpc_acccess_proxy {
+        if access_blocks.vpc_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
+        let incoming_vpc_endpoint_id = match ctx.extra() {
+            None => return Err(AuthError::MissingEndpointName),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                // Convert the vcpe_id to a string
+                String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+            }
+            Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+        };
+        let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?;
+        // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+        if !allowed_vpc_endpoint_ids.is_empty()
+            && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+        {
+            return Err(AuthError::vpc_endpoint_id_not_allowed(
+                incoming_vpc_endpoint_id,
+            ));
+        }
+    } else if access_blocks.public_access_blocked {
+        return Err(AuthError::NetworkNotAllowed);
     }
 
     if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
         return Err(AuthError::too_many_connections());
     }
-    let cached_secret = match maybe_secret {
-        Some(secret) => secret,
-        None => api.get_role_secret(ctx, &info).await?,
-    };
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
     let (cached_entry, secret) = cached_secret.take_value();
 
     let secret = if let Some(secret) = secret {
@@ -440,34 +468,38 @@ impl Backend<'_, ComputeUserInfo> {
         }
     }
 
-    pub(crate) async fn get_allowed_ips_and_secret(
+    pub(crate) async fn get_allowed_ips(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+        }
+    }
+
+    pub(crate) async fn get_allowed_vpc_endpoint_ids(
         &self,
         ctx: &RequestContext,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
         match self {
             Self::ControlPlane(api, user_info) => {
-                api.get_allowed_ips_and_secret(ctx, user_info).await
+                api.get_allowed_vpc_endpoint_ids(ctx, user_info).await
             }
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
         }
     }
-}
 
-#[async_trait::async_trait]
-impl BackendIpAllowlist for Backend<'_, ()> {
-    async fn get_allowed_ips(
+    pub(crate) async fn get_block_public_or_vpc_access(
         &self,
         ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        let auth_data = match self {
-            Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        };
-
-        auth_data
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_block_public_or_vpc_access(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())),
+        }
     }
 }
 
@@ -514,7 +546,10 @@ mod tests {
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
     use crate::context::RequestContext;
-    use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
+    use crate::control_plane::{
+        self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps,
+        CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret,
+    };
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
     use crate::scram::threadpool::ThreadPool;
@@ -523,6 +558,8 @@ mod tests {
 
     struct Auth {
         ips: Vec<IpPattern>,
+        vpc_endpoint_ids: Vec<String>,
+        access_blocker_flags: AccessBlockerFlags,
         secret: AuthSecret,
     }
 
@@ -535,17 +572,31 @@ mod tests {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
         }
 
-        async fn get_allowed_ips_and_secret(
+        async fn get_allowed_ips(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())))
+        }
+
+        async fn get_allowed_vpc_endpoint_ids(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new(
+                self.vpc_endpoint_ids.clone(),
+            )))
+        }
+
+        async fn get_block_public_or_vpc_access(
             &self,
             _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
-        ) -> Result<
-            (CachedAllowedIps, Option<CachedRoleSecret>),
-            control_plane::errors::GetAuthInfoError,
-        > {
-            Ok((
-                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
-                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+        ) -> Result<CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAccessBlockerFlags::new_uncached(
+                self.access_blocker_flags.clone(),
             ))
         }
 
@@ -575,6 +626,7 @@ mod tests {
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
         rate_limit_ip_subnet: 64,
         ip_allowlist_check_enabled: true,
+        is_vpc_acccess_proxy: false,
         is_auth_broker: false,
         accept_jwts: false,
         console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
@@ -642,6 +694,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
@@ -722,6 +776,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
@@ -774,6 +830,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 0198cc306e08..6082695a6b1b 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -55,6 +55,12 @@ pub(crate) enum AuthError {
     )]
     MissingEndpointName,
 
+    #[error(
+        "VPC endpoint ID is not specified. \
+        This endpoint requires a VPC endpoint ID to connect."
+    )]
+    MissingVPCEndpointId,
+
     #[error("password authentication failed for user '{0}'")]
     PasswordFailed(Box<str>),
 
@@ -69,6 +75,15 @@ pub(crate) enum AuthError {
     )]
     IpAddressNotAllowed(IpAddr),
 
+    #[error("This connection is trying to access this endpoint from a blocked network.")]
+    NetworkNotAllowed,
+
+    #[error(
+        "This VPC endpoint id {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console."
+    )]
+    VpcEndpointIdNotAllowed(String),
+
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
 
@@ -95,6 +110,10 @@ impl AuthError {
         AuthError::IpAddressNotAllowed(ip)
     }
 
+    pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self {
+        AuthError::VpcEndpointIdNotAllowed(id)
+    }
+
     pub(crate) fn too_many_connections() -> Self {
         AuthError::TooManyConnections
     }
@@ -122,8 +141,11 @@ impl UserFacingError for AuthError {
             Self::BadAuthMethod(_) => self.to_string(),
             Self::MalformedPassword(_) => self.to_string(),
             Self::MissingEndpointName => self.to_string(),
+            Self::MissingVPCEndpointId => self.to_string(),
             Self::Io(_) => "Internal error".to_string(),
             Self::IpAddressNotAllowed(_) => self.to_string(),
+            Self::NetworkNotAllowed => self.to_string(),
+            Self::VpcEndpointIdNotAllowed(_) => self.to_string(),
             Self::TooManyConnections => self.to_string(),
             Self::UserTimeout(_) => self.to_string(),
             Self::ConfirmationTimeout(_) => self.to_string(),
@@ -142,8 +164,11 @@ impl ReportableError for AuthError {
             Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
             Self::MalformedPassword(_) => crate::error::ErrorKind::User,
             Self::MissingEndpointName => crate::error::ErrorKind::User,
+            Self::MissingVPCEndpointId => crate::error::ErrorKind::User,
             Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
             Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            Self::NetworkNotAllowed => crate::error::ErrorKind::User,
+            Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User,
             Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
             Self::UserTimeout(_) => crate::error::ErrorKind::User,
             Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index ee8b3d4ef579..7a855bf54b41 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -284,6 +284,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
             rate_limiter: BucketRateLimiter::new(vec![]),
             rate_limit_ip_subnet: 64,
             ip_allowlist_check_enabled: true,
+            is_vpc_acccess_proxy: false,
             is_auth_broker: false,
             accept_jwts: true,
             console_redirect_confirmation_timeout: Duration::ZERO,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e1affe8391a6..de685a82c627 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -630,6 +630,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
         ip_allowlist_check_enabled: !args.is_private_access_proxy,
+        is_vpc_acccess_proxy: args.is_private_access_proxy,
         is_auth_broker: args.is_auth_broker,
         accept_jwts: args.is_auth_broker,
         console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index a5e71f1a8744..7651eb71a2e0 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -15,13 +15,16 @@ use tracing::{debug, info};
 use super::{Cache, Cached};
 use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
-use crate::control_plane::AuthSecret;
-use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::control_plane::{AccessBlockerFlags, AuthSecret};
+use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     async fn decrement_active_listeners(&self);
     async fn increment_active_listeners(&self);
@@ -51,6 +54,8 @@ impl<T> From<T> for Entry<T> {
 struct EndpointInfo {
     secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
+    block_public_or_vpc_access: Option<Entry<AccessBlockerFlags>>,
+    allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
 }
 
 impl EndpointInfo {
@@ -92,9 +97,52 @@ impl EndpointInfo {
         }
         None
     }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Arc<Vec<String>>, bool)> {
+        if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
+            if valid_since < allowed_vpc_endpoint_ids.created_at {
+                return Some((
+                    allowed_vpc_endpoint_ids.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        allowed_vpc_endpoint_ids.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(AccessBlockerFlags, bool)> {
+        if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
+            if valid_since < block_public_or_vpc_access.created_at {
+                return Some((
+                    block_public_or_vpc_access.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        block_public_or_vpc_access.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+
     pub(crate) fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
+    pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
+        self.allowed_vpc_endpoint_ids = None;
+    }
+    pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
+        self.block_public_or_vpc_access = None;
+    }
     pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
         self.secret.remove(&role_name);
     }
@@ -111,6 +159,8 @@ pub struct ProjectInfoCacheImpl {
     cache: ClashMap<EndpointIdInt, EndpointInfo>,
 
     project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    // FIXME(stefan): we need a way to GC the account2ep map.
+    account2ep: ClashMap<AccountIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -120,6 +170,63 @@ pub struct ProjectInfoCacheImpl {
 
 #[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
+        info!(
+            "invalidating allowed vpc endpoint ids for projects `{}`",
+            project_ids
+                .iter()
+                .map(|id| id.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        );
+        for project_id in project_ids {
+            let endpoints = self
+                .project2ep
+                .get(&project_id)
+                .map(|kv| kv.value().clone())
+                .unwrap_or_default();
+            for endpoint_id in endpoints {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+        }
+    }
+
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
+        info!(
+            "invalidating allowed vpc endpoint ids for org `{}`",
+            account_id
+        );
+        let endpoints = self
+            .account2ep
+            .get(&account_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+            }
+        }
+    }
+
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
+        info!(
+            "invalidating block public or vpc access for project `{}`",
+            project_id
+        );
+        let endpoints = self
+            .project2ep
+            .get(&project_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_block_public_or_vpc_access();
+            }
+        }
+    }
+
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
@@ -178,6 +285,7 @@ impl ProjectInfoCacheImpl {
         Self {
             cache: ClashMap::new(),
             project2ep: ClashMap::new(),
+            account2ep: ClashMap::new(),
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
@@ -226,6 +334,49 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<String>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, AccessBlockerFlags>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+
     pub(crate) fn insert_role_secret(
         &self,
         project_id: ProjectIdInt,
@@ -256,6 +407,43 @@ impl ProjectInfoCacheImpl {
         self.insert_project2endpoint(project_id, endpoint_id);
         self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
     }
+    pub(crate) fn insert_allowed_vpc_endpoint_ids(
+        &self,
+        account_id: Option<AccountIdInt>,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        allowed_vpc_endpoint_ids: Arc<Vec<String>>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        if let Some(account_id) = account_id {
+            self.insert_account2endpoint(account_id, endpoint_id);
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into());
+    }
+    pub(crate) fn insert_block_public_or_vpc_access(
+        &self,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        access_blockers: AccessBlockerFlags,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .block_public_or_vpc_access = Some(access_blockers.into());
+    }
+
     fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
         if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
             endpoints.insert(endpoint_id);
@@ -264,6 +452,14 @@ impl ProjectInfoCacheImpl {
                 .insert(project_id, HashSet::from([endpoint_id]));
         }
     }
+    fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
+        if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
+            endpoints.insert(endpoint_id);
+        } else {
+            self.account2ep
+                .insert(account_id, HashSet::from([endpoint_id]));
+        }
+    }
     fn get_cache_times(&self) -> (Instant, Option<Instant>) {
         let mut valid_since = Instant::now() - self.config.ttl;
         // Only ignore cache if ttl is disabled.
@@ -334,11 +530,25 @@ impl CachedLookupInfo {
             lookup_type: LookupType::AllowedIps,
         }
     }
+    pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::AllowedVpcEndpointIds,
+        }
+    }
+    pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::BlockPublicOrVpcAccess,
+        }
+    }
 }
 
 enum LookupType {
     RoleSecret(RoleNameInt),
     AllowedIps,
+    AllowedVpcEndpointIds,
+    BlockPublicOrVpcAccess,
 }
 
 impl Cache for ProjectInfoCacheImpl {
@@ -360,6 +570,16 @@ impl Cache for ProjectInfoCacheImpl {
                     endpoint_info.invalidate_allowed_ips();
                 }
             }
+            LookupType::AllowedVpcEndpointIds => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+            LookupType::BlockPublicOrVpcAccess => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_block_public_or_vpc_access();
+                }
+            }
         }
     }
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 9a0b954341bb..4d919f374a2d 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -12,13 +12,15 @@ use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{check_peer_addr_is_in_list, AuthError};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
+use crate::control_plane::ControlPlaneApi;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
+use crate::protocol2::ConnectionInfoExtra;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
@@ -133,6 +135,9 @@ pub(crate) enum CancelError {
     #[error("IP is not allowed")]
     IpNotAllowed,
 
+    #[error("VPC endpoint id is not allowed to connect")]
+    VpcEndpointIdNotAllowed,
+
     #[error("Authentication backend error")]
     AuthError(#[from] AuthError),
 
@@ -152,8 +157,9 @@ impl ReportableError for CancelError {
             }
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
             CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
-            CancelError::IpNotAllowed => crate::error::ErrorKind::User,
-            CancelError::NotFound => crate::error::ErrorKind::User,
+            CancelError::IpNotAllowed
+            | CancelError::VpcEndpointIdNotAllowed
+            | CancelError::NotFound => crate::error::ErrorKind::User,
             CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
             CancelError::InternalError => crate::error::ErrorKind::Service,
         }
@@ -265,11 +271,12 @@ impl CancellationHandler {
     /// Will fetch IP allowlist internally.
     ///
     /// return Result primarily for tests
-    pub(crate) async fn cancel_session<T: BackendIpAllowlist>(
+    pub(crate) async fn cancel_session<T: ControlPlaneApi>(
         &self,
         key: CancelKeyData,
         ctx: RequestContext,
-        check_allowed: bool,
+        check_ip_allowed: bool,
+        check_vpc_allowed: bool,
         auth_backend: &T,
     ) -> Result<(), CancelError> {
         let subnet_key = match ctx.peer_addr() {
@@ -304,11 +311,11 @@ impl CancellationHandler {
             return Err(CancelError::NotFound);
         };
 
-        if check_allowed {
+        if check_ip_allowed {
             let ip_allowlist = auth_backend
                 .get_allowed_ips(&ctx, &cancel_closure.user_info)
                 .await
-                .map_err(CancelError::AuthError)?;
+                .map_err(|e| CancelError::AuthError(e.into()))?;
 
             if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
                 // log it here since cancel_session could be spawned in a task
@@ -320,6 +327,40 @@ impl CancellationHandler {
             }
         }
 
+        // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+        let access_blocks = auth_backend
+            .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info)
+            .await
+            .map_err(|e| CancelError::AuthError(e.into()))?;
+
+        if check_vpc_allowed {
+            if access_blocks.vpc_access_blocked {
+                return Err(CancelError::AuthError(AuthError::NetworkNotAllowed));
+            }
+
+            let incoming_vpc_endpoint_id = match ctx.extra() {
+                None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            let allowed_vpc_endpoint_ids = auth_backend
+                .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(|e| CancelError::AuthError(e.into()))?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+            {
+                return Err(CancelError::VpcEndpointIdNotAllowed);
+            }
+        } else if access_blocks.public_access_blocked {
+            return Err(CancelError::VpcEndpointIdNotAllowed);
+        }
+
         Metrics::get()
             .proxy
             .cancellation_requests_total
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 8502edcfab09..1dcd37712ea2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -68,6 +68,7 @@ pub struct AuthenticationConfig {
     pub rate_limiter: AuthRateLimiter,
     pub rate_limit_ip_subnet: u8,
     pub ip_allowlist_check_enabled: bool,
+    pub is_vpc_acccess_proxy: bool,
     pub jwks_cache: JwkCache,
     pub is_auth_broker: bool,
     pub accept_jwts: bool,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 78bfb6deacc3..c4548a7ddd95 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -182,7 +182,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
-                            backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            backend.get_api(),
                         )
                         .await
                         .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index a9fb513d3ceb..3236b2e1bfb0 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
-use crate::protocol2::ConnectionInfo;
+use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
@@ -312,6 +312,15 @@ impl RequestContext {
             .ip()
     }
 
+    pub(crate) fn extra(&self) -> Option<ConnectionInfoExtra> {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .conn_info
+            .extra
+            .clone()
+    }
+
     pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
         self.0
             .try_lock()
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index ece03156d1fa..ef6621fc598a 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -22,7 +22,8 @@ use crate::control_plane::errors::{
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+    AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo,
 };
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
@@ -137,9 +138,6 @@ impl NeonControlPlaneClient {
                 }
             };
 
-            // Ivan: don't know where it will be used, so I leave it here
-            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
-
             let secret = if body.role_secret.is_empty() {
                 None
             } else {
@@ -153,10 +151,23 @@ impl NeonControlPlaneClient {
                 .proxy
                 .allowed_ips_number
                 .observe(allowed_ips.len() as f64);
+            let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_vpc_endpoint_ids
+                .observe(allowed_vpc_endpoint_ids.len() as f64);
+            let block_public_connections = body.block_public_connections.unwrap_or_default();
+            let block_vpc_connections = body.block_vpc_connections.unwrap_or_default();
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
+                allowed_vpc_endpoint_ids,
                 project_id: body.project_id,
+                account_id: body.account_id,
+                access_blocker_flags: AccessBlockerFlags {
+                    public_access_blocked: block_public_connections,
+                    vpc_access_blocked: block_vpc_connections,
+                },
             })
         }
         .inspect_err(|e| tracing::debug!(error = ?e))
@@ -299,6 +310,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let account_id = auth_info.account_id;
         if let Some(project_id) = auth_info.project_id {
             let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
@@ -312,24 +324,35 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_vpc_endpoint_ids),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                auth_info.access_blocker_flags,
+            );
             ctx.set_project_id(project_id);
         }
         // When we just got a secret, we don't need to invalidate it.
         Ok(Cached::new_uncached(auth_info.secret))
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             Metrics::get()
                 .proxy
-                .allowed_ips_cache_misses
+                .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats?
                 .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
+            return Ok(allowed_ips);
         }
         Metrics::get()
             .proxy
@@ -337,7 +360,10 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             .inc(CacheOutcome::Miss);
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
         let user = &user_info.user;
+        let account_id = auth_info.account_id;
         if let Some(project_id) = auth_info.project_id {
             let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
@@ -351,12 +377,136 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 normalized_ep_int,
                 allowed_ips.clone(),
             );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(allowed_ips))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_vpc_endpoint_ids) = self
+            .caches
+            .project_info
+            .get_allowed_vpc_endpoint_ids(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .vpc_endpoint_id_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(allowed_vpc_endpoint_ids);
+        }
+
+        Metrics::get()
+            .proxy
+            .vpc_endpoint_id_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(allowed_vpc_endpoint_ids))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(access_blocker_flags) = self
+            .caches
+            .project_info
+            .get_block_public_or_vpc_access(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .access_blocker_flags_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(access_blocker_flags);
+        }
+
+        Metrics::get()
+            .proxy
+            .access_blocker_flags_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags.clone(),
+            );
             ctx.set_project_id(project_id);
         }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
+        Ok(Cached::new_uncached(access_blocker_flags))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 5f8bda0f35ae..1e6cde8fb080 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -13,12 +13,14 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::context::RequestContext;
-use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
+use crate::control_plane::client::{
+    CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret,
+};
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
+use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -121,7 +123,10 @@ impl MockControlPlane {
         Ok(AuthInfo {
             secret,
             allowed_ips,
+            allowed_vpc_endpoint_ids: vec![],
             project_id: None,
+            account_id: None,
+            access_blocker_flags: AccessBlockerFlags::default(),
         })
     }
 
@@ -214,16 +219,35 @@ impl super::ControlPlaneApi for MockControlPlane {
         ))
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info).await?.allowed_ips,
+        )))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info)
+                .await?
+                .allowed_vpc_endpoint_ids,
+        )))
+    }
+
+    async fn get_block_public_or_vpc_access(
         &self,
         _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        Ok((
-            Cached::new_uncached(Arc::new(
-                self.do_get_auth_info(user_info).await?.allowed_ips,
-            )),
-            None,
+    ) -> Result<super::CachedAccessBlockerFlags, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(
+            self.do_get_auth_info(user_info).await?.access_blocker_flags,
         ))
     }
 
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index b879f3a59ff4..a06943726e50 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -17,7 +17,8 @@ use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{
-    errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
+    errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds,
+    CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
 };
 use crate::error::ReportableError;
 use crate::metrics::ApiLockMetrics;
@@ -55,17 +56,45 @@ impl ControlPlaneApi for ControlPlaneClient {
         }
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
         match self {
-            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await,
             #[cfg(test)]
-            Self::Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips(),
+        }
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_allowed_vpc_endpoint_ids(),
+        }
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_block_public_or_vpc_access(),
         }
     }
 
@@ -102,9 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient {
 pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 
-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    fn get_allowed_vpc_endpoint_ids(
+        &self,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    fn get_block_public_or_vpc_access(
         &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;
 
     fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
 }
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index d068614b24df..5883d02b92c7 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -4,7 +4,7 @@ use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 
 use crate::auth::IpPattern;
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
@@ -227,8 +227,11 @@ pub(crate) struct UserFacingMessage {
 pub(crate) struct GetEndpointAccessControl {
     pub(crate) role_secret: Box<str>,
     pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
     pub(crate) project_id: Option<ProjectIdInt>,
-    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+    pub(crate) account_id: Option<AccountIdInt>,
+    pub(crate) block_public_connections: Option<bool>,
+    pub(crate) block_vpc_connections: Option<bool>,
 }
 
 /// Response which holds compute node's `host:port` pair.
@@ -282,6 +285,10 @@ pub(crate) struct DatabaseInfo {
     pub(crate) aux: MetricsAuxInfo,
     #[serde(default)]
     pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    #[serde(default)]
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
+    #[serde(default)]
+    pub(crate) public_access_allowed: Option<bool>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -293,6 +300,7 @@ impl fmt::Debug for DatabaseInfo {
             .field("dbname", &self.dbname)
             .field("user", &self.user)
             .field("allowed_ips", &self.allowed_ips)
+            .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids)
             .finish_non_exhaustive()
     }
 }
@@ -457,19 +465,31 @@ mod tests {
 
     #[test]
     fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
+        // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field.
+        let json = json!({
+            "role_secret": "secret",
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_ips": ["8.8.8.8"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
             "project_id": "project",
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 1dca26d6866c..f92e4f3f6055 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,6 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
+use crate::intern::AccountIdInt;
 use crate::intern::ProjectIdInt;
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};
@@ -52,8 +53,14 @@ pub(crate) struct AuthInfo {
     pub(crate) secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
     pub(crate) allowed_ips: Vec<IpPattern>,
+    /// List of VPC endpoints allowed for the autorization.
+    pub(crate) allowed_vpc_endpoint_ids: Vec<String>,
     /// Project ID. This is used for cache invalidation.
     pub(crate) project_id: Option<ProjectIdInt>,
+    /// Account ID. This is used for cache invalidation.
+    pub(crate) account_id: Option<AccountIdInt>,
+    /// Are public connections or VPC connections blocked?
+    pub(crate) access_blocker_flags: AccessBlockerFlags,
 }
 
 /// Info for establishing a connection to a compute node.
@@ -95,11 +102,21 @@ impl NodeInfo {
     }
 }
 
+#[derive(Clone, Default, Eq, PartialEq, Debug)]
+pub(crate) struct AccessBlockerFlags {
+    pub public_access_blocked: bool,
+    pub vpc_access_blocked: bool,
+}
+
 pub(crate) type NodeInfoCache =
     TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type CachedAllowedVpcEndpointIds =
+    Cached<&'static ProjectInfoCacheImpl, Arc<Vec<String>>>;
+pub(crate) type CachedAccessBlockerFlags =
+    Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -113,11 +130,23 @@ pub(crate) trait ControlPlaneApi {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    async fn get_block_public_or_vpc_access(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;
 
     async fn get_endpoint_jwks(
         &self,
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 79c6020302af..0d1382679c81 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
 
-use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};
 
 pub trait InternId: Sized + 'static {
     fn get_interner() -> &'static StringInterner<Self>;
@@ -206,6 +206,26 @@ impl From<ProjectId> for ProjectIdInt {
     }
 }
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct AccountIdTag;
+impl InternId for AccountIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type AccountIdInt = InternedString<AccountIdTag>;
+impl From<&AccountId> for AccountIdInt {
+    fn from(value: &AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(value)
+    }
+}
+impl From<AccountId> for AccountIdInt {
+    fn from(value: AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(&value)
+    }
+}
+
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f3d281a26b59..25bcc81108b6 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -96,6 +96,16 @@ pub struct ProxyMetrics {
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_ips_number: Histogram<10>,
 
+    /// Number of cache hits/misses for VPC endpoint IDs.
+    pub vpc_endpoint_id_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of cache hits/misses for access blocker flags.
+    pub access_blocker_flags_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of allowed VPC endpoints IDs
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_vpc_endpoint_ids: Histogram<10>,
+
     /// Number of connections (per sni).
     pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
 
@@ -570,6 +580,9 @@ pub enum RedisEventsCount {
     CancelSession,
     PasswordUpdate,
     AllowedIpsUpdate,
+    AllowedVpcEndpointIdsUpdateForProjects,
+    AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
+    BlockPublicOrVpcAccessUpdate,
 }
 
 pub struct ThreadPoolWorkers(usize);
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index ab173bd0d052..8a407c811971 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -283,7 +283,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
-                            auth_backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            auth_backend.get_api(),
                         )
                         .await
                         .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 10db2bcb303f..d8c00a9b4177 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -26,7 +26,7 @@ use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{
-    self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
+    self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
 use crate::tls::client_config::compute_client_config_with_certs;
@@ -526,9 +526,19 @@ impl TestControlPlaneClient for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_allowed_vpc_endpoint_ids(
+        &self,
+    ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_block_public_or_vpc_access(
         &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>
+    ) -> Result<control_plane::CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError>
     {
         unimplemented!("not used in tests")
     }
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 19fdd3280dfc..1a7024588aa1 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
-use crate::intern::{ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -86,9 +86,7 @@ pub(crate) struct BlockPublicOrVpcAccessUpdated {
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct AllowedVpcEndpointsUpdatedForOrg {
-    // TODO: change type once the implementation is more fully fledged.
-    // See e.g. https://github.com/neondatabase/neon/pull/10073.
-    account_id: ProjectIdInt,
+    account_id: AccountIdInt,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -205,6 +203,24 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         .proxy
                         .redis_events_count
                         .inc(RedisEventsCount::PasswordUpdate);
+                } else if matches!(
+                    msg,
+                    Notification::AllowedVpcEndpointsUpdatedForProjects { .. }
+                ) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects);
+                } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg);
+                } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate);
                 }
                 // TODO: add additional metrics for the other event types.
 
@@ -230,20 +246,26 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
         Notification::AllowedIpsUpdate { allowed_ips_update } => {
             cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
         }
+        Notification::BlockPublicOrVpcAccessUpdated {
+            block_public_or_vpc_access_updated,
+        } => cache.invalidate_block_public_or_vpc_access_for_project(
+            block_public_or_vpc_access_updated.project_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForOrg {
+            allowed_vpc_endpoints_updated_for_org,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
+            allowed_vpc_endpoints_updated_for_org.account_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForProjects {
+            allowed_vpc_endpoints_updated_for_projects,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
+            allowed_vpc_endpoints_updated_for_projects.project_ids,
+        ),
         Notification::PasswordUpdate { password_update } => cache
             .invalidate_role_secret_for_project(
                 password_update.project_id,
                 password_update.role_name,
             ),
-        Notification::BlockPublicOrVpcAccessUpdated { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
         Notification::UnknownTopic => unreachable!(),
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6d5fb13681e9..0fb4a8a6cc70 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -30,6 +30,7 @@ use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -57,23 +58,52 @@ impl PoolingBackend {
 
         let user_info = user_info.clone();
         let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
-        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
+        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+
         if self.config.authentication_config.ip_allowlist_check_enabled
             && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
         {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
+
+        let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?;
+        if self.config.authentication_config.is_vpc_acccess_proxy {
+            if access_blocker_flags.vpc_access_blocked {
+                return Err(AuthError::NetworkNotAllowed);
+            }
+
+            let extra = ctx.extra();
+            let incoming_endpoint_id = match extra {
+                None => String::new(),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            if incoming_endpoint_id.is_empty() {
+                return Err(AuthError::MissingVPCEndpointId);
+            }
+
+            let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id)
+            {
+                return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id));
+            }
+        } else if access_blocker_flags.public_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
         if !self
             .endpoint_rate_limiter
             .check(user_info.endpoint.clone().into(), 1)
         {
             return Err(AuthError::too_many_connections());
         }
-        let cached_secret = match maybe_secret {
-            Some(secret) => secret,
-            None => backend.get_role_secret(ctx).await?,
-        };
-
+        let cached_secret = backend.get_role_secret(ctx).await?;
         let secret = match cached_secret.value.clone() {
             Some(secret) => self.config.authentication_config.check_rate_limit(
                 ctx,
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
index 6e0bd61c9442..d5952d1d8b0a 100644
--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -97,6 +97,8 @@ smol_str_wrapper!(EndpointId);
 smol_str_wrapper!(BranchId);
 // 90% of project strings are 23 characters or less.
 smol_str_wrapper!(ProjectId);
+// 90% of account strings are 23 characters or less.
+smol_str_wrapper!(AccountId);
 
 // will usually equal endpoint ID
 smol_str_wrapper!(EndpointCacheKey);

From 6318828c639ba85e66da321f7b32828ee945de12 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 31 Jan 2025 21:52:17 +0100
Subject: [PATCH 39/78] Update rust to 1.84.1 (#10618)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.84.1/).

Prior update was in https://github.com/neondatabase/neon/pull/10328.

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 build-tools.Dockerfile | 2 +-
 rust-toolchain.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index dfcc7d06b493..f744b44808c6 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -253,7 +253,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.84.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 06746d3e1dd5..38a7f202ba0a 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.84.0"
+channel = "1.84.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From b9e1a6724628aad5cf62737f6acab60ec23ff09b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Sat, 1 Feb 2025 12:09:45 +0100
Subject: [PATCH 40/78] fix generate matrix for olap for saturdays (#10622)

## Problem

when introducing pg17 for job step `Generate matrix for OLAP benchmarks`
I introduced a syntax error that only hits on Saturdays.

## Summary of changes

Remove trailing comma

## successful test run

https://github.com/neondatabase/neon/actions/runs/13086363907
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 20a8a6e2c9dc..413af90deca8 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -341,7 +341,7 @@ jobs:
           ],
           "pg_version" : [
             16,17
-          ],
+          ]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then

From 8ae6f656a694a2d6892ce6ebd1475d1b831ba917 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 05:11:06 +0100
Subject: [PATCH 41/78] Don't require partial backup semaphore capacity for
 deletions (#10628)

In the safekeeper, we block deletions on the timeline's gate closing,
and any `WalResidentTimeline` keeps the gate open (because it owns a
gate lock object). Thus, unless the `main_task` function of a partial
backup doesn't return, we can't delete the associated timeline.

In order to make these tasks exit early, we call the cancellation token
of the timeline upon its shutdown. However, the partial backup task
wasn't looking for the cancellation while waiting to acquire a partial
backup permit.

On a staging safekeeper we have been in a situation in the past where
the semaphore was already empty for a duration of many hours, rendering
all attempted deletions unable to proceed until a restart where the
semaphore was reset:
https://neondb.slack.com/archives/C03H1K0PGKH/p1738416586442029
---
 safekeeper/src/wal_backup_partial.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4e5b34a9bf65..5ecb23e8e04b 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -535,6 +535,10 @@ pub async fn main_task(
         // limit concurrent uploads
         let _upload_permit = tokio::select! {
             acq = limiter.acquire_partial_backup() => acq,
+            _ = backup.tli.cancel.cancelled() => {
+                info!("timeline canceled");
+                return None;
+            }
             _ = cancel.cancelled() => {
                 info!("task canceled");
                 return None;

From 4dfe60e2ad08ad0e99bfd938bcbe18271deb4f84 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 3 Feb 2025 10:00:23 +0100
Subject: [PATCH 42/78] revert https://github.com/neondatabase/neon/pull/10616
 (#10631)

## Problem

https://github.com/neondatabase/neon/pull/10616 was only intended
temparily during the weekend, want to reset to prior state

## Summary of changes

revert https://github.com/neondatabase/neon/pull/10616 but keep fixes in
https://github.com/neondatabase/neon/pull/10622
---
 .github/workflows/benchmarking.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 413af90deca8..b36ac46f3525 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,8 +11,7 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    # - cron:   '0 3 * * *' # run once a day, timezone is utc
-    - cron: '0 */10 * * *' # Runs every 10 hours at minute 0
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -551,7 +550,6 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
-    if: false
     permissions:
       contents: write
       statuses: write
@@ -685,8 +683,7 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     permissions:
       contents: write
       statuses: write
@@ -814,7 +811,6 @@ jobs:
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
     # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
     permissions:
       contents: write
       statuses: write
@@ -934,7 +930,6 @@ jobs:
 
   user-examples-compare:
     # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
     permissions:
       contents: write
       statuses: write

From f071800979fba434ea0708f22e454c513efe47b2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Feb 2025 09:02:21 +0000
Subject: [PATCH 43/78] tests: stabilize shard locations earlier in
 test_scrubber_tenant_snapshot (#10606)

## Problem

This test would sometimes emit unexpected logs from the storage
controller's requests to do migrations, which overlap with the test's
restarts of pageservers, where those migrations are happening some time
after a shard split as the controller moves load around.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10602/13067323736/index.html#testresult/f66f1329557a1fc5/retries

## Summary of changes

- Do a reconcile_until_idle after shard split, so that the rest of the
test doesn't run concurrently with migrations
---
 test_runner/regress/test_storage_scrubber.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 7e92cc01cd2f..0f4e5688a957 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -71,6 +71,10 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     else:
         tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
 
+    # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
+    # is it won't overlap with migrations
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 

From 89b9f7407706353bf256ac6eeea0edd31c3829d1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Feb 2025 09:40:12 +0000
Subject: [PATCH 44/78] CI(pre-merge-checks): do not run `conclusion` job for
 PRs (#10619)

## Problem

While working on https://github.com/neondatabase/neon/pull/10617 I
(unintentionally) merged the PR before the main CI pipeline has
finished.
I suspect this happens because we have received all the required job
results from the pre-merge-checks workflow, which runs on PRs that
include changes to relevant files.

## Summary of changes
- Skip the `conclusion` job in `pre-merge-checks` workflows for PRs
---
 .github/workflows/pre-merge-checks.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index d39ccecac9e9..c47b3fe0debb 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -95,7 +95,8 @@ jobs:
   # - conclusion
   # - neon-cloud-e2e
   conclusion:
-    if: always()
+    # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
+    if: always() && github.event_name == 'merge_group'
     permissions:
       statuses: write # for `github.repos.createCommitStatus(...)`
       contents: write

From 87ad50c92547891f6c1161c3e3c255067e6c0276 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 12:53:51 +0100
Subject: [PATCH 45/78] storcon: use diesel-async again, now with tls support
 (#10614)

Successor of #10280 after it was reverted in #10592.

Re-introduce the usage of diesel-async again, but now also add TLS
support so that we connect to the storcon database using TLS. By
default, diesel-async doesn't support TLS, so add some code to make us
explicitly request TLS.

cc https://github.com/neondatabase/cloud/issues/23583
---
 .github/workflows/_build-and-test-locally.yml |   4 -
 .github/workflows/build-macos.yml             |   2 +-
 .github/workflows/neon_extra_builds.yml       |   2 +-
 Cargo.lock                                    |  72 +-
 Dockerfile                                    |   2 +-
 Makefile                                      |   2 -
 storage_controller/Cargo.toml                 |   9 +-
 storage_controller/src/main.rs                |   2 +-
 storage_controller/src/persistence.rs         | 849 +++++++++++-------
 9 files changed, 557 insertions(+), 387 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index e9483492c967..1dec8106b484 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:
 
       - name: Run cargo build
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 01d82a1ed2e7..347a511e980f 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5b5910badf37..f077e04d1c7b 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Cargo.lock b/Cargo.lock
index cdc620e48575..0133c8356466 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -932,6 +932,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1790,11 +1802,24 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
- "pq-sys",
- "r2d2",
  "serde_json",
 ]
 
+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4645,15 +4670,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4966,17 +4982,6 @@ dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5797,12 +5802,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]
 
 [[package]]
@@ -6337,6 +6342,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6351,10 +6357,12 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
- "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
+ "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6362,6 +6370,8 @@ dependencies = [
  "strum_macros",
  "thiserror 1.0.69",
  "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
  "tokio-util",
  "tracing",
  "utils",
@@ -6604,7 +6614,7 @@ dependencies = [
  "fastrand 2.2.0",
  "once_cell",
  "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -7562,12 +7572,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
diff --git a/Dockerfile b/Dockerfile
index f80666529bb5..7ba54c8ca5fa 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Makefile b/Makefile
index 22ebfea7d571..d1238caebf5a 100644
--- a/Makefile
+++ b/Makefile
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index caaa22d0a5d7..63f43cdf62e0 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,6 +32,7 @@ postgres_connection.workspace = true
 rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
+rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -39,18 +40,20 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+rustls.workspace = true
 scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
-    "postgres",
-    "r2d2",
     "chrono",
 ] }
+diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-r2d2 = { version = "0.8.10" }
+scoped-futures = "0.1.4"
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 801409d61292..659c088d5143 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url));
+    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 37bfaf1139a5..880f20306407 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1,13 +1,20 @@
 pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel::Connection;
+use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
+use diesel_async::pooled_connection::bb8::Pool;
+use diesel_async::pooled_connection::AsyncDieselConnectionManager;
+use diesel_async::pooled_connection::ManagerConfig;
+use diesel_async::AsyncPgConnection;
+use diesel_async::RunQueryDsl;
+use futures::future::BoxFuture;
+use futures::FutureExt;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -20,6 +27,8 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use rustls::crypto::ring;
+use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -60,7 +69,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
+    connection_pool: Pool<AsyncPgConnection>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -76,7 +85,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] r2d2::Error),
+    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -124,6 +133,7 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
+#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -136,6 +146,11 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
+// A generous allowance for how many times we may retry serializable transactions
+// before giving up.  This is not expected to be hit: it is a defensive measure in case we
+// somehow engineer a situation where duelling transactions might otherwise live-lock.
+const MAX_RETRIES: usize = 128;
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -145,12 +160,18 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String) -> Self {
-        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+    pub async fn new(database_url: String) -> Self {
+        let mut mgr_config = ManagerConfig::default();
+        mgr_config.custom_setup = Box::new(establish_connection_rustls);
+
+        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
+            database_url,
+            mgr_config,
+        );
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = diesel::r2d2::Pool::builder()
+        let connection_pool = Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -158,6 +179,7 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
+            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -171,7 +193,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match PgConnection::establish(database_url) {
+            match establish_connection_rustls(database_url).await {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -192,20 +214,66 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            HarnessWithOutput::write_to_stdout(conn)
-                .run_pending_migrations(MIGRATIONS)
-                .map(|_| ())
-                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        // Can't use self.with_conn here as we do spawn_blocking which requires static.
+        let conn = self
+            .connection_pool
+            .dedicated_connection()
+            .await
+            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
+        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
+            AsyncConnectionWrapper::from(conn);
+        tokio::task::spawn_blocking(move || {
+            let mut retry_count = 0;
+            loop {
+                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
+                    .run_pending_migrations(MIGRATIONS)
+                    .map(|_| ())
+                    .map_err(|e| DatabaseError::Migration(e.to_string()));
+                match result {
+                    Ok(r) => break Ok(r),
+                    Err(
+                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                            diesel::result::DatabaseErrorKind::SerializationFailure,
+                            _,
+                        )),
+                    ) => {
+                        retry_count += 1;
+                        if retry_count > MAX_RETRIES {
+                            tracing::error!(
+                                "Exceeded max retries on SerializationFailure errors: {err:?}"
+                            );
+                            break Err(err);
+                        } else {
+                            // Retry on serialization errors: these are expected, because even though our
+                            // transactions don't fight for the same rows, they will occasionally collide
+                            // on index pages (e.g. increment_generation for unrelated shards can collide)
+                            tracing::debug!(
+                                "Retrying transaction on serialization failure {err:?}"
+                            );
+                            continue;
+                        }
+                    }
+                    Err(e) => break Err(e),
+                }
+            }
         })
         .await
+        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
+        Ok(())
     }
 
     /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
+    async fn with_measured_conn<'a, 'b, F, R>(
+        &self,
+        op: DatabaseOperation,
+        func: F,
+    ) -> DatabaseResult<R>
     where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
     {
         let latency = &METRICS_REGISTRY
             .metrics_group
@@ -227,77 +295,75 @@ impl Persistence {
         res
     }
 
-    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
-    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
+    /// Call the provided function with a Diesel database connection in a retry loop
+    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
     {
-        // A generous allowance for how many times we may retry serializable transactions
-        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
-        // somehow engineer a situation where duelling transactions might otherwise live-lock.
-        const MAX_RETRIES: usize = 128;
-
-        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
-            let mut retry_count = 0;
-            loop {
-                match conn.build_transaction().serializable().run(|c| func(c)) {
-                    Ok(r) => break Ok(r),
-                    Err(
-                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
-                            diesel::result::DatabaseErrorKind::SerializationFailure,
-                            _,
-                        )),
-                    ) => {
-                        retry_count += 1;
-                        if retry_count > MAX_RETRIES {
-                            tracing::error!(
-                                "Exceeded max retries on SerializationFailure errors: {err:?}"
-                            );
-                            break Err(err);
-                        } else {
-                            // Retry on serialization errors: these are expected, because even though our
-                            // transactions don't fight for the same rows, they will occasionally collide
-                            // on index pages (e.g. increment_generation for unrelated shards can collide)
-                            tracing::debug!(
-                                "Retrying transaction on serialization failure {err:?}"
-                            );
-                            continue;
-                        }
+        let mut retry_count = 0;
+        loop {
+            let mut conn = self.connection_pool.get().await?;
+            match conn
+                .build_transaction()
+                .serializable()
+                .run(|c| func(c))
+                .await
+            {
+                Ok(r) => break Ok(r),
+                Err(
+                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        diesel::result::DatabaseErrorKind::SerializationFailure,
+                        _,
+                    )),
+                ) => {
+                    retry_count += 1;
+                    if retry_count > MAX_RETRIES {
+                        tracing::error!(
+                            "Exceeded max retries on SerializationFailure errors: {err:?}"
+                        );
+                        break Err(err);
+                    } else {
+                        // Retry on serialization errors: these are expected, because even though our
+                        // transactions don't fight for the same rows, they will occasionally collide
+                        // on index pages (e.g. increment_generation for unrelated shards can collide)
+                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
+                        continue;
                     }
-                    Err(e) => break Err(e),
                 }
+                Err(e) => break Err(e),
             }
-        })
-        .await
-        .expect("Task panic")
+        }
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
+        let np = &node.to_persistent();
+        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
+                    .values(np)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::nodes::table
+                        .load::<NodePersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -313,11 +379,14 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                    .execute(conn)?;
-                Ok(updated)
+                Box::pin(async move {
+                    let updated = diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .execute(conn)
+                        .await?;
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -339,17 +408,16 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListTenantShards,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -359,15 +427,14 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::LoadTenant,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -393,19 +460,22 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
+        let shards = &shards;
+        let metadata_health_records = &metadata_health_records;
+        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
+                    .values(shards)
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                    .values(metadata_health_records)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -413,31 +483,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
+            Box::pin(async move {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
+            Box::pin(async move {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -454,34 +524,41 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .set(generation.eq(generation + 1))
-                    .execute(conn)?;
-
-                tracing::info!("Incremented {} tenants' generations", rows_updated);
-
-                // TODO: UPDATE+SELECT in one query
-
-                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .select(TenantShardPersistence::as_select())
-                    .load(conn)?;
-
-                // If the node went through a drain and restart phase before re-attaching,
-                // then reset it's node scheduling policy to active.
-                diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .filter(
-                        scheduling_policy
-                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
-                    )
-                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                    .execute(conn)?;
+                Box::pin(async move {
+                    let rows_updated = diesel::update(tenant_shards)
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .set(generation.eq(generation + 1))
+                        .execute(conn)
+                        .await?;
+
+                    tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                    // TODO: UPDATE+SELECT in one query
+
+                    let updated = tenant_shards
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .select(TenantShardPersistence::as_select())
+                        .load(conn)
+                        .await?;
+
+                    // If the node went through a drain and restart phase before re-attaching,
+                    // then reset it's node scheduling policy to active.
+                    diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .filter(
+                            scheduling_policy
+                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
+                        )
+                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                        .execute(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -518,19 +595,22 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation.eq(generation + 1),
-                        generation_pageserver.eq(node_id.0 as i64),
-                    ))
-                    // TODO: only returning() the generation column
-                    .returning(TenantShardPersistence::as_returning())
-                    .get_result(conn)?;
+                Box::pin(async move {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set((
+                            generation.eq(generation + 1),
+                            generation_pageserver.eq(node_id.0 as i64),
+                        ))
+                        // TODO: only returning() the generation column
+                        .returning(TenantShardPersistence::as_returning())
+                        .get_result(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -562,12 +642,15 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                let result = tenant_shards
-                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                    .select(TenantShardPersistence::as_select())
-                    .order(shard_number)
-                    .load(conn)?;
-                Ok(result)
+                Box::pin(async move {
+                    let result = tenant_shards
+                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                        .select(TenantShardPersistence::as_select())
+                        .order(shard_number)
+                        .load(conn)
+                        .await?;
+                    Ok(result)
+                })
             })
             .await?;
 
@@ -615,15 +698,18 @@ impl Persistence {
                 break;
             }
 
+            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                    ).load(conn)?;
-
-                    Ok(result)
+                    Box::pin(async move {
+                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                        ).load(conn).await?;
+
+                        Ok(result)
+                    })
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -657,51 +743,58 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let tenant = &tenant;
+        let input_placement_policy = &input_placement_policy;
+        let input_config = &input_config;
+        let input_generation = &input_generation;
+        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            Box::pin(async move {
+                let query = match tenant {
+                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .into_boxed(),
+                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(input_tenant_id.to_string()))
+                        .into_boxed(),
+                };
 
-            // Clear generation_pageserver if we are moving into a state where we won't have
-            // any attached pageservers.
-            let input_generation_pageserver = match input_placement_policy {
-                None | Some(PlacementPolicy::Attached(_)) => None,
-                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-            };
+                // Clear generation_pageserver if we are moving into a state where we won't have
+                // any attached pageservers.
+                let input_generation_pageserver = match input_placement_policy {
+                    None | Some(PlacementPolicy::Attached(_)) => None,
+                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+                };
 
-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
-                generation_pageserver: Option<Option<i64>>,
-            }
+                #[derive(AsChangeset)]
+                #[diesel(table_name = crate::schema::tenant_shards)]
+                struct ShardUpdate {
+                    generation: Option<i32>,
+                    placement_policy: Option<String>,
+                    config: Option<String>,
+                    scheduling_policy: Option<String>,
+                    generation_pageserver: Option<Option<i64>>,
+                }
 
-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .as_ref()
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config
-                    .as_ref()
-                    .map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                generation_pageserver: input_generation_pageserver,
-            };
+                let update = ShardUpdate {
+                    generation: input_generation.map(|g| g.into().unwrap() as i32),
+                    placement_policy: input_placement_policy
+                        .as_ref()
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    config: input_config
+                        .as_ref()
+                        .map(|c| serde_json::to_string(&c).unwrap()),
+                    scheduling_policy: input_scheduling_policy
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    generation_pageserver: input_generation_pageserver,
+                };
 
-            query.set(update).execute(conn)?;
+                query.set(update).execute(conn).await?;
 
-            Ok(())
+                Ok(())
+            })
         })
         .await?;
 
@@ -715,23 +808,27 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
-
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                    .execute(conn)?;
-
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+            Box::pin(async move {
+                let mut shards_updated = Vec::default();
+
+                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                        .execute(conn)
+                        .await?;
+
+                    if updated == 1 {
+                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    }
                 }
-            }
 
-            Ok(shards_updated)
+                Ok(shards_updated)
+            })
         })
         .await
     }
@@ -739,17 +836,21 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            let updated = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
-                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                ))
-                .execute(conn)?;
-
-            Ok(updated)
+            Box::pin(async move {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation_pageserver.eq(Option::<i64>::None),
+                        placement_policy
+                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                    ))
+                    .execute(conn)
+                    .await?;
+
+                Ok(updated)
+            })
         })
         .await?;
 
@@ -768,14 +869,16 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        let parent_to_children = parent_to_children.as_slice();
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
+            Box::pin(async move {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn)?;
+                .execute(conn).await?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -788,7 +891,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.clone();
+            let parent_to_children = parent_to_children.to_vec();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -796,7 +899,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn)?;
+                    .load::<TenantShardPersistence>(conn).await?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -811,12 +914,13 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn)?;
+                        .execute(conn).await?;
                 }
             }
 
             Ok(())
         })
+        })
         .await
     }
 
@@ -828,25 +932,26 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
+            Box::pin(async move {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -858,15 +963,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
+            Box::pin(async move {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -886,11 +991,12 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -906,25 +1012,28 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
+        let healthy_records = healthy_records.as_slice();
+        let unhealthy_records = unhealthy_records.as_slice();
+        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
+                    .values(healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
+                    .values(unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -933,15 +1042,13 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
+            Box::pin(async {
+                Ok(crate::schema::metadata_health::table
+                    .load::<MetadataHealthPersistence>(conn)
+                    .await?)
+            })
+        })
         .await
     }
 
@@ -953,10 +1060,15 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
+            move |conn| {
+                Box::pin(async {
+                    DatabaseResult::Ok(
+                        crate::schema::metadata_health::table
+                            .filter(healthy.eq(false))
+                            .load::<MetadataHealthPersistence>(conn)
+                            .await?,
+                    )
+                })
             },
         )
         .await
@@ -970,15 +1082,14 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
+            Box::pin(async move {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
+                let res = query.load::<MetadataHealthPersistence>(conn).await?;
 
                 Ok(res)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -986,12 +1097,13 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::GetLeader,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::controllers::table
+                        .load::<ControllerPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         if leader.len() > 1 {
@@ -1014,26 +1126,33 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(
-                DatabaseOperation::UpdateLeader,
-                move |conn| -> DatabaseResult<usize> {
+            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
+                let prev = prev.clone();
+                let new = new.clone();
+                Box::pin(async move {
                     let updated = match &prev {
-                        Some(prev) => diesel::update(controllers)
-                            .filter(address.eq(prev.address.clone()))
-                            .filter(started_at.eq(prev.started_at))
-                            .set((
-                                address.eq(new.address.clone()),
-                                started_at.eq(new.started_at),
-                            ))
-                            .execute(conn)?,
-                        None => diesel::insert_into(controllers)
-                            .values(new.clone())
-                            .execute(conn)?,
+                        Some(prev) => {
+                            diesel::update(controllers)
+                                .filter(address.eq(prev.address.clone()))
+                                .filter(started_at.eq(prev.started_at))
+                                .set((
+                                    address.eq(new.address.clone()),
+                                    started_at.eq(new.started_at),
+                                ))
+                                .execute(conn)
+                                .await?
+                        }
+                        None => {
+                            diesel::insert_into(controllers)
+                                .values(new.clone())
+                                .execute(conn)
+                                .await?
+                        }
                     };
 
                     Ok(updated)
-                },
-            )
+                })
+            })
             .await?;
 
         if updated == 0 {
@@ -1048,12 +1167,13 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::safekeepers::table
+                        .load::<SafekeeperPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1066,11 +1186,14 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
-            Ok(safekeepers
-                .filter(id_column.eq(&id))
-                .select(SafekeeperPersistence::as_select())
-                .get_result(conn)?)
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                Ok(safekeepers
+                    .filter(id_column.eq(&id))
+                    .select(SafekeeperPersistence::as_select())
+                    .get_result(conn)
+                    .await?)
+            })
         })
         .await
     }
@@ -1081,26 +1204,30 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            let bind = record
-                .as_insert_or_update()
-                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-
-            let inserted_updated = diesel::insert_into(safekeepers)
-                .values(&bind)
-                .on_conflict(id)
-                .do_update()
-                .set(&bind)
-                .execute(conn)?;
-
-            if inserted_updated != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({})",
-                    inserted_updated
-                )));
-            }
+        self.with_conn(move |conn| {
+            let record = record.clone();
+            Box::pin(async move {
+                let bind = record
+                    .as_insert_or_update()
+                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-            Ok(())
+                let inserted_updated = diesel::insert_into(safekeepers)
+                    .values(&bind)
+                    .on_conflict(id)
+                    .do_update()
+                    .set(&bind)
+                    .execute(conn)
+                    .await?;
+
+                if inserted_updated != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        inserted_updated
+                    )));
+                }
+
+                Ok(())
+            })
         })
         .await
     }
@@ -1112,31 +1239,73 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            #[derive(Insertable, AsChangeset)]
-            #[diesel(table_name = crate::schema::safekeepers)]
-            struct UpdateSkSchedulingPolicy<'a> {
-                id: i64,
-                scheduling_policy: &'a str,
-            }
-            let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                #[derive(Insertable, AsChangeset)]
+                #[diesel(table_name = crate::schema::safekeepers)]
+                struct UpdateSkSchedulingPolicy<'a> {
+                    id: i64,
+                    scheduling_policy: &'a str,
+                }
+                let scheduling_policy_ = String::from(scheduling_policy_);
 
-            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                .set(scheduling_policy.eq(scheduling_policy_))
-                .execute(conn)?;
+                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                    .set(scheduling_policy.eq(scheduling_policy_))
+                    .execute(conn)
+                    .await?;
 
-            if rows_affected != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({rows_affected})",
-                )));
-            }
+                if rows_affected != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({rows_affected})",
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
 }
 
+pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        anyhow::bail!("could not parse certificates: {:?}", der_certs.errors);
+    }
+
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs.certs);
+    Ok(Arc::new(store))
+}
+
+/// Loads the root certificates and constructs a client config suitable for connecting.
+/// This function is blocking.
+pub fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    Ok(
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("ring should support the default protocol versions")
+            .with_root_certificates(load_certs()?)
+            .with_no_client_auth(),
+    )
+}
+
+fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
+    let fut = async {
+        // We first set up the way we want rustls to work.
+        let rustls_config = client_config_with_root_certs()
+            .map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
+        let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
+        let (client, conn) = tokio_postgres::connect(config, tls)
+            .await
+            .map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
+
+        AsyncPgConnection::try_from_client_and_connection(client, conn).await
+    };
+    fut.boxed()
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
     QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,

From b1e451091ad45d3e274dc4119e94f0d0307650b5 Mon Sep 17 00:00:00 2001
From: OBBO67 <35974943+OBBO67@users.noreply.github.com>
Date: Mon, 3 Feb 2025 11:54:07 +0000
Subject: [PATCH 46/78] pageserver: clean up references to timeline delete
 marker, uninit marker (#5718) (#10627)

## Problem

Since [#5580](https://github.com/neondatabase/neon/pull/5580) the delete
and uninit file markers are no longer needed.

## Summary of changes

Remove the remaining code for the delete and uninit markers.

Additionally removes the `ends_with_suffix` function as it is no longer
required.

Closes [#5718](https://github.com/neondatabase/neon/issues/5718).
---
 pageserver/src/lib.rs    | 27 ---------------------------
 pageserver/src/tenant.rs |  7 +------
 2 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ff6af3566c82..f43cd08cf7ba 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// data directory at pageserver startup can be automatically removed.
 pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 
-/// A marker file to mark that a timeline directory was not fully initialized.
-/// If a timeline directory with this marker is encountered at pageserver startup,
-/// the timeline directory and the marker file are both removed.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-
-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
     match path.file_name() {
         Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
     }
 }
 
-fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
-    match path.file_name() {
-        Some(name) => name.ends_with(suffix),
-        None => false,
-    }
-}
-
-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 657cc78e2c26..1914a955625b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,7 +95,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
-use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::CONCURRENT_INITDBS;
 use crate::metrics::INITDB_RUN_TIME;
@@ -1793,11 +1792,7 @@ impl Tenant {
             let entry = entry.context("read timeline dir entry")?;
             let entry_path = entry.path();
 
-            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
-                || is_uninit_mark(entry_path)
-                || crate::is_delete_mark(entry_path)
-            {
+            let purge = if crate::is_temporary(entry_path) {
                 true
             } else {
                 match TimelineId::try_from(entry_path.file_name()) {

From b1bc33eb4d23c77ce0b4a2c9c8d8453e1f7cd5ee Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 3 Feb 2025 12:44:47 +0000
Subject: [PATCH 47/78] Fix logical_replication_sync test fixture (#10531)

Fixes flaky test_lr_with_slow_safekeeper test #10242

Fix query to `pg_catalog.pg_stat_subscription` catalog to handle table
synchronization and parallel LR correctly.
---
 test_runner/fixtures/neon_fixtures.py         | 54 +++++++++++++------
 .../performance/test_logical_replication.py   |  4 +-
 test_runner/regress/test_compute_catalog.py   |  7 ++-
 test_runner/regress/test_layer_bloating.py    |  2 +-
 .../regress/test_logical_replication.py       | 28 +++++-----
 .../test_physical_and_logical_replicaiton.py  |  4 +-
 .../regress/test_subscriber_branching.py      |  3 +-
 7 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7e3cc1982960..8909f7f2493c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4996,13 +4996,35 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
+# wait for subscriber to catch up with publisher
 def logical_replication_sync(
     subscriber: PgProtocol,
     publisher: PgProtocol,
+    # pass subname explicitly to avoid confusion
+    # when multiple subscriptions are present
+    subname: str,
     sub_dbname: str | None = None,
     pub_dbname: str | None = None,
-) -> Lsn:
+):
     """Wait logical replication subscriber to sync with publisher."""
+
+    def initial_sync():
+        # first check if the subscription is active `s`=`synchronized`, `r` = `ready`
+        query = f"""SELECT 1 FROM pg_subscription_rel join pg_catalog.pg_subscription
+                    on pg_subscription_rel.srsubid = pg_subscription.oid
+                    WHERE srsubstate NOT IN ('r', 's') and subname='{subname}'"""
+
+        if sub_dbname is not None:
+            res = subscriber.safe_psql(query, dbname=sub_dbname)
+        else:
+            res = subscriber.safe_psql(query)
+
+        assert (res is None) or (len(res) == 0)
+
+    wait_until(initial_sync)
+
+    # wait for the subscription to catch up with current state of publisher
+    # caller is responsible to call checkpoint before calling this function
     if pub_dbname is not None:
         publisher_lsn = Lsn(
             publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0]
@@ -5010,23 +5032,23 @@ def logical_replication_sync(
     else:
         publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
-    while True:
+    def subscriber_catch_up():
+        query = f"select latest_end_lsn from pg_catalog.pg_stat_subscription where latest_end_lsn is NOT NULL and subname='{subname}'"
+
         if sub_dbname is not None:
-            res = subscriber.safe_psql(
-                "select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname
-            )[0][0]
+            res = subscriber.safe_psql(query, dbname=sub_dbname)
         else:
-            res = subscriber.safe_psql(
-                "select latest_end_lsn from pg_catalog.pg_stat_subscription"
-            )[0][0]
-
-        if res:
-            log.info(f"subscriber_lsn={res}")
-            subscriber_lsn = Lsn(res)
-            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
-            if subscriber_lsn >= publisher_lsn:
-                return subscriber_lsn
-        time.sleep(0.5)
+            res = subscriber.safe_psql(query)
+
+        assert res is not None
+
+        res_lsn = res[0][0]
+        log.info(f"subscriber_lsn={res_lsn}")
+        subscriber_lsn = Lsn(res_lsn)
+        log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
+        assert subscriber_lsn >= publisher_lsn
+
+    wait_until(subscriber_catch_up)
 
 
 def tenant_get_shards(
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 9d653d1a1ef8..fdc56cc496e5 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -44,13 +44,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()])
 
     # Wait logical replication to sync
     start = time.time()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     log.info(f"Sync with master took {time.time() - start} seconds")
 
     sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0])
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index f0878b2631d1..50a922a616b0 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -183,6 +183,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
         cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute("INSERT INTO t VALUES (1)")
+        cursor.execute("CHECKPOINT")
 
     # connect to the subscriber_db and create a subscription
     # Note that we need to create subscription with
@@ -195,7 +196,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
 
     # wait for the subscription to be active
     logical_replication_sync(
-        endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db"
+        endpoint,
+        endpoint,
+        "mysub",
+        sub_dbname="subscriber_db",
+        pub_dbname="publisher_db",
     )
 
     # Check that replication is working
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index d9043fef7fb3..0260704ebf47 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -63,7 +63,7 @@ def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     cur.execute("set statement_timeout=0")
     cur.execute("select create_snapshots(10000)")
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
     env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False)
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 89087631092d..3a92f0d1d1f1 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -55,13 +55,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     # insert some data
     cur.execute("insert into t values (generate_series(1,1000), 0)")
 
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000
 
     # now stop subscriber...
@@ -78,7 +78,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     vanilla_pg.start()
 
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     # Check that subscribers receives all data
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000
@@ -148,7 +148,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     endpoint.start()
 
     vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3"
     assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q)
     log.info("rewriteheap synced")
@@ -285,7 +285,7 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: V
     vanilla_pg.safe_psql("create table t(a int)")
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     vanilla_pg.stop()
 
@@ -321,13 +321,13 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: V
         sk_http = sk.http_client()
         sk_http.configure_failpoints([("sk-pause-send", "off")])
 
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2]
 
     # Check that local reads also work
     with endpoint.connect().cursor() as cur:
         cur.execute("insert into t values (3)")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3]
 
     log_path = vanilla_pg.pgdatadir / "pg.log"
@@ -365,7 +365,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
     log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     vanilla_pg.stop()
 
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
@@ -375,7 +375,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
     # this should flush current wal page
     cur.execute("insert into replication_example values (3, 4)")
     vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql(
         "select sum(somedata) from replication_example"
     ) == endpoint.safe_psql("select sum(somedata) from replication_example")
@@ -409,18 +409,18 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     # Test simple insert, update, delete. But with very large values
     value = random_string(10_000_000)
     cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)]
 
     # Test delete, and reinsert another value
     cur.execute("DELETE FROM reptbl WHERE id = 1")
     cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
     value = random_string(10_000_000)
     cur.execute(f"UPDATE reptbl SET largeval='{value}'")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
     endpoint.stop()
@@ -428,7 +428,7 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     cur = endpoint.connect().cursor()
     value = random_string(10_000_000)
     cur.execute(f"UPDATE reptbl SET largeval='{value}'")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
 
@@ -608,7 +608,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
         for i in range(0, 1000):
             pcur.execute("INSERT into t values (%s, random()*100000)", (i,))
     # wait until sub receives all data
-    logical_replication_sync(sub, vanilla_pg)
+    logical_replication_sync(sub, vanilla_pg, "sub")
     # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data
     # as flushed we'll now lose it if subscriber restars. That's why
     # logical_replication_wait_flush_lsn_sync is expected to hang while
diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
index 3f9824ee677a..229439106b82 100644
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -43,7 +43,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
     s_cur.execute("select count(*) from t")
     assert s_cur.fetchall()[0][0] == n_records
 
-    logical_replication_sync(vanilla_pg, primary)
+    logical_replication_sync(vanilla_pg, primary, "sub1")
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
 
     # Check that LR slot is not copied to replica
@@ -87,7 +87,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
     s_con = secondary.connect()
     s_cur = s_con.cursor()
 
-    logical_replication_sync(vanilla_pg, primary)
+    logical_replication_sync(vanilla_pg, primary, "sub1")
 
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
     s_cur.execute("select count(*) from t")
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 645572da8e79..849d4f024d65 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -3,7 +3,7 @@
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -208,7 +208,6 @@ def insert_data(pub, start):
         # wake the sub and ensure that it catches up with the new data
         sub.start(create_test_user=True)
         with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
-            logical_replication_sync(sub, pub)
             wait_until(check_that_changes_propagated)
             scur.execute("SELECT count(*) FROM t")
             res = scur.fetchall()

From 23ca8b061ba4c3a53b38d3f49d3ae7be1ed696f8 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Mon, 3 Feb 2025 13:55:48 +0100
Subject: [PATCH 48/78] Use actions/checkout for checkout (#10630)

## Problem
1. First of all it's more correct
2. Current usage allows ` Time-of-Check-Time-of-Use (TOCTOU) 'Pwn
Request' vulnerabilities`. Please check security slack channel or reach
me for more details. I will update PR description after merge.

## Summary of changes
1. Use `actions/checkout` with `ref: ${{
github.event.pull_request.head.sha }}`

Discovered by and Co-author: @varunsh-coder
---
 .github/workflows/approved-for-ci-run.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 0a0898d30c1c..fc2f36c74b89 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -94,7 +94,9 @@ jobs:
           echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
           echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
 
-      - run: gh pr checkout "${PR_NUMBER}"
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
 
       - run: git checkout -b "${BRANCH}"
 

From e617a3a075f4debfa2b50a280a50475f3abed32c Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Mon, 3 Feb 2025 05:34:11 -0800
Subject: [PATCH 49/78] vm-monitor: Improve error display (#10542)

Logging errors with the debug format specifier causes multi-line errors,
which are sometimes a pain to deal with. Instead, we should use anyhow's
alternate display format, which shows the same information on a single
line.

Also adjusted a couple of error messages that were stale.

Fixes neondatabase/cloud#14710.
---
 libs/vm_monitor/src/filecache.rs |  6 +++---
 libs/vm_monitor/src/lib.rs       | 14 +++++++-------
 libs/vm_monitor/src/runner.rs    | 12 ++++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs
index fe71e11197de..4f5bf1c1e327 100644
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -177,8 +177,8 @@ impl FileCacheState {
         crate::spawn_with_cancel(
             token,
             |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
+                if let Err(e) = res {
+                    error!(error = format_args!("{e:#}"), "postgres error");
                 }
             },
             conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
         {
             Ok(rows) => Ok(rows),
             Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
+                error!(error = format_args!("{e:#}"), "postgres error -> retrying");
 
                 let client = FileCacheState::connect(&self.conn_str, self.token.clone())
                     .await
diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs
index 1b13c8e0b23d..0cd97d4ca122 100644
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -191,15 +191,12 @@ async fn start_monitor(
     .await;
     let mut monitor = match monitor {
         Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
+        Ok(Err(e)) => {
+            error!(error = format_args!("{e:#}"), "failed to create monitor");
             return;
         }
         Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
+            error!(?timeout, "creating monitor timed out");
             return;
         }
     };
@@ -207,6 +204,9 @@ async fn start_monitor(
 
     match monitor.run().await {
         Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(
+            error = format_args!("{e:#}"),
+            "monitor terminated unexpectedly"
+        ),
     }
 }
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 8605314ba9e5..8839f5803f81 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -370,12 +370,16 @@ impl Runner {
                 }),
             InboundMsgKind::InvalidMessage { error } => {
                 warn!(
-                    %error, id, "received notification of an invalid message we sent"
+                    error = format_args!("{error:#}"),
+                    id, "received notification of an invalid message we sent"
                 );
                 Ok(None)
             }
             InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(
+                    error = format_args!("{error:#}"),
+                    id, "agent experienced an internal error"
+                );
                 Ok(None)
             }
             InboundMsgKind::HealthCheck {} => {
@@ -476,7 +480,7 @@ impl Runner {
                                         // gives the outermost cause, and the debug impl
                                         // pretty-prints the error, whereas {:#} contains all the
                                         // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        warn!(error = format_args!("{e:#}"), "error handling message");
                                         OutboundMsg::new(
                                             OutboundMsgKind::InternalError {
                                                 error: e.to_string(),
@@ -492,7 +496,7 @@ impl Runner {
                                     .context("failed to send message")?;
                             }
                             Err(e) => warn!(
-                                error = format!("{e}"),
+                                error = format_args!("{e:#}"),
                                 msg = ?msg,
                                 "received error message"
                             ),

From 43682624b5548136f321254e88cd9b7f7453ae61 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Feb 2025 13:41:41 +0000
Subject: [PATCH 50/78] CI(pg-clients): fix logical replication tests (#10623)

## Problem

Tests for logical replication (on Staging) have been failing for some
time because logical replication is not enabled for them. This issue
occurred after switching to an org API key with a different default
setting, where logical replication was not enabled by default.

## Summary of changes
- Add `enable_logical_replication` input to
`actions/neon-project-create`
- Enable logical replication in `test-logical-replication` job
---
 .../actions/neon-project-create/action.yml    | 12 +++++++----
 .github/workflows/pg-clients.yml              |  6 ++++--
 test_runner/logical_repl/README.md            | 20 +++++++++++--------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 11f46bce8ef1..c9f6b0832eeb 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,7 +41,10 @@ inputs:
     description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
     required: false
     default: '/tmp/neon/pg_install/v16/lib'
-  
+  project_settings:
+    description: 'A JSON object with project settings'
+    required: false
+    default: '{}'
 
 outputs:
   dsn:
@@ -73,7 +76,7 @@ runs:
               \"provisioner\": \"k8s-neonvm\",
               \"autoscaling_limit_min_cu\": ${MIN_CU},
               \"autoscaling_limit_max_cu\": ${MAX_CU},
-              \"settings\": { }
+              \"settings\": ${PROJECT_SETTINGS}
             }
           }")
 
@@ -92,12 +95,12 @@ runs:
         if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
           # determine tenant ID
           TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
-          
+
           echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
 
           echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
           echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
-          
+
           # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
           curl -X PUT \
             "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -118,3 +121,4 @@ runs:
         STRIPE_SIZE: ${{ inputs.stripe_size }}
         PSQL: ${{ inputs.psql_path }}
         LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
+        PROJECT_SETTINGS: ${{ inputs.project_settings }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 4947907eb068..abc90c7fe1a7 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -12,8 +12,8 @@ on:
   pull_request:
     paths:
       - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
+      - 'test_runner/pg_clients/**/*.py'
+      - 'test_runner/logical_repl/**/*.py'
       - 'poetry.lock'
   workflow_dispatch:
 
@@ -104,6 +104,8 @@ jobs:
         with:
           api_key: ${{ secrets.NEON_STAGING_API_KEY }}
           postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+          project_settings: >-
+            {"enable_logical_replication": true}
 
       - name: Run tests
         uses: ./.github/actions/run-python-test-set
diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md
index 8eca056dda23..449e56e21df6 100644
--- a/test_runner/logical_repl/README.md
+++ b/test_runner/logical_repl/README.md
@@ -1,13 +1,18 @@
 # Logical replication tests
 
+> [!NOTE]
+> Neon project should have logical replication enabled:
+>
+> https://neon.tech/docs/guides/logical-replication-postgres#enable-logical-replication-in-the-source-neon-project
+
 ## Clickhouse
 
 ```bash
 export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
 
-docker compose -f clickhouse/docker-compose.yml up -d
-pytest -m remote_cluster -k test_clickhouse
-docker compose -f clickhouse/docker-compose.yml down
+docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml up -d
+./scripts/pytest -m remote_cluster -k test_clickhouse
+docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down
 ```
 
 ## Debezium
@@ -15,8 +20,7 @@ docker compose -f clickhouse/docker-compose.yml down
 ```bash
 export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
 
-docker compose -f debezium/docker-compose.yml up -d
-pytest -m remote_cluster -k test_debezium
-docker compose -f debezium/docker-compose.yml down
-
-```
\ No newline at end of file
+docker compose -f test_runner/logical_repl/debezium/docker-compose.yml up -d
+./scripts/pytest -m remote_cluster -k test_debezium
+docker compose -f test_runner/logical_repl/debezium/docker-compose.yml down
+```

From 628a9616c4f0836a8d06dce34f8b4a525a5d7985 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 3 Feb 2025 15:12:41 +0100
Subject: [PATCH 51/78] fix(proxy): Don't use --is-private-access-proxy to
 disable IP check (#10633)

## Problem
* The behavior of this flag changed. Plus, it's not necessary to disable
the IP check as long as there are no IPs listed in the local postgres.

## Summary of changes
* Drop the flag from the command in the README.md section.
* Change the postgres URL passed to proxy to not use the endpoint
hostname.
* Also swap postgres creation and proxy startup, so the DB is running
when proxy comes up.
---
 proxy/README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index 4b98342d7275..ecd54fbbd864 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -106,17 +106,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
 
 Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
 
-Let's create self-signed certificate by running:
-```sh
-openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
-```
-
-Then we need to build proxy with 'testing' feature and run, e.g.:
-```sh
-RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key
-```
-
-We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows:
+We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
 ```sh
 docker run \
   --detach \
@@ -133,8 +123,18 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```
 
+Let's create self-signed certificate by running:
+```sh
+openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
+```
+
+Then we need to build proxy with 'testing' feature and run, e.g.:
+```sh
+RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+```
+
 Now from client you can start a new session:
 
 ```sh
 PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
-```
\ No newline at end of file
+```

From c774f0a14710846c65f6b82b3838c1496d03af8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:21:01 +0100
Subject: [PATCH 52/78] storcon db: allow accepting any TLS certificate
 (#10640)

We encountered some TLS validation errors for the storcon since applying
#10614. Add an option to downgrade them to logged errors instead to
allow us to debug with more peace.

cc issue https://github.com/neondatabase/cloud/issues/23583
---
 storage_controller/src/persistence.rs | 93 +++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 880f20306407..45f3108d6b2b 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,6 +27,8 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use rustls::client::danger::ServerCertVerifier;
+use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
@@ -1281,14 +1283,95 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
 
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
-pub fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
-    Ok(
+fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    let client_config =
         rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
-            .expect("ring should support the default protocol versions")
+            .expect("ring should support the default protocol versions");
+    static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+    let do_cert_checks =
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+    Ok(if *do_cert_checks {
+        client_config
             .with_root_certificates(load_certs()?)
-            .with_no_client_auth(),
-    )
+            .with_no_client_auth()
+    } else {
+        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
+        #[derive(Debug)]
+        struct AcceptAll(Arc<WebPkiServerVerifier>);
+        impl ServerCertVerifier for AcceptAll {
+            fn verify_server_cert(
+                &self,
+                end_entity: &rustls::pki_types::CertificateDer<'_>,
+                intermediates: &[rustls::pki_types::CertificateDer<'_>],
+                server_name: &rustls::pki_types::ServerName<'_>,
+                ocsp_response: &[u8],
+                now: rustls::pki_types::UnixTime,
+            ) -> Result<ServerCertVerified, rustls::Error> {
+                let r = self.0.verify_server_cert(
+                    end_entity,
+                    intermediates,
+                    server_name,
+                    ocsp_response,
+                    now,
+                );
+                if let Err(err) = r {
+                    tracing::info!(
+                        ?server_name,
+                        "ignoring db connection TLS validation error: {err:?}"
+                    );
+                    return Ok(ServerCertVerified::assertion());
+                }
+                r
+            }
+            fn verify_tls12_signature(
+                &self,
+                message: &[u8],
+                cert: &rustls::pki_types::CertificateDer<'_>,
+                dss: &rustls::DigitallySignedStruct,
+            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
+            {
+                let r = self.0.verify_tls12_signature(message, cert, dss);
+                if let Err(err) = r {
+                    tracing::info!(
+                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
+                    );
+                    return Ok(HandshakeSignatureValid::assertion());
+                }
+                r
+            }
+            fn verify_tls13_signature(
+                &self,
+                message: &[u8],
+                cert: &rustls::pki_types::CertificateDer<'_>,
+                dss: &rustls::DigitallySignedStruct,
+            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
+            {
+                let r = self.0.verify_tls13_signature(message, cert, dss);
+                if let Err(err) = r {
+                    tracing::info!(
+                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
+                    );
+                    return Ok(HandshakeSignatureValid::assertion());
+                }
+                r
+            }
+            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+                self.0.supported_verify_schemes()
+            }
+        }
+        let verifier = AcceptAll(
+            WebPkiServerVerifier::builder_with_provider(
+                load_certs()?,
+                Arc::new(ring::default_provider()),
+            )
+            .build()?,
+        );
+        client_config
+            .dangerous()
+            .with_custom_certificate_verifier(Arc::new(verifier))
+            .with_no_client_auth()
+    })
 }
 
 fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {

From 715e20343a64a6194feffba0e67d4f6a47d4c83a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Feb 2025 19:01:16 +0000
Subject: [PATCH 53/78] storage controller: improve scheduling of tenants
 created in PlacementPolicy::Secondary (#10590)

## Problem

I noticed when onboarding lots of tenants that the AZ scheduling
violation stat was climbing, before falling later as optimisations
happened. This was happening because we first add the tenant with
PlacementPolicy::Secondary, and then later go to
PlacementPolicy::Attached, and the scheduler's behavior led to a bad AZ
choice:
1. Create a secondary location in the non-preferred AZ
2. Upgrade to Attached where we promote that non-preferred-AZ location
to attached and then create another secondary
3. Optimiser later realises we're in the wrong AZ and moves us

## Summary of changes

- Extend some logging to give more information about AZs
- When scheduling secondary location in PlacementPolicy::Secondary,
select it as if we were attached: in this mode, our business goal is to
have a warm pageserver location that we can make available as attached
quickly if needed, therefore we want it to be in the preferred AZ.
- Make optimize_secondary logic the same, so that it will consider a
secondary location in the preferred AZ to be optimal when in
PlacementPolicy::Secondary
- When transitioning to from PlacementPolicy::Attached(N) to
PlacementPolicy::Secondary, instead of arbitrarily picking a location to
keep, prefer to keep the location in the preferred AZ
---
 storage_controller/src/scheduler.rs           |   5 +-
 storage_controller/src/tenant_shard.rs        | 155 ++++++++++++++++--
 .../regress/test_storage_controller.py        |   7 +
 3 files changed, 155 insertions(+), 12 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index f5cab9dd5746..f9e72862ae3d 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -774,8 +774,9 @@ impl Scheduler {
 
         if !matches!(context.mode, ScheduleMode::Speculative) {
             tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
+            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
+            preferred_az,
        );
         }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index d344e27e3180..302104dc9773 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -710,7 +710,11 @@ impl TenantShard {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
+                    //
+                    // We use [`AttachedShardTag`] because when a secondary location is the only one
+                    // a shard has, we expect that its next use will be as an attached location: we want
+                    // the tenant to be ready to warm up and run fast in their preferred AZ.
+                    let node_id = scheduler.schedule_shard::<AttachedShardTag>(
                         &[],
                         &self.intent.preferred_az_id,
                         context,
@@ -719,9 +723,17 @@ impl TenantShard {
                     modified = true;
                 }
                 while self.intent.secondary.len() > 1 {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
+                    // If we have multiple secondaries (e.g. when transitioning from Attached to Secondary and
+                    // having just demoted our attached location), then we should prefer to keep the location
+                    // in our preferred AZ.  Tenants in Secondary mode want to be in the preferred AZ so that
+                    // they have a warm location to become attached when transitioning back into Attached.
+
+                    let mut candidates = self.intent.get_secondary().clone();
+                    // Sort to get secondaries outside preferred AZ last
+                    candidates
+                        .sort_by_key(|n| scheduler.get_node_az(n).as_ref() != self.preferred_az());
+                    let secondary_to_remove = candidates.pop().unwrap();
+                    self.intent.remove_secondary(scheduler, secondary_to_remove);
                     modified = true;
                 }
             }
@@ -1079,12 +1091,31 @@ impl TenantShard {
                 None => vec![],
             };
 
-            let replacement = self.find_better_location::<SecondaryShardTag>(
-                scheduler,
-                &schedule_context,
-                *secondary,
-                &exclude,
-            );
+            let replacement = match &self.policy {
+                PlacementPolicy::Attached(_) => {
+                    // Secondaries for an attached shard should be scheduled using `SecondaryShardTag`
+                    // to avoid placing them in the preferred AZ.
+                    self.find_better_location::<SecondaryShardTag>(
+                        scheduler,
+                        &schedule_context,
+                        *secondary,
+                        &exclude,
+                    )
+                }
+                PlacementPolicy::Secondary => {
+                    // In secondary-only mode, we want our secondary locations in the preferred AZ,
+                    // so that they're ready to take over as an attached location when we transition
+                    // into PlacementPolicy::Attached.
+                    self.find_better_location::<AttachedShardTag>(
+                        scheduler,
+                        &schedule_context,
+                        *secondary,
+                        &exclude,
+                    )
+                }
+                PlacementPolicy::Detached => None,
+            };
+
             assert!(replacement != Some(*secondary));
             if let Some(replacement) = replacement {
                 // We have found a candidate and confirmed that its score is preferable
@@ -2687,4 +2718,108 @@ pub(crate) mod tests {
         }
         Ok(())
     }
+
+    /// Check how the shard's scheduling behaves when in PlacementPolicy::Secondary mode.
+    #[test]
+    fn tenant_secondary_scheduling() -> anyhow::Result<()> {
+        let az_a = AvailabilityZone("az-a".to_string());
+        let nodes = make_test_nodes(
+            3,
+            &[
+                az_a.clone(),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
+
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Secondary);
+        tenant_shard.intent.preferred_az_id = Some(az_a.clone());
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_none());
+
+        // Should have scheduled into the preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        // Switch to PlacementPolicy::Attached
+        tenant_shard.policy = PlacementPolicy::Attached(1);
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_some());
+        // Secondary should now be in non-preferred AZ
+        assert_ne!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+        // Attached should be in preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.attached.unwrap())
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        // Switch back to PlacementPolicy::Secondary
+        tenant_shard.policy = PlacementPolicy::Secondary;
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_none());
+        // When we picked a location to keep, we should have kept the one in the preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        tenant_shard.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
 }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 350fe31099fb..11a4d0920210 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -373,6 +373,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     but imports the generation number.
     """
 
+    neon_env_builder.num_azs = 3
     env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -409,6 +410,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
             "node_secondary"
         ][0]
 
+        # Check that the secondary's scheduling is stable
+        assert env.storage_controller.reconcile_all() == 0
+
     # Call into storage controller to onboard the tenant
     generation += 1
     r = virtual_ps_http.tenant_location_conf(
@@ -460,6 +464,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     assert len(r["shards"]) == 1
 
+    # Check that onboarding did not result in an unstable scheduling state
+    assert env.storage_controller.reconcile_all() == 0
+
     # We should see the tenant is now attached to the pageserver managed
     # by the sharding service
     origin_tenants = origin_ps.http_client().tenant_list()

From 06b45fd0fd38515cd431aab0d9baae7e13a52058 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 3 Feb 2025 20:23:12 +0100
Subject: [PATCH 54/78] utils/logging: add `critical!` macro and metric
 (#10641)

## Problem

We don't currently have good alerts for critical errors, e.g. data
loss/corruption.

Touches #10094.

## Summary of changes

Add a `critical!` macro and corresponding
`libmetrics_tracing_event_count{level="critical"}` metric. This will:

* Emit an `ERROR` log message with prefix `"CRITICAL:"` and a backtrace.
* Increment `libmetrics_tracing_event_count{level="critical"}`, and
indirectly `level="error"`.
* Trigger a pageable alert (via the metric above).
* In debug builds, panic the process.

I'll add uses of the macro separately.
---
 libs/utils/src/logging.rs | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index e205d60d747d..753f05b6fd1c 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 
+/// Logs a critical error, similarly to `tracing::error!`. This will:
+///
+/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
+/// * Trigger a pageable alert (via the metric above).
+/// * In debug builds, panic the process.
+#[macro_export]
+macro_rules! critical {
+    ($($arg:tt)*) => {
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
+    };
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -25,7 +43,10 @@ impl LogFormat {
     }
 }
 
-struct TracingEventCountMetric {
+pub struct TracingEventCountMetric {
+    /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
+    /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
+    critical: IntCounter,
     error: IntCounter,
     warn: IntCounter,
     info: IntCounter,
@@ -33,7 +54,7 @@ struct TracingEventCountMetric {
     trace: IntCounter,
 }
 
-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
     let vec = metrics::register_int_counter_vec!(
         "libmetrics_tracing_event_count",
         "Number of tracing events, by level",
@@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
 impl TracingEventCountMetric {
     fn new(vec: IntCounterVec) -> Self {
         Self {
+            critical: vec.with_label_values(&["critical"]),
             error: vec.with_label_values(&["error"]),
             warn: vec.with_label_values(&["warn"]),
             info: vec.with_label_values(&["info"]),
@@ -54,6 +76,11 @@ impl TracingEventCountMetric {
         }
     }
 
+    // Allow public access from `critical!` macro.
+    pub fn inc_critical(&self) {
+        self.critical.inc();
+    }
+
     fn inc_for_level(&self, level: tracing::Level) {
         let counter = match level {
             tracing::Level::ERROR => &self.error,

From d80cbb244312aacfacbc5b4b8b26efbe63752a56 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:42:40 +0000
Subject: [PATCH 55/78] build(deps): bump openssl from 0.10.66 to 0.10.70 in
 /test_runner/pg_clients/rust/tokio-postgres in the cargo group across 1
 directory (#10642)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 354fc1574510..0b138bf1677a 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.66"
+version = "0.10.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
+checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.103"
+version = "0.9.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
+checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
 dependencies = [
  "cc",
  "libc",

From c1be84197eb6f0f8c793bd251428501097b7c580 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Feb 2025 15:55:47 -0500
Subject: [PATCH 56/78] feat(pageserver): preempt image layer generation if L0
 piles up (#10572)

## Problem

Image layer generation could block L0 compactions for a long time.

## Summary of changes

* Refactored the return value of `create_image_layers_for_*` functions
to make it self-explainable.
* Preempt image layer generation in `Try` mode if L0 piles up.

Note that we might potentially run into a state that only the beginning
part of the keyspace gets image coverage. In that case, we either need
to implement something to prioritize some keyspaces with image coverage,
or tune the image_creation_threshold to ensure that the frequency of
image creation could keep up with L0 compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   9 +
 libs/pageserver_api/src/models.rs             |   8 +
 pageserver/src/tenant.rs                      |   3 +
 pageserver/src/tenant/config.rs               |  12 ++
 .../storage_layer/batch_split_writer.rs       |   4 +
 pageserver/src/tenant/timeline.rs             | 197 +++++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs  |  30 ++-
 .../regress/test_attach_tenant_config.py      |   1 +
 9 files changed, 209 insertions(+), 60 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 383c1746846b..dd37bfc4071f 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -388,6 +388,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<u8>())
                 .transpose()
                 .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+            image_creation_preempt_threshold: settings
+                .remove("image_creation_preempt_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
             pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 422da0dc95c9..a0b5feea948e 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -323,6 +323,10 @@ pub struct TenantConfigToml {
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
 
+    // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
+    // Set to 0 to disable preemption.
+    pub image_creation_preempt_threshold: usize,
+
     /// The length for an explicit LSN lease request.
     /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
     #[serde(with = "humantime_serde")]
@@ -547,6 +551,10 @@ pub mod tenant_conf_defaults {
     // Relevant: https://github.com/neondatabase/neon/issues/3394
     pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
+    // want to enable this feature.
+    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -605,6 +613,7 @@ impl Default for TenantConfigToml {
             lazy_slru_download: false,
             timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: false,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 43447c67bd0e..19beb37ab34c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -498,6 +498,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub image_layer_creation_check_threshold: FieldPatch<u8>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_preempt_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub lsn_lease_length: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -544,6 +546,7 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
+    pub image_creation_preempt_threshold: Option<usize>,
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
     pub timeline_offloading: Option<bool>,
@@ -581,6 +584,7 @@ impl TenantConfig {
             mut lazy_slru_download,
             mut timeline_get_throttle,
             mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
             mut lsn_lease_length,
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
@@ -635,6 +639,9 @@ impl TenantConfig {
         patch
             .image_layer_creation_check_threshold
             .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
         patch.lsn_lease_length.apply(&mut lsn_lease_length);
         patch
             .lsn_lease_length_for_ts
@@ -679,6 +686,7 @@ impl TenantConfig {
             lazy_slru_download,
             timeline_get_throttle,
             image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
             lsn_lease_length,
             lsn_lease_length_for_ts,
             timeline_offloading,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1914a955625b..80a61eba924b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5486,6 +5486,9 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
+                image_creation_preempt_threshold: Some(
+                    tenant_conf.image_creation_preempt_threshold,
+                ),
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 139ed27bd200..972837dc442c 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_creation_preempt_threshold: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -453,6 +456,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            image_creation_preempt_threshold: self
+                .image_creation_preempt_threshold
+                .unwrap_or(global_conf.image_creation_preempt_threshold),
             lsn_lease_length: self
                 .lsn_lease_length
                 .unwrap_or(global_conf.lsn_lease_length),
@@ -504,6 +510,7 @@ impl TenantConfOpt {
             mut lazy_slru_download,
             mut timeline_get_throttle,
             mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
             mut lsn_lease_length,
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
@@ -578,6 +585,9 @@ impl TenantConfOpt {
         patch
             .image_layer_creation_check_threshold
             .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
         patch
             .lsn_lease_length
             .map(|v| humantime::parse_duration(&v))?
@@ -626,6 +636,7 @@ impl TenantConfOpt {
             lazy_slru_download,
             timeline_get_throttle,
             image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
             lsn_lease_length,
             lsn_lease_length_for_ts,
             timeline_offloading,
@@ -689,6 +700,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            image_creation_preempt_threshold: value.image_creation_preempt_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
             timeline_offloading: value.timeline_offloading,
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 22d8b81bccfd..7da51c27df99 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -166,6 +166,10 @@ impl BatchLayerWriter {
         // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
+
+    pub fn pending_layer_num(&self) -> usize {
+        self.generated_layer_writers.len()
+    }
 }
 
 /// An image writer that takes images and produces multiple image layers.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6a8eaa4d957..d65b382e50ab 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -189,6 +189,14 @@ pub enum ImageLayerCreationMode {
     Initial,
 }
 
+#[derive(Clone, Debug, Default)]
+pub enum LastImageLayerCreationStatus {
+    Incomplete, // TODO: record the last key being processed
+    Complete,
+    #[default]
+    Initial,
+}
+
 impl std::fmt::Display for ImageLayerCreationMode {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{:?}", self)
@@ -347,6 +355,8 @@ pub struct Timeline {
     // garbage collecting data that is still needed by the child timelines.
     pub(crate) gc_info: std::sync::RwLock<GcInfo>,
 
+    pub(crate) last_image_layer_creation_status: ArcSwap<LastImageLayerCreationStatus>,
+
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
     // It is needed in checks when we want to error on some operations
@@ -936,9 +946,16 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
-struct ImageLayerCreationOutcome {
-    unfinished_image_layer: Option<ImageLayerWriter>,
-    next_start_key: Key,
+enum ImageLayerCreationOutcome {
+    /// We generated an image layer
+    Generated {
+        unfinished_image_layer: ImageLayerWriter,
+    },
+    /// The key range is empty
+    Empty,
+    /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip
+    /// the image layer creation.
+    Skip,
 }
 
 /// Public interface functions
@@ -2349,6 +2366,18 @@ impl Timeline {
             )
     }
 
+    fn get_image_creation_preempt_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_creation_preempt_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_creation_preempt_threshold,
+            )
+    }
+
     /// Resolve the effective WAL receiver protocol to use for this tenant.
     ///
     /// Priority order is:
@@ -2499,6 +2528,10 @@ impl Timeline {
 
                 gc_info: std::sync::RwLock::new(GcInfo::default()),
 
+                last_image_layer_creation_status: ArcSwap::new(Arc::new(
+                    LastImageLayerCreationStatus::default(),
+                )),
+
                 latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
 
@@ -4042,15 +4075,20 @@ impl Timeline {
             }
 
             let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
+            let (generated_image_layers, is_complete) = self
+                .create_image_layers(
                     &partitions,
                     self.initdb_lsn,
                     ImageLayerCreationMode::Initial,
                     ctx,
+                    LastImageLayerCreationStatus::Initial,
                 )
-                .await?,
+                .await?;
+            debug_assert!(
+                matches!(is_complete, LastImageLayerCreationStatus::Complete),
+                "init image generation mode must fully cover the keyspace"
             );
+            layers_to_upload.extend(generated_image_layers);
 
             (layers_to_upload, None)
         } else {
@@ -4370,7 +4408,6 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
         img_range: Range<Key>,
-        start: Key,
         io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         let mut wrote_keys = false;
@@ -4458,26 +4495,23 @@ impl Timeline {
                     lsn
                 },
             );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
             })
         } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard.  In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
         }
     }
 
     /// Create an image layer for metadata keys. This function produces one image layer for all metadata
     /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
     /// would not be too large to fit in a single image layer.
+    ///
+    /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of
+    /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse
+    /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics
+    /// for metadata keys than relational keys, which is the number of delta files visited during the scan.
     #[allow(clippy::too_many_arguments)]
     async fn create_image_layer_for_metadata_keys(
         self: &Arc<Self>,
@@ -4487,12 +4521,13 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
-        start: Key,
         io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         // Metadata keys image layer creation.
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let begin = Instant::now();
+        // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+        // not contain too many keys, otherwise this takes a lot of memory.
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -4517,10 +4552,7 @@ impl Timeline {
         );
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
-            return Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: img_range.end,
-            });
+            return Ok(ImageLayerCreationOutcome::Skip);
         }
         if self.cancel.is_cancelled() {
             return Err(CreateImageLayersError::Cancelled);
@@ -4551,20 +4583,12 @@ impl Timeline {
                     lsn
                 }
             );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
             })
         } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard. In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
         }
     }
 
@@ -4620,6 +4644,8 @@ impl Timeline {
         decision
     }
 
+    /// Returns the image layers generated and an enum indicating whether the process is fully completed.
+    /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4627,7 +4653,8 @@ impl Timeline {
         lsn: Lsn,
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
+        last_status: LastImageLayerCreationStatus,
+    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
         // We need to avoid holes between generated image layers.
@@ -4641,10 +4668,23 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
+        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
+            info!(
+                "resuming image layer creation: last_status={:?}",
+                last_status
+            );
+            true
+        } else {
+            self.should_check_if_image_layers_required(lsn)
+        };
 
         let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
 
+        let mut all_generated = true;
+
+        let mut partition_processed = 0;
+        let total_partitions = partitioning.parts.len();
+
         for partition in partitioning.parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
@@ -4717,17 +4757,13 @@ impl Timeline {
                     .map_err(|_| CreateImageLayersError::Cancelled)?,
             );
 
-            let ImageLayerCreationOutcome {
-                unfinished_image_layer,
-                next_start_key,
-            } = if !compact_metadata {
+            let outcome = if !compact_metadata {
                 self.create_image_layer_for_rel_blocks(
                     partition,
                     image_layer_writer,
                     lsn,
                     ctx,
                     img_range.clone(),
-                    start,
                     io_concurrency,
                 )
                 .await?
@@ -4739,18 +4775,58 @@ impl Timeline {
                     ctx,
                     img_range.clone(),
                     mode,
-                    start,
                     io_concurrency,
                 )
                 .await?
             };
-            start = next_start_key;
-            if let Some(unfinished_image_layer) = unfinished_image_layer {
-                batch_image_writer.add_unfinished_image_writer(
+            match outcome {
+                ImageLayerCreationOutcome::Empty => {
+                    // No data in this partition, so we don't need to create an image layer (for now).
+                    // The next image layer should cover this key range, so we don't advance the `start`
+                    // key.
+                }
+                ImageLayerCreationOutcome::Generated {
                     unfinished_image_layer,
-                    img_range,
-                    lsn,
-                );
+                } => {
+                    batch_image_writer.add_unfinished_image_writer(
+                        unfinished_image_layer,
+                        img_range.clone(),
+                        lsn,
+                    );
+                    // The next image layer should be generated right after this one.
+                    start = img_range.end;
+                }
+                ImageLayerCreationOutcome::Skip => {
+                    // We don't need to create an image layer for this partition.
+                    // The next image layer should NOT cover this range, otherwise
+                    // the keyspace becomes empty (reads don't go past image layers).
+                    start = img_range.end;
+                }
+            }
+
+            partition_processed += 1;
+
+            if let ImageLayerCreationMode::Try = mode {
+                // We have at least made some progress
+                if batch_image_writer.pending_layer_num() >= 1 {
+                    // The `Try` mode is currently only used on the compaction path. We want to avoid
+                    // image layer generation taking too long time and blocking L0 compaction. So in this
+                    // mode, we also inspect the current number of L0 layers and skip image layer generation
+                    // if there are too many of them.
+                    let num_of_l0_layers = {
+                        let layers = self.layers.read().await;
+                        layers.layer_map()?.level0_deltas().len()
+                    };
+                    let image_preempt_threshold = self.get_image_creation_preempt_threshold()
+                        * self.get_compaction_threshold();
+                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
+                        tracing::info!(
+                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                    );
+                        all_generated = false;
+                        break;
+                    }
+                }
             }
         }
 
@@ -4765,14 +4841,35 @@ impl Timeline {
             .open_mut()?
             .track_new_image_layers(&image_layers, &self.metrics);
         drop_wlock(guard);
-        timer.stop_and_record();
+        let duration = timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
         if !image_layers.is_empty() {
             self.update_layer_visibility().await?;
         }
 
-        Ok(image_layers)
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();
+
+        info!(
+            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+            image_layers.len(),
+            total_layer_size,
+            duration.as_secs_f64(),
+            partition_processed,
+            total_partitions
+        );
+
+        Ok((
+            image_layers,
+            if all_generated {
+                LastImageLayerCreationStatus::Complete
+            } else {
+                LastImageLayerCreationStatus::Incomplete
+            },
+        ))
     }
 
     /// Wait until the background initial logical size calculation is complete, or
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7244e946cb79..9bd61bbac5a6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,7 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
     CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -709,7 +709,7 @@ impl Timeline {
                     .extend(sparse_partitioning.into_dense().parts);
 
                 // 3. Create new image layers for partitions that have been modified "enough".
-                let image_layers = self
+                let (image_layers, outcome) = self
                     .create_image_layers(
                         &partitioning,
                         lsn,
@@ -722,10 +722,22 @@ impl Timeline {
                             ImageLayerCreationMode::Try
                         },
                         &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
                     )
                     .await?;
 
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
+
                 self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(true);
+                }
                 partitioning.parts.len()
             }
             Err(err) => {
@@ -3232,11 +3244,7 @@ impl TimelineAdaptor {
             ranges: self.get_keyspace(key_range, lsn, ctx).await?,
         };
         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            unfinished_image_layer,
-            next_start_key: _,
-        } = self
+        let outcome = self
             .timeline
             .create_image_layer_for_rel_blocks(
                 &keyspace,
@@ -3244,13 +3252,15 @@ impl TimelineAdaptor {
                 lsn,
                 ctx,
                 key_range.clone(),
-                start,
                 IoConcurrency::sequential(),
             )
             .await?;
 
-        if let Some(image_layer_writer) = unfinished_image_layer {
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
+        if let ImageLayerCreationOutcome::Generated {
+            unfinished_image_layer,
+        } = outcome
+        {
+            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
             let image_layer =
                 Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
             self.new_images.push(image_layer);
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index e88d245c8ff4..a4b9eabf8e63 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -184,6 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_compaction_enabled": True,
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
+        "image_creation_preempt_threshold": 5,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From e219d48bfe9991f40eb0d6c4f8713c9f8dc9eb05 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Feb 2025 16:56:55 -0500
Subject: [PATCH 57/78] refactor(pageserver): clearify compaction return value
 (#10643)

## Problem

## Summary of changes

Make the return value of the set of compaction functions less confusing.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 35 +++++++------
 pageserver/src/tenant/tasks.rs               |  5 +-
 pageserver/src/tenant/timeline.rs            | 13 ++---
 pageserver/src/tenant/timeline/compaction.rs | 55 +++++++++++++-------
 4 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 80a61eba924b..c1b408ed72a4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -46,6 +46,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::CompactionOutcome;
 use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -2907,10 +2908,10 @@ impl Tenant {
         self: &Arc<Self>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
+    ) -> Result<CompactionOutcome, timeline::CompactionError> {
         // Don't start doing work during shutdown, or when broken, we do not need those in the logs
         if !self.is_active() {
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         {
@@ -2924,7 +2925,7 @@ impl Tenant {
             // to AttachedSingle state.
             if !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
             }
         }
 
@@ -2967,7 +2968,7 @@ impl Tenant {
         // Before doing any I/O work, check our circuit breaker
         if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
             info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         let mut has_pending_task = false;
@@ -2975,10 +2976,10 @@ impl Tenant {
         for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
             // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(true): compaction task left
-            // pending_task_left == Some(false): no compaction task left
+            // pending_task_left == Some(Pending): compaction task left
+            // pending_task_left == Some(Done): no compaction task left
             let pending_task_left = if *can_compact {
-                let has_pending_l0_compaction_task = timeline
+                let compaction_outcome = timeline
                     .compact(cancel, EnumSet::empty(), ctx)
                     .instrument(info_span!("compact_timeline", %timeline_id))
                     .await
@@ -2996,27 +2997,27 @@ impl Tenant {
                                 .fail(&CIRCUIT_BREAKERS_BROKEN, e);
                         }
                     })?;
-                if has_pending_l0_compaction_task {
-                    Some(true)
+                if let CompactionOutcome::Pending = compaction_outcome {
+                    Some(CompactionOutcome::Pending)
                 } else {
                     let queue = {
                         let guard = self.scheduled_compaction_tasks.lock().unwrap();
                         guard.get(timeline_id).cloned()
                     };
                     if let Some(queue) = queue {
-                        let has_pending_tasks = queue
+                        let outcome = queue
                             .iteration(cancel, ctx, &self.gc_block, timeline)
                             .await?;
-                        Some(has_pending_tasks)
+                        Some(outcome)
                     } else {
-                        Some(false)
+                        Some(CompactionOutcome::Done)
                     }
                 }
             } else {
                 None
             };
-            has_pending_task |= pending_task_left.unwrap_or(false);
-            if pending_task_left == Some(false) && *can_offload {
+            has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
+            if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
                 pausable_failpoint!("before-timeline-auto-offload");
                 match offload_timeline(self, timeline)
                     .instrument(info_span!("offload_timeline", %timeline_id))
@@ -3036,7 +3037,11 @@ impl Tenant {
             .unwrap()
             .success(&CIRCUIT_BREAKERS_UNBROKEN);
 
-        Ok(has_pending_task)
+        Ok(if has_pending_task {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
     }
 
     /// Cancel scheduled compaction tasks
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3725e2f7fcbc..b6b64d02dd9f 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -11,6 +11,7 @@ use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
+use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
@@ -206,10 +207,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     .run(tenant.compaction_iteration(&cancel, &ctx))
                     .await;
                 match output {
-                    Ok(has_pending_task) => {
+                    Ok(outcome) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task {
+                        sleep_duration = if let CompactionOutcome::Pending = outcome {
                             Duration::ZERO
                         } else {
                             period
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d65b382e50ab..11c0bbdfe5e1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,6 +18,7 @@ use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
+use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
@@ -1679,7 +1680,7 @@ impl Timeline {
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         self.compact_with_options(
             cancel,
             CompactOptions {
@@ -1701,7 +1702,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         // most likely the cancellation token is from background task, but in tests it could be the
         // request task as well.
 
@@ -1721,8 +1722,8 @@ impl Timeline {
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
             tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(false),
-            _ = cancel.cancelled() => return Ok(false),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1730,13 +1731,13 @@ impl Timeline {
         // Last record Lsn could be zero in case the timeline was just created
         if !last_record_lsn.is_valid() {
             warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         let result = match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => {
                 self.compact_tiered(cancel, ctx).await?;
-                Ok(false)
+                Ok(CompactionOutcome::Done)
             }
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
         };
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9bd61bbac5a6..7dd37d72328e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -262,13 +262,13 @@ impl GcCompactionQueue {
         ctx: &RequestContext,
         gc_block: &GcBlock,
         timeline: &Arc<Timeline>,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
         let has_pending_tasks;
         let (id, item) = {
             let mut guard = self.inner.lock().unwrap();
             let Some((id, item)) = guard.queued.pop_front() else {
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
             };
             guard.running = Some((id, item.clone()));
             has_pending_tasks = !guard.queued.is_empty();
@@ -323,7 +323,11 @@ impl GcCompactionQueue {
             let mut guard = self.inner.lock().unwrap();
             guard.running = None;
         }
-        Ok(has_pending_tasks)
+        Ok(if has_pending_tasks {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
     }
 
     #[allow(clippy::type_complexity)]
@@ -589,6 +593,17 @@ impl CompactionStatistics {
     }
 }
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CompactionOutcome {
+    #[default]
+    /// No layers need to be compacted after this round. Compaction doesn't need
+    /// to be immediately scheduled.
+    Done,
+    /// Still has pending layers to be compacted after this round. Ideally, the scheduler
+    /// should immediately schedule another compaction.
+    Pending,
+}
+
 impl Timeline {
     /// TODO: cancellation
     ///
@@ -598,7 +613,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         if options
             .flags
             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
@@ -606,7 +621,7 @@ impl Timeline {
             self.compact_with_gc(cancel, options, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         if options.flags.contains(CompactFlags::DryRun) {
@@ -666,9 +681,9 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // 1. L0 Compact
-        let fully_compacted = {
+        let l0_compaction_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
-            let fully_compacted = self
+            let l0_compaction_outcome = self
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -676,15 +691,15 @@ impl Timeline {
                 )
                 .await?;
             timer.stop_and_record();
-            fully_compacted
+            l0_compaction_outcome
         };
 
-        if !fully_compacted {
+        if let CompactionOutcome::Pending = l0_compaction_outcome {
             // Yield and do not do any other kind of compaction. True means
             // that we have pending L0 compaction tasks and the compaction scheduler
             // will prioritize compacting this tenant/timeline again.
             info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
-            return Ok(true);
+            return Ok(CompactionOutcome::Pending);
         }
 
         // 2. Repartition and create image layers if necessary
@@ -736,7 +751,7 @@ impl Timeline {
                 if let LastImageLayerCreationStatus::Incomplete = outcome {
                     // Yield and do not do any other kind of compaction.
                     info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                    return Ok(true);
+                    return Ok(CompactionOutcome::Pending);
                 }
                 partitioning.parts.len()
             }
@@ -765,7 +780,7 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
-        Ok(false)
+        Ok(CompactionOutcome::Done)
     }
 
     /// Check for layers that are elegible to be rewritten:
@@ -1022,11 +1037,11 @@ impl Timeline {
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-            fully_compacted,
+            outcome,
         } = {
             let phase1_span = info_span!("compact_level0_phase1");
             let ctx = ctx.attached_child();
@@ -1055,12 +1070,12 @@ impl Timeline {
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
-            return Ok(true);
+            return Ok(CompactionOutcome::Done);
         }
 
         self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
             .await?;
-        Ok(fully_compacted)
+        Ok(outcome)
     }
 
     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -1602,7 +1617,11 @@ impl Timeline {
                 .into_iter()
                 .map(|x| x.drop_eviction_guard())
                 .collect::<Vec<_>>(),
-            fully_compacted,
+            outcome: if fully_compacted {
+                CompactionOutcome::Done
+            } else {
+                CompactionOutcome::Pending
+            },
         })
     }
 }
@@ -1613,7 +1632,7 @@ struct CompactLevel0Phase1Result {
     deltas_to_compact: Vec<Layer>,
     // Whether we have included all L0 layers, or selected only part of them due to the
     // L0 compaction size limit.
-    fully_compacted: bool,
+    outcome: CompactionOutcome,
 }
 
 #[derive(Default)]

From 8107140f7f8145148f8b9ffd007046c81c724c25 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 4 Feb 2025 11:35:43 +0100
Subject: [PATCH 58/78] Refactor compute dockerfile (#10371)

Refactor how extensions are built in compute Dockerfile

1. Rename some of the extension layers, so that names correspond more
   precisely to the upstream repository name and the source directory
   name. For example, instead of "pg-jsonschema-pg-build", spell it
   "pg_jsonschema-build". Some of the layer names had the extra "pg-"
   part, and some didn't; harmonize on not having it. And use an
   underscore if the upstream project name uses an underscore.

2. Each extension now consists of two dockerfile targets:
   [extension]-src and [extension]-build. By convention, the -src
   target downloads the sources and applies any neon-specific patches
   if necessary. The source tarball is downloaded and extracted under
   /ext-src. For example, the 'pgvector' extension creates the
   following files and directory:

        /ext-src/pgvector.tar.gz  # original tarball
/ext-src/pgvector.patch # neon-specific patch, copied from patches/ dir
/ext-src/pgvector-src/ # extracted tarball, with patch applied

    This separation avoids re-downloading the sources every time the
    extension is recompiled. The 'extension-tests' target also uses the
    [extension]-src layers, by copying the /ext-src/ dirs from all
    the extensions together into one image.

This refactoring came about when I was experimenting with different
ways of splitting up the Dockerfile so that each extension would be in
a separate file. That's not part of this PR yet, but this is a good
step in modularizing the extensions.
---
 .github/workflows/build_and_test.yml |   2 +-
 compute/compute-node.Dockerfile      | 884 ++++++++++++++++++---------
 2 files changed, 593 insertions(+), 293 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1274543429ca..5a4bdecb994e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -682,7 +682,7 @@ jobs:
           push: true
           pull: true
           file: compute/compute-node.Dockerfile
-          target: neon-pg-ext-test
+          target: extension-tests
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 32226c56a52d..ea29630001a4 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1,3 +1,81 @@
+#
+# This Dockerfile builds the compute image. It is built multiple times to produce
+# different images for each PostgreSQL major version.
+#
+# We use Debian as the base for all the steps. The production images use Debian bookworm
+# for v17, and Debian bullseye for older PostgreSQL versions.
+#
+# ## Intermediary layers
+#
+# build-tools:   This contains Rust compiler toolchain and other tools needed at compile
+#                time. This is also used for the storage builds. This image is defined in
+#                build-tools.Dockerfile.
+#
+# build-deps:    Contains C compiler, other build tools, and compile-time dependencies
+#                needed to compile PostgreSQL and most extensions. (Some extensions need
+#                extra tools and libraries that are not included in this image. They are
+#                installed in the extension-specific build stages.)
+#
+# pg-build:      Result of compiling PostgreSQL. The PostgreSQL binaries are copied from
+#                this to the final image. This is also used as the base for compiling all
+#                the extensions.
+#
+# compute-tools: This contains compute_ctl, the launcher program that starts Postgres
+#                in Neon. It also contains a few other tools that are built from the
+#                sources from this repository and used in compute VMs: 'fast_import' and
+#                'local_proxy'
+#
+# ## Extensions
+#
+# By convention, the build of each extension consists of two layers:
+#
+# {extension}-src:   Contains the source tarball, possible neon-specific patches, and
+#                    the extracted tarball with the patches applied. All of these are
+#                    under the /ext-src/ directory.
+#
+# {extension}-build: Contains the installed extension files, under /usr/local/pgsql
+#                    (in addition to the PostgreSQL binaries inherited from the pg-build
+#                    image). A few extensions need extra libraries or other files
+#                    installed elsewhere in the filesystem. They are installed by ONBUILD
+#                    directives.
+#
+# These are merged together into two layers:
+#
+# all-extensions:    All the extension -build layers merged together
+#
+# extension-tests:   All the extension -src layers merged together. This is used by the
+#                    extension tests. The tests are executed against the compiled image,
+#                    but the tests need test scripts, expected result files etc. from the
+#                    original sources, which are not included in the binary image.
+#
+# ## Extra components
+#
+# These are extra included in the compute image, but are not directly used by PostgreSQL
+# itself.
+#
+# pgbouncer:         pgbouncer and its configuration
+#
+# sql_exporter:      Metrics exporter daemon.
+#
+# postgres_exporter: Another metrics exporter daemon, for different sets of metrics.
+#
+# The configuration files for the metrics exporters are under etc/ directory. We use
+# a templating system to handle variations between different PostgreSQL versions,
+# building slightly different config files for each PostgreSQL version.
+#
+#
+# ## Final image
+#
+# The final image puts together the PostgreSQL binaries (pg-build), the compute tools
+# (compute-tools), all the extensions (all-extensions) and the extra components into
+# one image.
+#
+# VM image: The final image built by this dockerfile isn't actually the final image that
+# we use in computes VMs. There's an extra step that adds some files and makes other
+# small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM.
+# That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the
+# build_and_test.yml github workflow for how that's done.
+
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
@@ -122,17 +200,9 @@ ENV PATH="/usr/local/pgsql/bin:$PATH"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM pg-build AS postgis-build
+FROM build-deps AS postgis-src
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
-    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
-    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
-    protobuf-c-compiler xsltproc \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
 
 # Postgis 3.5.0 requires SFCGAL 1.4+
 #
@@ -141,6 +211,7 @@ RUN apt update && \
 # and also we must check backward compatibility with older versions of PostGIS.
 #
 # Use new version only for v17
+WORKDIR /ext-src
 RUN case "${DEBIAN_VERSION}" in \
     "bookworm") \
         export SFCGAL_VERSION=1.4.1 \
@@ -154,15 +225,12 @@ RUN case "${DEBIAN_VERSION}" in \
         echo "unexpected PostgreSQL version" && exit 1 \
     ;; \
     esac && \
-    mkdir -p /sfcgal && \
     wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \
     echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
-    ninja clean && cp -R /sfcgal/* /
+    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C .
 
 # Postgis 3.5.0 supports v17
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export POSTGIS_VERSION=3.5.0 \
@@ -178,8 +246,27 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
     echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    ./autogen.sh && \
+    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C .
+
+# This is reused for pgrouting
+FROM pg-build AS postgis-build-deps
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
+    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
+    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
+    protobuf-c-compiler xsltproc \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+FROM postgis-build-deps AS postgis-build
+COPY --from=postgis-src /ext-src/ /ext-src/
+WORKDIR /ext-src/sfcgal-src
+RUN cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
+    ninja clean && cp -R /sfcgal/* /
+
+WORKDIR /ext-src/postgis-src
+RUN ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -202,12 +289,23 @@ RUN case "${PG_VERSION}" in \
     cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
     cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
 
+#########################################################################################
+#
+# Layer "pgrouting-build"
+# Build pgrouting. Note: This depends on the postgis-build-deps layer built above
+#
+#########################################################################################
+
 # Uses versioned libraries, i.e. libpgrouting-3.4
 # and may introduce function signature changes between releases
 # i.e. release 3.5.0 has new signature for pg_dijkstra function
 #
 # Use new version only for v17
 # last release v3.6.2 - Mar 30, 2024
+FROM build-deps AS pgrouting-src
+ARG DEBIAN_VERSION
+ARG PG_VERSION
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export PGROUTING_VERSION=3.6.2 \
@@ -223,8 +321,12 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \
     echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C .
+
+FROM postgis-build-deps AS pgrouting-build
+COPY --from=pgrouting-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgrouting-src
+RUN mkdir build && cd build && \
     cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
     ninja -j $(getconf _NPROCESSORS_ONLN) && \
     ninja -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -236,15 +338,11 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM pg-build AS plv8-build
+FROM build-deps AS plv8-src
 ARG PG_VERSION
+WORKDIR /ext-src
 
-COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
-
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-    ninja-build python3-dev libncurses5 binutils clang \
-    && apt clean && rm -rf /var/lib/apt/lists/*
+COPY compute/patches/plv8-3.1.10.patch .
 
 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -268,9 +366,20 @@ RUN case "${PG_VERSION}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
-    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \
+    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+
+FROM pg-build AS plv8-build
+ARG PG_VERSION
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+    ninja-build python3-dev libncurses5 binutils clang \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=plv8-src /ext-src/ /ext-src/
+WORKDIR /ext-src/plv8-src
+RUN \
     # generate and copy upgrade scripts
-    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
+    make generate_upgrades && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
@@ -298,16 +407,28 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM pg-build AS h3-pg-build
+FROM build-deps AS h3-pg-src
 ARG PG_VERSION
+WORKDIR /ext-src
 
 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
 RUN mkdir -p /h3/usr/ && \
     wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C .
+
+# not version-specific
+# last release v4.1.3 - Jul 26, 2023
+WORKDIR /ext-src
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS h3-pg-build
+COPY --from=h3-pg-src /ext-src/ /ext-src/
+WORKDIR /ext-src/h3-src
+RUN mkdir build && cd build && \
     cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \
         -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \
     && ninja -j $(getconf _NPROCESSORS_ONLN) && \
@@ -315,11 +436,8 @@ RUN mkdir -p /h3/usr/ && \
     cp -R /h3/usr / && \
     rm -rf build
 
-# not version-specific
-# last release v4.1.3 - Jul 26, 2023
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
-    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+WORKDIR /ext-src/h3-pg-src
+RUN ls -l && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -327,19 +445,24 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 
 #########################################################################################
 #
-# Layer "unit-pg-build"
+# Layer "postgresql-unit-build"
 # compile unit extension
 #
 #########################################################################################
-FROM pg-build AS unit-pg-build
+FROM build-deps AS postgresql-unit-src
 ARG PG_VERSION
 
 # not version-specific
 # last release 7.9 - Sep 15, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
     echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS postgresql-unit-build
+COPY --from=postgresql-unit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/postgresql-unit-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
     # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
@@ -350,14 +473,15 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 
 #########################################################################################
 #
-# Layer "vector-pg-build"
+# Layer "pgvector-build"
 # compile pgvector extension
 #
 #########################################################################################
-FROM pg-build AS vector-pg-build
+FROM build-deps AS pgvector-src
 ARG PG_VERSION
 
-COPY compute/patches/pgvector.patch /pgvector.patch
+WORKDIR /ext-src
+COPY compute/patches/pgvector.patch .
 
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
@@ -370,74 +494,94 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
     echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
-    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    patch -p1 < /ext-src/pgvector.patch
+
+FROM pg-build AS pgvector-build
+COPY --from=pgvector-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgvector-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################
 #
-# Layer "pgjwt-pg-build"
+# Layer "pgjwt-build"
 # compile pgjwt extension
 #
 #########################################################################################
-FROM pg-build AS pgjwt-pg-build
+FROM build-deps AS pgjwt-src
 ARG PG_VERSION
 
 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgjwt-build
+COPY --from=pgjwt-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgjwt-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
 #########################################################################################
 #
-# Layer "hypopg-pg-build"
+# Layer "hypopg-build"
 # compile hypopg extension
 #
 #########################################################################################
-FROM pg-build AS hypopg-pg-build
+FROM build-deps AS hypopg-src
 ARG PG_VERSION
 
 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
     echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS hypopg-build
+COPY --from=hypopg-src /ext-src/ /ext-src/
+WORKDIR /ext-src/hypopg-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
 #########################################################################################
 #
-# Layer "pg-hashids-pg-build"
+# Layer "pg_hashids-build"
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM pg-build AS pg-hashids-pg-build
+FROM build-deps AS pg_hashids-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
+WORKDIR /ext-src
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_hashids-build
+COPY --from=pg_hashids-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_hashids-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
 
 #########################################################################################
 #
-# Layer "rum-pg-build"
+# Layer "rum-build"
 # compile rum extension
 #
 #########################################################################################
-FROM pg-build AS rum-pg-build
+FROM build-deps AS rum-src
 ARG PG_VERSION
 
-COPY compute/patches/rum.patch /rum.patch
+WORKDIR /ext-src
+COPY compute/patches/rum.patch .
 
 # supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
 # doesn't use releases since 1.3.13 - Sep 19, 2022
@@ -445,110 +589,140 @@ COPY compute/patches/rum.patch /rum.patch
 RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
     echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    patch -p1 < /ext-src/rum.patch
+
+FROM pg-build AS rum-build
+COPY --from=rum-src /ext-src/ /ext-src/
+WORKDIR /ext-src/rum-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
 
 #########################################################################################
 #
-# Layer "pgtap-pg-build"
+# Layer "pgtap-build"
 # compile pgTAP extension
 #
 #########################################################################################
-FROM pg-build AS pgtap-pg-build
+FROM build-deps AS pgtap-src
 ARG PG_VERSION
 
 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
     echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgtap-build
+COPY --from=pgtap-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgtap-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
 
 #########################################################################################
 #
-# Layer "ip4r-pg-build"
+# Layer "ip4r-build"
 # compile ip4r extension
 #
 #########################################################################################
-FROM pg-build AS ip4r-pg-build
+FROM build-deps AS ip4r-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS ip4r-build
+COPY --from=ip4r-src /ext-src/ /ext-src/
+WORKDIR /ext-src/ip4r-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
 
 #########################################################################################
 #
-# Layer "prefix-pg-build"
+# Layer "prefix-build"
 # compile Prefix extension
 #
 #########################################################################################
-FROM pg-build AS prefix-pg-build
+FROM build-deps AS prefix-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS prefix-build
+COPY --from=prefix-src /ext-src/ /ext-src/
+WORKDIR /ext-src/prefix-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
 
 #########################################################################################
 #
-# Layer "hll-pg-build"
+# Layer "hll-build"
 # compile hll extension
 #
 #########################################################################################
-FROM pg-build AS hll-pg-build
+FROM build-deps AS hll-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v2.18 - Aug 29, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS hll-build
+COPY --from=hll-src /ext-src/ /ext-src/
+WORKDIR /ext-src/hll-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
 
 #########################################################################################
 #
-# Layer "plpgsql-check-pg-build"
+# Layer "plpgsql_check-build"
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM pg-build AS plpgsql-check-pg-build
+FROM build-deps AS plpgsql_check-src
 ARG PG_VERSION
 
 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
     echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS plpgsql_check-build
+COPY --from=plpgsql_check-src /ext-src/ /ext-src/
+WORKDIR /ext-src/plpgsql_check-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
 
 #########################################################################################
 #
-# Layer "timescaledb-pg-build"
+# Layer "timescaledb-build"
 # compile timescaledb extension
 #
 #########################################################################################
-FROM pg-build AS timescaledb-pg-build
+FROM build-deps AS timescaledb-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
@@ -565,8 +739,12 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS timescaledb-build
+COPY --from=timescaledb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/timescaledb-src
+RUN ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -574,14 +752,15 @@ RUN case "${PG_VERSION}" in \
 
 #########################################################################################
 #
-# Layer "pg-hint-plan-pg-build"
+# Layer "pg_hint_plan-build"
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM pg-build AS pg-hint-plan-pg-build
+FROM build-deps AS pg_hint_plan-src
 ARG PG_VERSION
 
 # version-specific, has separate releases for each version
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
@@ -605,50 +784,51 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
     echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_hint_plan-build
+COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_hint_plan-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
 
 
 #########################################################################################
 #
-# Layer "pg-cron-pg-build"
+# Layer "pg_cron-build"
 # compile pg_cron extension
 #
 #########################################################################################
-FROM pg-build AS pg-cron-pg-build
+FROM build-deps AS pg_cron-src
 ARG PG_VERSION
 
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
+WORKDIR /ext-src
+COPY compute/patches/pg_cron.patch .
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
     echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    patch < /ext-src/pg_cron.patch
+
+FROM pg-build AS pg_cron-build
+COPY --from=pg_cron-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_cron-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
 
 #########################################################################################
 #
-# Layer "rdkit-pg-build"
+# Layer "rdkit-build"
 # compile rdkit extension
 #
 #########################################################################################
-FROM pg-build AS rdkit-pg-build
+FROM build-deps AS rdkit-src
 ARG PG_VERSION
 
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-        libboost-iostreams1.74-dev \
-        libboost-regex1.74-dev \
-        libboost-serialization1.74-dev \
-        libboost-system1.74-dev \
-        libeigen3-dev \
-        libboost-all-dev \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
 # rdkit Release_2024_09_1 supports v17
 # last release Release_2024_09_1 - Sep 27, 2024
 #
@@ -656,12 +836,7 @@ RUN apt update && \
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 
-# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
-# pg_config. For some reason the rdkit cmake script doesn't work with just that,
-# however. By also adding /usr/local/pgsql, it works, which is weird because there
-# are no executables in that directory.
-ENV PATH="/usr/local/pgsql:$PATH"
-
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
@@ -677,8 +852,28 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
     echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
-    cmake \
+    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS rdkit-build
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+        libboost-iostreams1.74-dev \
+        libboost-regex1.74-dev \
+        libboost-serialization1.74-dev \
+        libboost-system1.74-dev \
+        libeigen3-dev \
+        libboost-all-dev \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=rdkit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/rdkit-src
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+RUN cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
         -D RDK_BUILD_INCHI_SUPPORT=ON \
         -D RDK_BUILD_AVALON_SUPPORT=ON \
@@ -710,47 +905,57 @@ RUN case "${PG_VERSION}" in \
 
 #########################################################################################
 #
-# Layer "pg-uuidv7-pg-build"
+# Layer "pg_uuidv7-build"
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM pg-build AS pg-uuidv7-pg-build
+FROM build-deps AS pg_uuidv7-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_uuidv7-build
+COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_uuidv7-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
 
 #########################################################################################
 #
-# Layer "pg-roaringbitmap-pg-build"
+# Layer "pg_roaringbitmap-build"
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM pg-build AS pg-roaringbitmap-pg-build
+FROM build-deps AS pg_roaringbitmap-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
+WORKDIR /ext-src
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_roaringbitmap-build
+COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_roaringbitmap-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
 #########################################################################################
 #
-# Layer "pg-semver-pg-build"
+# Layer "pg_semver-build"
 # compile pg_semver extension
 #
 #########################################################################################
-FROM pg-build AS pg-semver-pg-build
+FROM build-deps AS pg_semver-src
 ARG PG_VERSION
 
 # Release 0.40.0 breaks backward compatibility with previous versions
@@ -758,6 +963,7 @@ ARG PG_VERSION
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
@@ -773,22 +979,27 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \
     echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_semver-build
+COPY --from=pg_semver-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_semver-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
 
 #########################################################################################
 #
-# Layer "pg-embedding-pg-build"
+# Layer "pg_embedding-build"
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM pg-build AS pg-embedding-pg-build
+FROM build-deps AS pg_embedding-src
+ARG PG_VERSION
 
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
-ARG PG_VERSION
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -799,29 +1010,44 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
     echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_embedding-build
+COPY --from=pg_embedding-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN  if [ -d pg_embedding-src ]; then \
+        cd pg_embedding-src && \
+        make -j $(getconf _NPROCESSORS_ONLN) && \
+        make -j $(getconf _NPROCESSORS_ONLN) install; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-anon-pg-build"
+# Layer "pg_anon-build"
 # compile anon extension
 #
 #########################################################################################
-FROM pg-build AS pg-anon-pg-build
+FROM build-deps AS pg_anon-src
 ARG PG_VERSION
 
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_anon-build
+COPY --from=pg_anon-src /ext-src/ /ext-src/
+WORKDIR /ext-src
+RUN if [ -d pg_anon-src ]; then \
+        cd pg_anon-src && \
+        make -j $(getconf _NPROCESSORS_ONLN) install && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
+    fi
 
 #########################################################################################
 #
@@ -887,50 +1113,57 @@ USER root
 
 #########################################################################################
 #
-# Layers "pg-onnx-build" and "pgrag-pg-build"
+# Layers "pg-onnx-build" and "pgrag-build"
 # Compile "pgrag" extensions
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-onnx-build
+FROM build-deps AS pgrag-src
+ARG PG_VERSION
+
+WORKDIR /ext-src
+RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
+    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    echo "#nothing to test here" > neon-test.sh
+
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build-pgrx12 AS pgrag-build
+COPY --from=pgrag-src /ext-src/ /ext-src/
 
+# Install build-time dependencies
 # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
 # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
+WORKDIR /ext-src/onnxruntime-src
 RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
-    python3 python3-pip python3-venv && \
+    python3 python3-pip python3-venv protobuf-compiler && \
     apt clean && rm -rf /var/lib/apt/lists/* && \
     python3 -m venv venv && \
     . venv/bin/activate && \
-    python3 -m pip install cmake==3.30.5 && \
-    wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
-    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    python3 -m pip install cmake==3.30.5
+
+RUN . venv/bin/activate && \
     ./build.sh --config Release --parallel --cmake_generator Ninja \
     --skip_submodule_sync --skip_tests --allow_running_as_root
 
-
-FROM pg-onnx-build AS pgrag-pg-build
-
-RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \
-    && apt clean && rm -rf /var/lib/apt/lists/* && \
-    wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
-    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
-    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
-    \
-    cd exts/rag && \
+WORKDIR /ext-src/pgrag-src
+RUN cd exts/rag && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
-    \
-    cd ../rag_bge_small_en_v15 && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
+
+RUN cd exts/rag_bge_small_en_v15 && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
         cargo pgrx install --release --features remote_onnx && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
-    \
-    cd ../rag_jina_reranker_v1_tiny_en && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
+
+RUN cd exts/rag_jina_reranker_v1_tiny_en && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
         cargo pgrx install --release --features remote_onnx && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
@@ -938,17 +1171,23 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
 
 #########################################################################################
 #
-# Layer "pg-jsonschema-pg-build"
+# Layer "pg_jsonschema-build"
 # Compile "pg_jsonschema" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
+FROM build-deps AS pg_jsonschema-src
 ARG PG_VERSION
 # last release v0.3.3 - Oct 16, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
     echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build-pgrx12 AS pg_jsonschema-build
+COPY --from=pg_jsonschema-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_jsonschema-src
+RUN \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
     # against postgres forks that decided to change their ABI name (like us).
@@ -961,55 +1200,69 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
 
 #########################################################################################
 #
-# Layer "pg-graphql-pg-build"
+# Layer "pg_graphql-build"
 # Compile "pg_graphql" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
+FROM build-deps AS pg_graphql-src
 ARG PG_VERSION
 
 # last release v1.5.9 - Oct 16, 2024
+WORKDIR /ext-src
+COPY compute/patches/pg_graphql.patch .
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
     echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    patch -p1 < /ext-src/pg_graphql.patch
+
+
+FROM rust-extensions-build-pgrx12 AS pg_graphql-build
+COPY --from=pg_graphql-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_graphql-src
+RUN cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
 
 #########################################################################################
 #
-# Layer "pg-tiktoken-build"
+# Layer "pg_tiktoken-build"
 # Compile "pg_tiktoken" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
+FROM build-deps AS pg_tiktoken-src
 ARG PG_VERSION
 
 # doesn't use releases
 # 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
     echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
-    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pg_tiktoken-build
+COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_tiktoken-src
+RUN cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
 #########################################################################################
 #
-# Layer "pg-pgx-ulid-build"
+# Layer "pgx_ulid-build"
 # Compile "pgx_ulid" extension for v16 and below
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-pgx-ulid-build
+FROM build-deps AS pgx_ulid-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v14" | "v15" | "v16") \
         ;; \
@@ -1020,20 +1273,28 @@ RUN case "${PG_VERSION}" in \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17  pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.11.2"/pgrx       = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
+    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml
+
+FROM rust-extensions-build AS pgx_ulid-build
+COPY --from=pgx_ulid-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN if [ -d pgx_ulid-src ]; then \
+        cd pgx_ulid-src && \
+        cargo pgrx install --release && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-pgx-ulid-pgrx12-build"
+# Layer "pgx_ulid-pgrx12-build"
 # Compile "pgx_ulid" extension for v17 and up
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
+FROM build-deps AS pgx_ulid-pgrx12-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         ;; \
@@ -1044,23 +1305,32 @@ RUN case "${PG_VERSION}" in \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
     echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
+    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pgx_ulid-pgrx12-build
+ARG PG_VERSION
+WORKDIR /ext-src
+COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
+RUN if [ -d pgx_ulid-src ]; then \
+        cd pgx_ulid-src && \
+        cargo pgrx install --release && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-session-jwt-build"
+# Layer "pg_session_jwt-build"
 # Compile "pg_session_jwt" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
+FROM build-deps AS pg_session_jwt-src
 ARG PG_VERSION
 
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
+WORKDIR /ext-src
 RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
     echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
@@ -1068,8 +1338,12 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
     sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
     sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
     sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
-    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
-    cargo pgrx install --release
+    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pg_session_jwt-build
+COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_session_jwt-src
+RUN cargo pgrx install --release
 
 #########################################################################################
 #
@@ -1078,15 +1352,20 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
 #
 #########################################################################################
 
-FROM pg-build AS wal2json-pg-build
+FROM build-deps AS wal2json-src
 ARG PG_VERSION
 
 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
     echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS wal2json-build
+COPY --from=wal2json-src /ext-src/ /ext-src/
+WORKDIR /ext-src/wal2json-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
@@ -1095,15 +1374,20 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM pg-build AS pg-ivm-build
+FROM build-deps AS pg_ivm-src
 ARG PG_VERSION
 
 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
+WORKDIR /ext-src
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
     echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_ivm-build
+COPY --from=pg_ivm-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_ivm-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
 
@@ -1113,15 +1397,20 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM pg-build AS pg-partman-build
+FROM build-deps AS pg_partman-src
 ARG PG_VERSION
 
 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
     echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_partman-build
+COPY --from=pg_partman-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_partman-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
@@ -1131,13 +1420,19 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 # compile pg_mooncake extension
 #
 #########################################################################################
-FROM rust-extensions-build AS pg-mooncake-build
+FROM build-deps AS pg_mooncake-src
 ARG PG_VERSION
-
+WORKDIR /ext-src
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
     echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
-    make release -j $(getconf _NPROCESSORS_ONLN) && \
+    echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
+    chmod a+x neon-test.sh
+
+FROM rust-extensions-build AS pg_mooncake-build
+COPY --from=pg_mooncake-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_mooncake-src
+RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
@@ -1148,81 +1443,93 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/p
 #
 #########################################################################################
 
-FROM pg-build AS pg-repack-build
+FROM build-deps AS pg_repack-src
 ARG PG_VERSION
-
+WORKDIR /ext-src
 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
     echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
-    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build AS pg_repack-build
+COPY --from=pg_repack-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_repack-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
 #
-# Layer "neon-pg-ext-build"
+# Layer "neon-ext-build"
 # compile neon extensions
 #
 #########################################################################################
-FROM build-deps AS neon-pg-ext-build
+FROM pg-build AS neon-ext-build
 ARG PG_VERSION
 
-# Public extensions
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=postgis-build /sfcgal/* /
-COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /h3/usr /
-COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
-
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon \
         -s install && \
     make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_utils \
         -s install && \
     make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_test_utils \
         -s install && \
     make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_rmgr \
         -s install
 
+#########################################################################################
+#
+# Layer "all-extensions"
+# Bundle together all the extensions
+#
+#########################################################################################
+FROM build-deps AS all-extensions
+ARG PG_VERSION
+
+# Public extensions
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=postgis-build /sfcgal/* /
+COPY --from=pgrouting-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=h3-pg-build /h3/usr /
+COPY --from=postgresql-unit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgvector-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=ip4r-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=prefix-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hll-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plpgsql_check-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_hint_plan-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgx_ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgx_ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_session_jwt-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+
+COPY --from=neon-ext-build /usr/local/pgsql/ /usr/local/pgsql/
+
 #########################################################################################
 #
 # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
@@ -1302,8 +1609,8 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 # Clean up postgres folder before inclusion
 #
 #########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
+FROM neon-ext-build AS postgres-cleanup-layer
+COPY --from=all-extensions /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
 RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
@@ -1332,66 +1639,59 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 #########################################################################################
 #
-# Layer neon-pg-ext-test
+# Layer extension-tests
 #
 #########################################################################################
 
-FROM neon-pg-ext-build AS neon-pg-ext-test
+FROM pg-build AS extension-tests
 ARG PG_VERSION
 RUN mkdir /ext-src
 
 COPY --from=pg-build /postgres /postgres
-#COPY --from=postgis-build /postgis.tar.gz /ext-src/
-#COPY --from=postgis-build /sfcgal/* /usr
-COPY --from=plv8-build /plv8.tar.gz /ext-src/
-#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
-COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.patch /ext-src/
-COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
-#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
-COPY compute/patches/pg_graphql.patch /ext-src
-#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
-COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
-COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY compute/patches/rum.patch /ext-src
-#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
-COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
-COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
-COPY --from=hll-pg-build /hll.tar.gz /ext-src
-COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
-#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
-COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
+#COPY --from=postgis-src /ext-src/ /ext-src/
+COPY --from=plv8-src /ext-src/ /ext-src/
+#COPY --from=h3-pg-src /ext-src/ /ext-src/
+COPY --from=postgresql-unit-src /ext-src/ /ext-src/
+COPY --from=pgvector-src /ext-src/ /ext-src/
+COPY --from=pgjwt-src /ext-src/ /ext-src/
+#COPY --from=pgrag-src /ext-src/ /ext-src/
+#COPY --from=pg_jsonschema-src /ext-src/ /ext-src/
+COPY --from=pg_graphql-src /ext-src/ /ext-src/
+#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
+COPY --from=hypopg-src /ext-src/ /ext-src/
+COPY --from=pg_hashids-src /ext-src/ /ext-src/
+COPY --from=rum-src /ext-src/ /ext-src/
+#COPY --from=pgtap-src /ext-src/ /ext-src/
+COPY --from=ip4r-src /ext-src/ /ext-src/
+COPY --from=prefix-src /ext-src/ /ext-src/
+COPY --from=hll-src /ext-src/ /ext-src/
+COPY --from=plpgsql_check-src /ext-src/ /ext-src/
+#COPY --from=timescaledb-src /ext-src/ /ext-src/
+COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
 COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
-COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY compute/patches/pg_cron.patch /ext-src
-#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
-COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
-COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
-COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
-#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
-#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
-COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
-    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
-    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
-    || exit 1; rm -f $f; done
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
+COPY --from=pg_cron-src /ext-src/ /ext-src/
+#COPY --from=pgx_ulid-src /ext-src/ /ext-src/
+#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
+#COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
+#COPY --from=rdkit-src /ext-src/ /ext-src/
+COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
+COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
+COPY --from=pg_semver-src /ext-src/ /ext-src/
+#COPY --from=pg_embedding-src /ext-src/ /ext-src/
+#COPY --from=wal2json-src /ext-src/ /ext-src/
+COPY --from=pg_ivm-src /ext-src/ /ext-src/
+COPY --from=pg_partman-src /ext-src/ /ext-src/
+#COPY --from=pg_mooncake-src /ext-src/ /ext-src/
+#COPY --from=pg_repack-src /ext-src/ /ext-src/
+
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_cron.patch
-RUN cd /ext-src/pg_graphql-src && patch -p1 </ext-src/pg_graphql.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
+
 #########################################################################################
 #
 # Final layer

From d5c3a4e2b911bd6dcf8a318d861c99ee3264008a Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:49:44 +0100
Subject: [PATCH 59/78] Add support for pgjwt test (#10611)

## Problem
We don't currently test pgjwt, while it is based on pg_prove and can be
easily added
## Summary of changes
The test for pgjwt was added.
---
 docker-compose/docker_compose_test.sh             |  1 +
 docker-compose/ext-src/pgjwt-src/neon-test.sh     |  4 ++++
 .../ext-src/pgjwt-src/test-upgrade.patch          | 15 +++++++++++++++
 docker-compose/ext-src/pgjwt-src/test-upgrade.sh  |  5 +++++
 docker-compose/test_extensions_upgrade.sh         |  5 +++--
 5 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100755 docker-compose/ext-src/pgjwt-src/neon-test.sh
 create mode 100644 docker-compose/ext-src/pgjwt-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pgjwt-src/test-upgrade.sh

diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index e0c537edf37b..c4ff86ab663e 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
 
     if [ $pg_version -ge 16 ]; then
         docker cp ext-src $TEST_CONTAINER_NAME:/
+        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
diff --git a/docker-compose/ext-src/pgjwt-src/neon-test.sh b/docker-compose/ext-src/pgjwt-src/neon-test.sh
new file mode 100755
index 000000000000..95af0be77b1f
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+pg_prove test.sql
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
new file mode 100644
index 000000000000..85b35654800f
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test.sql b/test.sql
+index d7a0ca8..f15bc76 100644
+--- a/test.sql
++++ b/test.sql
+@@ -9,9 +9,7 @@
+ \set ON_ERROR_STOP true
+ \set QUIET 1
+ 
+-CREATE EXTENSION pgcrypto;
+-CREATE EXTENSION pgtap;
+-CREATE EXTENSION pgjwt;
++CREATE EXTENSION IF NOT EXISTS pgtap;
+ 
+ BEGIN;
+ SELECT plan(23);
diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
new file mode 100755
index 000000000000..b7158d234000
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+pg_prove test.sql
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index ff93b98065c4..08b1a60f2da8 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -24,7 +24,7 @@ function wait_for_ready {
 }
 function create_extensions() {
   for ext in ${1}; do
-    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
   done
 }
 EXTENSIONS='[
@@ -40,7 +40,8 @@ EXTENSIONS='[
 {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
-{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
+{"extname": "pgjwt", "extdir": "pgjwt-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

From b6e9daea9a3fe8a6a9f0e070053e8e643722258b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 4 Feb 2025 15:01:57 +0100
Subject: [PATCH 60/78] storcon: only allow errrors of the server cert
 verification (#10644)

This PR does a bunch of things:

* only allow errors of the server cert verification, not of the TLS
handshake. The TLS handshake doesn't cause any errors for us so we can
just always require it to be valid. This simplifies the code a little.
* As the solution is more permanent than originally anticipated, I think
it makes sense to move the `AcceptAll` verifier outside.
* log the connstr information. this helps with figuring out which domain
names are configured in the connstr, etc. I think it is generally useful
to print it. make extra sure that the password is not leaked.

Follow-up of #10640
---
 storage_controller/src/persistence.rs | 138 ++++++++++++++------------
 1 file changed, 72 insertions(+), 66 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 45f3108d6b2b..c4e5b3958912 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,7 +27,7 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use rustls::client::danger::ServerCertVerifier;
+use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
@@ -194,6 +194,8 @@ impl Persistence {
         timeout: Duration,
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
+        log_postgres_connstr_info(database_url)
+            .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
         loop {
             match establish_connection_rustls(database_url).await {
                 Ok(_) => {
@@ -1281,6 +1283,51 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
     Ok(Arc::new(store))
 }
 
+#[derive(Debug)]
+/// A verifier that accepts all certificates (but logs an error still)
+struct AcceptAll(Arc<WebPkiServerVerifier>);
+impl ServerCertVerifier for AcceptAll {
+    fn verify_server_cert(
+        &self,
+        end_entity: &rustls::pki_types::CertificateDer<'_>,
+        intermediates: &[rustls::pki_types::CertificateDer<'_>],
+        server_name: &rustls::pki_types::ServerName<'_>,
+        ocsp_response: &[u8],
+        now: rustls::pki_types::UnixTime,
+    ) -> Result<ServerCertVerified, rustls::Error> {
+        let r =
+            self.0
+                .verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now);
+        if let Err(err) = r {
+            tracing::info!(
+                ?server_name,
+                "ignoring db connection TLS validation error: {err:?}"
+            );
+            return Ok(ServerCertVerified::assertion());
+        }
+        r
+    }
+    fn verify_tls12_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls12_signature(message, cert, dss)
+    }
+    fn verify_tls13_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls13_signature(message, cert, dss)
+    }
+    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+        self.0.supported_verify_schemes()
+    }
+}
+
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
 fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
@@ -1290,76 +1337,12 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
             .expect("ring should support the default protocol versions");
     static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
     let do_cert_checks =
-        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok());
     Ok(if *do_cert_checks {
         client_config
             .with_root_certificates(load_certs()?)
             .with_no_client_auth()
     } else {
-        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
-        #[derive(Debug)]
-        struct AcceptAll(Arc<WebPkiServerVerifier>);
-        impl ServerCertVerifier for AcceptAll {
-            fn verify_server_cert(
-                &self,
-                end_entity: &rustls::pki_types::CertificateDer<'_>,
-                intermediates: &[rustls::pki_types::CertificateDer<'_>],
-                server_name: &rustls::pki_types::ServerName<'_>,
-                ocsp_response: &[u8],
-                now: rustls::pki_types::UnixTime,
-            ) -> Result<ServerCertVerified, rustls::Error> {
-                let r = self.0.verify_server_cert(
-                    end_entity,
-                    intermediates,
-                    server_name,
-                    ocsp_response,
-                    now,
-                );
-                if let Err(err) = r {
-                    tracing::info!(
-                        ?server_name,
-                        "ignoring db connection TLS validation error: {err:?}"
-                    );
-                    return Ok(ServerCertVerified::assertion());
-                }
-                r
-            }
-            fn verify_tls12_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls12_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn verify_tls13_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls13_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-                self.0.supported_verify_schemes()
-            }
-        }
         let verifier = AcceptAll(
             WebPkiServerVerifier::builder_with_provider(
                 load_certs()?,
@@ -1389,6 +1372,29 @@ fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<Async
     fut.boxed()
 }
 
+#[cfg_attr(test, test)]
+fn test_config_debug_censors_password() {
+    let has_pw =
+        "host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'";
+    let has_pw_cfg = has_pw.parse::<tokio_postgres::Config>().unwrap();
+    assert!(format!("{has_pw_cfg:?}").contains("specialuser"));
+    // Ensure that the password is not leaked by the debug impl
+    assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG"));
+}
+
+fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> {
+    let config = config_str
+        .parse::<tokio_postgres::Config>()
+        .map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?;
+    // We use debug formatting here, and use a unit test to ensure that we don't leak the password.
+    // To make extra sure the test gets ran, run it every time the function is called
+    // (this is rather cold code, we can afford it).
+    #[cfg(not(test))]
+    test_config_debug_censors_password();
+    tracing::info!("database connection config: {config:?}");
+    Ok(())
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
     QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,

From dcf335a25195bdd2d8fa8dd5080ca661ea4396e3 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 4 Feb 2025 15:50:53 +0100
Subject: [PATCH 61/78] proxy: Switch proxy to JSON logging (#9857)

## Problem

We want to switch proxy and ideally all Rust services to structured JSON
logging to support better filtering and cross-referencing with tracing.

## Summary of changes

* Introduce a custom tracing-subscriber to write the JSON. In a first
attempt a customized tracing::fmt::FmtSubscriber was used, but it's very
inefficient and can still generate invalid JSON. It's also doesn't allow
us to add important fields to the root object.
* Make this opt in: the `LOGFMT` env var can be set to `"json"` to
enable to new logger at startup.
---
 Cargo.lock                |  55 +++
 Cargo.toml                |   3 +
 deny.toml                 |   1 +
 proxy/Cargo.toml          |   8 +
 proxy/src/logging.rs      | 904 +++++++++++++++++++++++++++++++++++++-
 workspace_hack/Cargo.toml |   1 +
 6 files changed, 964 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0133c8356466..de1b1218cada 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -206,6 +206,16 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -1010,6 +1020,12 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "boxcar"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -2433,6 +2449,16 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "gettid"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -4212,6 +4238,16 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "papaya"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+dependencies = [
+ "equivalent",
+ "seize",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4839,6 +4875,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "arc-swap",
+ "assert-json-diff",
  "async-compression",
  "async-trait",
  "atomic-take",
@@ -4846,6 +4883,7 @@ dependencies = [
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.13.1",
+ "boxcar",
  "bstr",
  "bytes",
  "camino",
@@ -4862,6 +4900,7 @@ dependencies = [
  "flate2",
  "framed-websockets",
  "futures",
+ "gettid",
  "hashbrown 0.14.5",
  "hashlink",
  "hex",
@@ -4884,7 +4923,9 @@ dependencies = [
  "measured",
  "metrics",
  "once_cell",
+ "opentelemetry",
  "p256 0.13.2",
+ "papaya",
  "parking_lot 0.12.1",
  "parquet",
  "parquet_derive",
@@ -4931,6 +4972,9 @@ dependencies = [
  "tokio-tungstenite 0.21.0",
  "tokio-util",
  "tracing",
+ "tracing-log",
+ "tracing-opentelemetry",
+ "tracing-serde",
  "tracing-subscriber",
  "tracing-utils",
  "try-lock",
@@ -5884,6 +5928,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "seize"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.17"
@@ -8145,6 +8199,7 @@ dependencies = [
  "tower 0.4.13",
  "tracing",
  "tracing-core",
+ "tracing-log",
  "url",
  "zerocopy",
  "zeroize",
diff --git a/Cargo.toml b/Cargo.toml
index 267a91d773c4..76b54ae1d877 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 backtrace = "0.3.74"
 flate2 = "1.0.26"
+assert-json-diff = "2"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
+tracing-log = "0.2"
 tracing-opentelemetry = "0.28"
+tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
diff --git a/deny.toml b/deny.toml
index df00a34c60d6..b55140556801 100644
--- a/deny.toml
+++ b/deny.toml
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
 allow = [
+    "0BSD",
     "Apache-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 35574e945cd9..d7880ea7b964 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,6 +19,7 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
+boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -42,6 +43,7 @@ hyper0.workspace = true
 hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
+gettid = "0.1.3"
 indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
@@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry = { workspace = true, features = ["trace"] }
+papaya = "0.1.8"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] }
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+tracing-log.workspace = true
+tracing-serde.workspace = true
+tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
@@ -112,6 +119,7 @@ rsa = "0.9"
 workspace_hack.workspace = true
 
 [dev-dependencies]
+assert-json-diff.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 flate2.workspace = true
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 41f10f052ffa..97c9f5a59c27 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,10 +1,23 @@
-use tracing::Subscriber;
+use std::cell::{Cell, RefCell};
+use std::collections::HashMap;
+use std::hash::BuildHasher;
+use std::{env, io};
+
+use chrono::{DateTime, Utc};
+use opentelemetry::trace::TraceContextExt;
+use scopeguard::defer;
+use serde::ser::{SerializeMap, Serializer};
+use tracing::span;
+use tracing::subscriber::Interest;
+use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
 use tracing_subscriber::fmt::time::SystemTime;
 use tracing_subscriber::fmt::{FormatEvent, FormatFields};
+use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
-use tracing_subscriber::registry::LookupSpan;
+use tracing_subscriber::registry::{LookupSpan, SpanRef};
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan;
 /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
 /// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
 pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let logfmt = LogFormat::from_env()?;
+
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
         .from_env_lossy()
@@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
                 .expect("this should be a valid filter directive"),
         );
 
-    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(false)
-        .with_writer(std::io::stderr)
-        .with_target(false);
-
     let otlp_layer = tracing_utils::init_tracing("proxy").await;
 
+    let json_log_layer = if logfmt == LogFormat::Json {
+        Some(JsonLoggingLayer {
+            clock: RealClock,
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: StderrWriter {
+                stderr: std::io::stderr(),
+            },
+        })
+    } else {
+        None
+    };
+
+    let text_log_layer = if logfmt == LogFormat::Text {
+        Some(
+            tracing_subscriber::fmt::layer()
+                .with_ansi(false)
+                .with_writer(std::io::stderr)
+                .with_target(false),
+        )
+    } else {
+        None
+    };
+
     tracing_subscriber::registry()
         .with(env_filter)
         .with(otlp_layer)
-        .with(fmt_layer)
+        .with(json_log_layer)
+        .with(text_log_layer)
         .try_init()?;
 
     Ok(LoggingGuard)
@@ -94,3 +128,857 @@ impl Drop for LoggingGuard {
         tracing_utils::shutdown_tracing();
     }
 }
+
+// TODO: make JSON the default
+#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
+enum LogFormat {
+    #[default]
+    Text = 1,
+    Json,
+}
+
+impl LogFormat {
+    fn from_env() -> anyhow::Result<Self> {
+        let logfmt = env::var("LOGFMT");
+        Ok(match logfmt.as_deref() {
+            Err(_) => LogFormat::default(),
+            Ok("text") => LogFormat::Text,
+            Ok("json") => LogFormat::Json,
+            Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
+        })
+    }
+}
+
+trait MakeWriter {
+    fn make_writer(&self) -> impl io::Write;
+}
+
+struct StderrWriter {
+    stderr: io::Stderr,
+}
+
+impl MakeWriter for StderrWriter {
+    #[inline]
+    fn make_writer(&self) -> impl io::Write {
+        self.stderr.lock()
+    }
+}
+
+// TODO: move into separate module or even separate crate.
+trait Clock {
+    fn now(&self) -> DateTime<Utc>;
+}
+
+struct RealClock;
+
+impl Clock for RealClock {
+    #[inline]
+    fn now(&self) -> DateTime<Utc> {
+        Utc::now()
+    }
+}
+
+/// Name of the field used by tracing crate to store the event message.
+const MESSAGE_FIELD: &str = "message";
+
+thread_local! {
+    /// Protects against deadlocks and double panics during log writing.
+    /// The current panic handler will use tracing to log panic information.
+    static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
+    /// Thread-local instance with per-thread buffer for log writing.
+    static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
+    /// Cached OS thread ID.
+    static THREAD_ID: u64 = gettid::gettid();
+}
+
+/// Implements tracing layer to handle events specific to logging.
+struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+    clock: C,
+    skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    writer: W,
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+{
+    fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
+        use std::io::Write;
+
+        // TODO: consider special tracing subscriber to grab timestamp very
+        //       early, before OTel machinery, and add as event extension.
+        let now = self.clock.now();
+
+        let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
+            if entered.get() {
+                let mut formatter = EventFormatter::new();
+                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                self.writer.make_writer().write_all(formatter.buffer())
+            } else {
+                entered.set(true);
+                defer!(entered.set(false););
+
+                EVENT_FORMATTER.with_borrow_mut(move |formatter| {
+                    formatter.reset();
+                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    self.writer.make_writer().write_all(formatter.buffer())
+                })
+            }
+        });
+
+        // In case logging fails we generate a simpler JSON object.
+        if let Err(err) = res {
+            if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
+                "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": format_args!("cannot log event: {err:?}"),
+                "fields": {
+                    "event": format_args!("{event:?}"),
+                },
+            })) {
+                line.push(b'\n');
+                self.writer.make_writer().write_all(&line).ok();
+            }
+        }
+    }
+
+    /// Registers a SpanFields instance as span extension.
+    fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let fields = SpanFields::default();
+        fields.record_fields(attrs);
+        // This could deadlock when there's a panic somewhere in the tracing
+        // event handling and a read or write guard is still held. This includes
+        // the OTel subscriber.
+        span.extensions_mut().insert(fields);
+    }
+
+    fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let ext = span.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            data.record_fields(values);
+        }
+    }
+
+    /// Called (lazily) whenever a new log call is executed. We quickly check
+    /// for duplicate field names and record duplicates as skippable. Last one
+    /// wins.
+    fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
+        if !metadata.is_event() {
+            // Must not be never because we wouldn't get trace and span data.
+            return Interest::always();
+        }
+
+        let mut field_indices = SkippedFieldIndices::default();
+        let mut seen_fields = HashMap::<&'static str, usize>::new();
+        for field in metadata.fields() {
+            use std::collections::hash_map::Entry;
+            match seen_fields.entry(field.name()) {
+                Entry::Vacant(entry) => {
+                    // field not seen yet
+                    entry.insert(field.index());
+                }
+                Entry::Occupied(mut entry) => {
+                    // replace currently stored index
+                    let old_index = entry.insert(field.index());
+                    // ... and append it to list of skippable indices
+                    field_indices.push(old_index);
+                }
+            }
+        }
+
+        if !field_indices.is_empty() {
+            self.skipped_field_indices
+                .pin()
+                .insert(metadata.callsite(), field_indices);
+        }
+
+        Interest::always()
+    }
+}
+
+/// Stores span field values recorded during the spans lifetime.
+#[derive(Default)]
+struct SpanFields {
+    // TODO: Switch to custom enum with lasso::Spur for Strings?
+    fields: papaya::HashMap<&'static str, serde_json::Value>,
+}
+
+impl SpanFields {
+    #[inline]
+    fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
+        fields.record(&mut SpanFieldsRecorder {
+            fields: self.fields.pin(),
+        });
+    }
+}
+
+/// Implements a tracing field visitor to convert and store values.
+struct SpanFieldsRecorder<'m, S, G> {
+    fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
+}
+
+impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if let Ok(value) = i64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if let Ok(value) = u64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value:?}")));
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value}")));
+    }
+}
+
+/// List of field indices skipped during logging. Can list duplicate fields or
+/// metafields not meant to be logged.
+#[derive(Clone, Default)]
+struct SkippedFieldIndices {
+    bits: u64,
+}
+
+impl SkippedFieldIndices {
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.bits == 0
+    }
+
+    #[inline]
+    fn push(&mut self, index: usize) {
+        self.bits |= 1u64
+            .checked_shl(index as u32)
+            .expect("field index too large");
+    }
+
+    #[inline]
+    fn contains(&self, index: usize) -> bool {
+        self.bits
+            & 1u64
+                .checked_shl(index as u32)
+                .expect("field index too large")
+            != 0
+    }
+}
+
+/// Formats a tracing event and writes JSON to its internal buffer including a newline.
+// TODO: buffer capacity management, truncate if too large
+struct EventFormatter {
+    logline_buffer: Vec<u8>,
+}
+
+impl EventFormatter {
+    #[inline]
+    fn new() -> Self {
+        EventFormatter {
+            logline_buffer: Vec::new(),
+        }
+    }
+
+    #[inline]
+    fn buffer(&self) -> &[u8] {
+        &self.logline_buffer
+    }
+
+    #[inline]
+    fn reset(&mut self) {
+        self.logline_buffer.clear();
+    }
+
+    fn format<S>(
+        &mut self,
+        now: DateTime<Utc>,
+        event: &Event<'_>,
+        ctx: &Context<'_, S>,
+        skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    ) -> io::Result<()>
+    where
+        S: Subscriber + for<'a> LookupSpan<'a>,
+    {
+        let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
+
+        use tracing_log::NormalizeEvent;
+        let normalized_meta = event.normalized_metadata();
+        let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
+
+        let skipped_field_indices = skipped_field_indices.pin();
+        let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
+
+        let mut serialize = || {
+            let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
+
+            let mut serializer = serializer.serialize_map(None)?;
+
+            // Timestamp comes first, so raw lines can be sorted by timestamp.
+            serializer.serialize_entry("timestamp", &timestamp)?;
+
+            // Level next.
+            serializer.serialize_entry("level", &meta.level().as_str())?;
+
+            // Message next.
+            serializer.serialize_key("message")?;
+            let mut message_extractor =
+                MessageFieldExtractor::new(serializer, skipped_field_indices);
+            event.record(&mut message_extractor);
+            let mut serializer = message_extractor.into_serializer()?;
+
+            let mut fields_present = FieldsPresent(false, skipped_field_indices);
+            event.record(&mut fields_present);
+            if fields_present.0 {
+                serializer.serialize_entry(
+                    "fields",
+                    &SerializableEventFields(event, skipped_field_indices),
+                )?;
+            }
+
+            let pid = std::process::id();
+            if pid != 1 {
+                serializer.serialize_entry("process_id", &pid)?;
+            }
+
+            THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
+
+            // TODO: tls cache? name could change
+            if let Some(thread_name) = std::thread::current().name() {
+                if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
+                    serializer.serialize_entry("thread_name", thread_name)?;
+                }
+            }
+
+            if let Some(task_id) = tokio::task::try_id() {
+                serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
+            }
+
+            serializer.serialize_entry("target", meta.target())?;
+
+            if let Some(module) = meta.module_path() {
+                if module != meta.target() {
+                    serializer.serialize_entry("module", module)?;
+                }
+            }
+
+            if let Some(file) = meta.file() {
+                if let Some(line) = meta.line() {
+                    serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
+                } else {
+                    serializer.serialize_entry("src", file)?;
+                }
+            }
+
+            {
+                let otel_context = Span::current().context();
+                let otel_spanref = otel_context.span();
+                let span_context = otel_spanref.span_context();
+                if span_context.is_valid() {
+                    serializer.serialize_entry(
+                        "trace_id",
+                        &format_args!("{}", span_context.trace_id()),
+                    )?;
+                }
+            }
+
+            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+
+            serializer.end()
+        };
+
+        serialize().map_err(io::Error::other)?;
+        self.logline_buffer.push(b'\n');
+        Ok(())
+    }
+}
+
+/// Extracts the message field that's mixed will other fields.
+struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Option<Result<(), S::Error>>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: None,
+        }
+    }
+
+    #[inline]
+    fn into_serializer(mut self) -> Result<S, S::Error> {
+        match self.state {
+            Some(Ok(())) => {}
+            Some(Err(err)) => return Err(err),
+            None => self.serializer.serialize_value("")?,
+        }
+        Ok(self.serializer)
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_none()
+            && field.name() == MESSAGE_FIELD
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
+        }
+    }
+}
+
+/// Checks if there's any fields and field values present. If not, the JSON subobject
+/// can be skipped.
+// This is entirely optional and only cosmetic, though maybe helps a
+// bit during log parsing in dashboards when there's no field with empty object.
+struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
+
+// Even though some methods have an overhead (error, bytes) it is assumed the
+// compiler won't include this since we ignore the value entirely.
+impl tracing::field::Visit for FieldsPresent<'_> {
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
+        if !self.1.is_some_and(|i| i.contains(field.index()))
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+        {
+            self.0 |= true;
+        }
+    }
+}
+
+/// Serializes the fields directly supplied with a log event.
+struct SerializableEventFields<'a, 'event>(
+    &'a tracing::Event<'event>,
+    Option<&'a SkippedFieldIndices>,
+);
+
+impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let serializer = serializer.serialize_map(None)?;
+        let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
+        self.0.record(&mut message_skipper);
+        let serializer = message_skipper.into_serializer()?;
+        serializer.end()
+    }
+}
+
+/// A tracing field visitor that skips the message field.
+struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Result<(), S::Error>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: Ok(()),
+        }
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_ok()
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+
+    #[inline]
+    fn into_serializer(self) -> Result<S, S::Error> {
+        self.state?;
+        Ok(self.serializer)
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:x?}"));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:?}"));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_value(&format_args!("{value}"));
+        }
+    }
+}
+
+/// Serializes the span stack from root to leaf (parent of event) enumerated
+/// inside an object where the keys are just the number padded with zeroes
+/// to retain sorting order.
+// The object is necessary because Loki cannot flatten arrays.
+struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        if let Some(leaf_span) = self.0.lookup_current() {
+            for (i, span) in leaf_span.scope().from_root().enumerate() {
+                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+/// Serializes a single span. Include the span ID, name and its fields as
+/// recorded up to this point.
+struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
+where
+    Span: for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+        // TODO: the span ID is probably only useful for debugging tracing.
+        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
+        serializer.serialize_entry("span_name", self.0.metadata().name())?;
+
+        let ext = self.0.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            for (key, value) in &data.fields.pin() {
+                serializer.serialize_entry(key, value)?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used)]
+mod tests {
+    use std::sync::{Arc, Mutex, MutexGuard};
+
+    use assert_json_diff::assert_json_eq;
+    use tracing::info_span;
+
+    use super::*;
+
+    struct TestClock {
+        current_time: Mutex<DateTime<Utc>>,
+    }
+
+    impl Clock for Arc<TestClock> {
+        fn now(&self) -> DateTime<Utc> {
+            *self.current_time.lock().expect("poisoned")
+        }
+    }
+
+    struct VecWriter<'a> {
+        buffer: MutexGuard<'a, Vec<u8>>,
+    }
+
+    impl MakeWriter for Arc<Mutex<Vec<u8>>> {
+        fn make_writer(&self) -> impl io::Write {
+            VecWriter {
+                buffer: self.lock().expect("poisoned"),
+            }
+        }
+    }
+
+    impl io::Write for VecWriter<'_> {
+        fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+            self.buffer.write(buf)
+        }
+
+        fn flush(&mut self) -> io::Result<()> {
+            Ok(())
+        }
+    }
+
+    #[test]
+    fn test_field_collection() {
+        let clock = Arc::new(TestClock {
+            current_time: Mutex::new(Utc::now()),
+        });
+        let buffer = Arc::new(Mutex::new(Vec::new()));
+        let log_layer = JsonLoggingLayer {
+            clock: clock.clone(),
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: buffer.clone(),
+        };
+
+        let registry = tracing_subscriber::Registry::default().with(log_layer);
+
+        tracing::subscriber::with_default(registry, || {
+            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
+                info_span!("span2").in_scope(|| {
+                    tracing::error!(
+                        a = 1,
+                        a = 2,
+                        a = 3,
+                        message = "explicit message field",
+                        "implicit message field"
+                    );
+                });
+            });
+        });
+
+        let buffer = Arc::try_unwrap(buffer)
+            .expect("no other reference")
+            .into_inner()
+            .expect("poisoned");
+        let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
+        let expected: serde_json::Value = serde_json::json!(
+            {
+                "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": "explicit message field",
+                "fields": {
+                    "a": 3,
+                },
+                "spans": {
+                    "00":{
+                        "span_id": "0000000000000001",
+                        "span_name": "span1",
+                        "x": 42,
+                    },
+                    "01": {
+                        "span_id": "0000000000000002",
+                        "span_name": "span2",
+                    }
+                },
+                "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
+                "target": "proxy::logging::tests",
+                "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
+                "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
+                "thread_name": "logging::tests::test_field_collection",
+            }
+        );
+
+        assert_json_eq!(actual, expected);
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a3dffa8f195a..2c65401154a8 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.2" }
 url = { version = "2", features = ["serde"] }
 zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zeroize = { version = "1", features = ["derive", "serde"] }

From 06090bbccdf0d2b89f6a355afd42b352fadd40d6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 4 Feb 2025 15:55:11 +0100
Subject: [PATCH 62/78] pageserver: log critical error on `ClearVmBits` for
 unknown pages (#10634)

## Problem

In #9895, we fixed some issues where `ClearVmBits` were broadcast to all
shards, even those not owning the VM relation. As part of that, we found
some ancient code from #1417, which discarded spurious incorrect
`ClearVmBits` records for pages outside of the VM relation. We added
observability in #9911 to see how often this actually happens in the
wild.

After two months, we have not seen this happen once in production or
staging. However, out of caution, we don't want a hard error and break
WAL ingestion.

Resolves #10067.

## Summary of changes

Log a critical error when ingesting `ClearVmBits` for unknown VM
relations or pages.
---
 pageserver/src/metrics.rs   |   7 ---
 pageserver/src/walingest.rs | 117 +++++++++++++++---------------------
 2 files changed, 49 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index f9edf8855333..48aed7082634 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2379,7 +2379,6 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
-    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
@@ -2414,12 +2413,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
         "Total number of zero gap blocks written on relation extends"
     )
     .expect("failed to define a metric"),
-    clear_vm_bits_unknown: register_int_counter_vec!(
-        "pageserver_wal_ingest_clear_vm_bits_unknown",
-        "Number of ignored ClearVmBits operations due to unknown pages/relations",
-        &["entity"],
-    )
-    .expect("failed to define a metric"),
 }
 });
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e0283d99e0fb..04edb3e3f47b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,17 +28,9 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
 
-use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::fsm_logical_to_physical;
-use postgres_ffi::walrecord::*;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use wal_decoder::models::*;
-
 use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
-use utils::failpoint_support;
-use utils::rate_limit::RateLimit;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::walrecord::*;
 use postgres_ffi::TransactionId;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
+use utils::rate_limit::RateLimit;
+use utils::{critical, failpoint_support};
+use wal_decoder::models::*;
 
 enum_pgversion! {CheckPoint, pgv::CheckPoint}
 
@@ -327,93 +326,75 @@ impl WalIngest {
         let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
         let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
 
-        // Sometimes, Postgres seems to create heap WAL records with the
-        // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-        // not set. In fact, it's possible that the VM page does not exist at all.
-        // In that case, we don't want to store a record to clear the VM bit;
-        // replaying it would fail to find the previous image of the page, because
-        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-        // record if it doesn't.
-        //
-        // TODO: analyze the metrics and tighten this up accordingly. This logic
-        // implicitly assumes that VM pages see explicit WAL writes before
-        // implicit ClearVmBits, and will otherwise silently drop updates.
+        // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
+        // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
+        // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
+        // https://github.com/neondatabase/neon/pull/10634.
         let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
-            WAL_INGEST
-                .clear_vm_bits_unknown
-                .with_label_values(&["relation"])
-                .inc();
+            critical!("clear_vm_bits for unknown VM relation {vm_rel}");
             return Ok(());
         };
         if let Some(blknum) = new_vm_blk {
             if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["new_page"])
-                    .inc();
+                critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                 new_vm_blk = None;
             }
         }
         if let Some(blknum) = old_vm_blk {
             if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["old_page"])
-                    .inc();
+                critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                 old_vm_blk = None;
             }
         }
 
-        if new_vm_blk.is_some() || old_vm_blk.is_some() {
-            if new_vm_blk == old_vm_blk {
-                // An UPDATE record that needs to clear the bits for both old and the
-                // new page, both of which reside on the same VM page.
+        if new_vm_blk.is_none() && old_vm_blk.is_none() {
+            return Ok(());
+        } else if new_vm_blk == old_vm_blk {
+            // An UPDATE record that needs to clear the bits for both old and the new page, both of
+            // which reside on the same VM page.
+            self.put_rel_wal_record(
+                modification,
+                vm_rel,
+                new_vm_blk.unwrap(),
+                NeonWalRecord::ClearVisibilityMapFlags {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    flags,
+                },
+                ctx,
+            )
+            .await?;
+        } else {
+            // Clear VM bits for one heap page, or for two pages that reside on different VM pages.
+            if let Some(new_vm_blk) = new_vm_blk {
                 self.put_rel_wal_record(
                     modification,
                     vm_rel,
-                    new_vm_blk.unwrap(),
+                    new_vm_blk,
                     NeonWalRecord::ClearVisibilityMapFlags {
                         new_heap_blkno,
+                        old_heap_blkno: None,
+                        flags,
+                    },
+                    ctx,
+                )
+                .await?;
+            }
+            if let Some(old_vm_blk) = old_vm_blk {
+                self.put_rel_wal_record(
+                    modification,
+                    vm_rel,
+                    old_vm_blk,
+                    NeonWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno: None,
                         old_heap_blkno,
                         flags,
                     },
                     ctx,
                 )
                 .await?;
-            } else {
-                // Clear VM bits for one heap page, or for two pages that reside on
-                // different VM pages.
-                if let Some(new_vm_blk) = new_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno: None,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
-                if let Some(old_vm_blk) = old_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        old_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno: None,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
             }
         }
-
         Ok(())
     }
 

From cab60b6d9f97983ac8cbd904982cecd67edd2d92 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:11:31 -0500
Subject: [PATCH 63/78] fix(pagesever): stablize gc-compaction tests (#10621)

## Problem

Hopefully this can resolve
https://github.com/neondatabase/neon/issues/10517. The reason why the
test is flaky is that after restart the compute node might write some
data so that the pageserver flush some layers, and in the end, causing
L0 compaction to run, and we cannot get the test scenario as we want.

## Summary of changes

Ensure all L0 layers are compacted before starting the test.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 763a63c2e56e..c031d66dfbc2 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -250,6 +250,9 @@ def compaction_finished():
     workload.churn_rows(row_count, env.pageserver.id)
     # compact 3 times if mode is before_restart
     n_compactions = 3 if compaction_mode == "before_restart" else 1
+    ps_http.timeline_compact(
+        tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
+    )
     for _ in range(n_compactions):
         # Force refresh gc info to have gc_cutoff generated
         ps_http.timeline_gc(tenant_id, timeline_id, None)

From f9009d6b80f5351454eabdc3df5cbd137f95d5e2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 4 Feb 2025 17:52:54 +0000
Subject: [PATCH 64/78] pageserver: write heatmap to disk after uploading it
 (#10650)

## Problem

We wish to make heatmap generation additive in
https://github.com/neondatabase/neon/pull/10597.
However, if the pageserver restarts and has a heatmap on disk from when
it was a secondary long ago,
we can end up keeping extra layers on the secondary's disk.

## Summary of changes

Persist the heatmap after a successful upload.
---
 .../src/tenant/secondary/heatmap_uploader.rs  | 22 ++++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |  5 +++++
 .../regress/test_pageserver_secondary.py      | 10 ++++++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index c5e5e0494571..d72c33736954 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,13 +9,14 @@ use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
         config::AttachmentMode,
-        mgr::GetTenantError,
-        mgr::TenantManager,
+        mgr::{GetTenantError, TenantManager},
         remote_timeline_client::remote_heatmap_path,
         span::debug_assert_current_span_has_tenant_id,
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
         Tenant,
     },
+    virtual_file::VirtualFile,
+    TEMP_FILE_SUFFIX,
 };
 
 use futures::Future;
@@ -32,7 +33,10 @@ use super::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
+    yielding_loop::yielding_loop,
+};
 
 pub(super) async fn heatmap_uploader_task(
     tenant_manager: Arc<TenantManager>,
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
         }
     }
 
+    // After a successful upload persist the fresh heatmap to disk.
+    // When restarting, the tenant will read the heatmap from disk
+    // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
+    // If the heatmap is stale, the additive generation can lead to keeping previously
+    // evicted timelines on the secondarie's disk.
+    let tenant_shard_id = tenant.get_tenant_shard_id();
+    let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
+    let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+    if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
+        tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
+    }
+
     tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
 
     Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8909f7f2493c..7c4991ffabda 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2766,6 +2766,11 @@ def read_tenant_location_conf(
             log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
             raise
 
+    def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
+        path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
+        with open(path) as f:
+            return json.load(f)
+
     def tenant_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 1292682f9e3d..590093d23c37 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     workload.write_rows(256, env.pageservers[0].id)
     env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
 
-    def validate_heatmap(heatmap):
+    def validate_heatmap(heatmap, on_disk_heatmap):
         assert len(heatmap["timelines"]) == 1
         assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
         assert len(heatmap["timelines"][0]["layers"]) > 0
@@ -452,10 +452,13 @@ def validate_heatmap(heatmap):
         # Each layer appears at most once
         assert len(set(layer["name"] for layer in layers)) == len(layers)
 
+        assert heatmap == on_disk_heatmap
+
     # Download and inspect the heatmap that the pageserver uploaded
     heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id)
     log.info(f"Read back heatmap: {heatmap_first}")
-    validate_heatmap(heatmap_first)
+    validate_heatmap(heatmap_first, heatmap_first_on_disk)
 
     # Do some more I/O to generate more layers
     workload.churn_rows(64, env.pageservers[0].id)
@@ -463,9 +466,10 @@ def validate_heatmap(heatmap):
 
     # Ensure that another heatmap upload includes the new layers
     heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id)
     log.info(f"Read back heatmap: {heatmap_second}")
     assert heatmap_second != heatmap_first
-    validate_heatmap(heatmap_second)
+    validate_heatmap(heatmap_second, heatmap_second_on_disk)
 
 
 def list_elegible_layers(

From 472007dd7ce5c5b861b29870a852ad7c99e1ea01 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 4 Feb 2025 19:58:02 +0100
Subject: [PATCH 65/78] ci: unify Dockerfiles, set bash as SHELL for debian
 layers, make cpan step as separate RUN (#10645)

## Problem
Ref: https://github.com/neondatabase/cloud/issues/23461

and follow-up after: https://github.com/neondatabase/neon/pull/10553

we used `echo` to set-up `.wgetrc` and `.curlrc`, and there we used `\n`
to make these multiline configs with one echo command.

The problem is that Debian `/bin/sh`'s built-in echo command behaves
differently from the `/bin/echo` executable and from the `echo` built-in
in `bash`. Namely, it does not support the`-e` option, and while it does
treat `\n` as a newline, passing `-e` here will add that `-e` to the
output.
At the same time, when we use different base images, for example
`alpine/curl`, their `/bin/sh` supports and requires `-e` for treating
escape sequences like `\n`.
But having different `echo` and remembering difference in their
behaviour isn't best experience for the developer and makes bad
experience maintaining Dockerfiles.

Work-arounds:

- Explicitly use `/bin/bash` (like in this PR)
- Use `/bin/echo` instead of the shell's built-in echo function
- Use printf "foo\n" instead of echo -e "foo\n"

## Summary of changes
1. To fix that, we process with the option setting `/bin/bash` as a
SHELL for the debian-baysed layers
2. With no changes for `alpine/curl` based layers.
3. And one more change here: in `extensions` layer split to the 2 steps:
installing dependencies from `CPAN` and installing `lcov` from github,
so upgrading `lcov` could reuse previous layer with installed cpan
modules.
---
 build-tools.Dockerfile          | 22 +++++++++++++++++-----
 compute/compute-node.Dockerfile | 10 +++++++++-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index f744b44808c6..3ade57b175b3 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,8 +3,13 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
 
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
@@ -55,7 +60,8 @@ ARG DEBIAN_VERSION
 
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 RUN mkdir -p /pgcopydb/bin && \
     mkdir -p /pgcopydb/lib && \
@@ -66,7 +72,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 # System deps
@@ -190,8 +196,14 @@ RUN set -e \
 # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
 # And patches from us:
 # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+RUN set +o pipefail && \
+	 for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
+		yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
+	 done && \
+	set -o pipefail
+# Split into separate step to debug flaky failures here
+RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
     && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
     && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
     && cd lcov \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index ea29630001a4..9379856eabff 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -96,8 +96,10 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN case $DEBIAN_VERSION in \
@@ -1068,6 +1070,7 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+# See comment on the top of the file regading `echo` and `\n`
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
 
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
@@ -1584,6 +1587,7 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
+# See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
     if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
@@ -1700,6 +1704,10 @@ ENV PGDATABASE=postgres
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
 ARG DEBIAN_VERSION
+
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \

From 47975d06d98bde792b61cf1d1d397567f16b0b49 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 5 Feb 2025 12:41:09 +0000
Subject: [PATCH 66/78] storcon: silence cplane 404s on tenant creation
 (#10665)

## Problem

We get WARN log noise on tenant creations. Cplane creates tenants via
/location_config. That returns the attached locations in the response
and spawns a reconciliation which will also attempt to notify cplane. If
the notification is attempted before cplane persists the shards to its
database, storcon gets back a 404. The situation is harmless, but
annoying.

## Summary of Changes

* Add a tenant creation hint to the reconciler config
* If the hint is true and we get back a 404 on the notification from
cplane, ignore the error, but still queue the reconcile up for a retry.

Closes https://github.com/neondatabase/cloud/issues/20732
---
 storage_controller/src/compute_hook.rs |  2 +-
 storage_controller/src/reconciler.rs   | 48 ++++++++++++++++++++++----
 storage_controller/src/service.rs      |  7 +++-
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 3884a6df4601..5bc3c81f02c8 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
     // We shutdown while sending
     #[error("Shutting down")]
     ShuttingDown,
-    // A response indicates we will never succeed, such as 400 or 404
+    // A response indicates we will never succeed, such as 400 or 403
     #[error("Non-retryable error {0}")]
     Fatal(StatusCode),
 
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 03db94726315..58bc0ba1cd86 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
         }
     }
 
+    pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                tenant_creation_hint: hint,
+                ..self.config
+            },
+        }
+    }
+
     pub(crate) fn build(self) -> ReconcilerConfig {
         self.config
     }
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
     // During live migrations this is the amount of time that
     // the pagserver will hold our poll.
     secondary_download_request_timeout: Option<Duration>,
+
+    // A hint indicating whether this reconciliation is done on the
+    // creation of a new tenant. This only informs logging behaviour.
+    tenant_creation_hint: bool,
 }
 
 impl ReconcilerConfig {
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
         self.secondary_download_request_timeout
             .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
     }
+
+    pub(crate) fn tenant_creation_hint(&self) -> bool {
+        self.tenant_creation_hint
+    }
 }
 
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
@@ -934,16 +951,35 @@ impl Reconciler {
                 )
                 .await;
             if let Err(e) = &result {
+                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
+                // needs to retry at some point.
+                self.compute_notify_failure = true;
+
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                if !matches!(e, NotifyError::ShuttingDown) {
-                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                match e {
+                    // 404s from cplane during tenant creation are expected.
+                    // Cplane only persists the shards to the database after
+                    // creating the tenant and the timeline. If we notify before
+                    // that, we'll get a 404.
+                    //
+                    // This is fine because tenant creations happen via /location_config
+                    // and that returns the list of locations in the response. Hence, we
+                    // silence the error and return Ok(()) here. Reconciliation will still
+                    // be retried because we set [`Reconciler::compute_notify_failure`] above.
+                    NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
+                        if self.reconciler_config.tenant_creation_hint() =>
+                    {
+                        return Ok(());
+                    }
+                    NotifyError::ShuttingDown => {}
+                    _ => {
+                        tracing::warn!(
+                            "Failed to notify compute of attached pageserver {node}: {e}"
+                        );
+                    }
                 }
-
-                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
-                // needs to retry at some point.
-                self.compute_notify_failure = true;
             }
             result
         } else {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9ac9ee17cad4..4028cd702343 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2238,9 +2238,14 @@ impl Service {
         let waiters = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let config = ReconcilerConfigBuilder::new()
+                .tenant_creation_hint(true)
+                .build();
             tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
-                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
+                .filter_map(|(_shard_id, shard)| {
+                    self.maybe_configured_reconcile_shard(shard, nodes, config)
+                })
                 .collect::<Vec<_>>()
         };
 

From f07119cca798e24745b4eda21bde8c2376a22952 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 5 Feb 2025 15:33:04 +0100
Subject: [PATCH 67/78] pageserver: add
 `pageserver_wal_ingest_values_committed` metric (#10653)

## Problem

We don't have visibility into the ratio of image vs. delta pages
ingested in Pageservers. This might be useful to determine whether we
should compress WAL records before storing them, which in turn might
make compaction more efficient.

## Summary of changes

Add `pageserver_wal_ingest_values_committed` metric with dimensions
`class=metadata|data` and `kind=image|delta`.
---
 pageserver/src/metrics.rs                     | 35 +++++++++++++++++++
 pageserver/src/pgdatadir_mapping.rs           | 31 +++++++++++++++-
 .../walreceiver/walreceiver_connection.rs     | 25 +++++++++----
 3 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 48aed7082634..6ab1178a7bd0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -32,6 +32,7 @@ use utils::id::TimelineId;
 
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
+use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
@@ -2378,10 +2379,40 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_observed: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
+    pub(crate) values_committed_metadata_images: IntCounter,
+    pub(crate) values_committed_metadata_deltas: IntCounter,
+    pub(crate) values_committed_data_images: IntCounter,
+    pub(crate) values_committed_data_deltas: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }
 
+impl WalIngestMetrics {
+    pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
+        if stats.metadata_images > 0 {
+            self.values_committed_metadata_images
+                .inc_by(stats.metadata_images);
+        }
+        if stats.metadata_deltas > 0 {
+            self.values_committed_metadata_deltas
+                .inc_by(stats.metadata_deltas);
+        }
+        if stats.data_images > 0 {
+            self.values_committed_data_images.inc_by(stats.data_images);
+        }
+        if stats.data_deltas > 0 {
+            self.values_committed_data_deltas.inc_by(stats.data_deltas);
+        }
+    }
+}
+
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    let values_committed = register_int_counter_vec!(
+        "pageserver_wal_ingest_values_committed",
+        "Number of values committed to pageserver storage from WAL records",
+        &["class", "kind"],
+    )
+    .expect("failed to define a metric");
+
     WalIngestMetrics {
     bytes_received: register_int_counter!(
         "pageserver_wal_ingest_bytes_received",
@@ -2408,6 +2439,10 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
+    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
+    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
+    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
+    values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
     gap_blocks_zeroed_on_rel_extend: register_int_counter!(
         "pageserver_gap_blocks_zeroed_on_rel_extend",
         "Total number of zero gap blocks written on relation extends"
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 40c657524dff..dcbf62b56c6c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1297,6 +1297,26 @@ impl DatadirModification<'_> {
             .is_some_and(|b| b.has_data())
     }
 
+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -2317,6 +2337,15 @@ impl DatadirModification<'_> {
     }
 }
 
+/// Statistics for a DatadirModification.
+#[derive(Default)]
+pub struct DatadirModificationStats {
+    pub metadata_images: u64,
+    pub metadata_deltas: u64,
+    pub data_images: u64,
+    pub data_deltas: u64,
+}
+
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
 ///
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d69e7dbd3245..de917377cb54 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
                 // advances it to its end LSN. 0 is just an initialization placeholder.
                 let mut modification = timeline.begin_modification(Lsn(0));
 
+                async fn commit(
+                    modification: &mut DatadirModification<'_>,
+                    ctx: &RequestContext,
+                    uncommitted: &mut u64,
+                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
+                    *uncommitted = 0;
+                    Ok(())
+                }
+
                 if !records.is_empty() {
                     timeline
                         .metrics
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
                     {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                     }
 
                     let local_next_record_lsn = interpreted.next_record_lsn;
@@ -396,8 +408,7 @@ pub(super) async fn handle_walreceiver_connection(
                         || modification.approx_pending_bytes()
                             > DatadirModification::MAX_PENDING_BYTES
                     {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                     }
                 }
 
@@ -415,7 +426,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                 if uncommitted_records > 0 || needs_last_record_lsn_advance {
                     // Commit any uncommitted records
-                    modification.commit(&ctx).await?;
+                    commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                 }
 
                 if !caught_up && streaming_lsn >= end_of_wal {
@@ -442,10 +453,12 @@ pub(super) async fn handle_walreceiver_connection(
                     filtered: &mut u64,
                     ctx: &RequestContext,
                 ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
                     WAL_INGEST
                         .records_committed
                         .inc_by(*uncommitted - *filtered);
-                    modification.commit(ctx).await?;
+                    WAL_INGEST.inc_values_committed(&stats);
                     *uncommitted = 0;
                     *filtered = 0;
                     Ok(())

From ebc55e6ae87723a95303e62e9e7b16dae218676c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 5 Feb 2025 08:58:33 -0600
Subject: [PATCH 68/78] Fix logic for checking if a compute can install a
 remote extension (#10656)

Given a remote extensions manifest of the following:

```json
  {
    "public_extensions": [],
    "custom_extensions": null,
    "library_index": {
      "pg_search": "pg_search"
    },
    "extension_data": {
      "pg_search": {
        "control_data": {
          "pg_search.control": "comment = 'pg_search: Full text search for PostgreSQL using BM25'\ndefault_version = '0.14.1'\nmodule_pathname = '$libdir/pg_search'\nrelocatable = false\nsuperuser = true\nschema = paradedb\ntrusted = true\n"
        },
        "archive_path": "13117844657/v14/extensions/pg_search.tar.zst"
      }
    }
  }
```

We were allowing a compute to install a remote extension that wasn't
listed in either public_extensions or custom_extensions.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 libs/compute_api/src/spec.rs                  | 108 ++++++++++++++++--
 .../regress/test_download_extensions.py       |   2 +
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index b3f18dc6da2c..2fc95c47c616 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -204,14 +204,16 @@ impl RemoteExtSpec {
 
         // Check if extension is present in public or custom.
         // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
+        if !self
+            .public_extensions
+            .as_ref()
+            .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+            && !self
+                .custom_extensions
+                .as_ref()
+                .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+        {
+            return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
         }
 
         match self.extension_data.get(real_ext_name) {
@@ -340,6 +342,96 @@ mod tests {
     use super::*;
     use std::fs::File;
 
+    #[test]
+    fn allow_installing_remote_extensions() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": null,
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": ["ext"],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+    }
+
     #[test]
     fn parse_spec_file() {
         let file = File::open("tests/cluster_spec.json").unwrap();
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index d7e6e9de5642..7f12c140731c 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -95,6 +95,8 @@ def endpoint_handler_build_tag(request: Request) -> Response:
 
     # mock remote_extensions spec
     spec: dict[str, Any] = {
+        "public_extensions": ["anon"],
+        "custom_extensions": None,
         "library_index": {
             "anon": "anon",
         },

From 14e05276a3e0882777c4ee38252bed25324c93e8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 5 Feb 2025 16:05:12 +0000
Subject: [PATCH 69/78] storcon: fix a case where optimise could get stuck on
 unschedulable node (#10648)

## Problem

When a shard has two secondary locations, but one of them is on a node
with MaySchedule::No, the optimiser would get stuck, because it couldn't
decide which secondary to remove.

This is generally okay if a node is offline, but if a node is in Pause
mode for a long period of time, it's a problem.

Closes: https://github.com/neondatabase/neon/issues/10646

## Summary of changes

- Instead of insisting on finding a node in the wrong AZ to remove, find
an available node in the _right_ AZ, and remove all the others. This
ensures that if there is one live suitable node, then other
offline/paused nodes cannot hold things up.
---
 storage_controller/src/tenant_shard.rs | 150 +++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 302104dc9773..219c0dffe7c6 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -707,6 +707,7 @@ impl TenantShard {
                 if let Some(node_id) = self.intent.get_attached() {
                     // Populate secondary by demoting the attached node
                     self.intent.demote_attached(scheduler, *node_id);
+
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
@@ -979,24 +980,51 @@ impl TenantShard {
                         ),
                     )
                 })
-                .collect::<Vec<_>>();
+                .collect::<HashMap<_, _>>();
 
             if secondary_scores.iter().any(|score| score.1.is_none()) {
-                // Don't have full list of scores, so can't make a good decision about which to drop unless
-                // there is an obvious one in the wrong AZ
-                for secondary in self.intent.get_secondary() {
-                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                // Trivial case: if we only have one secondary, drop that one
+                if self.intent.get_secondary().len() == 1 {
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::RemoveSecondary(
+                            *self.intent.get_secondary().first().unwrap(),
+                        ),
+                    });
+                }
+
+                // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
+                // where its score can't be calculated), and drop the others.  This enables us to make progress in
+                // most cases, even if some nodes are offline or have scheduling=pause set.
+
+                debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
+                                                               // logic presumes we are in a mode where we want secondaries to be in non-home AZ
+                if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
+                    let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
+                    let is_available = secondary_scores
+                        .get(n)
+                        .expect("Built from same list of nodes")
+                        .is_some();
+                    is_available && !in_home_az
+                }) {
+                    // Great, we found one to retain.  Pick some other to drop.
+                    if let Some(victim) = self
+                        .intent
+                        .get_secondary()
+                        .iter()
+                        .find(|n| n != &retain_secondary)
+                    {
                         return Some(ScheduleOptimization {
                             sequence: self.sequence,
-                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                            action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                         });
                     }
                 }
 
                 // Fall through: we didn't identify one to remove.  This ought to be rare.
                 tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
-                self.intent.get_secondary()
-            );
+                    self.intent.get_secondary()
+                );
             } else {
                 let victim = secondary_scores
                     .iter()
@@ -1005,7 +1033,7 @@ impl TenantShard {
                     .0;
                 return Some(ScheduleOptimization {
                     sequence: self.sequence,
-                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                    action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                 });
             }
         }
@@ -2379,6 +2407,110 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    /// Test how the optimisation code behaves with an extra secondary
+    #[test]
+    fn optimize_removes_secondary() -> anyhow::Result<()> {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let mut nodes = make_test_nodes(
+            4,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+
+        // Attached on node 1, secondary on node 2
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
+
+        // Initially optimiser is idle
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
+        // to our new location
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary in the non-home AZ, and one of them is offline
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
+        nodes
+            .get_mut(&NodeId(4))
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
+        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary when should have none
+        shard_a.policy = PlacementPolicy::Attached(0);
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![]);
+
+        // Check that in secondary mode, we preserve the secondary in the preferred AZ
+        let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
+        shard_a.policy = PlacementPolicy::Secondary;
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+        assert_eq!(shard_a.intent.get_attached(), &None);
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        shard_a.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
     // Optimize til quiescent: this emulates what Service::optimize_all does, when
     // called repeatedly in the background.
     // Returns the applied optimizations

From fba22a7123a444970dd053c09e942a9a2e237a10 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 5 Feb 2025 20:00:26 +0300
Subject: [PATCH 70/78] Record more timings in test_layer_map (#10670)

## Problem

It it is not very clear how much time take different operations.

## Summary of changes

Record more timings.

ref https://github.com/neondatabase/neon/issues/10409
---
 test_runner/performance/test_layer_map.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index efc7fa59dbea..6c009440058c 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     cur.execute("set log_statement = 'all'")
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
-        cur.execute(f"insert into t values (generate_series(1,{n_records}))")
+        with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
+            cur.execute(f"insert into t values (generate_series(1,{n_records}))")
         time.sleep(1)
 
-    cur.execute("vacuum t")
+    with zenbenchmark.record_duration("vacuum t"):
+        cur.execute("vacuum t")
 
-    with zenbenchmark.record_duration("test_query"):
+    with zenbenchmark.record_duration("SELECT count(*) from t"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
-    env.pageserver.http_client().timeline_checkpoint(
-        tenant, timeline, compact=False, wait_until_uploaded=True
-    )
+    with zenbenchmark.record_duration("flush_ep_to_pageserver"):
+        flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    with zenbenchmark.record_duration("timeline_checkpoint"):
+        env.pageserver.http_client().timeline_checkpoint(
+            tenant, timeline, compact=False, wait_until_uploaded=True
+        )

From 133b89a83db9d49a802efc5a66a7fde65b7c8d5e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 12:35:39 -0500
Subject: [PATCH 71/78] feat(pageserver): continue from last incomplete image
 layer creation (#10660)

## Problem

close https://github.com/neondatabase/neon/issues/10651

## Summary of changes

* Image layer creation starts from the next partition of the last
processed partition if the previous attempt was not complete.
* Add tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            | 92 ++++++++++++++++----
 pageserver/src/tenant/timeline/compaction.rs |  2 +-
 test_runner/regress/test_compaction.py       | 56 +++++++++++-
 3 files changed, 129 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 11c0bbdfe5e1..b6a349a20913 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -192,7 +192,12 @@ pub enum ImageLayerCreationMode {
 
 #[derive(Clone, Debug, Default)]
 pub enum LastImageLayerCreationStatus {
-    Incomplete, // TODO: record the last key being processed
+    Incomplete {
+        /// The last key of the partition (exclusive) that was processed in the last
+        /// image layer creation attempt. We will continue from this key in the next
+        /// attempt.
+        last_key: Key,
+    },
     Complete,
     #[default]
     Initial,
@@ -4346,7 +4351,7 @@ impl Timeline {
         Ok(result)
     }
 
-    // Is it time to create a new image layer for the given partition?
+    // Is it time to create a new image layer for the given partition? True if we want to generate.
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
         let threshold = self.get_image_creation_threshold();
 
@@ -4658,6 +4663,11 @@ impl Timeline {
     ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
+        if partitioning.parts.is_empty() {
+            warn!("no partitions to create image layers for");
+            return Ok((vec![], LastImageLayerCreationStatus::Complete));
+        }
+
         // We need to avoid holes between generated image layers.
         // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
         // image layer with hole between them. In this case such layer can not be utilized by GC.
@@ -4669,28 +4679,65 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
-            info!(
-                "resuming image layer creation: last_status={:?}",
-                last_status
-            );
-            true
-        } else {
-            self.should_check_if_image_layers_required(lsn)
-        };
+        let check_for_image_layers =
+            if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+                info!(
+                    "resuming image layer creation: last_status=incomplete, continue from {}",
+                    last_key
+                );
+                true
+            } else {
+                self.should_check_if_image_layers_required(lsn)
+            };
 
         let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
 
         let mut all_generated = true;
 
         let mut partition_processed = 0;
-        let total_partitions = partitioning.parts.len();
+        let mut total_partitions = partitioning.parts.len();
+        let mut last_partition_processed = None;
+        let mut partition_parts = partitioning.parts.clone();
+
+        if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+            // We need to skip the partitions that have already been processed.
+            let mut found = false;
+            for (i, partition) in partition_parts.iter().enumerate() {
+                if last_key <= partition.end().unwrap() {
+                    // ```plain
+                    // |------|--------|----------|------|
+                    //              ^last_key
+                    //                    ^start from this partition
+                    // ```
+                    // Why `i+1` instead of `i`?
+                    // It is possible that the user did some writes after the previous image layer creation attempt so that
+                    // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
+                    // still want to skip this partition, so that we can make progress and avoid generating image layers over
+                    // the same partition. Doing a mod to ensure we don't end up with an empty vec.
+                    if i + 1 >= total_partitions {
+                        // In general, this case should not happen -- if last_key is on the last partition, the previous
+                        // iteration of image layer creation should return a complete status.
+                        break; // with found=false
+                    }
+                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
+                    total_partitions = partition_parts.len();
+                    // Update the start key to the partition start.
+                    start = partition_parts[0].start().unwrap();
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                // Last key is within the last partition, or larger than all partitions.
+                return Ok((vec![], LastImageLayerCreationStatus::Complete));
+            }
+        }
 
-        for partition in partitioning.parts.iter() {
+        for partition in partition_parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
             }
-
+            partition_processed += 1;
             let img_range = start..partition.ranges.last().unwrap().end;
             let compact_metadata = partition.overlaps(&Key::metadata_key_range());
             if compact_metadata {
@@ -4725,6 +4772,8 @@ impl Timeline {
                     lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
                     is_delta: false,
                 }) {
+                    // TODO: this can be processed with the BatchLayerWriter::finish_with_discard
+                    // in the future.
                     tracing::info!(
                         "Skipping image layer at {lsn} {}..{}, already exists",
                         img_range.start,
@@ -4805,8 +4854,6 @@ impl Timeline {
                 }
             }
 
-            partition_processed += 1;
-
             if let ImageLayerCreationMode::Try = mode {
                 // We have at least made some progress
                 if batch_image_writer.pending_layer_num() >= 1 {
@@ -4822,8 +4869,10 @@ impl Timeline {
                         * self.get_compaction_threshold();
                     if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
                         tracing::info!(
-                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
+                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
                     );
+                        last_partition_processed = Some(partition.clone());
                         all_generated = false;
                         break;
                     }
@@ -4868,7 +4917,14 @@ impl Timeline {
             if all_generated {
                 LastImageLayerCreationStatus::Complete
             } else {
-                LastImageLayerCreationStatus::Incomplete
+                LastImageLayerCreationStatus::Incomplete {
+                    last_key: if let Some(last_partition_processed) = last_partition_processed {
+                        last_partition_processed.end().unwrap_or(Key::MIN)
+                    } else {
+                        // This branch should be unreachable, but in case it happens, we can just return the start key.
+                        Key::MIN
+                    },
+                }
             },
         ))
     }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7dd37d72328e..466ebea78330 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -748,7 +748,7 @@ impl Timeline {
                     .store(Arc::new(outcome.clone()));
 
                 self.upload_new_image_layers(image_layers)?;
-                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                     // Yield and do not do any other kind of compaction.
                     info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
                     return Ok(CompactionOutcome::Pending);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index c031d66dfbc2..f3347b594e32 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -29,6 +29,21 @@
     # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }
 
+PREEMPT_COMPACTION_TENANT_CONF = {
+    "gc_period": "5s",
+    "compaction_period": "5s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 1,
+    "image_creation_preempt_threshold": 1,
+    # compact more frequently
+    "compaction_threshold": 3,
+    "compaction_upper_limit": 6,
+    "lsn_lease_length": "0s",
+}
+
 
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
@@ -36,7 +51,8 @@
     [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
 def test_pageserver_compaction_smoke(
-    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+    neon_env_builder: NeonEnvBuilder,
+    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """
 
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
+    conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -113,6 +130,41 @@ def test_pageserver_compaction_smoke(
     assert vectored_average < 8
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_compaction_preempt(
+    neon_env_builder: NeonEnvBuilder,
+):
+    # Ideally we should be able to do unit tests for this, but we need real Postgres
+    # WALs in order to do unit testing...
+
+    conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 200000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        workload.validate(env.pageserver.id)
+    ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    # ensure image layer creation gets preempted and then resumed
+    env.pageserver.assert_log_contains("resuming image layer creation")
+
+
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
     "with_branches",

From 6699a30a4948309703a90ed167980ef9d467bfad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 5 Feb 2025 20:07:51 +0200
Subject: [PATCH 72/78] Make it easy to build only a subset of extensions into
 compute image (#10655)

The full build of all extensions takes a long time. When working locally
on parts that don't need extensions, you can iterate more quickly by
skipping the unnecessary extensions.

This adds a build argument to the dockerfile to specify extensions to
build. There are three options:

- EXTENSIONS=all (default)
- EXTENSIONS=minimal: Build only a few extensions that are listed in
shared_preload_libraries in the default neon config.
- EXTENSIONS=none: Build no extensions (except for the mandatory 'neon'
extension).
---
 compute/compute-node.Dockerfile | 44 +++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9379856eabff..43910f2622b4 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -85,6 +85,10 @@ ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 ARG ALPINE_CURL_VERSION=8.11.1
 
+# By default, build all PostgreSQL extensions. For quick local testing when you don't
+# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
+ARG EXTENSIONS=all
+
 #########################################################################################
 #
 # Layer "build-deps"
@@ -1484,12 +1488,35 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 
 #########################################################################################
 #
-# Layer "all-extensions"
+# Layer "extensions-none"
+#
+#########################################################################################
+FROM build-deps AS extensions-none
+
+RUN mkdir /usr/local/pgsql
+
+#########################################################################################
+#
+# Layer "extensions-minimal"
+#
+# This subset of extensions includes the extensions that we have in
+# shared_preload_libraries by default.
+#
+#########################################################################################
+FROM build-deps AS extensions-minimal
+
+COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+
+#########################################################################################
+#
+# Layer "extensions-all"
 # Bundle together all the extensions
 #
 #########################################################################################
-FROM build-deps AS all-extensions
-ARG PG_VERSION
+FROM build-deps AS extensions-all
 
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1531,7 +1558,13 @@ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY --from=neon-ext-build /usr/local/pgsql/ /usr/local/pgsql/
+#########################################################################################
+#
+# Layer "neon-pg-ext-build"
+# Includes Postgres and all the extensions chosen by EXTENSIONS arg.
+#
+#########################################################################################
+FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
 
 #########################################################################################
 #
@@ -1614,7 +1647,8 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 #
 #########################################################################################
 FROM neon-ext-build AS postgres-cleanup-layer
-COPY --from=all-extensions /usr/local/pgsql /usr/local/pgsql
+
+COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
 RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp

From 733a57247bb04893b62855d68ca54fbc85a58aa7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 15:44:28 -0500
Subject: [PATCH 73/78] fix(pageserver): disallow gc-compaction produce l0
 layer (#10679)

## Problem

Any compaction should never produce l0 layers. This never happened in my
experiments, but would be good to guard it early.

## Summary of changes

Disallow gc-compaction to produce l0 layers.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 466ebea78330..cfde0704424f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -33,6 +33,7 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -438,6 +439,11 @@ impl KeyHistoryRetention {
         if dry_run {
             return true;
         }
+        if LayerMap::is_l0(&key.key_range, key.is_delta) {
+            // gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
+            // We should ignore such layers.
+            return true;
+        }
         let layer_generation;
         {
             let guard = tline.layers.read().await;

From 0ceeec9be3acd07cda24d849d7cd71dc4c0ac3cc Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 17:11:50 -0500
Subject: [PATCH 74/78] fix(pageserver): schedule compaction immediately if
 pending (#10684)

## Problem

The code is intended to reschedule compaction immediately if there are
pending tasks. We set the duration to 0 before if there are pending
tasks, but this will go through the `if period == Duration::ZERO {`
branch and sleep for another 10 seconds.

## Summary of changes

Set duration to 1 so that it doesn't sleep for too long.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/tasks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b6b64d02dd9f..d65f0991820d 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,7 +211,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
                         sleep_duration = if let CompactionOutcome::Pending = outcome {
-                            Duration::ZERO
+                            Duration::from_secs(1)
                         } else {
                             period
                         };

From 77f9e74d86d62bca524571aa50416680afb1f5ec Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 6 Feb 2025 02:14:29 +0100
Subject: [PATCH 75/78] pgxn: include socket send & recv queue size in slow
 response logs (#10673)

# Problem

When we see an apparent slow request, one possible cause is that the
client is failing to consume responses, but we don't have a clear way to
see that.

# Solution

- Log the socket queue depths on slow/stuck connections, so that we have
an indication of whether the compute is keeping up with processing the
connection's responses.

refs
- slack https://neondb.slack.com/archives/C036U0GRMRB/p1738652644396329
- refs https://github.com/neondatabase/cloud/issues/23515
- refs https://github.com/neondatabase/cloud/issues/23486
---
 pgxn/neon/libpagestore.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 4460e3b40c11..22aeb2e2d658 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -36,6 +36,11 @@
 #include "pagestore_client.h"
 #include "walproposer.h"
 
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+#endif
+
 #define PageStoreTrace DEBUG5
 
 #define MIN_RECONNECT_INTERVAL_USEC 1000
@@ -728,11 +733,36 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
 		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
 		{
+			int sndbuf = -1;
+			int recvbuf = -1;
+#ifdef __linux__
+			int socketfd;
+#endif
+
 			since_start = now;
 			INSTR_TIME_SUBTRACT(since_start, start_ts);
-			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+
+#ifdef __linux__
+			/*
+			 * get kernel's send and recv queue size via ioctl
+			 * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
+			 */
+			socketfd = PQsocket(pageserver_conn);
+			if (socketfd != -1) {
+				int ioctl_err;
+				ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
+				if (ioctl_err!= 0) {
+					sndbuf = -errno;
+				}
+				ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
+				if (ioctl_err != 0) {
+					recvbuf = -errno;
+				}
+			}
+#endif
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
 						   INSTR_TIME_GET_DOUBLE(since_start),
-						   shard->nrequests_sent, shard->nresponses_received);
+						   shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
 			last_log_ts = now;
 			logged = true;
 		}

From 7fc6953da41d536f8d785ea3a41640c80bb7c6bb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 07:42:14 +0200
Subject: [PATCH 76/78] Is neon superuser (#10625)

## Problem

is_neon_superuser() fiunction is public in pg14/pg15
but statically defined in publicationcmd.c in pg16/pg17

## Summary of changes

Make this function public for all Postgres version.
It is intended to be used not only in  publicationcmd.c

See
https://github.com/neondatabase/postgres/pull/573
https://github.com/neondatabase/postgres/pull/576

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3cf7ce1afab7..86d9ea96ebb9 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162
+Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index f0ffc8279dbc..8dfd5a7030d3 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit f0ffc8279dbcbbc439981a4fd001a9687e5d665d
+Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c3eaeac927ce..efddaef46a4f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
   "v17": [
     "17.2",
-    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
+    "8dfd5a7030d3e8a98b60265ebe045788892ac7f3"
   ],
   "v16": [
     "16.6",
-    "3cf7ce1afab75027716d14223f95ddb300754162"
+    "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c"
   ],
   "v15": [
     "15.10",

From 81cd30e4d6e2f9e43a92e59c6ffce4f0890980ca Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 07:47:56 +0200
Subject: [PATCH 77/78] Use #ifdef instead of #if USE_ASSERT_CHECKING (#10683)

## Problem

USE_ASSERT _CHECKING is defined as empty entity. but it is checked using
#if

## Summary of changes

Replace `#if USE_ASSERT _CHECKING` with `#ifdef USE_ASSERT _CHECKING` as
done in other places in Postgres

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c     | 6 +++---
 pgxn/neon/pagestore_smgr.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 08b76521756c..01da61f84b56 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -563,8 +563,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	LWLockRelease(lfc_lock);
 
-#if USE_ASSERT_CHECKING
-	do {
+#ifdef USE_ASSERT_CHECKING
+	{
 		int count = 0;
 
 		for (int j = 0; j < nblocks; j++)
@@ -574,7 +574,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 
 		Assert(count == found);
-	} while (false);
+	}
 #endif
 
 	return found;
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 54cacea98448..012bd479bcd6 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 {
 	uint64		min_ring_index;
 	PrefetchRequest hashkey;
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 	bool		any_hits = false;
 #endif
 	/* We will never read further ahead than our buffer can store. */
@@ -955,7 +955,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 		else
 			lsns = NULL;
 
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 		any_hits = true;
 #endif
 

From fedf4f169c5a7641e1d5233289af4e91aaba8274 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Feb 2025 06:02:11 +0000
Subject: [PATCH 78/78] Proxy release 2025-02-06