diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md index d33eec3cdea0..234d9b5a3782 100644 --- a/.github/ISSUE_TEMPLATE/bug-template.md +++ b/.github/ISSUE_TEMPLATE/bug-template.md @@ -3,6 +3,7 @@ name: Bug Template about: Used for describing bugs title: '' labels: t/bug +type: Bug assignees: '' --- diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index c442f50fde18..868fd084f151 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising more than 1 week of work. title: 'Epic: ' labels: t/Epic +type: Epic assignees: '' --- diff --git a/.github/actionlint.yml b/.github/actionlint.yml index ecff0cc70b22..2b96ce95da32 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -27,3 +27,4 @@ config-variables: - SLACK_ON_CALL_QA_STAGING_STREAM - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN - SLACK_ON_CALL_STORAGE_STAGING_STREAM + - SLACK_CICD_CHANNEL_ID diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 11f46bce8ef1..c9f6b0832eeb 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -41,7 +41,10 @@ inputs: description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library' required: false default: '/tmp/neon/pg_install/v16/lib' - + project_settings: + description: 'A JSON object with project settings' + required: false + default: '{}' outputs: dsn: @@ -73,7 +76,7 @@ runs: \"provisioner\": \"k8s-neonvm\", \"autoscaling_limit_min_cu\": ${MIN_CU}, \"autoscaling_limit_max_cu\": ${MAX_CU}, - \"settings\": { } + \"settings\": ${PROJECT_SETTINGS} } }") @@ -92,12 +95,12 @@ runs: if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then # determine tenant ID TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` - + echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))" echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}" - + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) curl -X PUT \ "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \ @@ -118,3 +121,4 @@ runs: STRIPE_SIZE: ${{ inputs.stripe_size }} PSQL: ${{ inputs.psql_path }} LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} + PROJECT_SETTINGS: ${{ inputs.project_settings }} diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml index 886cd3919ac2..02ee383d5ed3 100644 --- a/.github/file-filters.yaml +++ b/.github/file-filters.yaml @@ -1,4 +1,5 @@ rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock'] +rust_dependencies: ['**/Cargo.lock'] v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**'] v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**'] diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index f97402a90b92..1dec8106b484 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -267,6 +267,26 @@ jobs: path: /tmp/neon aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + - name: Check diesel schema + if: inputs.build-type == 'release' && inputs.arch == 'x64' + env: + DATABASE_URL: postgresql://localhost:1235/storage_controller + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + run: | + /tmp/neon/bin/neon_local init + /tmp/neon/bin/neon_local storage_controller start + + diesel print-schema > storage_controller/src/schema.rs + + if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then + echo >&2 "Uncommitted changes in diesel schema" + + git diff . + exit 1 + fi + + /tmp/neon/bin/neon_local storage_controller stop + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: inputs.build-type == 'debug' diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml index cbc47c640640..c4c76914aa64 100644 --- a/.github/workflows/_check-codestyle-rust.yml +++ b/.github/workflows/_check-codestyle-rust.yml @@ -16,6 +16,9 @@ defaults: run: shell: bash -euxo pipefail {0} +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + jobs: check-codestyle-rust: strategy: @@ -84,8 +87,3 @@ jobs: run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} - run: cargo deny check --hide-inclusion-graph diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 0a0898d30c1c..fc2f36c74b89 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -94,7 +94,9 @@ jobs: echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT} echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT} - - run: gh pr checkout "${PR_NUMBER}" + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} - run: git checkout -b "${BRANCH}" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 32747d825caa..b36ac46f3525 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -319,7 +319,7 @@ jobs: { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' @@ -340,7 +340,7 @@ jobs: ], "pg_version" : [ 16,17 - ], + ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then @@ -458,7 +458,7 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB + # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB # without (neonvm-captest-new) # and with (neonvm-captest-new-many-tables) many relations in the database - name: Create many relations before the run @@ -590,36 +590,20 @@ jobs: steps: - uses: actions/checkout@v4 - # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16 - # instead of using Neon artifacts containing pgbench - - name: Install postgresql-16 where pytest expects it - run: | - # Just to make it easier to test things locally on macOS (with arm64) - arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') - - cd /home/nonroot - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb" - dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg - dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg - dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg - - mkdir -p /tmp/neon/pg_install/v16/bin - mkdir -p /tmp/neon/pg_install/v17/bin - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql - ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v17/bin/psql - ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v17/lib - - LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}" - export LD_LIBRARY_PATH - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV} - - /tmp/neon/pg_install/v16/bin/pgbench --version - /tmp/neon/pg_install/v16/bin/psql --version + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -642,13 +626,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 18000 # 5 hours - - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set with: @@ -764,10 +741,10 @@ jobs: neonvm-captest-reuse) case "${PG_VERSION}" in 16) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }} + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; 17) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }} ;; *) echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" @@ -833,7 +810,7 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write @@ -887,7 +864,7 @@ jobs: CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR" ;; 17) - CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17" + CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17" ;; *) echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" @@ -906,7 +883,7 @@ jobs: exit 1 ;; esac - + echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV - name: Set up Connection String @@ -952,7 +929,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write @@ -1007,7 +984,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; 17) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }} ;; *) echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e588fc5a0e88..5a4bdecb994e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -45,6 +45,26 @@ jobs: run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" + files-changed: + needs: [ check-permissions ] + runs-on: [ self-hosted, small ] + timeout-minutes: 3 + outputs: + check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + + - name: Check for file changes + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + id: files-changed + with: + token: ${{ secrets.GITHUB_TOKEN }} + filters: .github/file-filters.yaml + tag: needs: [ check-permissions ] runs-on: [ self-hosted, small ] @@ -170,6 +190,14 @@ jobs: archs: '["x64", "arm64"]' secrets: inherit + check-dependencies-rust: + needs: [ files-changed, build-build-tools-image ] + if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }} + uses: ./.github/workflows/cargo-deny.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + secrets: inherit + build-and-test-locally: needs: [ tag, build-build-tools-image ] strategy: @@ -654,7 +682,7 @@ jobs: push: true pull: true file: compute/compute-node.Dockerfile - target: neon-pg-ext-test + target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} @@ -1332,6 +1360,8 @@ jobs: - build-and-test-locally - check-codestyle-python - check-codestyle-rust + - check-dependencies-rust + - files-changed - promote-images-dev - test-images - trigger-custom-extensions-build-and-wait @@ -1344,4 +1374,11 @@ jobs: if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - || contains(needs.*.result, 'skipped') + || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true') + || needs.build-and-test-locally.result == 'skipped' + || needs.check-codestyle-python.result == 'skipped' + || needs.check-codestyle-rust.result == 'skipped' + || needs.files-changed.result == 'skipped' + || needs.promote-images-dev.result == 'skipped' + || needs.test-images.result == 'skipped' + || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml new file mode 100644 index 000000000000..433b377c327e --- /dev/null +++ b/.github/workflows/cargo-deny.yml @@ -0,0 +1,57 @@ +name: cargo deny checks + +on: + workflow_call: + inputs: + build-tools-image: + required: false + type: string + schedule: + - cron: '0 0 * * *' + +jobs: + cargo-deny: + strategy: + matrix: + ref: >- + ${{ + fromJSON( + github.event_name == 'schedule' + && '["main","release","release-proxy","release-compute"]' + || format('["{0}"]', github.sha) + ) + }} + + runs-on: [self-hosted, small] + + container: + image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ matrix.ref }} + + - name: Check rust licenses/bans/advisories/sources + env: + CARGO_DENY_TARGET: >- + ${{ github.event_name == 'schedule' && 'advisories' || 'all' }} + run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET + + - name: Post to a Slack channel + if: ${{ github.event_name == 'schedule' && failure() }} + uses: slackapi/slack-github-action@v2 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_CICD_CHANNEL_ID }} + text: | + Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + Pinging @oncall-devprod. diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 4947907eb068..abc90c7fe1a7 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -12,8 +12,8 @@ on: pull_request: paths: - '.github/workflows/pg-clients.yml' - - 'test_runner/pg_clients/**' - - 'test_runner/logical_repl/**' + - 'test_runner/pg_clients/**/*.py' + - 'test_runner/logical_repl/**/*.py' - 'poetry.lock' workflow_dispatch: @@ -104,6 +104,8 @@ jobs: with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} + project_settings: >- + {"enable_logical_replication": true} - name: Run tests uses: ./.github/actions/run-python-test-set diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index e6dfbaeed871..c47b3fe0debb 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -59,7 +59,10 @@ jobs: echo "${RUST_CHANGED_FILES}" build-build-tools-image: - if: needs.get-changed-files.outputs.python-changed == 'true' + if: | + false + || needs.get-changed-files.outputs.python-changed == 'true' + || needs.get-changed-files.outputs.rust-changed == 'true' needs: [ get-changed-files ] uses: ./.github/workflows/build-build-tools-image.yml with: @@ -92,7 +95,8 @@ jobs: # - conclusion # - neon-cloud-e2e conclusion: - if: always() + # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow + if: always() && github.event_name == 'merge_group' permissions: statuses: write # for `github.repos.createCommitStatus(...)` contents: write @@ -124,6 +128,8 @@ jobs: - name: Fail the job if any of the dependencies do not succeed or skipped run: exit 1 if: | - (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true') + false + || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true') + || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true') || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') diff --git a/Cargo.lock b/Cargo.lock index 9ba90355df19..de1b1218cada 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,6 +206,16 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "assert-json-diff" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "async-channel" version = "1.9.0" @@ -290,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.5.10" +version = "1.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" +checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90" dependencies = [ "aws-credential-types", "aws-runtime", @@ -301,7 +311,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.60.7", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -332,9 +342,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.4.4" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea" +checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -366,7 +376,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -389,7 +399,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -414,7 +424,7 @@ dependencies = [ "aws-smithy-checksums", "aws-smithy-eventstream", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -437,15 +447,15 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.50.0" +version = "1.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab" +checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -459,15 +469,15 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.51.0" +version = "1.58.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0" +checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -481,15 +491,15 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.51.0" +version = "1.58.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf" +checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json 0.61.1", + "aws-smithy-json", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -504,9 +514,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.6" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2" +checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -533,9 +543,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.1" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" +checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" dependencies = [ "futures-util", "pin-project-lite", @@ -565,9 +575,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.5" +version = "0.60.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" +checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a" dependencies = [ "aws-smithy-types", "bytes", @@ -576,9 +586,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.11" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" +checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -597,18 +607,9 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.60.7" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" -dependencies = [ - "aws-smithy-types", -] - -[[package]] -name = "aws-smithy-json" -version = "0.61.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095" +checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" dependencies = [ "aws-smithy-types", ] @@ -625,9 +626,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.4" +version = "1.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" +checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -669,9 +670,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.9" +version = "1.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" +checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97" dependencies = [ "base64-simd", "bytes", @@ -704,9 +705,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.3" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" +checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -978,7 +979,7 @@ version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "cexpr", "clang-sys", "itertools 0.12.1", @@ -1006,9 +1007,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "block-buffer" @@ -1019,6 +1020,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "boxcar" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42" + [[package]] name = "bstr" version = "1.5.0" @@ -1225,6 +1232,20 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +[[package]] +name = "clashmap" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d" +dependencies = [ + "crossbeam-utils", + "hashbrown 0.15.2", + "lock_api", + "parking_lot_core 0.9.8", + "polonius-the-crab", + "replace_with", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -1312,7 +1333,7 @@ dependencies = [ "tar", "thiserror 1.0.69", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-stream", "tokio-util", "tower 0.5.2", @@ -1421,7 +1442,7 @@ dependencies = [ "storage_broker", "thiserror 1.0.69", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-util", "toml", "toml_edit", @@ -1561,7 +1582,7 @@ version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "crossterm_winapi", "libc", "parking_lot 0.12.1", @@ -1792,7 +1813,7 @@ version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "byteorder", "chrono", "diesel_derives", @@ -1812,7 +1833,7 @@ dependencies = [ "futures-util", "scoped-futures", "tokio", - "tokio-postgres 0.7.12", + "tokio-postgres", ] [[package]] @@ -2428,6 +2449,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "gettid" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "gimli" version = "0.31.1" @@ -2556,6 +2587,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "hashlink" version = "0.9.1" @@ -2606,6 +2643,15 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "higher-kinded-types" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e" +dependencies = [ + "never-say-never", +] + [[package]] name = "hmac" version = "0.12.1" @@ -3084,11 +3130,11 @@ dependencies = [ [[package]] name = "inotify" -version = "0.9.6" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.8.0", "inotify-sys", "libc", ] @@ -3265,9 +3311,9 @@ dependencies = [ [[package]] name = "kqueue" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98" +checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c" dependencies = [ "kqueue-sys", "libc", @@ -3275,9 +3321,9 @@ dependencies = [ [[package]] name = "kqueue-sys" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" dependencies = [ "bitflags 1.3.2", "libc", @@ -3304,9 +3350,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.167" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libloading" @@ -3553,14 +3599,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.11" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -3569,6 +3615,12 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "never-say-never" +version = "6.6.666" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6" + [[package]] name = "nix" version = "0.25.1" @@ -3600,7 +3652,7 @@ version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "cfg-if", "libc", "memoffset 0.9.0", @@ -3618,12 +3670,11 @@ dependencies = [ [[package]] name = "notify" -version = "6.1.1" +version = "8.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" +checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943" dependencies = [ - "bitflags 2.4.1", - "crossbeam-channel", + "bitflags 2.8.0", "filetime", "fsevent-sys", "inotify", @@ -3631,10 +3682,17 @@ dependencies = [ "libc", "log", "mio", + "notify-types", "walkdir", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] +[[package]] +name = "notify-types" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" + [[package]] name = "ntapi" version = "0.4.1" @@ -4060,8 +4118,8 @@ dependencies = [ "pageserver_compaction", "pin-project-lite", "postgres", - "postgres-protocol 0.6.6", - "postgres-types 0.2.6", + "postgres-protocol", + "postgres-types", "postgres_backend", "postgres_connection", "postgres_ffi", @@ -4092,7 +4150,7 @@ dependencies = [ "tokio", "tokio-epoll-uring", "tokio-io-timeout", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-stream", "tokio-tar", "tokio-util", @@ -4150,7 +4208,7 @@ dependencies = [ "serde", "thiserror 1.0.69", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-stream", "tokio-util", "utils", @@ -4180,6 +4238,16 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "papaya" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c" +dependencies = [ + "equivalent", + "seize", +] + [[package]] name = "parking" version = "2.1.1" @@ -4446,48 +4514,40 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "polonius-the-crab" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9" +dependencies = [ + "higher-kinded-types", + "never-say-never", +] + [[package]] name = "postgres" -version = "0.19.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" +version = "0.19.7" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "bytes", "fallible-iterator", "futures-util", "log", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", ] [[package]] name = "postgres-protocol" version = "0.6.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" -dependencies = [ - "base64 0.21.1", - "byteorder", - "bytes", - "fallible-iterator", - "hmac", - "lazy_static", - "md-5", - "memchr", - "rand 0.8.5", - "sha2", - "stringprep", -] - -[[package]] -name = "postgres-protocol" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "base64 0.22.1", "byteorder", "bytes", "fallible-iterator", "hmac", + "lazy_static", "md-5", "memchr", "rand 0.8.5", @@ -4514,23 +4574,12 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "bytes", "chrono", "fallible-iterator", - "postgres-protocol 0.6.6", -] - -[[package]] -name = "postgres-types" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol 0.6.7", + "postgres-protocol", ] [[package]] @@ -4555,7 +4604,7 @@ dependencies = [ "serde", "thiserror 1.0.69", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls 0.26.0", "tokio-util", @@ -4570,7 +4619,7 @@ dependencies = [ "itertools 0.10.5", "once_cell", "postgres", - "tokio-postgres 0.7.9", + "tokio-postgres", "url", ] @@ -4664,7 +4713,7 @@ dependencies = [ "byteorder", "bytes", "itertools 0.10.5", - "postgres-protocol 0.6.6", + "postgres-protocol", "rand 0.8.5", "serde", "thiserror 1.0.69", @@ -4705,7 +4754,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "chrono", "flate2", "hex", @@ -4720,7 +4769,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "chrono", "hex", ] @@ -4826,6 +4875,7 @@ dependencies = [ "ahash", "anyhow", "arc-swap", + "assert-json-diff", "async-compression", "async-trait", "atomic-take", @@ -4833,15 +4883,16 @@ dependencies = [ "aws-sdk-iam", "aws-sigv4", "base64 0.13.1", + "boxcar", "bstr", "bytes", "camino", "camino-tempfile", "chrono", "clap", + "clashmap", "compute_api", "consumption_metrics", - "dashmap 5.5.0", "ecdsa 0.16.9", "ed25519-dalek", "env_logger 0.10.2", @@ -4849,6 +4900,7 @@ dependencies = [ "flate2", "framed-websockets", "futures", + "gettid", "hashbrown 0.14.5", "hashlink", "hex", @@ -4871,7 +4923,9 @@ dependencies = [ "measured", "metrics", "once_cell", + "opentelemetry", "p256 0.13.2", + "papaya", "parking_lot 0.12.1", "parquet", "parquet_derive", @@ -4912,12 +4966,15 @@ dependencies = [ "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-postgres2", "tokio-rustls 0.26.0", "tokio-tungstenite 0.21.0", "tokio-util", "tracing", + "tracing-log", + "tracing-opentelemetry", + "tracing-serde", "tracing-subscriber", "tracing-utils", "try-lock", @@ -5249,6 +5306,12 @@ dependencies = [ "utils", ] +[[package]] +name = "replace_with" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690" + [[package]] name = "reqwest" version = "0.12.4" @@ -5528,7 +5591,7 @@ version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys 0.4.14", @@ -5700,7 +5763,7 @@ dependencies = [ "pageserver_api", "parking_lot 0.12.1", "postgres", - "postgres-protocol 0.6.6", + "postgres-protocol", "postgres_backend", "postgres_ffi", "pprof", @@ -5724,7 +5787,7 @@ dependencies = [ "tikv-jemallocator", "tokio", "tokio-io-timeout", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-stream", "tokio-tar", "tokio-util", @@ -5865,6 +5928,16 @@ dependencies = [ "libc", ] +[[package]] +name = "seize" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "semver" version = "1.0.17" @@ -6341,6 +6414,8 @@ dependencies = [ "rand 0.8.5", "reqwest", "routerify", + "rustls 0.23.18", + "rustls-native-certs 0.8.0", "scoped-futures", "scopeguard", "serde", @@ -6349,6 +6424,8 @@ dependencies = [ "strum_macros", "thiserror 1.0.69", "tokio", + "tokio-postgres", + "tokio-postgres-rustls", "tokio-util", "tracing", "utils", @@ -6394,7 +6471,7 @@ dependencies = [ "serde_json", "storage_controller_client", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-postgres-rustls", "tokio-stream", "tokio-util", @@ -6591,7 +6668,7 @@ dependencies = [ "fastrand 2.2.0", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6803,21 +6880,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.1" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -6848,9 +6924,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", @@ -6859,34 +6935,8 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.9" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" -dependencies = [ - "async-trait", - "byteorder", - "bytes", - "fallible-iterator", - "futures-channel", - "futures-util", - "log", - "parking_lot 0.12.1", - "percent-encoding", - "phf", - "pin-project-lite", - "postgres-protocol 0.6.6", - "postgres-types 0.2.6", - "rand 0.8.5", - "socket2", - "tokio", - "tokio-util", - "whoami", -] - -[[package]] -name = "tokio-postgres" -version = "0.7.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb" +version = "0.7.10" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "async-trait", "byteorder", @@ -6899,8 +6949,8 @@ dependencies = [ "percent-encoding", "phf", "pin-project-lite", - "postgres-protocol 0.6.7", - "postgres-types 0.2.8", + "postgres-protocol", + "postgres-types", "rand 0.8.5", "socket2", "tokio", @@ -6917,7 +6967,7 @@ dependencies = [ "ring", "rustls 0.23.18", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-rustls 0.26.0", "x509-certificate", ] @@ -7161,7 +7211,7 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "bytes", "http 1.1.0", "http-body 1.0.0", @@ -7595,7 +7645,7 @@ dependencies = [ "serde_json", "sysinfo", "tokio", - "tokio-postgres 0.7.9", + "tokio-postgres", "tokio-util", "tracing", "tracing-subscriber", @@ -7658,9 +7708,9 @@ dependencies = [ [[package]] name = "walkdir" -version = "2.3.3" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" dependencies = [ "same-file", "winapi-util", @@ -7912,6 +7962,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.0" @@ -8140,6 +8199,7 @@ dependencies = [ "tower 0.4.13", "tracing", "tracing-core", + "tracing-log", "url", "zerocopy", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 9ccdb45f6d82..76b54ae1d877 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" backtrace = "0.3.74" flate2 = "1.0.26" +assert-json-diff = "2" async-stream = "0.3" async-trait = "0.1" aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } @@ -77,10 +78,10 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive", "env"] } +clashmap = { version = "1.0", features = ["raw-api"] } comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" -dashmap = { version = "5.5.0", features = ["raw-api"] } diatomic-waker = { version = "0.2.3" } either = "1.8" enum-map = "2.4.2" @@ -123,7 +124,7 @@ measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } -notify = "6.0.0" +notify = "8.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" @@ -177,7 +178,7 @@ test-context = "0.3" thiserror = "1.0" tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } -tokio = { version = "1.17", features = ["macros"] } +tokio = { version = "1.41", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" @@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] } tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" +tracing-log = "0.2" tracing-opentelemetry = "0.28" +tracing-serde = "0.2.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 9c13e480c125..3ade57b175b3 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -3,8 +3,13 @@ ARG DEBIAN_VERSION=bookworm FROM debian:bookworm-slim AS pgcopydb_builder ARG DEBIAN_VERSION +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] + +# By default, /bin/sh used in debian images will treat '\n' as eol, +# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ @@ -55,7 +60,8 @@ ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home -SHELL ["/bin/bash", "-c"] +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] RUN mkdir -p /pgcopydb/bin && \ mkdir -p /pgcopydb/lib && \ @@ -66,7 +72,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc # System deps @@ -190,8 +196,14 @@ RUN set -e \ # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master) # And patches from us: # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz) -RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \ - && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ +RUN set +o pipefail && \ + for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \ + yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\ + done && \ + set -o pipefail +# Split into separate step to debug flaky failures here +RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ + && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \ && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \ && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \ && cd lcov \ @@ -253,7 +265,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.84.0 +ENV RUSTC_VERSION=1.84.1 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 @@ -261,6 +273,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33 ARG CARGO_DENY_VERSION=0.16.2 ARG CARGO_HACK_VERSION=0.6.33 ARG CARGO_NEXTEST_VERSION=0.9.85 +ARG CARGO_DIESEL_CLI_VERSION=2.2.6 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -274,6 +287,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ + cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \ + --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index a428c61f3458..43910f2622b4 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1,3 +1,81 @@ +# +# This Dockerfile builds the compute image. It is built multiple times to produce +# different images for each PostgreSQL major version. +# +# We use Debian as the base for all the steps. The production images use Debian bookworm +# for v17, and Debian bullseye for older PostgreSQL versions. +# +# ## Intermediary layers +# +# build-tools: This contains Rust compiler toolchain and other tools needed at compile +# time. This is also used for the storage builds. This image is defined in +# build-tools.Dockerfile. +# +# build-deps: Contains C compiler, other build tools, and compile-time dependencies +# needed to compile PostgreSQL and most extensions. (Some extensions need +# extra tools and libraries that are not included in this image. They are +# installed in the extension-specific build stages.) +# +# pg-build: Result of compiling PostgreSQL. The PostgreSQL binaries are copied from +# this to the final image. This is also used as the base for compiling all +# the extensions. +# +# compute-tools: This contains compute_ctl, the launcher program that starts Postgres +# in Neon. It also contains a few other tools that are built from the +# sources from this repository and used in compute VMs: 'fast_import' and +# 'local_proxy' +# +# ## Extensions +# +# By convention, the build of each extension consists of two layers: +# +# {extension}-src: Contains the source tarball, possible neon-specific patches, and +# the extracted tarball with the patches applied. All of these are +# under the /ext-src/ directory. +# +# {extension}-build: Contains the installed extension files, under /usr/local/pgsql +# (in addition to the PostgreSQL binaries inherited from the pg-build +# image). A few extensions need extra libraries or other files +# installed elsewhere in the filesystem. They are installed by ONBUILD +# directives. +# +# These are merged together into two layers: +# +# all-extensions: All the extension -build layers merged together +# +# extension-tests: All the extension -src layers merged together. This is used by the +# extension tests. The tests are executed against the compiled image, +# but the tests need test scripts, expected result files etc. from the +# original sources, which are not included in the binary image. +# +# ## Extra components +# +# These are extra included in the compute image, but are not directly used by PostgreSQL +# itself. +# +# pgbouncer: pgbouncer and its configuration +# +# sql_exporter: Metrics exporter daemon. +# +# postgres_exporter: Another metrics exporter daemon, for different sets of metrics. +# +# The configuration files for the metrics exporters are under etc/ directory. We use +# a templating system to handle variations between different PostgreSQL versions, +# building slightly different config files for each PostgreSQL version. +# +# +# ## Final image +# +# The final image puts together the PostgreSQL binaries (pg-build), the compute tools +# (compute-tools), all the extensions (all-extensions) and the extra components into +# one image. +# +# VM image: The final image built by this dockerfile isn't actually the final image that +# we use in computes VMs. There's an extra step that adds some files and makes other +# small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM. +# That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the +# build_and_test.yml github workflow for how that's done. + ARG PG_VERSION ARG REPOSITORY=neondatabase ARG IMAGE=build-tools @@ -7,6 +85,10 @@ ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim ARG ALPINE_CURL_VERSION=8.11.1 +# By default, build all PostgreSQL extensions. For quick local testing when you don't +# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal +ARG EXTENSIONS=all + ######################################################################################### # # Layer "build-deps" @@ -18,8 +100,10 @@ ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] +# By default, /bin/sh used in debian images will treat '\n' as eol, +# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc RUN case $DEBIAN_VERSION in \ @@ -122,17 +206,9 @@ ENV PATH="/usr/local/pgsql/bin:$PATH" # Build PostGIS from the upstream PostGIS mirror. # ######################################################################################### -FROM pg-build AS postgis-build +FROM build-deps AS postgis-src ARG DEBIAN_VERSION ARG PG_VERSION -RUN apt update && \ - apt install --no-install-recommends --no-install-suggests -y \ - gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ - libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ - libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ - protobuf-c-compiler xsltproc \ - && apt clean && rm -rf /var/lib/apt/lists/* - # Postgis 3.5.0 requires SFCGAL 1.4+ # @@ -141,6 +217,7 @@ RUN apt update && \ # and also we must check backward compatibility with older versions of PostGIS. # # Use new version only for v17 +WORKDIR /ext-src RUN case "${DEBIAN_VERSION}" in \ "bookworm") \ export SFCGAL_VERSION=1.4.1 \ @@ -154,15 +231,12 @@ RUN case "${DEBIAN_VERSION}" in \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ - mkdir -p /sfcgal && \ wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \ echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \ - mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ - cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ - ninja clean && cp -R /sfcgal/* / + mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . # Postgis 3.5.0 supports v17 +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v17") \ export POSTGIS_VERSION=3.5.0 \ @@ -178,8 +252,27 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \ echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \ - mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ - ./autogen.sh && \ + mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . + +# This is reused for pgrouting +FROM pg-build AS postgis-build-deps +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ + gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ + libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ + protobuf-c-compiler xsltproc \ + && apt clean && rm -rf /var/lib/apt/lists/* + +FROM postgis-build-deps AS postgis-build +COPY --from=postgis-src /ext-src/ /ext-src/ +WORKDIR /ext-src/sfcgal-src +RUN cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ + ninja clean && cp -R /sfcgal/* / + +WORKDIR /ext-src/postgis-src +RUN ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -202,12 +295,23 @@ RUN case "${PG_VERSION}" in \ cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis +######################################################################################### +# +# Layer "pgrouting-build" +# Build pgrouting. Note: This depends on the postgis-build-deps layer built above +# +######################################################################################### + # Uses versioned libraries, i.e. libpgrouting-3.4 # and may introduce function signature changes between releases # i.e. release 3.5.0 has new signature for pg_dijkstra function # # Use new version only for v17 # last release v3.6.2 - Mar 30, 2024 +FROM build-deps AS pgrouting-src +ARG DEBIAN_VERSION +ARG PG_VERSION +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v17") \ export PGROUTING_VERSION=3.6.2 \ @@ -223,8 +327,12 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \ echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \ - mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ - mkdir build && cd build && \ + mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . + +FROM postgis-build-deps AS pgrouting-build +COPY --from=pgrouting-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgrouting-src +RUN mkdir build && cd build && \ cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ ninja -j $(getconf _NPROCESSORS_ONLN) && \ ninja -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -236,15 +344,11 @@ RUN case "${PG_VERSION}" in \ # Build plv8 # ######################################################################################### -FROM pg-build AS plv8-build +FROM build-deps AS plv8-src ARG PG_VERSION +WORKDIR /ext-src -COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch - -RUN apt update && \ - apt install --no-install-recommends --no-install-suggests -y \ - ninja-build python3-dev libncurses5 binutils clang \ - && apt clean && rm -rf /var/lib/apt/lists/* +COPY compute/patches/plv8-3.1.10.patch . # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 @@ -268,9 +372,20 @@ RUN case "${PG_VERSION}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ - if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \ + if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi + +FROM pg-build AS plv8-build +ARG PG_VERSION +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build python3-dev libncurses5 binutils clang \ + && apt clean && rm -rf /var/lib/apt/lists/* + +COPY --from=plv8-src /ext-src/ /ext-src/ +WORKDIR /ext-src/plv8-src +RUN \ # generate and copy upgrade scripts - mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \ + make generate_upgrades && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ @@ -298,16 +413,28 @@ RUN case "${PG_VERSION}" in \ # Build h3_pg # ######################################################################################### -FROM pg-build AS h3-pg-build +FROM build-deps AS h3-pg-src ARG PG_VERSION +WORKDIR /ext-src # not version-specific # last release v4.1.0 - Jan 18, 2023 RUN mkdir -p /h3/usr/ && \ wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ - mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ - mkdir build && cd build && \ + mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . + +# not version-specific +# last release v4.1.3 - Jul 26, 2023 +WORKDIR /ext-src +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ + echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ + mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . + +FROM pg-build AS h3-pg-build +COPY --from=h3-pg-src /ext-src/ /ext-src/ +WORKDIR /ext-src/h3-src +RUN mkdir build && cd build && \ cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \ -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \ && ninja -j $(getconf _NPROCESSORS_ONLN) && \ @@ -315,11 +442,8 @@ RUN mkdir -p /h3/usr/ && \ cp -R /h3/usr / && \ rm -rf build -# not version-specific -# last release v4.1.3 - Jul 26, 2023 -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ - echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ - mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ +WORKDIR /ext-src/h3-pg-src +RUN ls -l && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ @@ -327,19 +451,24 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 ######################################################################################### # -# Layer "unit-pg-build" +# Layer "postgresql-unit-build" # compile unit extension # ######################################################################################### -FROM pg-build AS unit-pg-build +FROM build-deps AS postgresql-unit-src ARG PG_VERSION # not version-specific # last release 7.9 - Sep 15, 2024 +WORKDIR /ext-src RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \ echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \ - mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . + +FROM pg-build AS postgresql-unit-build +COPY --from=postgresql-unit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/postgresql-unit-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path. @@ -350,14 +479,15 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz - ######################################################################################### # -# Layer "vector-pg-build" +# Layer "pgvector-build" # compile pgvector extension # ######################################################################################### -FROM pg-build AS vector-pg-build +FROM build-deps AS pgvector-src ARG PG_VERSION -COPY compute/patches/pgvector.patch /pgvector.patch +WORKDIR /ext-src +COPY compute/patches/pgvector.patch . # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. @@ -370,74 +500,94 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \ echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \ - patch -p1 < /pgvector.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ + patch -p1 < /ext-src/pgvector.patch + +FROM pg-build AS pgvector-build +COPY --from=pgvector-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgvector-src +RUN make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### # -# Layer "pgjwt-pg-build" +# Layer "pgjwt-build" # compile pgjwt extension # ######################################################################################### -FROM pg-build AS pgjwt-pg-build +FROM build-deps AS pgjwt-src ARG PG_VERSION # not version-specific # doesn't use releases, last commit f3d82fd - Mar 2, 2023 +WORKDIR /ext-src RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ - mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgjwt-build +COPY --from=pgjwt-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgjwt-src +RUN make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control ######################################################################################### # -# Layer "hypopg-pg-build" +# Layer "hypopg-build" # compile hypopg extension # ######################################################################################### -FROM pg-build AS hypopg-pg-build +FROM build-deps AS hypopg-src ARG PG_VERSION # HypoPG 1.4.1 supports v17 # last release 1.4.1 - Apr 28, 2024 +WORKDIR /ext-src RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ - mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . + +FROM pg-build AS hypopg-build +COPY --from=hypopg-src /ext-src/ /ext-src/ +WORKDIR /ext-src/hypopg-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control ######################################################################################### # -# Layer "pg-hashids-pg-build" +# Layer "pg_hashids-build" # compile pg_hashids extension # ######################################################################################### -FROM pg-build AS pg-hashids-pg-build +FROM build-deps AS pg_hashids-src ARG PG_VERSION # not version-specific # last release v1.2.1 -Jan 12, 2018 +WORKDIR /ext-src RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ - mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_hashids-build +COPY --from=pg_hashids-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_hashids-src +RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control ######################################################################################### # -# Layer "rum-pg-build" +# Layer "rum-build" # compile rum extension # ######################################################################################### -FROM pg-build AS rum-pg-build +FROM build-deps AS rum-src ARG PG_VERSION -COPY compute/patches/rum.patch /rum.patch +WORKDIR /ext-src +COPY compute/patches/rum.patch . # supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25 # doesn't use releases since 1.3.13 - Sep 19, 2022 @@ -445,110 +595,140 @@ COPY compute/patches/rum.patch /rum.patch RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \ echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ - patch -p1 < /rum.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + patch -p1 < /ext-src/rum.patch + +FROM pg-build AS rum-build +COPY --from=rum-src /ext-src/ /ext-src/ +WORKDIR /ext-src/rum-src +RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control ######################################################################################### # -# Layer "pgtap-pg-build" +# Layer "pgtap-build" # compile pgTAP extension # ######################################################################################### -FROM pg-build AS pgtap-pg-build +FROM build-deps AS pgtap-src ARG PG_VERSION # pgtap 1.3.3 supports v17 # last release v1.3.3 - Apr 8, 2024 +WORKDIR /ext-src RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ - mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgtap-build +COPY --from=pgtap-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgtap-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control ######################################################################################### # -# Layer "ip4r-pg-build" +# Layer "ip4r-build" # compile ip4r extension # ######################################################################################### -FROM pg-build AS ip4r-pg-build +FROM build-deps AS ip4r-src ARG PG_VERSION # not version-specific # last release v2.4.2 - Jul 29, 2023 +WORKDIR /ext-src RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ - mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . + +FROM pg-build AS ip4r-build +COPY --from=ip4r-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ip4r-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control ######################################################################################### # -# Layer "prefix-pg-build" +# Layer "prefix-build" # compile Prefix extension # ######################################################################################### -FROM pg-build AS prefix-pg-build +FROM build-deps AS prefix-src ARG PG_VERSION # not version-specific # last release v1.2.10 - Jul 5, 2023 +WORKDIR /ext-src RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ - mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . + +FROM pg-build AS prefix-build +COPY --from=prefix-src /ext-src/ /ext-src/ +WORKDIR /ext-src/prefix-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control ######################################################################################### # -# Layer "hll-pg-build" +# Layer "hll-build" # compile hll extension # ######################################################################################### -FROM pg-build AS hll-pg-build +FROM build-deps AS hll-src ARG PG_VERSION # not version-specific # last release v2.18 - Aug 29, 2023 +WORKDIR /ext-src RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ - mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . + +FROM pg-build AS hll-build +COPY --from=hll-src /ext-src/ /ext-src/ +WORKDIR /ext-src/hll-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control ######################################################################################### # -# Layer "plpgsql-check-pg-build" +# Layer "plpgsql_check-build" # compile plpgsql_check extension # ######################################################################################### -FROM pg-build AS plpgsql-check-pg-build +FROM build-deps AS plpgsql_check-src ARG PG_VERSION # plpgsql_check v2.7.11 supports v17 # last release v2.7.11 - Sep 16, 2024 +WORKDIR /ext-src RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ - mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . + +FROM pg-build AS plpgsql_check-build +COPY --from=plpgsql_check-src /ext-src/ /ext-src/ +WORKDIR /ext-src/plpgsql_check-src +RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control ######################################################################################### # -# Layer "timescaledb-pg-build" +# Layer "timescaledb-build" # compile timescaledb extension # ######################################################################################### -FROM pg-build AS timescaledb-pg-build +FROM build-deps AS timescaledb-src ARG PG_VERSION +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ @@ -565,8 +745,12 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ - mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ - ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ + mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . + +FROM pg-build AS timescaledb-build +COPY --from=timescaledb-src /ext-src/ /ext-src/ +WORKDIR /ext-src/timescaledb-src +RUN ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ @@ -574,14 +758,15 @@ RUN case "${PG_VERSION}" in \ ######################################################################################### # -# Layer "pg-hint-plan-pg-build" +# Layer "pg_hint_plan-build" # compile pg_hint_plan extension # ######################################################################################### -FROM pg-build AS pg-hint-plan-pg-build +FROM build-deps AS pg_hint_plan-src ARG PG_VERSION # version-specific, has separate releases for each version +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ @@ -605,50 +790,51 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ - mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_hint_plan-build +COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_hint_plan-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control ######################################################################################### # -# Layer "pg-cron-pg-build" +# Layer "pg_cron-build" # compile pg_cron extension # ######################################################################################### -FROM pg-build AS pg-cron-pg-build +FROM build-deps AS pg_cron-src ARG PG_VERSION # This is an experimental extension that we do not support on prod yet. # !Do not remove! # We set it in shared_preload_libraries and computes will fail to start if library is not found. +WORKDIR /ext-src +COPY compute/patches/pg_cron.patch . RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \ echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + patch < /ext-src/pg_cron.patch + +FROM pg-build AS pg_cron-build +COPY --from=pg_cron-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_cron-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control ######################################################################################### # -# Layer "rdkit-pg-build" +# Layer "rdkit-build" # compile rdkit extension # ######################################################################################### -FROM pg-build AS rdkit-pg-build +FROM build-deps AS rdkit-src ARG PG_VERSION -RUN apt update && \ - apt install --no-install-recommends --no-install-suggests -y \ - libboost-iostreams1.74-dev \ - libboost-regex1.74-dev \ - libboost-serialization1.74-dev \ - libboost-system1.74-dev \ - libeigen3-dev \ - libboost-all-dev \ - && apt clean && rm -rf /var/lib/apt/lists/* - # rdkit Release_2024_09_1 supports v17 # last release Release_2024_09_1 - Sep 27, 2024 # @@ -656,12 +842,7 @@ RUN apt update && \ # because Release_2024_09_1 has some backward incompatible changes # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 -# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find -# pg_config. For some reason the rdkit cmake script doesn't work with just that, -# however. By also adding /usr/local/pgsql, it works, which is weird because there -# are no executables in that directory. -ENV PATH="/usr/local/pgsql:$PATH" - +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ @@ -677,8 +858,28 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ - mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ - cmake \ + mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . + +FROM pg-build AS rdkit-build +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ + libboost-iostreams1.74-dev \ + libboost-regex1.74-dev \ + libboost-serialization1.74-dev \ + libboost-system1.74-dev \ + libeigen3-dev \ + libboost-all-dev \ + && apt clean && rm -rf /var/lib/apt/lists/* + +COPY --from=rdkit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/rdkit-src + +# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find +# pg_config. For some reason the rdkit cmake script doesn't work with just that, +# however. By also adding /usr/local/pgsql, it works, which is weird because there +# are no executables in that directory. +ENV PATH="/usr/local/pgsql:$PATH" +RUN cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ -D RDK_BUILD_INCHI_SUPPORT=ON \ -D RDK_BUILD_AVALON_SUPPORT=ON \ @@ -710,47 +911,57 @@ RUN case "${PG_VERSION}" in \ ######################################################################################### # -# Layer "pg-uuidv7-pg-build" +# Layer "pg_uuidv7-build" # compile pg_uuidv7 extension # ######################################################################################### -FROM pg-build AS pg-uuidv7-pg-build +FROM build-deps AS pg_uuidv7-src ARG PG_VERSION # not version-specific # last release v1.6.0 - Oct 9, 2024 +WORKDIR /ext-src RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ - mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_uuidv7-build +COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_uuidv7-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control ######################################################################################### # -# Layer "pg-roaringbitmap-pg-build" +# Layer "pg_roaringbitmap-build" # compile pg_roaringbitmap extension # ######################################################################################### -FROM pg-build AS pg-roaringbitmap-pg-build +FROM build-deps AS pg_roaringbitmap-src ARG PG_VERSION # not version-specific # last release v0.5.4 - Jun 28, 2022 +WORKDIR /ext-src RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ - mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_roaringbitmap-build +COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_roaringbitmap-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control ######################################################################################### # -# Layer "pg-semver-pg-build" +# Layer "pg_semver-build" # compile pg_semver extension # ######################################################################################### -FROM pg-build AS pg-semver-pg-build +FROM build-deps AS pg_semver-src ARG PG_VERSION # Release 0.40.0 breaks backward compatibility with previous versions @@ -758,6 +969,7 @@ ARG PG_VERSION # Use new version only for v17 # # last release v0.40.0 - Jul 22, 2024 +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ @@ -773,22 +985,27 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \ echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \ - mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_semver-build +COPY --from=pg_semver-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_semver-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control ######################################################################################### # -# Layer "pg-embedding-pg-build" +# Layer "pg_embedding-build" # compile pg_embedding extension # ######################################################################################### -FROM pg-build AS pg-embedding-pg-build +FROM build-deps AS pg_embedding-src +ARG PG_VERSION # This is our extension, support stopped in favor of pgvector # TODO: deprecate it -ARG PG_VERSION +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -799,37 +1016,52 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install + mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_embedding-build +COPY --from=pg_embedding-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ +RUN if [ -d pg_embedding-src ]; then \ + cd pg_embedding-src && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install; \ + fi ######################################################################################### # -# Layer "pg-anon-pg-build" +# Layer "pg_anon-build" # compile anon extension # ######################################################################################### -FROM pg-build AS pg-anon-pg-build +FROM build-deps AS pg_anon-src ARG PG_VERSION # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. +WORKDIR /ext-src RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_anon-build +COPY --from=pg_anon-src /ext-src/ /ext-src/ +WORKDIR /ext-src +RUN if [ -d pg_anon-src ]; then \ + cd pg_anon-src && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \ + fi ######################################################################################### # -# Layer "rust extensions" -# This layer is used to build `pgrx` deps +# Layer "pg build with nonroot user and cargo installed" +# This layer is base and common for layers with `pgrx` # ######################################################################################### -FROM pg-build AS rust-extensions-build +FROM pg-build AS pg-build-nonroot-with-cargo ARG PG_VERSION RUN apt update && \ @@ -842,13 +1074,24 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot +# See comment on the top of the file regading `echo` and `\n` RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ - rm rustup-init && \ - case "${PG_VERSION}" in \ + rm rustup-init + +######################################################################################### +# +# Layer "rust extensions" +# This layer is used to build `pgrx` deps +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build +ARG PG_VERSION + +RUN case "${PG_VERSION}" in \ 'v17') \ echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \ esac && \ @@ -867,76 +1110,67 @@ USER root # and eventually get merged with `rust-extensions-build` # ######################################################################################### -FROM pg-build AS rust-extensions-build-pgrx12 +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12 ARG PG_VERSION -RUN apt update && \ - apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ - apt clean && rm -rf /var/lib/apt/lists/* && \ - useradd -ms /bin/bash nonroot -b /home - -ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:$PATH" -USER nonroot -WORKDIR /home/nonroot - -RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc - -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ - chmod +x rustup-init && \ - ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ - rm rustup-init && \ - cargo install --locked --version 0.12.9 cargo-pgrx && \ +RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root ######################################################################################### # -# Layers "pg-onnx-build" and "pgrag-pg-build" +# Layers "pg-onnx-build" and "pgrag-build" # Compile "pgrag" extensions # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-onnx-build +FROM build-deps AS pgrag-src +ARG PG_VERSION + +WORKDIR /ext-src +RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ + mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ + echo "#nothing to test here" > neon-test.sh + +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ + echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ + mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . + +FROM rust-extensions-build-pgrx12 AS pgrag-build +COPY --from=pgrag-src /ext-src/ /ext-src/ +# Install build-time dependencies # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25). # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise +WORKDIR /ext-src/onnxruntime-src RUN apt update && apt install --no-install-recommends --no-install-suggests -y \ - python3 python3-pip python3-venv && \ + python3 python3-pip python3-venv protobuf-compiler && \ apt clean && rm -rf /var/lib/apt/lists/* && \ python3 -m venv venv && \ . venv/bin/activate && \ - python3 -m pip install cmake==3.30.5 && \ - wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ - mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ + python3 -m pip install cmake==3.30.5 + +RUN . venv/bin/activate && \ ./build.sh --config Release --parallel --cmake_generator Ninja \ --skip_submodule_sync --skip_tests --allow_running_as_root - -FROM pg-onnx-build AS pgrag-pg-build - -RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \ - && apt clean && rm -rf /var/lib/apt/lists/* && \ - wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ - echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ - mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \ - \ - cd exts/rag && \ +WORKDIR /ext-src/pgrag-src +RUN cd exts/rag && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \ - \ - cd ../rag_bge_small_en_v15 && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control + +RUN cd exts/rag_bge_small_en_v15 && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ + ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \ - \ - cd ../rag_jina_reranker_v1_tiny_en && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control + +RUN cd exts/rag_jina_reranker_v1_tiny_en && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ + ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control @@ -944,17 +1178,23 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p ######################################################################################### # -# Layer "pg-jsonschema-pg-build" +# Layer "pg_jsonschema-build" # Compile "pg_jsonschema" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build +FROM build-deps AS pg_jsonschema-src ARG PG_VERSION # last release v0.3.3 - Oct 16, 2024 +WORKDIR /ext-src RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \ echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \ - mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . + +FROM rust-extensions-build-pgrx12 AS pg_jsonschema-build +COPY --from=pg_jsonschema-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_jsonschema-src +RUN \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 # `unsafe-postgres` feature allows to build pgx extensions # against postgres forks that decided to change their ABI name (like us). @@ -967,55 +1207,69 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar. ######################################################################################### # -# Layer "pg-graphql-pg-build" +# Layer "pg_graphql-build" # Compile "pg_graphql" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build +FROM build-deps AS pg_graphql-src ARG PG_VERSION # last release v1.5.9 - Oct 16, 2024 +WORKDIR /ext-src +COPY compute/patches/pg_graphql.patch . RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \ echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \ - cargo pgrx install --release && \ + patch -p1 < /ext-src/pg_graphql.patch + + +FROM rust-extensions-build-pgrx12 AS pg_graphql-build +COPY --from=pg_graphql-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_graphql-src +RUN cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control ######################################################################################### # -# Layer "pg-tiktoken-build" +# Layer "pg_tiktoken-build" # Compile "pg_tiktoken" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build +FROM build-deps AS pg_tiktoken-src ARG PG_VERSION # doesn't use releases # 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024 +WORKDIR /ext-src RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \ echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \ - sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \ - cargo pgrx install --release && \ + sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml + +FROM rust-extensions-build-pgrx12 AS pg_tiktoken-build +COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_tiktoken-src +RUN cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control ######################################################################################### # -# Layer "pg-pgx-ulid-build" +# Layer "pgx_ulid-build" # Compile "pgx_ulid" extension for v16 and below # ######################################################################################### -FROM rust-extensions-build AS pg-pgx-ulid-build +FROM build-deps AS pgx_ulid-src ARG PG_VERSION +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14" | "v15" | "v16") \ ;; \ @@ -1026,20 +1280,28 @@ RUN case "${PG_VERSION}" in \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgrx install --release && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml + +FROM rust-extensions-build AS pgx_ulid-build +COPY --from=pgx_ulid-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ +RUN if [ -d pgx_ulid-src ]; then \ + cd pgx_ulid-src && \ + cargo pgrx install --release && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control; \ + fi ######################################################################################### # -# Layer "pg-pgx-ulid-pgrx12-build" +# Layer "pgx_ulid-pgrx12-build" # Compile "pgx_ulid" extension for v17 and up # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build +FROM build-deps AS pgx_ulid-pgrx12-src ARG PG_VERSION +WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v17") \ ;; \ @@ -1050,23 +1312,32 @@ RUN case "${PG_VERSION}" in \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \ echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgrx install --release && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control + sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml + +FROM rust-extensions-build-pgrx12 AS pgx_ulid-pgrx12-build +ARG PG_VERSION +WORKDIR /ext-src +COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ +RUN if [ -d pgx_ulid-src ]; then \ + cd pgx_ulid-src && \ + cargo pgrx install --release && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control; \ + fi ######################################################################################### # -# Layer "pg-session-jwt-build" +# Layer "pg_session_jwt-build" # Compile "pg_session_jwt" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build +FROM build-deps AS pg_session_jwt-src ARG PG_VERSION # NOTE: local_proxy depends on the version of pg_session_jwt # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs +WORKDIR /ext-src RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \ echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ @@ -1074,8 +1345,12 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0 sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \ sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \ - sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \ - cargo pgrx install --release + sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml + +FROM rust-extensions-build-pgrx12 AS pg_session_jwt-build +COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_session_jwt-src +RUN cargo pgrx install --release ######################################################################################### # @@ -1084,15 +1359,20 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0 # ######################################################################################### -FROM pg-build AS wal2json-pg-build +FROM build-deps AS wal2json-src ARG PG_VERSION # wal2json wal2json_2_6 supports v17 # last release wal2json_2_6 - Apr 25, 2024 +WORKDIR /ext-src RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . + +FROM pg-build AS wal2json-build +COPY --from=wal2json-src /ext-src/ /ext-src/ +WORKDIR /ext-src/wal2json-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### @@ -1101,15 +1381,20 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar. # compile pg_ivm extension # ######################################################################################### -FROM pg-build AS pg-ivm-build +FROM build-deps AS pg_ivm-src ARG PG_VERSION # pg_ivm v1.9 supports v17 # last release v1.9 - Jul 31 +WORKDIR /ext-src RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ - mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_ivm-build +COPY --from=pg_ivm-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_ivm-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control @@ -1119,15 +1404,20 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv # compile pg_partman extension # ######################################################################################### -FROM pg-build AS pg-partman-build +FROM build-deps AS pg_partman-src ARG PG_VERSION # should support v17 https://github.com/pgpartman/pg_partman/discussions/693 # last release 5.1.0 Apr 2, 2024 +WORKDIR /ext-src RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ - mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_partman-build +COPY --from=pg_partman-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_partman-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control @@ -1137,13 +1427,19 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz # compile pg_mooncake extension # ######################################################################################### -FROM rust-extensions-build AS pg-mooncake-build +FROM build-deps AS pg_mooncake-src ARG PG_VERSION - -RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \ - echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \ +WORKDIR /ext-src +RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \ + echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ - make release -j $(getconf _NPROCESSORS_ONLN) && \ + echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ + chmod a+x neon-test.sh + +FROM rust-extensions-build AS pg_mooncake-build +COPY --from=pg_mooncake-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_mooncake-src +RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control @@ -1154,81 +1450,122 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p # ######################################################################################### -FROM pg-build AS pg-repack-build +FROM build-deps AS pg_repack-src ARG PG_VERSION - +WORKDIR /ext-src RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ - mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . + +FROM rust-extensions-build AS pg_repack-build +COPY --from=pg_repack-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_repack-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### # -# Layer "neon-pg-ext-build" +# Layer "neon-ext-build" # compile neon extensions # ######################################################################################### -FROM build-deps AS neon-pg-ext-build +FROM pg-build AS neon-ext-build ARG PG_VERSION -# Public extensions -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=postgis-build /sfcgal/* / -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=h3-pg-build /h3/usr / -COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql -COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ - RUN make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_test_utils \ -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ -s install +######################################################################################### +# +# Layer "extensions-none" +# +######################################################################################### +FROM build-deps AS extensions-none + +RUN mkdir /usr/local/pgsql + +######################################################################################### +# +# Layer "extensions-minimal" +# +# This subset of extensions includes the extensions that we have in +# shared_preload_libraries by default. +# +######################################################################################### +FROM build-deps AS extensions-minimal + +COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ + +######################################################################################### +# +# Layer "extensions-all" +# Bundle together all the extensions +# +######################################################################################### +FROM build-deps AS extensions-all + +# Public extensions +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=postgis-build /sfcgal/* / +COPY --from=pgrouting-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / +COPY --from=postgresql-unit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgvector-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgjwt-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=ip4r-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=prefix-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hll-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plpgsql_check-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_hint_plan-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgx_ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgx_ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_session_jwt-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ + +######################################################################################### +# +# Layer "neon-pg-ext-build" +# Includes Postgres and all the extensions chosen by EXTENSIONS arg. +# +######################################################################################### +FROM extensions-${EXTENSIONS} AS neon-pg-ext-build + ######################################################################################### # # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries @@ -1283,7 +1620,9 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters ARG TARGETARCH # Keep sql_exporter version same as in build-tools.Dockerfile and # test_runner/regress/test_compute_metrics.py -RUN if [ "$TARGETARCH" = "amd64" ]; then\ +# See comment on the top of the file regading `echo`, `-e` and `\n` +RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \ + if [ "$TARGETARCH" = "amd64" ]; then\ postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ @@ -1307,7 +1646,8 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\ # Clean up postgres folder before inclusion # ######################################################################################### -FROM neon-pg-ext-build AS postgres-cleanup-layer +FROM neon-ext-build AS postgres-cleanup-layer + COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) @@ -1337,65 +1677,59 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute ######################################################################################### # -# Layer neon-pg-ext-test +# Layer extension-tests # ######################################################################################### -FROM neon-pg-ext-build AS neon-pg-ext-test +FROM pg-build AS extension-tests ARG PG_VERSION RUN mkdir /ext-src -#COPY --from=postgis-build /postgis.tar.gz /ext-src/ -#COPY --from=postgis-build /sfcgal/* /usr -COPY --from=plv8-build /plv8.tar.gz /ext-src/ -#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ -COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ -COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ -COPY --from=vector-pg-build /pgvector.patch /ext-src/ -COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src -#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src -COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src -COPY compute/patches/pg_graphql.patch /ext-src -#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src -COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src -COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src -COPY --from=rum-pg-build /rum.tar.gz /ext-src -COPY compute/patches/rum.patch /ext-src -#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src -COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src -COPY --from=prefix-pg-build /prefix.tar.gz /ext-src -COPY --from=hll-pg-build /hll.tar.gz /ext-src -COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src -#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src -COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src +COPY --from=pg-build /postgres /postgres +#COPY --from=postgis-src /ext-src/ /ext-src/ +COPY --from=plv8-src /ext-src/ /ext-src/ +#COPY --from=h3-pg-src /ext-src/ /ext-src/ +COPY --from=postgresql-unit-src /ext-src/ /ext-src/ +COPY --from=pgvector-src /ext-src/ /ext-src/ +COPY --from=pgjwt-src /ext-src/ /ext-src/ +#COPY --from=pgrag-src /ext-src/ /ext-src/ +#COPY --from=pg_jsonschema-src /ext-src/ /ext-src/ +COPY --from=pg_graphql-src /ext-src/ /ext-src/ +#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ +COPY --from=hypopg-src /ext-src/ /ext-src/ +COPY --from=pg_hashids-src /ext-src/ /ext-src/ +COPY --from=rum-src /ext-src/ /ext-src/ +#COPY --from=pgtap-src /ext-src/ /ext-src/ +COPY --from=ip4r-src /ext-src/ /ext-src/ +COPY --from=prefix-src /ext-src/ /ext-src/ +COPY --from=hll-src /ext-src/ /ext-src/ +COPY --from=plpgsql_check-src /ext-src/ /ext-src/ +#COPY --from=timescaledb-src /ext-src/ /ext-src/ +COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src -COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src -COPY compute/patches/pg_cron.patch /ext-src -#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src -#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src -COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src -COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src -COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src -#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src -#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src -COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src -COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN cd /ext-src/ && for f in *.tar.gz; \ - do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ - rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ - || exit 1; rm -f $f; done -RUN cd /ext-src/rum-src && patch -p1 <../rum.patch -RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch +COPY --from=pg_cron-src /ext-src/ /ext-src/ +#COPY --from=pgx_ulid-src /ext-src/ /ext-src/ +#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ +#COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ +#COPY --from=rdkit-src /ext-src/ /ext-src/ +COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ +COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ +COPY --from=pg_semver-src /ext-src/ /ext-src/ +#COPY --from=pg_embedding-src /ext-src/ /ext-src/ +#COPY --from=wal2json-src /ext-src/ /ext-src/ +COPY --from=pg_ivm-src /ext-src/ /ext-src/ +COPY --from=pg_partman-src /ext-src/ /ext-src/ +#COPY --from=pg_mooncake-src /ext-src/ /ext-src/ +#COPY --from=pg_repack-src /ext-src/ /ext-src/ + COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN patch -p1 :stats_bulkreads_before; +- ?column? +----------- +- t +-(1 row) +- + CREATE ROLE regress_heaptest_role; + -- verify permissions are checked (error due to function not callable) + SET ROLE regress_heaptest_role; +@@ -233,7 +222,6 @@ ERROR: cannot check relation "test_foreign_table" + DETAIL: This operation is not supported for foreign tables. + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql +index 1745bae..3b429c3 100644 +--- a/contrib/amcheck/sql/check_heap.sql ++++ b/contrib/amcheck/sql/check_heap.sql +@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -58,9 +55,6 @@ COMMIT; + -- ALTER TABLE ... SET TABLESPACE ... + -- causing an additional bulkread, which should be reflected in pg_stat_io. + SELECT pg_stat_force_next_flush(); +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; + + CREATE ROLE regress_heaptest_role; + +@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table', + + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out +index 33be13a..70a406c 100644 +--- a/contrib/citext/expected/create_index_acl.out ++++ b/contrib/citext/expected/create_index_acl.out +@@ -5,9 +5,6 @@ + -- owner having as few applicable privileges as possible. (The privileges.sql + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -66,11 +61,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + ERROR: 42704 + \set VERBOSITY default + ROLLBACK; +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql +index 10b5225..ae442e1 100644 +--- a/contrib/citext/sql/create_index_acl.sql ++++ b/contrib/citext/sql/create_index_acl.sql +@@ -6,10 +6,6 @@ + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) + +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; +- + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -68,11 +62,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + \set VERBOSITY default + ROLLBACK; + +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out +index 72304e0..ebe131b 100644 +--- a/contrib/file_fdw/expected/file_fdw.out ++++ b/contrib/file_fdw/expected/file_fdw.out +@@ -4,6 +4,7 @@ + -- directory paths are passed to us in environment variables + \getenv abs_srcdir PG_ABS_SRCDIR + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; +diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql +index f0548e1..848a08c 100644 +--- a/contrib/file_fdw/sql/file_fdw.sql ++++ b/contrib/file_fdw/sql/file_fdw.sql +@@ -6,6 +6,7 @@ + \getenv abs_srcdir PG_ABS_SRCDIR + + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; +diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out +index d1adbab..38b52ac 100644 +--- a/contrib/pageinspect/expected/gist.out ++++ b/contrib/pageinspect/expected/gist.out +@@ -10,25 +10,6 @@ BEGIN; + CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM + generate_series(1,1000) i; + CREATE INDEX test_gist_idx ON test_gist USING gist (p); +--- Page 0 is the root, the rest are leaf pages +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); +- lsn | nsn | rightlink | flags +------+-----+------------+------- +- 0/1 | 0/0 | 4294967295 | {} +-(1 row) +- +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); +- lsn | nsn | rightlink | flags +------+-----+------------+-------- +- 0/1 | 0/0 | 4294967295 | {leaf} +-(1 row) +- +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); +- lsn | nsn | rightlink | flags +------+-----+-----------+-------- +- 0/1 | 0/0 | 1 | {leaf} +-(1 row) +- + COMMIT; + SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); + itemoffset | ctid | itemlen | dead | keys +diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql +index d263542..607992f 100644 +--- a/contrib/pageinspect/sql/gist.sql ++++ b/contrib/pageinspect/sql/gist.sql +@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM + generate_series(1,1000) i; + CREATE INDEX test_gist_idx ON test_gist USING gist (p); + +--- Page 0 is the root, the rest are leaf pages +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); +- + COMMIT; + + SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); diff --git a/compute/patches/contrib_pg17.patch b/compute/patches/contrib_pg17.patch new file mode 100644 index 000000000000..0d6c1203b0cc --- /dev/null +++ b/compute/patches/contrib_pg17.patch @@ -0,0 +1,196 @@ +diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out +index 979e5e8..2375b45 100644 +--- a/contrib/amcheck/expected/check_heap.out ++++ b/contrib/amcheck/expected/check_heap.out +@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush(); + + (1 row) + +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; +- ?column? +----------- +- t +-(1 row) +- + CREATE ROLE regress_heaptest_role; + -- verify permissions are checked (error due to function not callable) + SET ROLE regress_heaptest_role; +@@ -233,7 +222,6 @@ ERROR: cannot check relation "test_foreign_table" + DETAIL: This operation is not supported for foreign tables. + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql +index 1745bae..3b429c3 100644 +--- a/contrib/amcheck/sql/check_heap.sql ++++ b/contrib/amcheck/sql/check_heap.sql +@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -58,9 +55,6 @@ COMMIT; + -- ALTER TABLE ... SET TABLESPACE ... + -- causing an additional bulkread, which should be reflected in pg_stat_io. + SELECT pg_stat_force_next_flush(); +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; + + CREATE ROLE regress_heaptest_role; + +@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table', + + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out +index 33be13a..70a406c 100644 +--- a/contrib/citext/expected/create_index_acl.out ++++ b/contrib/citext/expected/create_index_acl.out +@@ -5,9 +5,6 @@ + -- owner having as few applicable privileges as possible. (The privileges.sql + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -66,11 +61,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + ERROR: 42704 + \set VERBOSITY default + ROLLBACK; +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql +index 10b5225..ae442e1 100644 +--- a/contrib/citext/sql/create_index_acl.sql ++++ b/contrib/citext/sql/create_index_acl.sql +@@ -6,10 +6,6 @@ + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) + +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; +- + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -68,11 +62,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + \set VERBOSITY default + ROLLBACK; + +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out +index 86c148a..81bdb2c 100644 +--- a/contrib/file_fdw/expected/file_fdw.out ++++ b/contrib/file_fdw/expected/file_fdw.out +@@ -4,6 +4,7 @@ + -- directory paths are passed to us in environment variables + \getenv abs_srcdir PG_ABS_SRCDIR + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; +diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql +index f0548e1..848a08c 100644 +--- a/contrib/file_fdw/sql/file_fdw.sql ++++ b/contrib/file_fdw/sql/file_fdw.sql +@@ -6,6 +6,7 @@ + \getenv abs_srcdir PG_ABS_SRCDIR + + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b98cf706d343..47fc9cb7fe9d 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -34,6 +34,7 @@ //! -r http://pg-ext-s3-gateway \ //! ``` use std::collections::HashMap; +use std::ffi::OsString; use std::fs::File; use std::path::Path; use std::process::exit; @@ -44,7 +45,7 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; -use clap::Arg; +use clap::Parser; use compute_tools::disk_quota::set_disk_quota; use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; @@ -73,10 +74,75 @@ use utils::failpoint_support; // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; +// Compatibility hack: if the control plane specified any remote-ext-config +// use the default value for extension storage proxy gateway. +// Remove this once the control plane is updated to pass the gateway URL +fn parse_remote_ext_config(arg: &str) -> Result { + if arg.starts_with("http") { + Ok(arg.trim_end_matches('/').to_string()) + } else { + Ok("http://pg-ext-s3-gateway".to_string()) + } +} + +#[derive(Parser)] +#[command(rename_all = "kebab-case")] +struct Cli { + #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] + pub pgbin: String, + + #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] + pub remote_ext_config: Option, + + #[arg(long, default_value_t = 3080)] + pub http_port: u16, + + #[arg(short = 'D', long, value_name = "DATADIR")] + pub pgdata: String, + + #[arg(short = 'C', long, value_name = "DATABASE_URL")] + pub connstr: String, + + #[cfg(target_os = "linux")] + #[arg(long, default_value = "neon-postgres")] + pub cgroup: String, + + #[cfg(target_os = "linux")] + #[arg( + long, + default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor" + )] + pub filecache_connstr: String, + + #[cfg(target_os = "linux")] + #[arg(long, default_value = "0.0.0.0:10301")] + pub vm_monitor_addr: String, + + #[arg(long, action = clap::ArgAction::SetTrue)] + pub resize_swap_on_bind: bool, + + #[arg(long)] + pub set_disk_quota_for_fs: Option, + + #[arg(short = 's', long = "spec", group = "spec")] + pub spec_json: Option, + + #[arg(short = 'S', long, group = "spec-path")] + pub spec_path: Option, + + #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])] + pub compute_id: Option, + + #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")] + pub control_plane_uri: Option, +} + fn main() -> Result<()> { - let scenario = failpoint_support::init(); + let cli = Cli::parse(); - let (build_tag, clap_args) = init()?; + let build_tag = init()?; + + let scenario = failpoint_support::init(); // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -85,13 +151,11 @@ fn main() -> Result<()> { // Enter startup tracing context let _startup_context_guard = startup_context_from_env(); - let cli_args = process_cli(&clap_args)?; - - let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?; + let cli_spec = try_spec_from_cli(&cli)?; - let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?; + let compute = wait_spec(build_tag, &cli, cli_spec)?; - start_postgres(&clap_args, wait_spec_result)? + start_postgres(&cli, compute)? // Startup is finished, exit the startup tracing span }; @@ -108,7 +172,7 @@ fn main() -> Result<()> { deinit_and_exit(wait_pg_result); } -fn init() -> Result<(String, clap::ArgMatches)> { +fn init() -> Result { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -123,66 +187,7 @@ fn init() -> Result<(String, clap::ArgMatches)> { .to_string(); info!("build_tag: {build_tag}"); - Ok((build_tag, cli().get_matches())) -} - -fn process_cli(matches: &clap::ArgMatches) -> Result { - let pgbin_default = "postgres"; - let pgbin = matches - .get_one::("pgbin") - .map(|s| s.as_str()) - .unwrap_or(pgbin_default); - - let ext_remote_storage = matches - .get_one::("remote-ext-config") - // Compatibility hack: if the control plane specified any remote-ext-config - // use the default value for extension storage proxy gateway. - // Remove this once the control plane is updated to pass the gateway URL - .map(|conf| { - if conf.starts_with("http") { - conf.trim_end_matches('/') - } else { - "http://pg-ext-s3-gateway" - } - }); - - let http_port = *matches - .get_one::("http-port") - .expect("http-port is required"); - let pgdata = matches - .get_one::("pgdata") - .expect("PGDATA path is required"); - let connstr = matches - .get_one::("connstr") - .expect("Postgres connection string is required"); - let spec_json = matches.get_one::("spec"); - let spec_path = matches.get_one::("spec-path"); - let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); - let set_disk_quota_for_fs = matches.get_one::("set-disk-quota-for-fs"); - - Ok(ProcessCliResult { - connstr, - pgdata, - pgbin, - ext_remote_storage, - http_port, - spec_json, - spec_path, - resize_swap_on_bind, - set_disk_quota_for_fs, - }) -} - -struct ProcessCliResult<'clap> { - connstr: &'clap str, - pgdata: &'clap str, - pgbin: &'clap str, - ext_remote_storage: Option<&'clap str>, - http_port: u16, - spec_json: Option<&'clap String>, - spec_path: Option<&'clap String>, - resize_swap_on_bind: bool, - set_disk_quota_for_fs: Option<&'clap String>, + Ok(build_tag) } fn startup_context_from_env() -> Option { @@ -235,19 +240,9 @@ fn startup_context_from_env() -> Option { } } -fn try_spec_from_cli( - matches: &clap::ArgMatches, - ProcessCliResult { - spec_json, - spec_path, - .. - }: &ProcessCliResult, -) -> Result { - let compute_id = matches.get_one::("compute-id"); - let control_plane_uri = matches.get_one::("control-plane-uri"); - +fn try_spec_from_cli(cli: &Cli) -> Result { // First, try to get cluster spec from the cli argument - if let Some(spec_json) = spec_json { + if let Some(ref spec_json) = cli.spec_json { info!("got spec from cli argument {}", spec_json); return Ok(CliSpecParams { spec: Some(serde_json::from_str(spec_json)?), @@ -256,7 +251,7 @@ fn try_spec_from_cli( } // Second, try to read it from the file if path is provided - if let Some(spec_path) = spec_path { + if let Some(ref spec_path) = cli.spec_path { let file = File::open(Path::new(spec_path))?; return Ok(CliSpecParams { spec: Some(serde_json::from_reader(file)?), @@ -264,17 +259,20 @@ fn try_spec_from_cli( }); } - let Some(compute_id) = compute_id else { + if cli.compute_id.is_none() { panic!( "compute spec should be provided by one of the following ways: \ --spec OR --spec-path OR --control-plane-uri and --compute-id" ); }; - let Some(control_plane_uri) = control_plane_uri else { + if cli.control_plane_uri.is_none() { panic!("must specify both --control-plane-uri and --compute-id or none"); }; - match get_spec_from_control_plane(control_plane_uri, compute_id) { + match get_spec_from_control_plane( + cli.control_plane_uri.as_ref().unwrap(), + cli.compute_id.as_ref().unwrap(), + ) { Ok(spec) => Ok(CliSpecParams { spec, live_config_allowed: true, @@ -298,21 +296,12 @@ struct CliSpecParams { fn wait_spec( build_tag: String, - ProcessCliResult { - connstr, - pgdata, - pgbin, - ext_remote_storage, - resize_swap_on_bind, - set_disk_quota_for_fs, - http_port, - .. - }: ProcessCliResult, + cli: &Cli, CliSpecParams { spec, live_config_allowed, }: CliSpecParams, -) -> Result { +) -> Result> { let mut new_state = ComputeState::new(); let spec_set; @@ -324,7 +313,7 @@ fn wait_spec( } else { spec_set = false; } - let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?; + let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; let conn_conf = postgres::config::Config::from_str(connstr.as_str()) .context("cannot build postgres config from connstr")?; let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) @@ -333,14 +322,14 @@ fn wait_spec( connstr, conn_conf, tokio_conn_conf, - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - pgversion: get_pg_version_string(pgbin), - http_port, + pgdata: cli.pgdata.clone(), + pgbin: cli.pgbin.clone(), + pgversion: get_pg_version_string(&cli.pgbin), + http_port: cli.http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), - ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), + ext_remote_storage: cli.remote_ext_config.clone(), ext_download_progress: RwLock::new(HashMap::new()), build_tag, }; @@ -357,7 +346,7 @@ fn wait_spec( // Launch http service first, so that we can serve control-plane requests // while configuration is still in progress. let _http_handle = - launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); + launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread"); if !spec_set { // No spec provided, hang waiting for it. @@ -389,27 +378,12 @@ fn wait_spec( launch_lsn_lease_bg_task_for_static(&compute); - Ok(WaitSpecResult { - compute, - resize_swap_on_bind, - set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(), - }) -} - -struct WaitSpecResult { - compute: Arc, - resize_swap_on_bind: bool, - set_disk_quota_for_fs: Option, + Ok(compute) } fn start_postgres( - // need to allow unused because `matches` is only used if target_os = "linux" - #[allow(unused_variables)] matches: &clap::ArgMatches, - WaitSpecResult { - compute, - resize_swap_on_bind, - set_disk_quota_for_fs, - }: WaitSpecResult, + cli: &Cli, + compute: Arc, ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); @@ -437,7 +411,7 @@ fn start_postgres( let mut delay_exit = false; // Resize swap to the desired size if the compute spec says so - if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) { + if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) { // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion // *before* starting postgres. // @@ -464,9 +438,9 @@ fn start_postgres( // Set disk quota if the compute spec says so if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = - (disk_quota_bytes, set_disk_quota_for_fs) + (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref()) { - match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) { + match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) { Ok(()) => { let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. info!(%disk_quota_bytes, %size_mib, "set disk quota"); @@ -509,13 +483,7 @@ fn start_postgres( if #[cfg(target_os = "linux")] { use std::env; use tokio_util::sync::CancellationToken; - let vm_monitor_addr = matches - .get_one::("vm-monitor-addr") - .expect("--vm-monitor-addr should always be set because it has a default arg"); - let file_cache_connstr = matches.get_one::("filecache-connstr"); - let cgroup = matches.get_one::("cgroup"); - // Only make a runtime if we need to. // Note: it seems like you can make a runtime in an inner scope and // if you start a task in it it won't be dropped. However, make it // in the outermost scope just to be safe. @@ -538,15 +506,15 @@ fn start_postgres( let pgconnstr = if disable_lfc_resizing.unwrap_or(false) { None } else { - file_cache_connstr.cloned() + Some(cli.filecache_connstr.clone()) }; let vm_monitor = rt.as_ref().map(|rt| { rt.spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { - cgroup: cgroup.cloned(), + cgroup: Some(cli.cgroup.clone()), pgconnstr, - addr: vm_monitor_addr.clone(), + addr: cli.vm_monitor_addr.clone(), })), token.clone(), )) @@ -702,105 +670,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { exit(exit_code.unwrap_or(1)) } -fn cli() -> clap::Command { - // Env variable is set by `cargo` - let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); - clap::Command::new("compute_ctl") - .version(version) - .arg( - Arg::new("http-port") - .long("http-port") - .value_name("HTTP_PORT") - .default_value("3080") - .value_parser(clap::value_parser!(u16)) - .required(false), - ) - .arg( - Arg::new("connstr") - .short('C') - .long("connstr") - .value_name("DATABASE_URL") - .required(true), - ) - .arg( - Arg::new("pgdata") - .short('D') - .long("pgdata") - .value_name("DATADIR") - .required(true), - ) - .arg( - Arg::new("pgbin") - .short('b') - .long("pgbin") - .default_value("postgres") - .value_name("POSTGRES_PATH"), - ) - .arg( - Arg::new("spec") - .short('s') - .long("spec") - .value_name("SPEC_JSON"), - ) - .arg( - Arg::new("spec-path") - .short('S') - .long("spec-path") - .value_name("SPEC_PATH"), - ) - .arg( - Arg::new("compute-id") - .short('i') - .long("compute-id") - .value_name("COMPUTE_ID"), - ) - .arg( - Arg::new("control-plane-uri") - .short('p') - .long("control-plane-uri") - .value_name("CONTROL_PLANE_API_BASE_URI"), - ) - .arg( - Arg::new("remote-ext-config") - .short('r') - .long("remote-ext-config") - .value_name("REMOTE_EXT_CONFIG"), - ) - // TODO(fprasx): we currently have default arguments because the cloud PR - // to pass them in hasn't been merged yet. We should get rid of them once - // the PR is merged. - .arg( - Arg::new("vm-monitor-addr") - .long("vm-monitor-addr") - .default_value("0.0.0.0:10301") - .value_name("VM_MONITOR_ADDR"), - ) - .arg( - Arg::new("cgroup") - .long("cgroup") - .default_value("neon-postgres") - .value_name("CGROUP"), - ) - .arg( - Arg::new("filecache-connstr") - .long("filecache-connstr") - .default_value( - "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor", - ) - .value_name("FILECACHE_CONNSTR"), - ) - .arg( - Arg::new("resize-swap-on-bind") - .long("resize-swap-on-bind") - .action(clap::ArgAction::SetTrue), - ) - .arg( - Arg::new("set-disk-quota-for-fs") - .long("set-disk-quota-for-fs") - .value_name("SET_DISK_QUOTA_FOR_FS") - ) -} - /// When compute_ctl is killed, send also termination signal to sync-safekeepers /// to prevent leakage. TODO: it is better to convert compute_ctl to async and /// wait for termination which would be easy then. @@ -810,7 +679,14 @@ fn handle_exit_signal(sig: i32) { exit(1); } -#[test] -fn verify_cli() { - cli().debug_assert() +#[cfg(test)] +mod test { + use clap::CommandFactory; + + use super::Cli; + + #[test] + fn verify_cli() { + Cli::command().debug_assert() + } } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 64c338f4d7ad..00f46386e71b 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -258,14 +258,11 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { let uri = format!("{}/{}", ext_remote_storage, ext_path); - info!("Download extension {:?} from uri {:?}", ext_path, uri); + info!("Download extension {} from uri {}", ext_path, uri); match do_extension_server_request(&uri).await { Ok(resp) => { - info!( - "Successfully downloaded remote extension data {:?}", - ext_path - ); + info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL .with_label_values(&[&StatusCode::OK.to_string()]) .inc(); @@ -285,7 +282,10 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res async fn do_extension_server_request(uri: &str) -> Result { let resp = reqwest::get(uri).await.map_err(|e| { ( - format!("could not perform remote extensions server request: {}", e), + format!( + "could not perform remote extensions server request: {:?}", + e + ), UNKNOWN_HTTP_STATUS.to_string(), ) })?; @@ -295,7 +295,7 @@ async fn do_extension_server_request(uri: &str) -> Result match resp.bytes().await { Ok(resp) => Ok(resp), Err(e) => Err(( - format!("could not read remote extensions server response: {}", e), + format!("could not read remote extensions server response: {:?}", e), // It's fine to return and report error with status as 200 OK, // because we still failed to read the response. status.to_string(), diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index aa3c6b01f035..7b7b042d848c 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> { info!("Finished migration id={}", migration_id); } Err(e) => { - error!("Failed to run migration id={}: {}", migration_id, e); + error!("Failed to run migration id={}: {:?}", migration_id, e); DB_MIGRATION_FAILED .with_label_values(&[migration_id.to_string().as_str()]) .inc(); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 43a820885bcd..37d5d3a1a65f 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -28,7 +28,7 @@ fn do_control_plane_request( .map_err(|e| { ( true, - format!("could not perform spec request to control plane: {}", e), + format!("could not perform spec request to control plane: {:?}", e), UNKNOWN_HTTP_STATUS.to_string(), ) })?; @@ -39,7 +39,7 @@ fn do_control_plane_request( Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, - format!("could not deserialize control plane response: {}", e), + format!("could not deserialize control plane response: {:?}", e), status.to_string(), )), }, diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 383c1746846b..dd37bfc4071f 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -388,6 +388,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_check_threshold' as integer")?, + image_creation_preempt_threshold: settings + .remove("image_creation_preempt_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index d9b76b960093..985fe6b3b1d6 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -10,8 +10,8 @@ use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, - ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, - TenantPolicyRequest, + ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, + TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> { .collect(), }; storcon_client - .dispatch::( + .dispatch::( Method::PUT, "control/v1/preferred_azs".to_string(), Some(req), diff --git a/deny.toml b/deny.toml index df00a34c60d6..b55140556801 100644 --- a/deny.toml +++ b/deny.toml @@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] allow = [ + "0BSD", "Apache-2.0", "BSD-2-Clause", "BSD-3-Clause", diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 61f44681dabf..b5f0f47ceb88 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ jq \ netcat-openbsd #This is required for the pg_hintplan test -RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src +RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw USER postgres diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index a05d6c043dc4..c4ff86ab663e 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do if [ $pg_version -ge 16 ]; then docker cp ext-src $TEST_CONTAINER_NAME:/ + docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl" # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config @@ -61,17 +62,32 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ rm -rf $TMPDIR + # The following block does the same for the contrib/file_fdw test + TMPDIR=$(mktemp -d) + docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data + rm -rf $TMPDIR + # Apply patches + cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)" # We are running tests now - if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ - $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt - then - FAILED=$(tail -1 testout.txt) - for d in $FAILED - do - mkdir $d - docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true - docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true - cat $d/regression.out $d/regression.diffs || true + rm -f testout.txt testout_contrib.txt + docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ + $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 + docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ + $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 + if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then + CONTRIB_FAILED= + FAILED= + [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}') + [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}') + for d in $FAILED $CONTRIB_FAILED; do + dn="$(basename $d)" + rm -rf $dn + mkdir $dn + docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ] + docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ] + cat $dn/regression.out $dn/regression.diffs || true + rm -rf $dn done rm -rf $FAILED exit 1 diff --git a/docker-compose/ext-src/pgjwt-src/neon-test.sh b/docker-compose/ext-src/pgjwt-src/neon-test.sh new file mode 100755 index 000000000000..95af0be77b1f --- /dev/null +++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -ex +cd "$(dirname "${0}")" +pg_prove test.sql \ No newline at end of file diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch new file mode 100644 index 000000000000..85b35654800f --- /dev/null +++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch @@ -0,0 +1,15 @@ +diff --git a/test.sql b/test.sql +index d7a0ca8..f15bc76 100644 +--- a/test.sql ++++ b/test.sql +@@ -9,9 +9,7 @@ + \set ON_ERROR_STOP true + \set QUIET 1 + +-CREATE EXTENSION pgcrypto; +-CREATE EXTENSION pgtap; +-CREATE EXTENSION pgjwt; ++CREATE EXTENSION IF NOT EXISTS pgtap; + + BEGIN; + SELECT plan(23); diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh new file mode 100755 index 000000000000..b7158d234000 --- /dev/null +++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +patch -p1 /dev/null; then diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index ff93b98065c4..08b1a60f2da8 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -24,7 +24,7 @@ function wait_for_ready { } function create_extensions() { for ext in ${1}; do - docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}" + docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE" done } EXTENSIONS='[ @@ -40,7 +40,8 @@ EXTENSIONS='[ {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"}, {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"}, {"extname": "semver", "extdir": "pg_semver-src"}, -{"extname": "pg_ivm", "extdir": "pg_ivm-src"} +{"extname": "pg_ivm", "extdir": "pg_ivm-src"}, +{"extname": "pgjwt", "extdir": "pgjwt-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index b3f18dc6da2c..2fc95c47c616 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -204,14 +204,16 @@ impl RemoteExtSpec { // Check if extension is present in public or custom. // If not, then it is not allowed to be used by this compute. - if let Some(public_extensions) = &self.public_extensions { - if !public_extensions.contains(&real_ext_name.to_string()) { - if let Some(custom_extensions) = &self.custom_extensions { - if !custom_extensions.contains(&real_ext_name.to_string()) { - return Err(anyhow::anyhow!("extension {} is not found", real_ext_name)); - } - } - } + if !self + .public_extensions + .as_ref() + .is_some_and(|exts| exts.iter().any(|e| e == ext_name)) + && !self + .custom_extensions + .as_ref() + .is_some_and(|exts| exts.iter().any(|e| e == ext_name)) + { + return Err(anyhow::anyhow!("extension {} is not found", real_ext_name)); } match self.extension_data.get(real_ext_name) { @@ -340,6 +342,96 @@ mod tests { use super::*; use std::fs::File; + #[test] + fn allow_installing_remote_extensions() { + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": null, + "custom_extensions": null, + "library_index": {}, + "extension_data": {}, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect_err("Extension should not be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": [], + "custom_extensions": null, + "library_index": {}, + "extension_data": {}, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect_err("Extension should not be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": [], + "custom_extensions": [], + "library_index": { + "ext": "ext" + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect_err("Extension should not be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": [], + "custom_extensions": ["ext"], + "library_index": { + "ext": "ext" + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect("Extension should be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": ["ext"], + "custom_extensions": [], + "library_index": { + "ext": "ext" + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect("Extension should be found"); + } + #[test] fn parse_spec_file() { let file = File::open("tests/cluster_spec.json").unwrap(); diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 422da0dc95c9..a0b5feea948e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -323,6 +323,10 @@ pub struct TenantConfigToml { // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, + // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction. + // Set to 0 to disable preemption. + pub image_creation_preempt_threshold: usize, + /// The length for an explicit LSN lease request. /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. #[serde(with = "humantime_serde")] @@ -547,6 +551,10 @@ pub mod tenant_conf_defaults { // Relevant: https://github.com/neondatabase/neon/issues/3394 pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image + // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we + // want to enable this feature. + pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; @@ -605,6 +613,7 @@ impl Default for TenantConfigToml { lazy_slru_download: false, timeline_get_throttle: crate::models::ThrottleConfig::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD, lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: false, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 43447c67bd0e..19beb37ab34c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -498,6 +498,8 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_layer_creation_check_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub image_creation_preempt_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lsn_lease_length: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lsn_lease_length_for_ts: FieldPatch, @@ -544,6 +546,7 @@ pub struct TenantConfig { pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, + pub image_creation_preempt_threshold: Option, pub lsn_lease_length: Option, pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, @@ -581,6 +584,7 @@ impl TenantConfig { mut lazy_slru_download, mut timeline_get_throttle, mut image_layer_creation_check_threshold, + mut image_creation_preempt_threshold, mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, @@ -635,6 +639,9 @@ impl TenantConfig { patch .image_layer_creation_check_threshold .apply(&mut image_layer_creation_check_threshold); + patch + .image_creation_preempt_threshold + .apply(&mut image_creation_preempt_threshold); patch.lsn_lease_length.apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts @@ -679,6 +686,7 @@ impl TenantConfig { lazy_slru_download, timeline_get_throttle, image_layer_creation_check_threshold, + image_creation_preempt_threshold, lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index e205d60d747d..753f05b6fd1c 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; +/// Logs a critical error, similarly to `tracing::error!`. This will: +/// +/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace. +/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error". +/// * Trigger a pageable alert (via the metric above). +/// * In debug builds, panic the process. +#[macro_export] +macro_rules! critical { + ($($arg:tt)*) => { + if cfg!(debug_assertions) { + panic!($($arg)*); + } + $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); + let backtrace = std::backtrace::Backtrace::capture(); + tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); + }; +} + #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { @@ -25,7 +43,10 @@ impl LogFormat { } } -struct TracingEventCountMetric { +pub struct TracingEventCountMetric { + /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro, + /// and also emit it as a regular error. These are thus double-counted, but that seems fine. + critical: IntCounter, error: IntCounter, warn: IntCounter, info: IntCounter, @@ -33,7 +54,7 @@ struct TracingEventCountMetric { trace: IntCounter, } -static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { +pub static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", "Number of tracing events, by level", @@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| impl TracingEventCountMetric { fn new(vec: IntCounterVec) -> Self { Self { + critical: vec.with_label_values(&["critical"]), error: vec.with_label_values(&["error"]), warn: vec.with_label_values(&["warn"]), info: vec.with_label_values(&["info"]), @@ -54,6 +76,11 @@ impl TracingEventCountMetric { } } + // Allow public access from `critical!` macro. + pub fn inc_critical(&self) { + self.critical.inc(); + } + fn inc_for_level(&self, level: tracing::Level) { let counter = match level { tracing::Level::ERROR => &self.error, diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs index fe71e11197de..4f5bf1c1e327 100644 --- a/libs/vm_monitor/src/filecache.rs +++ b/libs/vm_monitor/src/filecache.rs @@ -177,8 +177,8 @@ impl FileCacheState { crate::spawn_with_cancel( token, |res| { - if let Err(error) = res { - error!(%error, "postgres error") + if let Err(e) = res { + error!(error = format_args!("{e:#}"), "postgres error"); } }, conn, @@ -205,7 +205,7 @@ impl FileCacheState { { Ok(rows) => Ok(rows), Err(e) => { - error!(error = ?e, "postgres error: {e} -> retrying"); + error!(error = format_args!("{e:#}"), "postgres error -> retrying"); let client = FileCacheState::connect(&self.conn_str, self.token.clone()) .await diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index 1b13c8e0b23d..0cd97d4ca122 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -191,15 +191,12 @@ async fn start_monitor( .await; let mut monitor = match monitor { Ok(Ok(monitor)) => monitor, - Ok(Err(error)) => { - error!(?error, "failed to create monitor"); + Ok(Err(e)) => { + error!(error = format_args!("{e:#}"), "failed to create monitor"); return; } Err(_) => { - error!( - ?timeout, - "creating monitor timed out (probably waiting to receive protocol range)" - ); + error!(?timeout, "creating monitor timed out"); return; } }; @@ -207,6 +204,9 @@ async fn start_monitor( match monitor.run().await { Ok(()) => info!("monitor was killed due to new connection"), - Err(e) => error!(error = ?e, "monitor terminated unexpectedly"), + Err(e) => error!( + error = format_args!("{e:#}"), + "monitor terminated unexpectedly" + ), } } diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 8605314ba9e5..8839f5803f81 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -370,12 +370,16 @@ impl Runner { }), InboundMsgKind::InvalidMessage { error } => { warn!( - %error, id, "received notification of an invalid message we sent" + error = format_args!("{error:#}"), + id, "received notification of an invalid message we sent" ); Ok(None) } InboundMsgKind::InternalError { error } => { - warn!(error, id, "agent experienced an internal error"); + warn!( + error = format_args!("{error:#}"), + id, "agent experienced an internal error" + ); Ok(None) } InboundMsgKind::HealthCheck {} => { @@ -476,7 +480,7 @@ impl Runner { // gives the outermost cause, and the debug impl // pretty-prints the error, whereas {:#} contains all the // causes, but is compact (no newlines). - warn!(error = format!("{e:#}"), "error handling message"); + warn!(error = format_args!("{e:#}"), "error handling message"); OutboundMsg::new( OutboundMsgKind::InternalError { error: e.to_string(), @@ -492,7 +496,7 @@ impl Runner { .context("failed to send message")?; } Err(e) => warn!( - error = format!("{e}"), + error = format_args!("{e:#}"), msg = ?msg, "received error message" ), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0f3e9fdab601..94f7510a4abf 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1472,7 +1472,13 @@ async fn layer_download_handler( let downloaded = timeline .download_layer(&layer_name) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| match e { + tenant::storage_layer::layer::DownloadError::TimelineShutdown + | tenant::storage_layer::layer::DownloadError::DownloadCancelled => { + ApiError::ShuttingDown + } + other => ApiError::InternalServerError(other.into()), + })?; match downloaded { Some(true) => json_response(StatusCode::OK, ()), @@ -3169,12 +3175,16 @@ async fn put_tenant_timeline_import_basebackup( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + let span = info_span!("import_basebackup", + tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(), + base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); async move { let state = get_state(&request); let tenant = state .tenant_manager - .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?; + .get_attached_tenant_shard(tenant_shard_id)?; let broker_client = state.broker_client.clone(); @@ -3383,7 +3393,17 @@ where let status = response.status(); info!(%status, "Cancelled request finished successfully") } - Err(e) => error!("Cancelled request finished with an error: {e:?}"), + Err(e) => match e { + ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => { + // Don't log this at error severity: they are normal during lifecycle of tenants/process + info!("Cancelled request aborted for shutdown") + } + _ => { + // Log these in a highly visible way, because we have no client to send the response to, but + // would like to know that something went wrong. + error!("Cancelled request finished with an error: {e:?}") + } + }, } } // only logging for cancelled panicked request handlers is the tracing_panic_hook, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ff6af3566c82..f43cd08cf7ba 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; /// data directory at pageserver startup can be automatically removed. pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp"; -/// A marker file to mark that a timeline directory was not fully initialized. -/// If a timeline directory with this marker is encountered at pageserver startup, -/// the timeline directory and the marker file are both removed. -/// Full path: `tenants//timelines/___uninit`. -pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; - -pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; - pub fn is_temporary(path: &Utf8Path) -> bool { match path.file_name() { Some(name) => name.ends_with(TEMP_FILE_SUFFIX), @@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool { } } -fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool { - match path.file_name() { - Some(name) => name.ends_with(suffix), - None => false, - } -} - -// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid -// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once -// from the name. - -pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool { - ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) -} - -pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool { - ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) -} - /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 77c0967afc4b..6ab1178a7bd0 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -32,6 +32,7 @@ use utils::id::TimelineId; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; +use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; @@ -116,11 +117,38 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { +/// Measures layers visited per read (i.e. read amplification). +/// +/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits +/// are amortized across the batch, and some layers may not intersect with a given key, each visited +/// layer contributes directly to the observed latency for every read in the batch, which is what we +/// care about. +pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_layers_per_read", + "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", + &["tenant_id", "shard_id", "timeline_id"], + // Low resolution to reduce cardinality. + vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_layers_visited_per_vectored_read_global", - "Average number of layers visited to reconstruct one key", - vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + "pageserver_layers_per_read_global", + "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", + vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { + // We expect this to be low because of Postgres checkpoints. Let's see if that holds. + register_histogram!( + "pageserver_deltas_per_read_global", + "Number of delta pages applied to image page per read", + vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); @@ -2351,11 +2379,40 @@ pub(crate) struct WalIngestMetrics { pub(crate) records_observed: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, + pub(crate) values_committed_metadata_images: IntCounter, + pub(crate) values_committed_metadata_deltas: IntCounter, + pub(crate) values_committed_data_images: IntCounter, + pub(crate) values_committed_data_deltas: IntCounter, pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter, - pub(crate) clear_vm_bits_unknown: IntCounterVec, +} + +impl WalIngestMetrics { + pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) { + if stats.metadata_images > 0 { + self.values_committed_metadata_images + .inc_by(stats.metadata_images); + } + if stats.metadata_deltas > 0 { + self.values_committed_metadata_deltas + .inc_by(stats.metadata_deltas); + } + if stats.data_images > 0 { + self.values_committed_data_images.inc_by(stats.data_images); + } + if stats.data_deltas > 0 { + self.values_committed_data_deltas.inc_by(stats.data_deltas); + } + } } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { + let values_committed = register_int_counter_vec!( + "pageserver_wal_ingest_values_committed", + "Number of values committed to pageserver storage from WAL records", + &["class", "kind"], + ) + .expect("failed to define a metric"); + WalIngestMetrics { bytes_received: register_int_counter!( "pageserver_wal_ingest_bytes_received", @@ -2382,17 +2439,15 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { "Number of WAL records filtered out due to sharding" ) .expect("failed to define a metric"), + values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]), + values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]), + values_committed_data_images: values_committed.with_label_values(&["data", "image"]), + values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]), gap_blocks_zeroed_on_rel_extend: register_int_counter!( "pageserver_gap_blocks_zeroed_on_rel_extend", "Total number of zero gap blocks written on relation extends" ) .expect("failed to define a metric"), - clear_vm_bits_unknown: register_int_counter_vec!( - "pageserver_wal_ingest_clear_vm_bits_unknown", - "Number of ignored ClearVmBits operations due to unknown pages/relations", - &["entity"], - ) - .expect("failed to define a metric"), } }); @@ -2632,6 +2687,7 @@ pub(crate) struct TimelineMetrics { pub disk_consistent_lsn_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, + pub layers_per_read: Histogram, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, @@ -2729,6 +2785,10 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let layers_per_read = LAYERS_PER_READ + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2793,6 +2853,7 @@ impl TimelineMetrics { disk_consistent_lsn_gauge, pitr_history_size, archival_size, + layers_per_read, standby_horizon_gauge, resident_physical_size_gauge, visible_physical_size_gauge, @@ -2962,6 +3023,8 @@ impl TimelineMetrics { } } + let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -3912,7 +3975,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ - &VEC_READ_NUM_LAYERS_VISITED, + &LAYERS_PER_READ_GLOBAL, + &DELTAS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 40c657524dff..dcbf62b56c6c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -48,7 +48,7 @@ use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; use utils::{bin_ser::BeSer, lsn::Lsn}; -use wal_decoder::serialized_batch::SerializedValueBatch; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -1297,6 +1297,26 @@ impl DatadirModification<'_> { .is_some_and(|b| b.has_data()) } + /// Returns statistics about the currently pending modifications. + pub(crate) fn stats(&self) -> DatadirModificationStats { + let mut stats = DatadirModificationStats::default(); + for (_, _, value) in self.pending_metadata_pages.values().flatten() { + match value { + Value::Image(_) => stats.metadata_images += 1, + Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1, + Value::WalRecord(_) => stats.metadata_deltas += 1, + } + } + for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) { + match valuemeta { + ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1, + ValueMeta::Serialized(_) => stats.data_deltas += 1, + ValueMeta::Observed(_) => {} + } + } + stats + } + /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { ensure!( @@ -2317,6 +2337,15 @@ impl DatadirModification<'_> { } } +/// Statistics for a DatadirModification. +#[derive(Default)] +pub struct DatadirModificationStats { + pub metadata_images: u64, + pub metadata_deltas: u64, + pub data_images: u64, + pub data_deltas: u64, +} + /// This struct facilitates accessing either a committed key from the timeline at a /// specific LSN, or the latest uncommitted key from a pending modification. /// diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 085f76c05d98..c1b408ed72a4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -46,6 +46,7 @@ use std::sync::atomic::AtomicBool; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; +use timeline::compaction::CompactionOutcome; use timeline::compaction::GcCompactionQueue; use timeline::import_pgdata; use timeline::offload::offload_timeline; @@ -95,7 +96,6 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; -use crate::is_uninit_mark; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::CONCURRENT_INITDBS; use crate::metrics::INITDB_RUN_TIME; @@ -1793,11 +1793,7 @@ impl Tenant { let entry = entry.context("read timeline dir entry")?; let entry_path = entry.path(); - let purge = if crate::is_temporary(entry_path) - // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718) - || is_uninit_mark(entry_path) - || crate::is_delete_mark(entry_path) - { + let purge = if crate::is_temporary(entry_path) { true } else { match TimelineId::try_from(entry_path.file_name()) { @@ -2426,7 +2422,7 @@ impl Tenant { // Make sure the freeze_and_flush reaches remote storage. tline.remote_client.wait_completion().await.unwrap(); - let tl = uninit_tl.finish_creation()?; + let tl = uninit_tl.finish_creation().await?; // The non-test code would call tl.activate() here. tl.set_state(TimelineState::Active); Ok(tl) @@ -2912,10 +2908,10 @@ impl Tenant { self: &Arc, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { + ) -> Result { // Don't start doing work during shutdown, or when broken, we do not need those in the logs if !self.is_active() { - return Ok(false); + return Ok(CompactionOutcome::Done); } { @@ -2929,7 +2925,7 @@ impl Tenant { // to AttachedSingle state. if !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); - return Ok(false); + return Ok(CompactionOutcome::Done); } } @@ -2972,7 +2968,7 @@ impl Tenant { // Before doing any I/O work, check our circuit breaker if self.compaction_circuit_breaker.lock().unwrap().is_broken() { info!("Skipping compaction due to previous failures"); - return Ok(false); + return Ok(CompactionOutcome::Done); } let mut has_pending_task = false; @@ -2980,10 +2976,10 @@ impl Tenant { for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload { // pending_task_left == None: cannot compact, maybe still pending tasks - // pending_task_left == Some(true): compaction task left - // pending_task_left == Some(false): no compaction task left + // pending_task_left == Some(Pending): compaction task left + // pending_task_left == Some(Done): no compaction task left let pending_task_left = if *can_compact { - let has_pending_l0_compaction_task = timeline + let compaction_outcome = timeline .compact(cancel, EnumSet::empty(), ctx) .instrument(info_span!("compact_timeline", %timeline_id)) .await @@ -3001,27 +2997,27 @@ impl Tenant { .fail(&CIRCUIT_BREAKERS_BROKEN, e); } })?; - if has_pending_l0_compaction_task { - Some(true) + if let CompactionOutcome::Pending = compaction_outcome { + Some(CompactionOutcome::Pending) } else { let queue = { let guard = self.scheduled_compaction_tasks.lock().unwrap(); guard.get(timeline_id).cloned() }; if let Some(queue) = queue { - let has_pending_tasks = queue + let outcome = queue .iteration(cancel, ctx, &self.gc_block, timeline) .await?; - Some(has_pending_tasks) + Some(outcome) } else { - Some(false) + Some(CompactionOutcome::Done) } } } else { None }; - has_pending_task |= pending_task_left.unwrap_or(false); - if pending_task_left == Some(false) && *can_offload { + has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending); + if pending_task_left == Some(CompactionOutcome::Done) && *can_offload { pausable_failpoint!("before-timeline-auto-offload"); match offload_timeline(self, timeline) .instrument(info_span!("offload_timeline", %timeline_id)) @@ -3041,7 +3037,11 @@ impl Tenant { .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); - Ok(has_pending_task) + Ok(if has_pending_task { + CompactionOutcome::Pending + } else { + CompactionOutcome::Done + }) } /// Cancel scheduled compaction tasks @@ -4702,7 +4702,7 @@ impl Tenant { ) .await?; - let new_timeline = uninitialized_timeline.finish_creation()?; + let new_timeline = uninitialized_timeline.finish_creation().await?; // Root timeline gets its layers during creation and uploads them along with the metadata. // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. @@ -4892,10 +4892,11 @@ impl Tenant { } // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it + let pgdata_path_deferred = pgdata_path.clone(); scopeguard::defer! { - if let Err(e) = fs::remove_dir_all(&pgdata_path) { + if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) { // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call - error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}"); + error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}"); } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { @@ -4962,7 +4963,7 @@ impl Tenant { pgdata_lsn, pg_version, ); - let raw_timeline = self + let mut raw_timeline = self .prepare_new_timeline( timeline_id, &new_metadata, @@ -4973,42 +4974,33 @@ impl Tenant { .await?; let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; - let unfinished_timeline = raw_timeline.raw_timeline()?; - - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - unfinished_timeline.maybe_spawn_flush_loop(); - - import_datadir::import_timeline_from_postgres_datadir( - unfinished_timeline, - &pgdata_path, - pgdata_lsn, - ctx, - ) - .await - .with_context(|| { - format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") - })?; + raw_timeline + .write(|unfinished_timeline| async move { + import_datadir::import_timeline_from_postgres_datadir( + &unfinished_timeline, + &pgdata_path, + pgdata_lsn, + ctx, + ) + .await + .with_context(|| { + format!( + "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}" + ) + })?; - fail::fail_point!("before-checkpoint-new-timeline", |_| { - Err(CreateTimelineError::Other(anyhow::anyhow!( - "failpoint before-checkpoint-new-timeline" - ))) - }); + fail::fail_point!("before-checkpoint-new-timeline", |_| { + Err(CreateTimelineError::Other(anyhow::anyhow!( + "failpoint before-checkpoint-new-timeline" + ))) + }); - unfinished_timeline - .freeze_and_flush() - .await - .with_context(|| { - format!( - "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}" - ) - })?; + Ok(()) + }) + .await?; // All done! - let timeline = raw_timeline.finish_creation()?; + let timeline = raw_timeline.finish_creation().await?; // Callers are responsible to wait for uploads to complete and for activating the timeline. @@ -5499,6 +5491,9 @@ pub(crate) mod harness { image_layer_creation_check_threshold: Some( tenant_conf.image_layer_creation_check_threshold, ), + image_creation_preempt_threshold: Some( + tenant_conf.image_creation_preempt_threshold, + ), lsn_lease_length: Some(tenant_conf.lsn_lease_length), lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 139ed27bd200..972837dc442c 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -357,6 +357,9 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_creation_preempt_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] @@ -453,6 +456,9 @@ impl TenantConfOpt { image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), + image_creation_preempt_threshold: self + .image_creation_preempt_threshold + .unwrap_or(global_conf.image_creation_preempt_threshold), lsn_lease_length: self .lsn_lease_length .unwrap_or(global_conf.lsn_lease_length), @@ -504,6 +510,7 @@ impl TenantConfOpt { mut lazy_slru_download, mut timeline_get_throttle, mut image_layer_creation_check_threshold, + mut image_creation_preempt_threshold, mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, @@ -578,6 +585,9 @@ impl TenantConfOpt { patch .image_layer_creation_check_threshold .apply(&mut image_layer_creation_check_threshold); + patch + .image_creation_preempt_threshold + .apply(&mut image_creation_preempt_threshold); patch .lsn_lease_length .map(|v| humantime::parse_duration(&v))? @@ -626,6 +636,7 @@ impl TenantConfOpt { lazy_slru_download, timeline_get_throttle, image_layer_creation_check_threshold, + image_creation_preempt_threshold, lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, @@ -689,6 +700,7 @@ impl From for models::TenantConfig { lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, + image_creation_preempt_threshold: value.image_creation_preempt_threshold, lsn_lease_length: value.lsn_lease_length.map(humantime), lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), timeline_offloading: value.timeline_offloading, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index cf524fcb25e2..2e8c3946bd69 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -673,12 +673,30 @@ impl<'a> TenantDownloader<'a> { HeatMapDownload::Modified(m) => m, }; - let heatmap = serde_json::from_slice::(&heatmap_bytes)?; - - // Save the heatmap: this will be useful on restart, allowing us to reconstruct - // layer metadata without having to re-download it. + // Heatmap storage location let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id); + let last_heatmap = if last_download.is_none() { + match load_heatmap(&heatmap_path, ctx).await { + Ok(htm) => htm, + Err(e) => { + tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}"); + None + } + } + } else { + None + }; + + let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| { + htm.timelines + .iter() + .map(|tl| (tl.timeline_id, tl)) + .collect::>() + }); + + let heatmap = serde_json::from_slice::(&heatmap_bytes)?; + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); let heatmap_path_bg = heatmap_path.clone(); @@ -707,10 +725,17 @@ impl<'a> TenantDownloader<'a> { let timeline_state = match timeline_state { Some(t) => t, None => { + let last_heatmap = + last_heatmap_timelines + .as_ref() + .and_then(|last_heatmap_timelines| { + last_heatmap_timelines.get(&timeline.timeline_id).copied() + }); // We have no existing state: need to scan local disk for layers first. let timeline_state = init_timeline_state( self.conf, tenant_shard_id, + last_heatmap, timeline, &self.secondary_state.resident_size_metric, ) @@ -1079,12 +1104,12 @@ impl<'a> TenantDownloader<'a> { } } - if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() { + if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() { tracing::info!( "Re-downloading layer {} with changed size or generation: {:?}->{:?}", layer.name, on_disk.metadata.generation_file_size(), - on_disk.metadata.generation_file_size() + layer.metadata.generation_file_size() ); return LayerAction::Download; } @@ -1277,6 +1302,7 @@ impl<'a> TenantDownloader<'a> { async fn init_timeline_state( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, + last_heatmap: Option<&HeatMapTimeline>, heatmap: &HeatMapTimeline, resident_metric: &UIntGauge, ) -> SecondaryDetailTimeline { @@ -1306,6 +1332,13 @@ async fn init_timeline_state( let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = + if let Some(last_heatmap) = last_heatmap { + last_heatmap.layers.iter().map(|l| (&l.name, l)).collect() + } else { + HashMap::new() + }; + while let Some(dentry) = dir .next_entry() .await @@ -1339,18 +1372,32 @@ async fn init_timeline_state( match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); + let last_meta = last_heatmap_metadata.get(&name); + let mut remove = false; match remote_meta { Some(remote_meta) => { + let last_meta_generation_file_size = last_meta + .map(|m| m.metadata.generation_file_size()) + .unwrap_or(remote_meta.metadata.generation_file_size()); // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784) - if local_meta.len() != remote_meta.metadata.file_size { - // This should not happen, because we do crashsafe write-then-rename when downloading - // layers, and layers in remote storage are immutable. Remove the local file because - // we cannot trust it. - tracing::warn!( + if remote_meta.metadata.generation_file_size() + != last_meta_generation_file_size + { + tracing::info!( + "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}", + last_meta_generation_file_size, + remote_meta.metadata.generation_file_size() + ); + remove = true; + } else if local_meta.len() != remote_meta.metadata.file_size { + // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had + // the chance yet to download the new layer to disk, before the process restarted. + tracing::info!( "Removing local layer {name} with unexpected local size {} != {}", local_meta.len(), remote_meta.metadata.file_size ); + remove = true; } else { // We expect the access time to be initialized immediately afterwards, when // the latest heatmap is applied to the state. @@ -1372,15 +1419,18 @@ async fn init_timeline_state( "Removing secondary local layer {} because it's absent in heatmap", name ); - tokio::fs::remove_file(&dentry.path()) - .await - .or_else(fs_ext::ignore_not_found) - .fatal_err(&format!( - "Removing layer {}", - dentry.path().to_string_lossy() - )); + remove = true; } } + if remove { + tokio::fs::remove_file(&dentry.path()) + .await + .or_else(fs_ext::ignore_not_found) + .fatal_err(&format!( + "Removing layer {}", + dentry.path().to_string_lossy() + )); + } } Err(_) => { // Ignore it. @@ -1391,3 +1441,18 @@ async fn init_timeline_state( detail } + +/// Loads a json-encoded heatmap file from the provided on-disk path +async fn load_heatmap( + path: &Utf8PathBuf, + ctx: &RequestContext, +) -> Result, anyhow::Error> { + let mut file = match VirtualFile::open(path, ctx).await { + Ok(file) => file, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None), + Err(e) => Err(e)?, + }; + let st = file.read_to_string(ctx).await?; + let htm = serde_json::from_str(&st)?; + Ok(Some(htm)) +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index c5e5e0494571..d72c33736954 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -9,13 +9,14 @@ use crate::{ metrics::SECONDARY_MODE, tenant::{ config::AttachmentMode, - mgr::GetTenantError, - mgr::TenantManager, + mgr::{GetTenantError, TenantManager}, remote_timeline_client::remote_heatmap_path, span::debug_assert_current_span_has_tenant_id, tasks::{warn_when_period_overrun, BackgroundLoopKind}, Tenant, }, + virtual_file::VirtualFile, + TEMP_FILE_SUFFIX, }; use futures::Future; @@ -32,7 +33,10 @@ use super::{ }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; -use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; +use utils::{ + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, + yielding_loop::yielding_loop, +}; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, @@ -461,6 +465,18 @@ async fn upload_tenant_heatmap( } } + // After a successful upload persist the fresh heatmap to disk. + // When restarting, the tenant will read the heatmap from disk + // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]). + // If the heatmap is stale, the additive generation can lead to keeping previously + // evicted timelines on the secondarie's disk. + let tenant_shard_id = tenant.get_tenant_shard_id(); + let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id); + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); + if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await { + tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}"); + } + tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index c1fe67c87c5d..3800852ccc3e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState { pub(crate) img: Option<(Lsn, Bytes)>, } +impl ValueReconstructState { + /// Returns the number of page deltas applied to the page image. + pub fn num_deltas(&self) -> usize { + match self.img { + Some(_) => self.records.len(), + None => self.records.len() - 1, // omit will_init record + } + } +} + #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub(crate) enum ValueReconstructSituation { Complete, diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 22d8b81bccfd..7da51c27df99 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -166,6 +166,10 @@ impl BatchLayerWriter { // END: catch every error and do the recovery in the above section Ok(generated_layers) } + + pub fn pending_layer_num(&self) -> usize { + self.generated_layer_writers.len() + } } /// An image writer that takes images and produces multiple image layers. diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 99e0ff1aa5df..92313afba7b7 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -340,7 +340,7 @@ impl Layer { /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. - pub(crate) async fn download(&self) -> anyhow::Result<()> { + pub(crate) async fn download(&self) -> Result<(), DownloadError> { self.0.get_or_maybe_download(true, None).await?; Ok(()) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 3725e2f7fcbc..d65f0991820d 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -11,6 +11,7 @@ use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::throttle::Stats; +use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; use rand::Rng; @@ -206,11 +207,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { .run(tenant.compaction_iteration(&cancel, &ctx)) .await; match output { - Ok(has_pending_task) => { + Ok(outcome) => { error_run_count = 0; // schedule the next compaction immediately in case there is a pending compaction task - sleep_duration = if has_pending_task { - Duration::ZERO + sleep_duration = if let CompactionOutcome::Pending = outcome { + Duration::from_secs(1) } else { period }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b4b30fcd23e2..b6a349a20913 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -18,6 +18,7 @@ use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; use camino::Utf8Path; use chrono::{DateTime, Utc}; +use compaction::CompactionOutcome; use enumset::EnumSet; use fail::fail_point; use futures::{stream::FuturesUnordered, StreamExt}; @@ -51,6 +52,7 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::rate_limit::RateLimit; use utils::{ fs_ext, guard_arc_swap::GuardArcSwap, @@ -115,7 +117,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; +use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; @@ -188,6 +190,19 @@ pub enum ImageLayerCreationMode { Initial, } +#[derive(Clone, Debug, Default)] +pub enum LastImageLayerCreationStatus { + Incomplete { + /// The last key of the partition (exclusive) that was processed in the last + /// image layer creation attempt. We will continue from this key in the next + /// attempt. + last_key: Key, + }, + Complete, + #[default] + Initial, +} + impl std::fmt::Display for ImageLayerCreationMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) @@ -340,10 +355,14 @@ pub struct Timeline { // Needed to ensure that we can't create a branch at a point that was already garbage collected pub latest_gc_cutoff_lsn: Rcu, + pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>, + // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. pub(crate) gc_info: std::sync::RwLock, + pub(crate) last_image_layer_creation_status: ArcSwap, + // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -933,9 +952,16 @@ pub(crate) enum ShutdownMode { Hard, } -struct ImageLayerCreationOutcome { - unfinished_image_layer: Option, - next_start_key: Key, +enum ImageLayerCreationOutcome { + /// We generated an image layer + Generated { + unfinished_image_layer: ImageLayerWriter, + }, + /// The key range is empty + Empty, + /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip + /// the image layer creation. + Skip, } /// Public interface functions @@ -1044,7 +1070,7 @@ impl Timeline { } pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; - pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; + pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN /// @@ -1194,6 +1220,7 @@ impl Timeline { return (key, Err(err)); } }; + DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64); // The walredo module expects the records to be descending in terms of Lsn. // And we submit the IOs in that order, so, there shuold be no need to sort here. @@ -1221,25 +1248,28 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { - let avg = layers_visited as f64 / results.len() as f64; - if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = + // Record the total number of layers visited towards each key in the batch. While some + // layers may not intersect with a given read, and the cost of layer visits are + // amortized across the batch, each visited layer contributes directly to the observed + // latency for every read in the batch, which is what we care about. + if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { + static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { + LOG_PACER.lock().unwrap().call(|| { + let num_keys = keyspace.total_raw_size(); + let num_pages = results.len(); tracing::info!( shard_id = %self.tenant_shard_id.shard_slug(), lsn = %lsn, - "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", - keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", + ); }); } - // Note that this is an approximation. Tracking the exact number of layers visited - // per key requires virtually unbounded memory usage and is inefficient - // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); + for _ in &results { + self.metrics.layers_per_read.observe(layers_visited as f64); + LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); + } } Ok(results) @@ -1655,7 +1685,7 @@ impl Timeline { cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result { + ) -> Result { self.compact_with_options( cancel, CompactOptions { @@ -1677,7 +1707,7 @@ impl Timeline { cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> Result { + ) -> Result { // most likely the cancellation token is from background task, but in tests it could be the // request task as well. @@ -1697,8 +1727,8 @@ impl Timeline { // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(false), - _ = cancel.cancelled() => return Ok(false), + _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done), + _ = cancel.cancelled() => return Ok(CompactionOutcome::Done), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1706,13 +1736,13 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(false); + return Ok(CompactionOutcome::Done); } let result = match self.get_compaction_algorithm_settings().kind { CompactionAlgorithm::Tiered => { self.compact_tiered(cancel, ctx).await?; - Ok(false) + Ok(CompactionOutcome::Done) } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await, }; @@ -1811,7 +1841,7 @@ impl Timeline { self.last_record_lsn.shutdown(); if let ShutdownMode::FreezeAndFlush = mode { - if let Some((open, frozen)) = self + let do_flush = if let Some((open, frozen)) = self .layers .read() .await @@ -1820,43 +1850,54 @@ impl Timeline { .ok() .filter(|(open, frozen)| *open || *frozen > 0) { - tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + if self.remote_client.is_archived() == Some(true) { + // No point flushing on shutdown for an archived timeline: it is not important + // to have it nice and fresh after our restart, and trying to flush here might + // race with trying to offload it (which also stops the flush loop) + false + } else { + tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + true + } } else { - // this is double-shutdown, ignore it - } + // this is double-shutdown, it'll be a no-op + true + }; // we shut down walreceiver above, so, we won't add anything more // to the InMemoryLayer; freeze it and wait for all frozen layers // to reach the disk & upload queue, then shut the upload queue and // wait for it to drain. - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - self.remote_client.shutdown().await; - } - Err(FlushLayerError::Cancelled) => { - // this is likely the second shutdown, ignore silently. - // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 - debug_assert!(self.cancel.is_cancelled()); - } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); + if do_flush { + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + self.remote_client.shutdown().await; + } + Err(FlushLayerError::Cancelled) => { + // this is likely the second shutdown, ignore silently. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + debug_assert!(self.cancel.is_cancelled()); + } + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); + } } - } - // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but - // we also do a final check here to ensure that the queue is empty. - if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but + // we also do a final check here to ensure that the queue is empty. + if !self.remote_client.no_pending_work() { + warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + } } } @@ -2021,8 +2062,16 @@ impl Timeline { pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, - ) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await? else { + ) -> Result, super::storage_layer::layer::DownloadError> { + let Some(layer) = self + .find_layer(layer_file_name) + .await + .map_err(|e| match e { + layer_manager::Shutdown => { + super::storage_layer::layer::DownloadError::TimelineShutdown + } + })? + else { return Ok(None); }; @@ -2323,6 +2372,18 @@ impl Timeline { ) } + fn get_image_creation_preempt_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_creation_preempt_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_creation_preempt_threshold, + ) + } + /// Resolve the effective WAL receiver protocol to use for this tenant. /// /// Priority order is: @@ -2432,6 +2493,7 @@ impl Timeline { shard_identity, pg_version, layers: Default::default(), + gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()), walredo_mgr, walreceiver: Mutex::new(None), @@ -2472,6 +2534,10 @@ impl Timeline { gc_info: std::sync::RwLock::new(GcInfo::default()), + last_image_layer_creation_status: ArcSwap::new(Arc::new( + LastImageLayerCreationStatus::default(), + )), + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -3475,6 +3541,9 @@ impl Timeline { // image layer). let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn(); + // See `compaction::compact_with_gc` for why we need this. + let _guard = timeline.gc_compaction_layer_update_lock.read().await; + loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -4012,15 +4081,20 @@ impl Timeline { } let mut layers_to_upload = Vec::new(); - layers_to_upload.extend( - self.create_image_layers( + let (generated_image_layers, is_complete) = self + .create_image_layers( &partitions, self.initdb_lsn, ImageLayerCreationMode::Initial, ctx, + LastImageLayerCreationStatus::Initial, ) - .await?, + .await?; + debug_assert!( + matches!(is_complete, LastImageLayerCreationStatus::Complete), + "init image generation mode must fully cover the keyspace" ); + layers_to_upload.extend(generated_image_layers); (layers_to_upload, None) } else { @@ -4277,7 +4351,7 @@ impl Timeline { Ok(result) } - // Is it time to create a new image layer for the given partition? + // Is it time to create a new image layer for the given partition? True if we want to generate. async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { let threshold = self.get_image_creation_threshold(); @@ -4340,7 +4414,6 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, img_range: Range, - start: Key, io_concurrency: IoConcurrency, ) -> Result { let mut wrote_keys = false; @@ -4428,26 +4501,23 @@ impl Timeline { lsn }, ); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: Some(image_layer_writer), - next_start_key: img_range.end, + Ok(ImageLayerCreationOutcome::Generated { + unfinished_image_layer: image_layer_writer, }) } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: None, - next_start_key: start, - }) + Ok(ImageLayerCreationOutcome::Empty) } } /// Create an image layer for metadata keys. This function produces one image layer for all metadata /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it /// would not be too large to fit in a single image layer. + /// + /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of + /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse + /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics + /// for metadata keys than relational keys, which is the number of delta files visited during the scan. #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_metadata_keys( self: &Arc, @@ -4457,12 +4527,13 @@ impl Timeline { ctx: &RequestContext, img_range: Range, mode: ImageLayerCreationMode, - start: Key, io_concurrency: IoConcurrency, ) -> Result { // Metadata keys image layer creation. let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let begin = Instant::now(); + // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should + // not contain too many keys, otherwise this takes a lot of memory. let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -4487,10 +4558,7 @@ impl Timeline { ); if !trigger_generation && mode == ImageLayerCreationMode::Try { - return Ok(ImageLayerCreationOutcome { - unfinished_image_layer: None, - next_start_key: img_range.end, - }); + return Ok(ImageLayerCreationOutcome::Skip); } if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); @@ -4521,20 +4589,12 @@ impl Timeline { lsn } ); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: Some(image_layer_writer), - next_start_key: img_range.end, + Ok(ImageLayerCreationOutcome::Generated { + unfinished_image_layer: image_layer_writer, }) } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: None, - next_start_key: start, - }) + Ok(ImageLayerCreationOutcome::Empty) } } @@ -4590,6 +4650,8 @@ impl Timeline { decision } + /// Returns the image layers generated and an enum indicating whether the process is fully completed. + /// true = we have generate all image layers, false = we preempt the process for L0 compaction. #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4597,9 +4659,15 @@ impl Timeline { lsn: Lsn, mode: ImageLayerCreationMode, ctx: &RequestContext, - ) -> Result, CreateImageLayersError> { + last_status: LastImageLayerCreationStatus, + ) -> Result<(Vec, LastImageLayerCreationStatus), CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); + if partitioning.parts.is_empty() { + warn!("no partitions to create image layers for"); + return Ok((vec![], LastImageLayerCreationStatus::Complete)); + } + // We need to avoid holes between generated image layers. // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one // image layer with hole between them. In this case such layer can not be utilized by GC. @@ -4611,15 +4679,65 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; - let check_for_image_layers = self.should_check_if_image_layers_required(lsn); + let check_for_image_layers = + if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status { + info!( + "resuming image layer creation: last_status=incomplete, continue from {}", + last_key + ); + true + } else { + self.should_check_if_image_layers_required(lsn) + }; let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; - for partition in partitioning.parts.iter() { + let mut all_generated = true; + + let mut partition_processed = 0; + let mut total_partitions = partitioning.parts.len(); + let mut last_partition_processed = None; + let mut partition_parts = partitioning.parts.clone(); + + if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status { + // We need to skip the partitions that have already been processed. + let mut found = false; + for (i, partition) in partition_parts.iter().enumerate() { + if last_key <= partition.end().unwrap() { + // ```plain + // |------|--------|----------|------| + // ^last_key + // ^start from this partition + // ``` + // Why `i+1` instead of `i`? + // It is possible that the user did some writes after the previous image layer creation attempt so that + // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we + // still want to skip this partition, so that we can make progress and avoid generating image layers over + // the same partition. Doing a mod to ensure we don't end up with an empty vec. + if i + 1 >= total_partitions { + // In general, this case should not happen -- if last_key is on the last partition, the previous + // iteration of image layer creation should return a complete status. + break; // with found=false + } + partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements + total_partitions = partition_parts.len(); + // Update the start key to the partition start. + start = partition_parts[0].start().unwrap(); + found = true; + break; + } + } + if !found { + // Last key is within the last partition, or larger than all partitions. + return Ok((vec![], LastImageLayerCreationStatus::Complete)); + } + } + + for partition in partition_parts.iter() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } - + partition_processed += 1; let img_range = start..partition.ranges.last().unwrap().end; let compact_metadata = partition.overlaps(&Key::metadata_key_range()); if compact_metadata { @@ -4654,6 +4772,8 @@ impl Timeline { lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), is_delta: false, }) { + // TODO: this can be processed with the BatchLayerWriter::finish_with_discard + // in the future. tracing::info!( "Skipping image layer at {lsn} {}..{}, already exists", img_range.start, @@ -4687,17 +4807,13 @@ impl Timeline { .map_err(|_| CreateImageLayersError::Cancelled)?, ); - let ImageLayerCreationOutcome { - unfinished_image_layer, - next_start_key, - } = if !compact_metadata { + let outcome = if !compact_metadata { self.create_image_layer_for_rel_blocks( partition, image_layer_writer, lsn, ctx, img_range.clone(), - start, io_concurrency, ) .await? @@ -4709,18 +4825,58 @@ impl Timeline { ctx, img_range.clone(), mode, - start, io_concurrency, ) .await? }; - start = next_start_key; - if let Some(unfinished_image_layer) = unfinished_image_layer { - batch_image_writer.add_unfinished_image_writer( + match outcome { + ImageLayerCreationOutcome::Empty => { + // No data in this partition, so we don't need to create an image layer (for now). + // The next image layer should cover this key range, so we don't advance the `start` + // key. + } + ImageLayerCreationOutcome::Generated { unfinished_image_layer, - img_range, - lsn, - ); + } => { + batch_image_writer.add_unfinished_image_writer( + unfinished_image_layer, + img_range.clone(), + lsn, + ); + // The next image layer should be generated right after this one. + start = img_range.end; + } + ImageLayerCreationOutcome::Skip => { + // We don't need to create an image layer for this partition. + // The next image layer should NOT cover this range, otherwise + // the keyspace becomes empty (reads don't go past image layers). + start = img_range.end; + } + } + + if let ImageLayerCreationMode::Try = mode { + // We have at least made some progress + if batch_image_writer.pending_layer_num() >= 1 { + // The `Try` mode is currently only used on the compaction path. We want to avoid + // image layer generation taking too long time and blocking L0 compaction. So in this + // mode, we also inspect the current number of L0 layers and skip image layer generation + // if there are too many of them. + let num_of_l0_layers = { + let layers = self.layers.read().await; + layers.layer_map()?.level0_deltas().len() + }; + let image_preempt_threshold = self.get_image_creation_preempt_threshold() + * self.get_compaction_threshold(); + if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold { + tracing::info!( + "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}", + partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers + ); + last_partition_processed = Some(partition.clone()); + all_generated = false; + break; + } + } } } @@ -4735,14 +4891,42 @@ impl Timeline { .open_mut()? .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); - timer.stop_and_record(); + let duration = timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered if !image_layers.is_empty() { self.update_layer_visibility().await?; } - Ok(image_layers) + let total_layer_size = image_layers + .iter() + .map(|l| l.metadata().file_size) + .sum::(); + + info!( + "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", + image_layers.len(), + total_layer_size, + duration.as_secs_f64(), + partition_processed, + total_partitions + ); + + Ok(( + image_layers, + if all_generated { + LastImageLayerCreationStatus::Complete + } else { + LastImageLayerCreationStatus::Incomplete { + last_key: if let Some(last_partition_processed) = last_partition_processed { + last_partition_processed.end().unwrap_or(Key::MIN) + } else { + // This branch should be unreachable, but in case it happens, we can just return the start key. + Key::MIN + }, + } + }, + )) } /// Wait until the background initial logical size calculation is complete, or diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 5f7b5f1af584..cfde0704424f 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -11,7 +11,7 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, - RecordedDuration, Timeline, + LastImageLayerCreationStatus, RecordedDuration, Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -33,6 +33,7 @@ use crate::page_cache; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; +use crate::tenant::layer_map::LayerMap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, @@ -262,13 +263,13 @@ impl GcCompactionQueue { ctx: &RequestContext, gc_block: &GcBlock, timeline: &Arc, - ) -> Result { + ) -> Result { let _one_op_at_a_time_guard = self.consumer_lock.lock().await; let has_pending_tasks; let (id, item) = { let mut guard = self.inner.lock().unwrap(); let Some((id, item)) = guard.queued.pop_front() else { - return Ok(false); + return Ok(CompactionOutcome::Done); }; guard.running = Some((id, item.clone())); has_pending_tasks = !guard.queued.is_empty(); @@ -323,7 +324,11 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.running = None; } - Ok(has_pending_tasks) + Ok(if has_pending_tasks { + CompactionOutcome::Pending + } else { + CompactionOutcome::Done + }) } #[allow(clippy::type_complexity)] @@ -434,6 +439,11 @@ impl KeyHistoryRetention { if dry_run { return true; } + if LayerMap::is_l0(&key.key_range, key.is_delta) { + // gc-compaction should not produce L0 deltas, otherwise it will break the layer order. + // We should ignore such layers. + return true; + } let layer_generation; { let guard = tline.layers.read().await; @@ -589,6 +599,17 @@ impl CompactionStatistics { } } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] +pub enum CompactionOutcome { + #[default] + /// No layers need to be compacted after this round. Compaction doesn't need + /// to be immediately scheduled. + Done, + /// Still has pending layers to be compacted after this round. Ideally, the scheduler + /// should immediately schedule another compaction. + Pending, +} + impl Timeline { /// TODO: cancellation /// @@ -598,7 +619,7 @@ impl Timeline { cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> Result { + ) -> Result { if options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) @@ -606,7 +627,7 @@ impl Timeline { self.compact_with_gc(cancel, options, ctx) .await .map_err(CompactionError::Other)?; - return Ok(false); + return Ok(CompactionOutcome::Done); } if options.flags.contains(CompactFlags::DryRun) { @@ -666,9 +687,9 @@ impl Timeline { // Define partitioning schema if needed // 1. L0 Compact - let fully_compacted = { + let l0_compaction_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); - let fully_compacted = self + let l0_compaction_outcome = self .compact_level0( target_file_size, options.flags.contains(CompactFlags::ForceL0Compaction), @@ -676,15 +697,15 @@ impl Timeline { ) .await?; timer.stop_and_record(); - fully_compacted + l0_compaction_outcome }; - if !fully_compacted { + if let CompactionOutcome::Pending = l0_compaction_outcome { // Yield and do not do any other kind of compaction. True means // that we have pending L0 compaction tasks and the compaction scheduler // will prioritize compacting this tenant/timeline again. info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers."); - return Ok(true); + return Ok(CompactionOutcome::Pending); } // 2. Repartition and create image layers if necessary @@ -709,7 +730,7 @@ impl Timeline { .extend(sparse_partitioning.into_dense().parts); // 3. Create new image layers for partitions that have been modified "enough". - let image_layers = self + let (image_layers, outcome) = self .create_image_layers( &partitioning, lsn, @@ -722,10 +743,22 @@ impl Timeline { ImageLayerCreationMode::Try }, &image_ctx, + self.last_image_layer_creation_status + .load() + .as_ref() + .clone(), ) .await?; + self.last_image_layer_creation_status + .store(Arc::new(outcome.clone())); + self.upload_new_image_layers(image_layers)?; + if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { + // Yield and do not do any other kind of compaction. + info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + return Ok(CompactionOutcome::Pending); + } partitioning.parts.len() } Err(err) => { @@ -753,7 +786,7 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } - Ok(false) + Ok(CompactionOutcome::Done) } /// Check for layers that are elegible to be rewritten: @@ -1010,11 +1043,11 @@ impl Timeline { target_file_size: u64, force_compaction_ignore_threshold: bool, ctx: &RequestContext, - ) -> Result { + ) -> Result { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - fully_compacted, + outcome, } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); @@ -1043,12 +1076,12 @@ impl Timeline { if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do - return Ok(true); + return Ok(CompactionOutcome::Done); } self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) .await?; - Ok(fully_compacted) + Ok(outcome) } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. @@ -1503,11 +1536,9 @@ impl Timeline { .await .map_err(CompactionError::Other)?; } else { - let shard = self.shard_identity.shard_index(); let owner = self.shard_identity.get_shard_number(&key); - if cfg!(debug_assertions) { - panic!("key {key} does not belong on shard {shard}, owned by {owner}"); - } + + // This happens after a shard split, when we're compacting an L0 created by our parent shard debug!("dropping key {key} during compaction (it belongs on shard {owner})"); } @@ -1592,7 +1623,11 @@ impl Timeline { .into_iter() .map(|x| x.drop_eviction_guard()) .collect::>(), - fully_compacted, + outcome: if fully_compacted { + CompactionOutcome::Done + } else { + CompactionOutcome::Pending + }, }) } } @@ -1603,7 +1638,7 @@ struct CompactLevel0Phase1Result { deltas_to_compact: Vec, // Whether we have included all L0 layers, or selected only part of them due to the // L0 compaction size limit. - fully_compacted: bool, + outcome: CompactionOutcome, } #[derive(Default)] @@ -2919,10 +2954,45 @@ impl Timeline { // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only // operate on L1 layers. { + // Gc-compaction will rewrite the history of a key. This could happen in two ways: + // + // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume + // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace + // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer + // map, the read path then cannot find any keys below A, reporting a missing key error, while the key + // now gets stored in I at the compact LSN. + // + // --------------- --------------- + // delta1@LSN20 image1@LSN20 + // --------------- (read path collects delta@LSN20, => --------------- (read path cannot find anything + // delta1@LSN10 yields) below LSN 20) + // --------------- + // + // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers, + // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4, + // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4. + // + // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that + // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta + // layer containing 4, yields, and we update the layer map to put the delta layer. + // + // --------------- --------------- + // delta1@LSN4 image1@LSN4 + // --------------- (read path collects delta@LSN4, => --------------- (read path collects LSN4 and LSN1, + // delta1@LSN1-3 yields) delta1@LSN1 which is an invalid history) + // --------------- --------------- + // + // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads, + // and only allow reads to continue after the update is finished. + + let update_guard = self.gc_compaction_layer_update_lock.write().await; + // Acquiring the update guard ensures current read operations end and new read operations are blocked. + // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect? let mut guard = self.layers.write().await; guard .open_mut()? - .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) + .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics); + drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map. }; // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json. @@ -3199,11 +3269,7 @@ impl TimelineAdaptor { ranges: self.get_keyspace(key_range, lsn, ctx).await?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly - let start = Key::MIN; - let ImageLayerCreationOutcome { - unfinished_image_layer, - next_start_key: _, - } = self + let outcome = self .timeline .create_image_layer_for_rel_blocks( &keyspace, @@ -3211,13 +3277,15 @@ impl TimelineAdaptor { lsn, ctx, key_range.clone(), - start, IoConcurrency::sequential(), ) .await?; - if let Some(image_layer_writer) = unfinished_image_layer { - let (desc, path) = image_layer_writer.finish(ctx).await?; + if let ImageLayerCreationOutcome::Generated { + unfinished_image_layer, + } = outcome + { + let (desc, path) = unfinished_image_layer.finish(ctx).await?; let image_layer = Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_images.push(image_layer); diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 80a09b4840d0..3074463384fb 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -1,4 +1,4 @@ -use std::{collections::hash_map::Entry, fs, sync::Arc}; +use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; @@ -8,7 +8,8 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; use crate::{ context::RequestContext, import_datadir, - tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}, + span::debug_assert_current_span_has_tenant_and_timeline_id, + tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}, }; use super::Timeline; @@ -24,6 +25,9 @@ pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, raw_timeline: Option<(Arc, TimelineCreateGuard)>, + /// Whether we spawned the inner Timeline's tasks such that we must later shut it down + /// if aborting the timeline creation + needs_shutdown: bool, } impl<'t> UninitializedTimeline<'t> { @@ -36,6 +40,50 @@ impl<'t> UninitializedTimeline<'t> { owning_tenant, timeline_id, raw_timeline, + needs_shutdown: false, + } + } + + /// When writing data to this timeline during creation, use this wrapper: it will take care of + /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down + /// later. + pub(crate) async fn write(&mut self, f: F) -> anyhow::Result<()> + where + F: FnOnce(Arc) -> Fut, + Fut: Future>, + { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop + self.needs_shutdown = true; + + let timeline = self.raw_timeline()?; + + // Spawn flush loop so that the Timeline is ready to accept writes + timeline.maybe_spawn_flush_loop(); + + // Invoke the provided function, which will write some data into the new timeline + if let Err(e) = f(timeline.clone()).await { + self.abort().await; + return Err(e.into()); + } + + // Flush the underlying timeline's ephemeral layers to disk + if let Err(e) = timeline + .freeze_and_flush() + .await + .context("Failed to flush after timeline creation writes") + { + self.abort().await; + return Err(e); + } + + Ok(()) + } + + pub(crate) async fn abort(&self) { + if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() { + raw_timeline.shutdown(super::ShutdownMode::Hard).await; } } @@ -44,11 +92,13 @@ impl<'t> UninitializedTimeline<'t> { /// This function launches the flush loop if not already done. /// /// The caller is responsible for activating the timeline (function `.activate()`). - pub(crate) fn finish_creation(mut self) -> anyhow::Result> { + pub(crate) async fn finish_creation(mut self) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_shard_id = self.owning_tenant.tenant_shard_id; if self.raw_timeline.is_none() { + self.abort().await; + return Err(anyhow::anyhow!( "No timeline for initialization found for {tenant_shard_id}/{timeline_id}" )); @@ -62,16 +112,25 @@ impl<'t> UninitializedTimeline<'t> { .0 .get_disk_consistent_lsn(); - anyhow::ensure!( - new_disk_consistent_lsn.is_valid(), - "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" - ); + if !new_disk_consistent_lsn.is_valid() { + self.abort().await; + + return Err(anyhow::anyhow!( + "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" + )); + } let mut timelines = self.owning_tenant.timelines.lock().unwrap(); match timelines.entry(timeline_id) { - Entry::Occupied(_) => anyhow::bail!( + Entry::Occupied(_) => { + // Unexpected, bug in the caller. Tenant is responsible for preventing concurrent creation of the same timeline. + // + // We do not call Self::abort here. Because we don't cleanly shut down our Timeline, [`Self::drop`] should + // skip trying to delete the timeline directory too. + anyhow::bail!( "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" - ), + ) + } Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not // cleanup after and would block for example the tenant deletion @@ -93,36 +152,31 @@ impl<'t> UninitializedTimeline<'t> { /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( - self, + mut self, tenant: Arc, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> anyhow::Result> { - let raw_timeline = self.raw_timeline()?; + self.write(|raw_timeline| async move { + import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx) + .await + .context("Failed to import basebackup") + .map_err(CreateTimelineError::Other)?; - import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx) - .await - .context("Failed to import basebackup")?; - - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - raw_timeline.maybe_spawn_flush_loop(); - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - anyhow::bail!("failpoint before-checkpoint-new-timeline"); - }); + fail::fail_point!("before-checkpoint-new-timeline", |_| { + Err(CreateTimelineError::Other(anyhow::anyhow!( + "failpoint before-checkpoint-new-timeline" + ))) + }); - raw_timeline - .freeze_and_flush() - .await - .context("Failed to flush after basebackup import")?; + Ok(()) + }) + .await?; // All the data has been imported. Insert the Timeline into the tenant's timelines map - let tl = self.finish_creation()?; + let tl = self.finish_creation().await?; tl.activate(tenant, broker_client, None, ctx); Ok(tl) } @@ -143,12 +197,19 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { - if let Some((_, create_guard)) = self.raw_timeline.take() { + if let Some((timeline, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); - // This is unusual, but can happen harmlessly if the pageserver is stopped while - // creating a timeline. - info!("Timeline got dropped without initializing, cleaning its files"); - cleanup_timeline_directory(create_guard); + if self.needs_shutdown && !timeline.gate.close_complete() { + // This should not happen: caller should call [`Self::abort`] on failures + tracing::warn!( + "Timeline not shut down after initialization failure, cannot clean up files" + ); + } else { + // This is unusual, but can happen harmlessly if the pageserver is stopped while + // creating a timeline. + info!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(create_guard); + } } } } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d69e7dbd3245..de917377cb54 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection( // advances it to its end LSN. 0 is just an initialization placeholder. let mut modification = timeline.begin_modification(Lsn(0)); + async fn commit( + modification: &mut DatadirModification<'_>, + ctx: &RequestContext, + uncommitted: &mut u64, + ) -> anyhow::Result<()> { + let stats = modification.stats(); + modification.commit(ctx).await?; + WAL_INGEST.records_committed.inc_by(*uncommitted); + WAL_INGEST.inc_values_committed(&stats); + *uncommitted = 0; + Ok(()) + } + if !records.is_empty() { timeline .metrics @@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection( if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) && uncommitted_records > 0 { - modification.commit(&ctx).await?; - uncommitted_records = 0; + commit(&mut modification, &ctx, &mut uncommitted_records).await?; } let local_next_record_lsn = interpreted.next_record_lsn; @@ -396,8 +408,7 @@ pub(super) async fn handle_walreceiver_connection( || modification.approx_pending_bytes() > DatadirModification::MAX_PENDING_BYTES { - modification.commit(&ctx).await?; - uncommitted_records = 0; + commit(&mut modification, &ctx, &mut uncommitted_records).await?; } } @@ -415,7 +426,7 @@ pub(super) async fn handle_walreceiver_connection( if uncommitted_records > 0 || needs_last_record_lsn_advance { // Commit any uncommitted records - modification.commit(&ctx).await?; + commit(&mut modification, &ctx, &mut uncommitted_records).await?; } if !caught_up && streaming_lsn >= end_of_wal { @@ -442,10 +453,12 @@ pub(super) async fn handle_walreceiver_connection( filtered: &mut u64, ctx: &RequestContext, ) -> anyhow::Result<()> { + let stats = modification.stats(); + modification.commit(ctx).await?; WAL_INGEST .records_committed .inc_by(*uncommitted - *filtered); - modification.commit(ctx).await?; + WAL_INGEST.inc_values_committed(&stats); *uncommitted = 0; *filtered = 0; Ok(()) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 8a7f4a4bf5fd..9d539198c7ae 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -234,6 +234,19 @@ impl VirtualFile { ) -> (FullSlice, Result) { self.inner.write_all(buf, ctx).await } + + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + self.inner.read_to_end(buf, ctx).await + } + + pub(crate) async fn read_to_string( + &mut self, + ctx: &RequestContext, + ) -> Result { + let mut buf = Vec::new(); + self.read_to_end(&mut buf, ctx).await?; + Ok(String::from_utf8(buf)?) + } } /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing @@ -993,6 +1006,24 @@ impl VirtualFileInner { (buf, result) }) } + + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + let mut tmp = vec![0; 128]; + loop { + let slice = tmp.slice(..128); + let (slice, res) = self.read_at(slice, self.pos, ctx).await; + match res { + Ok(0) => return Ok(()), + Ok(n) => { + self.pos += n as u64; + buf.extend_from_slice(&slice[..n]); + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + tmp = slice.into_inner(); + } + } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 @@ -1237,10 +1268,6 @@ impl VirtualFile { ) -> Result, std::io::Error> { self.inner.read_blk(blknum, ctx).await } - - async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { - self.inner.read_to_end(buf, ctx).await - } } #[cfg(test)] @@ -1260,24 +1287,6 @@ impl VirtualFileInner { slice.into_inner(), )) } - - async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { - let mut tmp = vec![0; 128]; - loop { - let slice = tmp.slice(..128); - let (slice, res) = self.read_at(slice, self.pos, ctx).await; - match res { - Ok(0) => return Ok(()), - Ok(n) => { - self.pos += n as u64; - buf.extend_from_slice(&slice[..n]); - } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), - } - tmp = slice.into_inner(); - } - } } impl Drop for VirtualFileInner { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e0283d99e0fb..04edb3e3f47b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -28,17 +28,9 @@ use std::time::Duration; use std::time::Instant; use std::time::SystemTime; -use pageserver_api::shard::ShardIdentity; -use postgres_ffi::fsm_logical_to_physical; -use postgres_ffi::walrecord::*; -use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; -use wal_decoder::models::*; - use anyhow::{bail, Result}; use bytes::{Buf, Bytes}; use tracing::*; -use utils::failpoint_support; -use utils::rate_limit::RateLimit; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; @@ -50,11 +42,18 @@ use crate::ZERO_PAGE; use pageserver_api::key::rel_block_to_key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use postgres_ffi::fsm_logical_to_physical; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::walrecord::*; use postgres_ffi::TransactionId; +use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; +use utils::rate_limit::RateLimit; +use utils::{critical, failpoint_support}; +use wal_decoder::models::*; enum_pgversion! {CheckPoint, pgv::CheckPoint} @@ -327,93 +326,75 @@ impl WalIngest { let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - // Sometimes, Postgres seems to create heap WAL records with the - // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is - // not set. In fact, it's possible that the VM page does not exist at all. - // In that case, we don't want to store a record to clear the VM bit; - // replaying it would fail to find the previous image of the page, because - // it doesn't exist. So check if the VM page(s) exist, and skip the WAL - // record if it doesn't. - // - // TODO: analyze the metrics and tighten this up accordingly. This logic - // implicitly assumes that VM pages see explicit WAL writes before - // implicit ClearVmBits, and will otherwise silently drop updates. + // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within + // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion, + // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See: + // https://github.com/neondatabase/neon/pull/10634. let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else { - WAL_INGEST - .clear_vm_bits_unknown - .with_label_values(&["relation"]) - .inc(); + critical!("clear_vm_bits for unknown VM relation {vm_rel}"); return Ok(()); }; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { - WAL_INGEST - .clear_vm_bits_unknown - .with_label_values(&["new_page"]) - .inc(); + critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"); new_vm_blk = None; } } if let Some(blknum) = old_vm_blk { if blknum >= vm_size { - WAL_INGEST - .clear_vm_bits_unknown - .with_label_values(&["old_page"]) - .inc(); + critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"); old_vm_blk = None; } } - if new_vm_blk.is_some() || old_vm_blk.is_some() { - if new_vm_blk == old_vm_blk { - // An UPDATE record that needs to clear the bits for both old and the - // new page, both of which reside on the same VM page. + if new_vm_blk.is_none() && old_vm_blk.is_none() { + return Ok(()); + } else if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the new page, both of + // which reside on the same VM page. + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk.unwrap(), + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags, + }, + ctx, + ) + .await?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on different VM pages. + if let Some(new_vm_blk) = new_vm_blk { self.put_rel_wal_record( modification, vm_rel, - new_vm_blk.unwrap(), + new_vm_blk, NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, + old_heap_blkno: None, + flags, + }, + ctx, + ) + .await?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + old_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, old_heap_blkno, flags, }, ctx, ) .await?; - } else { - // Clear VM bits for one heap page, or for two pages that reside on - // different VM pages. - if let Some(new_vm_blk) = new_vm_blk { - self.put_rel_wal_record( - modification, - vm_rel, - new_vm_blk, - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno: None, - flags, - }, - ctx, - ) - .await?; - } - if let Some(old_vm_blk) = old_vm_blk { - self.put_rel_wal_record( - modification, - vm_rel, - old_vm_blk, - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno: None, - old_heap_blkno, - flags, - }, - ctx, - ) - .await?; - } } } - Ok(()) } diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 08b76521756c..01da61f84b56 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -563,8 +563,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); -#if USE_ASSERT_CHECKING - do { +#ifdef USE_ASSERT_CHECKING + { int count = 0; for (int j = 0; j < nblocks; j++) @@ -574,7 +574,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } Assert(count == found); - } while (false); + } #endif return found; diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 4460e3b40c11..22aeb2e2d658 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -36,6 +36,11 @@ #include "pagestore_client.h" #include "walproposer.h" +#ifdef __linux__ +#include +#include +#endif + #define PageStoreTrace DEBUG5 #define MIN_RECONNECT_INTERVAL_USEC 1000 @@ -728,11 +733,36 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer) INSTR_TIME_SUBTRACT(since_last_log, last_log_ts); if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS) { + int sndbuf = -1; + int recvbuf = -1; +#ifdef __linux__ + int socketfd; +#endif + since_start = now; INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)", + +#ifdef __linux__ + /* + * get kernel's send and recv queue size via ioctl + * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 + */ + socketfd = PQsocket(pageserver_conn); + if (socketfd != -1) { + int ioctl_err; + ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf); + if (ioctl_err!= 0) { + sndbuf = -errno; + } + ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf); + if (ioctl_err != 0) { + recvbuf = -errno; + } + } +#endif + neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received); + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); last_log_ts = now; logged = true; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 54cacea98448..012bd479bcd6 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, { uint64 min_ring_index; PrefetchRequest hashkey; -#if USE_ASSERT_CHECKING +#ifdef USE_ASSERT_CHECKING bool any_hits = false; #endif /* We will never read further ahead than our buffer can store. */ @@ -955,7 +955,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, else lsns = NULL; -#if USE_ASSERT_CHECKING +#ifdef USE_ASSERT_CHECKING any_hits = true; #endif diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index f362a4503537..d7880ea7b964 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -19,14 +19,15 @@ aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true base64.workspace = true +boxcar = "0.2.8" bstr.workspace = true bytes = { workspace = true, features = ["serde"] } camino.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive", "env"] } +clashmap.workspace = true compute_api.workspace = true consumption_metrics.workspace = true -dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true futures.workspace = true @@ -42,6 +43,7 @@ hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } +gettid = "0.1.3" indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true @@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true +opentelemetry = { workspace = true, features = ["trace"] } +papaya = "0.1.8" parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true @@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] } tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +tracing-log.workspace = true +tracing-serde.workspace = true +tracing-opentelemetry.workspace = true try-lock.workspace = true typed-json.workspace = true url.workspace = true @@ -112,6 +119,7 @@ rsa = "0.9" workspace_hack.workspace = true [dev-dependencies] +assert-json-diff.workspace = true camino-tempfile.workspace = true fallible-iterator.workspace = true flate2.workspace = true diff --git a/proxy/README.md b/proxy/README.md index 4b98342d7275..ecd54fbbd864 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -106,17 +106,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`. -Let's create self-signed certificate by running: -```sh -openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" -``` - -Then we need to build proxy with 'testing' feature and run, e.g.: -```sh -RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key -``` - -We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows: +We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows: ```sh docker run \ --detach \ @@ -133,8 +123,18 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';" ``` +Let's create self-signed certificate by running: +```sh +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" +``` + +Then we need to build proxy with 'testing' feature and run, e.g.: +```sh +RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key +``` + Now from client you can start a new session: ```sh PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full" -``` \ No newline at end of file +``` diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 1cbf91d3ae73..9be29c38c938 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -7,8 +7,8 @@ use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; -use super::{ComputeCredentialKeys, ControlPlaneApi}; -use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo}; +use super::ComputeCredentialKeys; +use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; use crate::cache::Cached; use crate::config::AuthenticationConfig; @@ -84,26 +84,15 @@ pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } -#[async_trait] -impl BackendIpAllowlist for ConsoleRedirectBackend { - async fn get_allowed_ips( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> auth::Result> { - self.api - .get_allowed_ips_and_secret(ctx, user_info) - .await - .map(|(ips, _)| ips.as_ref().clone()) - .map_err(|e| e.into()) - } -} - impl ConsoleRedirectBackend { pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self { Self { console_uri, api } } + pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient { + &self.api + } + pub(crate) async fn authenticate( &self, ctx: &RequestContext, @@ -191,6 +180,15 @@ async fn authenticate( } } + // Check if the access over the public internet is allowed, otherwise block. Note that + // the console redirect is not behind the VPC service endpoint, so we don't need to check + // the VPC endpoint ID. + if let Some(public_access_allowed) = db_info.public_access_allowed { + if !public_access_allowed { + return Err(auth::AuthError::NetworkNotAllowed); + } + } + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; // This config should be self-contained, because we won't diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index df716f8455f0..e05a693cee27 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; -use dashmap::DashMap; +use clashmap::ClashMap; use jose_jwk::crypto::KeyInfo; use reqwest::{redirect, Client}; use reqwest_retry::policies::ExponentialBackoff; @@ -64,7 +64,7 @@ pub(crate) struct AuthRule { pub struct JwkCache { client: reqwest_middleware::ClientWithMiddleware, - map: DashMap<(EndpointId, RoleName), Arc>, + map: ClashMap<(EndpointId, RoleName), Arc>, } pub(crate) struct JwkCacheEntry { @@ -469,7 +469,7 @@ impl Default for JwkCache { JwkCache { client, - map: DashMap::default(), + map: ClashMap::default(), } } } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index d17d91a56d96..7ef096207aed 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -26,10 +26,12 @@ use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::{ - self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, + self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps, + CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, }; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; +use crate::protocol2::ConnectionInfoExtra; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; @@ -99,6 +101,13 @@ impl Backend<'_, T> { Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } + + pub(crate) fn get_api(&self) -> &ControlPlaneClient { + match self { + Self::ControlPlane(api, _) => api, + Self::Local(_) => panic!("Local backend has no API"), + } + } } impl<'a, T> Backend<'a, T> { @@ -247,15 +256,6 @@ impl AuthenticationConfig { } } -#[async_trait::async_trait] -pub(crate) trait BackendIpAllowlist { - async fn get_allowed_ips( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> auth::Result>; -} - /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. /// @@ -282,23 +282,51 @@ async fn auth_quirks( Ok(info) => (info, None), }; - debug!("fetching user's authentication info"); - let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; + debug!("fetching authentication info and allowlists"); // check allowed list - if config.ip_allowlist_check_enabled - && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) - { - return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); + let allowed_ips = if config.ip_allowlist_check_enabled { + let allowed_ips = api.get_allowed_ips(ctx, &info).await?; + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); + } + allowed_ips + } else { + Cached::new_uncached(Arc::new(vec![])) + }; + + // check if a VPC endpoint ID is coming in and if yes, if it's allowed + let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?; + if config.is_vpc_acccess_proxy { + if access_blocks.vpc_access_blocked { + return Err(AuthError::NetworkNotAllowed); + } + + let incoming_vpc_endpoint_id = match ctx.extra() { + None => return Err(AuthError::MissingEndpointName), + Some(ConnectionInfoExtra::Aws { vpce_id }) => { + // Convert the vcpe_id to a string + String::from_utf8(vpce_id.to_vec()).unwrap_or_default() + } + Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), + }; + let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?; + // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. + if !allowed_vpc_endpoint_ids.is_empty() + && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id) + { + return Err(AuthError::vpc_endpoint_id_not_allowed( + incoming_vpc_endpoint_id, + )); + } + } else if access_blocks.public_access_blocked { + return Err(AuthError::NetworkNotAllowed); } if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { return Err(AuthError::too_many_connections()); } - let cached_secret = match maybe_secret { - Some(secret) => secret, - None => api.get_role_secret(ctx, &info).await?, - }; + let cached_secret = api.get_role_secret(ctx, &info).await?; let (cached_entry, secret) = cached_secret.take_value(); let secret = if let Some(secret) = secret { @@ -440,34 +468,38 @@ impl Backend<'_, ComputeUserInfo> { } } - pub(crate) async fn get_allowed_ips_and_secret( + pub(crate) async fn get_allowed_ips( + &self, + ctx: &RequestContext, + ) -> Result { + match self { + Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await, + Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), + } + } + + pub(crate) async fn get_allowed_vpc_endpoint_ids( &self, ctx: &RequestContext, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + ) -> Result { match self { Self::ControlPlane(api, user_info) => { - api.get_allowed_ips_and_secret(ctx, user_info).await + api.get_allowed_vpc_endpoint_ids(ctx, user_info).await } - Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), } } -} -#[async_trait::async_trait] -impl BackendIpAllowlist for Backend<'_, ()> { - async fn get_allowed_ips( + pub(crate) async fn get_block_public_or_vpc_access( &self, ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> auth::Result> { - let auth_data = match self { - Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await, - Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), - }; - - auth_data - .map(|(ips, _)| ips.as_ref().clone()) - .map_err(|e| e.into()) + ) -> Result { + match self { + Self::ControlPlane(api, user_info) => { + api.get_block_public_or_vpc_access(ctx, user_info).await + } + Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())), + } } } @@ -514,7 +546,10 @@ mod tests { use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; use crate::context::RequestContext; - use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret}; + use crate::control_plane::{ + self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps, + CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, + }; use crate::proxy::NeonOptions; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; use crate::scram::threadpool::ThreadPool; @@ -523,6 +558,8 @@ mod tests { struct Auth { ips: Vec, + vpc_endpoint_ids: Vec, + access_blocker_flags: AccessBlockerFlags, secret: AuthSecret, } @@ -535,17 +572,31 @@ mod tests { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( + &self, + _ctx: &RequestContext, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone()))) + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + _ctx: &RequestContext, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new( + self.vpc_endpoint_ids.clone(), + ))) + } + + async fn get_block_public_or_vpc_access( &self, _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, - ) -> Result< - (CachedAllowedIps, Option), - control_plane::errors::GetAuthInfoError, - > { - Ok(( - CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())), - Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))), + ) -> Result { + Ok(CachedAccessBlockerFlags::new_uncached( + self.access_blocker_flags.clone(), )) } @@ -575,6 +626,7 @@ mod tests { rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_vpc_acccess_proxy: false, is_auth_broker: false, accept_jwts: false, console_redirect_confirmation_timeout: std::time::Duration::from_secs(5), @@ -642,6 +694,8 @@ mod tests { let ctx = RequestContext::test(); let api = Auth { ips: vec![], + vpc_endpoint_ids: vec![], + access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; @@ -722,6 +776,8 @@ mod tests { let ctx = RequestContext::test(); let api = Auth { ips: vec![], + vpc_endpoint_ids: vec![], + access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; @@ -774,6 +830,8 @@ mod tests { let ctx = RequestContext::test(); let api = Auth { ips: vec![], + vpc_endpoint_ids: vec![], + access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 0198cc306e08..6082695a6b1b 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -55,6 +55,12 @@ pub(crate) enum AuthError { )] MissingEndpointName, + #[error( + "VPC endpoint ID is not specified. \ + This endpoint requires a VPC endpoint ID to connect." + )] + MissingVPCEndpointId, + #[error("password authentication failed for user '{0}'")] PasswordFailed(Box), @@ -69,6 +75,15 @@ pub(crate) enum AuthError { )] IpAddressNotAllowed(IpAddr), + #[error("This connection is trying to access this endpoint from a blocked network.")] + NetworkNotAllowed, + + #[error( + "This VPC endpoint id {0} is not allowed to connect to this endpoint. \ + Please add it to the allowed list in the Neon console." + )] + VpcEndpointIdNotAllowed(String), + #[error("Too many connections to this endpoint. Please try again later.")] TooManyConnections, @@ -95,6 +110,10 @@ impl AuthError { AuthError::IpAddressNotAllowed(ip) } + pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self { + AuthError::VpcEndpointIdNotAllowed(id) + } + pub(crate) fn too_many_connections() -> Self { AuthError::TooManyConnections } @@ -122,8 +141,11 @@ impl UserFacingError for AuthError { Self::BadAuthMethod(_) => self.to_string(), Self::MalformedPassword(_) => self.to_string(), Self::MissingEndpointName => self.to_string(), + Self::MissingVPCEndpointId => self.to_string(), Self::Io(_) => "Internal error".to_string(), Self::IpAddressNotAllowed(_) => self.to_string(), + Self::NetworkNotAllowed => self.to_string(), + Self::VpcEndpointIdNotAllowed(_) => self.to_string(), Self::TooManyConnections => self.to_string(), Self::UserTimeout(_) => self.to_string(), Self::ConfirmationTimeout(_) => self.to_string(), @@ -142,8 +164,11 @@ impl ReportableError for AuthError { Self::BadAuthMethod(_) => crate::error::ErrorKind::User, Self::MalformedPassword(_) => crate::error::ErrorKind::User, Self::MissingEndpointName => crate::error::ErrorKind::User, + Self::MissingVPCEndpointId => crate::error::ErrorKind::User, Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + Self::NetworkNotAllowed => crate::error::ErrorKind::User, + Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User, Self::TooManyConnections => crate::error::ErrorKind::RateLimit, Self::UserTimeout(_) => crate::error::ErrorKind::User, Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index ee8b3d4ef579..7a855bf54b41 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -284,6 +284,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig rate_limiter: BucketRateLimiter::new(vec![]), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_vpc_acccess_proxy: false, is_auth_broker: false, accept_jwts: true, console_redirect_confirmation_timeout: Duration::ZERO, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index e1affe8391a6..de685a82c627 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -630,6 +630,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_vpc_acccess_proxy: args.is_private_access_proxy, is_auth_broker: args.is_auth_broker, accept_jwts: args.is_auth_broker, console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 0136446d6dfb..b5c42cd23db7 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -3,7 +3,7 @@ use std::future::pending; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; -use dashmap::DashSet; +use clashmap::ClashSet; use redis::streams::{StreamReadOptions, StreamReadReply}; use redis::{AsyncCommands, FromRedisValue, Value}; use serde::Deserialize; @@ -55,9 +55,9 @@ impl TryFrom<&Value> for ControlPlaneEvent { pub struct EndpointsCache { config: EndpointCacheConfig, - endpoints: DashSet, - branches: DashSet, - projects: DashSet, + endpoints: ClashSet, + branches: ClashSet, + projects: ClashSet, ready: AtomicBool, limiter: Arc>, } @@ -69,9 +69,9 @@ impl EndpointsCache { config.limiter_info.clone(), ))), config, - endpoints: DashSet::new(), - branches: DashSet::new(), - projects: DashSet::new(), + endpoints: ClashSet::new(), + branches: ClashSet::new(), + projects: ClashSet::new(), ready: AtomicBool::new(false), } } diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index cab0b8b90594..7651eb71a2e0 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; -use dashmap::DashMap; +use clashmap::ClashMap; use rand::{thread_rng, Rng}; use smol_str::SmolStr; use tokio::sync::Mutex; @@ -15,13 +15,16 @@ use tracing::{debug, info}; use super::{Cache, Cached}; use crate::auth::IpPattern; use crate::config::ProjectInfoCacheOptions; -use crate::control_plane::AuthSecret; -use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::control_plane::{AccessBlockerFlags, AuthSecret}; +use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::types::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); + fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec); + fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt); + fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); async fn decrement_active_listeners(&self); async fn increment_active_listeners(&self); @@ -51,6 +54,8 @@ impl From for Entry { struct EndpointInfo { secret: std::collections::HashMap>>, allowed_ips: Option>>>, + block_public_or_vpc_access: Option>, + allowed_vpc_endpoint_ids: Option>>>, } impl EndpointInfo { @@ -92,9 +97,52 @@ impl EndpointInfo { } None } + pub(crate) fn get_allowed_vpc_endpoint_ids( + &self, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(Arc>, bool)> { + if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids { + if valid_since < allowed_vpc_endpoint_ids.created_at { + return Some(( + allowed_vpc_endpoint_ids.value.clone(), + Self::check_ignore_cache( + ignore_cache_since, + allowed_vpc_endpoint_ids.created_at, + ), + )); + } + } + None + } + pub(crate) fn get_block_public_or_vpc_access( + &self, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(AccessBlockerFlags, bool)> { + if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access { + if valid_since < block_public_or_vpc_access.created_at { + return Some(( + block_public_or_vpc_access.value.clone(), + Self::check_ignore_cache( + ignore_cache_since, + block_public_or_vpc_access.created_at, + ), + )); + } + } + None + } + pub(crate) fn invalidate_allowed_ips(&mut self) { self.allowed_ips = None; } + pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) { + self.allowed_vpc_endpoint_ids = None; + } + pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) { + self.block_public_or_vpc_access = None; + } pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { self.secret.remove(&role_name); } @@ -108,9 +156,11 @@ impl EndpointInfo { /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. pub struct ProjectInfoCacheImpl { - cache: DashMap, + cache: ClashMap, - project2ep: DashMap>, + project2ep: ClashMap>, + // FIXME(stefan): we need a way to GC the account2ep map. + account2ep: ClashMap>, config: ProjectInfoCacheOptions, start_time: Instant, @@ -120,6 +170,63 @@ pub struct ProjectInfoCacheImpl { #[async_trait] impl ProjectInfoCache for ProjectInfoCacheImpl { + fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec) { + info!( + "invalidating allowed vpc endpoint ids for projects `{}`", + project_ids + .iter() + .map(|id| id.to_string()) + .collect::>() + .join(", ") + ); + for project_id in project_ids { + let endpoints = self + .project2ep + .get(&project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_allowed_vpc_endpoint_ids(); + } + } + } + } + + fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) { + info!( + "invalidating allowed vpc endpoint ids for org `{}`", + account_id + ); + let endpoints = self + .account2ep + .get(&account_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_allowed_vpc_endpoint_ids(); + } + } + } + + fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) { + info!( + "invalidating block public or vpc access for project `{}`", + project_id + ); + let endpoints = self + .project2ep + .get(&project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_block_public_or_vpc_access(); + } + } + } + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { info!("invalidating allowed ips for project `{}`", project_id); let endpoints = self @@ -176,8 +283,9 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { impl ProjectInfoCacheImpl { pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self { Self { - cache: DashMap::new(), - project2ep: DashMap::new(), + cache: ClashMap::new(), + project2ep: ClashMap::new(), + account2ep: ClashMap::new(), config, ttl_disabled_since_us: AtomicU64::new(u64::MAX), start_time: Instant::now(), @@ -226,6 +334,49 @@ impl ProjectInfoCacheImpl { } Some(Cached::new_uncached(value)) } + pub(crate) fn get_allowed_vpc_endpoint_ids( + &self, + endpoint_id: &EndpointId, + ) -> Option>>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(&endpoint_id)?; + let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since); + let (value, ignore_cache) = value?; + if !ignore_cache { + let cached = Cached { + token: Some(( + self, + CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id), + )), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub(crate) fn get_block_public_or_vpc_access( + &self, + endpoint_id: &EndpointId, + ) -> Option> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(&endpoint_id)?; + let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since); + let (value, ignore_cache) = value?; + if !ignore_cache { + let cached = Cached { + token: Some(( + self, + CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id), + )), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub(crate) fn insert_role_secret( &self, project_id: ProjectIdInt, @@ -256,6 +407,43 @@ impl ProjectInfoCacheImpl { self.insert_project2endpoint(project_id, endpoint_id); self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into()); } + pub(crate) fn insert_allowed_vpc_endpoint_ids( + &self, + account_id: Option, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + allowed_vpc_endpoint_ids: Arc>, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + if let Some(account_id) = account_id { + self.insert_account2endpoint(account_id, endpoint_id); + } + self.insert_project2endpoint(project_id, endpoint_id); + self.cache + .entry(endpoint_id) + .or_default() + .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into()); + } + pub(crate) fn insert_block_public_or_vpc_access( + &self, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + access_blockers: AccessBlockerFlags, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + self.insert_project2endpoint(project_id, endpoint_id); + self.cache + .entry(endpoint_id) + .or_default() + .block_public_or_vpc_access = Some(access_blockers.into()); + } + fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) { if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) { endpoints.insert(endpoint_id); @@ -264,6 +452,14 @@ impl ProjectInfoCacheImpl { .insert(project_id, HashSet::from([endpoint_id])); } } + fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) { + if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) { + endpoints.insert(endpoint_id); + } else { + self.account2ep + .insert(account_id, HashSet::from([endpoint_id])); + } + } fn get_cache_times(&self) -> (Instant, Option) { let mut valid_since = Instant::now() - self.config.ttl; // Only ignore cache if ttl is disabled. @@ -302,7 +498,7 @@ impl ProjectInfoCacheImpl { let mut removed = 0; let shard = self.project2ep.shards()[shard].write(); for (_, endpoints) in shard.iter() { - for endpoint in endpoints.get() { + for endpoint in endpoints { self.cache.remove(endpoint); removed += 1; } @@ -334,11 +530,25 @@ impl CachedLookupInfo { lookup_type: LookupType::AllowedIps, } } + pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::AllowedVpcEndpointIds, + } + } + pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::BlockPublicOrVpcAccess, + } + } } enum LookupType { RoleSecret(RoleNameInt), AllowedIps, + AllowedVpcEndpointIds, + BlockPublicOrVpcAccess, } impl Cache for ProjectInfoCacheImpl { @@ -360,6 +570,16 @@ impl Cache for ProjectInfoCacheImpl { endpoint_info.invalidate_allowed_ips(); } } + LookupType::AllowedVpcEndpointIds => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_allowed_vpc_endpoint_ids(); + } + } + LookupType::BlockPublicOrVpcAccess => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_block_public_or_vpc_access(); + } + } } } } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 34f708a36b90..4d919f374a2d 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,3 +1,4 @@ +use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; @@ -8,23 +9,22 @@ use pq_proto::CancelKeyData; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; -use tokio::sync::mpsc; +use tokio::sync::{mpsc, oneshot}; use tracing::{debug, info}; -use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo}; +use crate::auth::backend::ComputeUserInfo; use crate::auth::{check_peer_addr_is_in_list, AuthError}; use crate::config::ComputeConfig; use crate::context::RequestContext; +use crate::control_plane::ControlPlaneApi; use crate::error::ReportableError; use crate::ext::LockExt; -use crate::metrics::CancelChannelSizeGuard; -use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind}; +use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind}; +use crate::protocol2::ConnectionInfoExtra; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::keys::KeyPrefix; use crate::redis::kv_ops::RedisKVClient; use crate::tls::postgres_rustls::MakeRustlsConnect; -use std::convert::Infallible; -use tokio::sync::oneshot; type IpSubnetKey = IpNet; @@ -135,6 +135,9 @@ pub(crate) enum CancelError { #[error("IP is not allowed")] IpNotAllowed, + #[error("VPC endpoint id is not allowed to connect")] + VpcEndpointIdNotAllowed, + #[error("Authentication backend error")] AuthError(#[from] AuthError), @@ -154,8 +157,9 @@ impl ReportableError for CancelError { } CancelError::Postgres(_) => crate::error::ErrorKind::Compute, CancelError::RateLimit => crate::error::ErrorKind::RateLimit, - CancelError::IpNotAllowed => crate::error::ErrorKind::User, - CancelError::NotFound => crate::error::ErrorKind::User, + CancelError::IpNotAllowed + | CancelError::VpcEndpointIdNotAllowed + | CancelError::NotFound => crate::error::ErrorKind::User, CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane, CancelError::InternalError => crate::error::ErrorKind::Service, } @@ -267,11 +271,12 @@ impl CancellationHandler { /// Will fetch IP allowlist internally. /// /// return Result primarily for tests - pub(crate) async fn cancel_session( + pub(crate) async fn cancel_session( &self, key: CancelKeyData, ctx: RequestContext, - check_allowed: bool, + check_ip_allowed: bool, + check_vpc_allowed: bool, auth_backend: &T, ) -> Result<(), CancelError> { let subnet_key = match ctx.peer_addr() { @@ -306,11 +311,11 @@ impl CancellationHandler { return Err(CancelError::NotFound); }; - if check_allowed { + if check_ip_allowed { let ip_allowlist = auth_backend .get_allowed_ips(&ctx, &cancel_closure.user_info) .await - .map_err(CancelError::AuthError)?; + .map_err(|e| CancelError::AuthError(e.into()))?; if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) { // log it here since cancel_session could be spawned in a task @@ -322,6 +327,40 @@ impl CancellationHandler { } } + // check if a VPC endpoint ID is coming in and if yes, if it's allowed + let access_blocks = auth_backend + .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info) + .await + .map_err(|e| CancelError::AuthError(e.into()))?; + + if check_vpc_allowed { + if access_blocks.vpc_access_blocked { + return Err(CancelError::AuthError(AuthError::NetworkNotAllowed)); + } + + let incoming_vpc_endpoint_id = match ctx.extra() { + None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)), + Some(ConnectionInfoExtra::Aws { vpce_id }) => { + // Convert the vcpe_id to a string + String::from_utf8(vpce_id.to_vec()).unwrap_or_default() + } + Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), + }; + + let allowed_vpc_endpoint_ids = auth_backend + .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info) + .await + .map_err(|e| CancelError::AuthError(e.into()))?; + // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. + if !allowed_vpc_endpoint_ids.is_empty() + && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id) + { + return Err(CancelError::VpcEndpointIdNotAllowed); + } + } else if access_blocks.public_access_blocked { + return Err(CancelError::VpcEndpointIdNotAllowed); + } + Metrics::get() .proxy .cancellation_requests_total diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8502edcfab09..1dcd37712ea2 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -68,6 +68,7 @@ pub struct AuthenticationConfig { pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, pub ip_allowlist_check_enabled: bool, + pub is_vpc_acccess_proxy: bool, pub jwks_cache: JwkCache, pub is_auth_broker: bool, pub accept_jwts: bool, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 78bfb6deacc3..c4548a7ddd95 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -182,7 +182,8 @@ pub(crate) async fn handle_client( cancel_key_data, ctx, config.authentication_config.ip_allowlist_check_enabled, - backend, + config.authentication_config.is_vpc_acccess_proxy, + backend.get_api(), ) .await .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index a9fb513d3ceb..3236b2e1bfb0 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{ ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, }; -use crate::protocol2::ConnectionInfo; +use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; pub mod parquet; @@ -312,6 +312,15 @@ impl RequestContext { .ip() } + pub(crate) fn extra(&self) -> Option { + self.0 + .try_lock() + .expect("should not deadlock") + .conn_info + .extra + .clone() + } + pub(crate) fn cold_start_info(&self) -> ColdStartInfo { self.0 .try_lock() diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index ece03156d1fa..ef6621fc598a 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -22,7 +22,8 @@ use crate::control_plane::errors::{ use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; use crate::control_plane::{ - AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, + AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps, + CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; use crate::metrics::{CacheOutcome, Metrics}; use crate::rate_limiter::WakeComputeRateLimiter; @@ -137,9 +138,6 @@ impl NeonControlPlaneClient { } }; - // Ivan: don't know where it will be used, so I leave it here - let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); - let secret = if body.role_secret.is_empty() { None } else { @@ -153,10 +151,23 @@ impl NeonControlPlaneClient { .proxy .allowed_ips_number .observe(allowed_ips.len() as f64); + let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); + Metrics::get() + .proxy + .allowed_vpc_endpoint_ids + .observe(allowed_vpc_endpoint_ids.len() as f64); + let block_public_connections = body.block_public_connections.unwrap_or_default(); + let block_vpc_connections = body.block_vpc_connections.unwrap_or_default(); Ok(AuthInfo { secret, allowed_ips, + allowed_vpc_endpoint_ids, project_id: body.project_id, + account_id: body.account_id, + access_blocker_flags: AccessBlockerFlags { + public_access_blocked: block_public_connections, + vpc_access_blocked: block_vpc_connections, + }, }) } .inspect_err(|e| tracing::debug!(error = ?e)) @@ -299,6 +310,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let account_id = auth_info.account_id; if let Some(project_id) = auth_info.project_id { let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( @@ -312,24 +324,35 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { normalized_ep_int, Arc::new(auth_info.allowed_ips), ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + Arc::new(auth_info.allowed_vpc_endpoint_ids), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + auth_info.access_blocker_flags, + ); ctx.set_project_id(project_id); } // When we just got a secret, we don't need to invalidate it. Ok(Cached::new_uncached(auth_info.secret)) } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { Metrics::get() .proxy - .allowed_ips_cache_misses + .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats? .inc(CacheOutcome::Hit); - return Ok((allowed_ips, None)); + return Ok(allowed_ips); } Metrics::get() .proxy @@ -337,7 +360,10 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); + let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids); + let access_blocker_flags = auth_info.access_blocker_flags; let user = &user_info.user; + let account_id = auth_info.account_id; if let Some(project_id) = auth_info.project_id { let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( @@ -351,12 +377,136 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { normalized_ep_int, allowed_ips.clone(), ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + allowed_vpc_endpoint_ids.clone(), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + access_blocker_flags, + ); + ctx.set_project_id(project_id); + } + Ok(Cached::new_uncached(allowed_ips)) + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_vpc_endpoint_ids) = self + .caches + .project_info + .get_allowed_vpc_endpoint_ids(normalized_ep) + { + Metrics::get() + .proxy + .vpc_endpoint_id_cache_stats + .inc(CacheOutcome::Hit); + return Ok(allowed_vpc_endpoint_ids); + } + + Metrics::get() + .proxy + .vpc_endpoint_id_cache_stats + .inc(CacheOutcome::Miss); + + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids); + let access_blocker_flags = auth_info.access_blocker_flags; + let user = &user_info.user; + let account_id = auth_info.account_id; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + allowed_vpc_endpoint_ids.clone(), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + access_blocker_flags, + ); + ctx.set_project_id(project_id); + } + Ok(Cached::new_uncached(allowed_vpc_endpoint_ids)) + } + + async fn get_block_public_or_vpc_access( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(access_blocker_flags) = self + .caches + .project_info + .get_block_public_or_vpc_access(normalized_ep) + { + Metrics::get() + .proxy + .access_blocker_flags_cache_stats + .inc(CacheOutcome::Hit); + return Ok(access_blocker_flags); + } + + Metrics::get() + .proxy + .access_blocker_flags_cache_stats + .inc(CacheOutcome::Miss); + + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids); + let access_blocker_flags = auth_info.access_blocker_flags; + let user = &user_info.user; + let account_id = auth_info.account_id; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + allowed_vpc_endpoint_ids.clone(), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + access_blocker_flags.clone(), + ); ctx.set_project_id(project_id); } - Ok(( - Cached::new_uncached(allowed_ips), - Some(Cached::new_uncached(auth_info.secret)), - )) + Ok(Cached::new_uncached(access_blocker_flags)) } #[tracing::instrument(skip_all)] diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 5f8bda0f35ae..1e6cde8fb080 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -13,12 +13,14 @@ use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; use crate::cache::Cached; use crate::context::RequestContext; -use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret}; +use crate::control_plane::client::{ + CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret, +}; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, }; use crate::control_plane::messages::MetricsAuxInfo; -use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; +use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; use crate::error::io_error; use crate::intern::RoleNameInt; use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; @@ -121,7 +123,10 @@ impl MockControlPlane { Ok(AuthInfo { secret, allowed_ips, + allowed_vpc_endpoint_ids: vec![], project_id: None, + account_id: None, + access_blocker_flags: AccessBlockerFlags::default(), }) } @@ -214,16 +219,35 @@ impl super::ControlPlaneApi for MockControlPlane { )) } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( + &self, + _ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + Ok(Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info).await?.allowed_ips, + ))) + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + _ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + Ok(Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info) + .await? + .allowed_vpc_endpoint_ids, + ))) + } + + async fn get_block_public_or_vpc_access( &self, _ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - Ok(( - Cached::new_uncached(Arc::new( - self.do_get_auth_info(user_info).await?.allowed_ips, - )), - None, + ) -> Result { + Ok(Cached::new_uncached( + self.do_get_auth_info(user_info).await?.access_blocker_flags, )) } diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index d559d96bbc61..a06943726e50 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -6,7 +6,7 @@ use std::hash::Hash; use std::sync::Arc; use std::time::Duration; -use dashmap::DashMap; +use clashmap::ClashMap; use tokio::time::Instant; use tracing::{debug, info}; @@ -17,7 +17,8 @@ use crate::cache::project_info::ProjectInfoCacheImpl; use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; use crate::context::RequestContext; use crate::control_plane::{ - errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, + errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, + CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, }; use crate::error::ReportableError; use crate::metrics::ApiLockMetrics; @@ -55,17 +56,45 @@ impl ControlPlaneApi for ControlPlaneClient { } } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { + ) -> Result { match self { - Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] - Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await, #[cfg(test)] - Self::Test(api) => api.get_allowed_ips_and_secret(), + Self::Test(api) => api.get_allowed_ips(), + } + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + match self { + Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await, + #[cfg(test)] + Self::Test(api) => api.get_allowed_vpc_endpoint_ids(), + } + } + + async fn get_block_public_or_vpc_access( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + match self { + Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await, + #[cfg(test)] + Self::Test(api) => api.get_block_public_or_vpc_access(), } } @@ -102,9 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient { pub(crate) trait TestControlPlaneClient: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips_and_secret( + fn get_allowed_ips(&self) -> Result; + + fn get_allowed_vpc_endpoint_ids( + &self, + ) -> Result; + + fn get_block_public_or_vpc_access( &self, - ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; + ) -> Result; fn dyn_clone(&self) -> Box; } @@ -148,7 +183,7 @@ impl ApiCaches { /// Various caches for [`control_plane`](super). pub struct ApiLocks { name: &'static str, - node_locks: DashMap>, + node_locks: ClashMap>, config: RateLimiterConfig, timeout: Duration, epoch: std::time::Duration, @@ -180,7 +215,7 @@ impl ApiLocks { ) -> prometheus::Result { Ok(Self { name, - node_locks: DashMap::with_shard_amount(shards), + node_locks: ClashMap::with_shard_amount(shards), config, timeout, epoch, @@ -238,7 +273,7 @@ impl ApiLocks { let mut lock = shard.write(); let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock - .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) + .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1) .count(); drop(lock); self.metrics.semaphores_unregistered.inc_by(count as u64); diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index d068614b24df..5883d02b92c7 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -4,7 +4,7 @@ use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use crate::auth::IpPattern; -use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; /// Generic error response with human-readable description. @@ -227,8 +227,11 @@ pub(crate) struct UserFacingMessage { pub(crate) struct GetEndpointAccessControl { pub(crate) role_secret: Box, pub(crate) allowed_ips: Option>, + pub(crate) allowed_vpc_endpoint_ids: Option>, pub(crate) project_id: Option, - pub(crate) allowed_vpc_endpoint_ids: Option>, + pub(crate) account_id: Option, + pub(crate) block_public_connections: Option, + pub(crate) block_vpc_connections: Option, } /// Response which holds compute node's `host:port` pair. @@ -282,6 +285,10 @@ pub(crate) struct DatabaseInfo { pub(crate) aux: MetricsAuxInfo, #[serde(default)] pub(crate) allowed_ips: Option>, + #[serde(default)] + pub(crate) allowed_vpc_endpoint_ids: Option>, + #[serde(default)] + pub(crate) public_access_allowed: Option, } // Manually implement debug to omit sensitive info. @@ -293,6 +300,7 @@ impl fmt::Debug for DatabaseInfo { .field("dbname", &self.dbname) .field("user", &self.user) .field("allowed_ips", &self.allowed_ips) + .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids) .finish_non_exhaustive() } } @@ -457,19 +465,31 @@ mod tests { #[test] fn parse_get_role_secret() -> anyhow::Result<()> { - // Empty `allowed_ips` field. + // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field. + let json = json!({ + "role_secret": "secret", + }); + serde_json::from_str::(&json.to_string())?; + let json = json!({ + "role_secret": "secret", + "allowed_ips": ["8.8.8.8"], + }); + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", + "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], }); serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], + "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], }); serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], + "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], "project_id": "project", }); serde_json::from_str::(&json.to_string())?; diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 1dca26d6866c..f92e4f3f6055 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -19,6 +19,7 @@ use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; +use crate::intern::AccountIdInt; use crate::intern::ProjectIdInt; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, scram}; @@ -52,8 +53,14 @@ pub(crate) struct AuthInfo { pub(crate) secret: Option, /// List of IP addresses allowed for the autorization. pub(crate) allowed_ips: Vec, + /// List of VPC endpoints allowed for the autorization. + pub(crate) allowed_vpc_endpoint_ids: Vec, /// Project ID. This is used for cache invalidation. pub(crate) project_id: Option, + /// Account ID. This is used for cache invalidation. + pub(crate) account_id: Option, + /// Are public connections or VPC connections blocked? + pub(crate) access_blocker_flags: AccessBlockerFlags, } /// Info for establishing a connection to a compute node. @@ -95,11 +102,21 @@ impl NodeInfo { } } +#[derive(Clone, Default, Eq, PartialEq, Debug)] +pub(crate) struct AccessBlockerFlags { + pub public_access_blocked: bool, + pub vpc_access_blocked: bool, +} + pub(crate) type NodeInfoCache = TimedLru>>; pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; +pub(crate) type CachedAllowedVpcEndpointIds = + Cached<&'static ProjectInfoCacheImpl, Arc>>; +pub(crate) type CachedAccessBlockerFlags = + Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. @@ -113,11 +130,23 @@ pub(crate) trait ControlPlaneApi { user_info: &ComputeUserInfo, ) -> Result; - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result; + + async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result; + + async fn get_block_public_or_vpc_access( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; + ) -> Result; async fn get_endpoint_jwks( &self, diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index 79c6020302af..0d1382679c81 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -7,7 +7,7 @@ use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; -use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; +use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName}; pub trait InternId: Sized + 'static { fn get_interner() -> &'static StringInterner; @@ -206,6 +206,26 @@ impl From for ProjectIdInt { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct AccountIdTag; +impl InternId for AccountIdTag { + fn get_interner() -> &'static StringInterner { + static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type AccountIdInt = InternedString; +impl From<&AccountId> for AccountIdInt { + fn from(value: &AccountId) -> Self { + AccountIdTag::get_interner().get_or_intern(value) + } +} +impl From for AccountIdInt { + fn from(value: AccountId) -> Self { + AccountIdTag::get_interner().get_or_intern(&value) + } +} + #[cfg(test)] #[expect(clippy::unwrap_used)] mod tests { diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 41f10f052ffa..97c9f5a59c27 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,10 +1,23 @@ -use tracing::Subscriber; +use std::cell::{Cell, RefCell}; +use std::collections::HashMap; +use std::hash::BuildHasher; +use std::{env, io}; + +use chrono::{DateTime, Utc}; +use opentelemetry::trace::TraceContextExt; +use scopeguard::defer; +use serde::ser::{SerializeMap, Serializer}; +use tracing::span; +use tracing::subscriber::Interest; +use tracing::{callsite, Event, Metadata, Span, Subscriber}; +use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; use tracing_subscriber::fmt::time::SystemTime; use tracing_subscriber::fmt::{FormatEvent, FormatFields}; +use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; -use tracing_subscriber::registry::LookupSpan; +use tracing_subscriber::registry::{LookupSpan, SpanRef}; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan; /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. /// See pub async fn init() -> anyhow::Result { + let logfmt = LogFormat::from_env()?; + let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() @@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result { .expect("this should be a valid filter directive"), ); - let fmt_layer = tracing_subscriber::fmt::layer() - .with_ansi(false) - .with_writer(std::io::stderr) - .with_target(false); - let otlp_layer = tracing_utils::init_tracing("proxy").await; + let json_log_layer = if logfmt == LogFormat::Json { + Some(JsonLoggingLayer { + clock: RealClock, + skipped_field_indices: papaya::HashMap::default(), + writer: StderrWriter { + stderr: std::io::stderr(), + }, + }) + } else { + None + }; + + let text_log_layer = if logfmt == LogFormat::Text { + Some( + tracing_subscriber::fmt::layer() + .with_ansi(false) + .with_writer(std::io::stderr) + .with_target(false), + ) + } else { + None + }; + tracing_subscriber::registry() .with(env_filter) .with(otlp_layer) - .with(fmt_layer) + .with(json_log_layer) + .with(text_log_layer) .try_init()?; Ok(LoggingGuard) @@ -94,3 +128,857 @@ impl Drop for LoggingGuard { tracing_utils::shutdown_tracing(); } } + +// TODO: make JSON the default +#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)] +enum LogFormat { + #[default] + Text = 1, + Json, +} + +impl LogFormat { + fn from_env() -> anyhow::Result { + let logfmt = env::var("LOGFMT"); + Ok(match logfmt.as_deref() { + Err(_) => LogFormat::default(), + Ok("text") => LogFormat::Text, + Ok("json") => LogFormat::Json, + Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"), + }) + } +} + +trait MakeWriter { + fn make_writer(&self) -> impl io::Write; +} + +struct StderrWriter { + stderr: io::Stderr, +} + +impl MakeWriter for StderrWriter { + #[inline] + fn make_writer(&self) -> impl io::Write { + self.stderr.lock() + } +} + +// TODO: move into separate module or even separate crate. +trait Clock { + fn now(&self) -> DateTime; +} + +struct RealClock; + +impl Clock for RealClock { + #[inline] + fn now(&self) -> DateTime { + Utc::now() + } +} + +/// Name of the field used by tracing crate to store the event message. +const MESSAGE_FIELD: &str = "message"; + +thread_local! { + /// Protects against deadlocks and double panics during log writing. + /// The current panic handler will use tracing to log panic information. + static REENTRANCY_GUARD: Cell = const { Cell::new(false) }; + /// Thread-local instance with per-thread buffer for log writing. + static EVENT_FORMATTER: RefCell = RefCell::new(EventFormatter::new()); + /// Cached OS thread ID. + static THREAD_ID: u64 = gettid::gettid(); +} + +/// Implements tracing layer to handle events specific to logging. +struct JsonLoggingLayer { + clock: C, + skipped_field_indices: papaya::HashMap, + writer: W, +} + +impl Layer for JsonLoggingLayer +where + S: Subscriber + for<'a> LookupSpan<'a>, +{ + fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) { + use std::io::Write; + + // TODO: consider special tracing subscriber to grab timestamp very + // early, before OTel machinery, and add as event extension. + let now = self.clock.now(); + + let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { + if entered.get() { + let mut formatter = EventFormatter::new(); + formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + self.writer.make_writer().write_all(formatter.buffer()) + } else { + entered.set(true); + defer!(entered.set(false);); + + EVENT_FORMATTER.with_borrow_mut(move |formatter| { + formatter.reset(); + formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + self.writer.make_writer().write_all(formatter.buffer()) + }) + } + }); + + // In case logging fails we generate a simpler JSON object. + if let Err(err) = res { + if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( { + "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + "level": "ERROR", + "message": format_args!("cannot log event: {err:?}"), + "fields": { + "event": format_args!("{event:?}"), + }, + })) { + line.push(b'\n'); + self.writer.make_writer().write_all(&line).ok(); + } + } + } + + /// Registers a SpanFields instance as span extension. + fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("span must exist"); + let fields = SpanFields::default(); + fields.record_fields(attrs); + // This could deadlock when there's a panic somewhere in the tracing + // event handling and a read or write guard is still held. This includes + // the OTel subscriber. + span.extensions_mut().insert(fields); + } + + fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("span must exist"); + let ext = span.extensions(); + if let Some(data) = ext.get::() { + data.record_fields(values); + } + } + + /// Called (lazily) whenever a new log call is executed. We quickly check + /// for duplicate field names and record duplicates as skippable. Last one + /// wins. + fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { + if !metadata.is_event() { + // Must not be never because we wouldn't get trace and span data. + return Interest::always(); + } + + let mut field_indices = SkippedFieldIndices::default(); + let mut seen_fields = HashMap::<&'static str, usize>::new(); + for field in metadata.fields() { + use std::collections::hash_map::Entry; + match seen_fields.entry(field.name()) { + Entry::Vacant(entry) => { + // field not seen yet + entry.insert(field.index()); + } + Entry::Occupied(mut entry) => { + // replace currently stored index + let old_index = entry.insert(field.index()); + // ... and append it to list of skippable indices + field_indices.push(old_index); + } + } + } + + if !field_indices.is_empty() { + self.skipped_field_indices + .pin() + .insert(metadata.callsite(), field_indices); + } + + Interest::always() + } +} + +/// Stores span field values recorded during the spans lifetime. +#[derive(Default)] +struct SpanFields { + // TODO: Switch to custom enum with lasso::Spur for Strings? + fields: papaya::HashMap<&'static str, serde_json::Value>, +} + +impl SpanFields { + #[inline] + fn record_fields(&self, fields: R) { + fields.record(&mut SpanFieldsRecorder { + fields: self.fields.pin(), + }); + } +} + +/// Implements a tracing field visitor to convert and store values. +struct SpanFieldsRecorder<'m, S, G> { + fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>, +} + +impl tracing::field::Visit for SpanFieldsRecorder<'_, S, G> { + #[inline] + fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { + if let Ok(value) = i64::try_from(value) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } else { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value}"))); + } + } + + #[inline] + fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { + if let Ok(value) = u64::try_from(value) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } else { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value}"))); + } + } + + #[inline] + fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_str(&mut self, field: &tracing::field::Field, value: &str) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value:?}"))); + } + + #[inline] + fn record_error( + &mut self, + field: &tracing::field::Field, + value: &(dyn std::error::Error + 'static), + ) { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value}"))); + } +} + +/// List of field indices skipped during logging. Can list duplicate fields or +/// metafields not meant to be logged. +#[derive(Clone, Default)] +struct SkippedFieldIndices { + bits: u64, +} + +impl SkippedFieldIndices { + #[inline] + fn is_empty(&self) -> bool { + self.bits == 0 + } + + #[inline] + fn push(&mut self, index: usize) { + self.bits |= 1u64 + .checked_shl(index as u32) + .expect("field index too large"); + } + + #[inline] + fn contains(&self, index: usize) -> bool { + self.bits + & 1u64 + .checked_shl(index as u32) + .expect("field index too large") + != 0 + } +} + +/// Formats a tracing event and writes JSON to its internal buffer including a newline. +// TODO: buffer capacity management, truncate if too large +struct EventFormatter { + logline_buffer: Vec, +} + +impl EventFormatter { + #[inline] + fn new() -> Self { + EventFormatter { + logline_buffer: Vec::new(), + } + } + + #[inline] + fn buffer(&self) -> &[u8] { + &self.logline_buffer + } + + #[inline] + fn reset(&mut self) { + self.logline_buffer.clear(); + } + + fn format( + &mut self, + now: DateTime, + event: &Event<'_>, + ctx: &Context<'_, S>, + skipped_field_indices: &papaya::HashMap, + ) -> io::Result<()> + where + S: Subscriber + for<'a> LookupSpan<'a>, + { + let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true); + + use tracing_log::NormalizeEvent; + let normalized_meta = event.normalized_metadata(); + let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata()); + + let skipped_field_indices = skipped_field_indices.pin(); + let skipped_field_indices = skipped_field_indices.get(&meta.callsite()); + + let mut serialize = || { + let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer); + + let mut serializer = serializer.serialize_map(None)?; + + // Timestamp comes first, so raw lines can be sorted by timestamp. + serializer.serialize_entry("timestamp", ×tamp)?; + + // Level next. + serializer.serialize_entry("level", &meta.level().as_str())?; + + // Message next. + serializer.serialize_key("message")?; + let mut message_extractor = + MessageFieldExtractor::new(serializer, skipped_field_indices); + event.record(&mut message_extractor); + let mut serializer = message_extractor.into_serializer()?; + + let mut fields_present = FieldsPresent(false, skipped_field_indices); + event.record(&mut fields_present); + if fields_present.0 { + serializer.serialize_entry( + "fields", + &SerializableEventFields(event, skipped_field_indices), + )?; + } + + let pid = std::process::id(); + if pid != 1 { + serializer.serialize_entry("process_id", &pid)?; + } + + THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?; + + // TODO: tls cache? name could change + if let Some(thread_name) = std::thread::current().name() { + if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" { + serializer.serialize_entry("thread_name", thread_name)?; + } + } + + if let Some(task_id) = tokio::task::try_id() { + serializer.serialize_entry("task_id", &format_args!("{task_id}"))?; + } + + serializer.serialize_entry("target", meta.target())?; + + if let Some(module) = meta.module_path() { + if module != meta.target() { + serializer.serialize_entry("module", module)?; + } + } + + if let Some(file) = meta.file() { + if let Some(line) = meta.line() { + serializer.serialize_entry("src", &format_args!("{file}:{line}"))?; + } else { + serializer.serialize_entry("src", file)?; + } + } + + { + let otel_context = Span::current().context(); + let otel_spanref = otel_context.span(); + let span_context = otel_spanref.span_context(); + if span_context.is_valid() { + serializer.serialize_entry( + "trace_id", + &format_args!("{}", span_context.trace_id()), + )?; + } + } + + serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?; + + serializer.end() + }; + + serialize().map_err(io::Error::other)?; + self.logline_buffer.push(b'\n'); + Ok(()) + } +} + +/// Extracts the message field that's mixed will other fields. +struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> { + serializer: S, + skipped_field_indices: Option<&'a SkippedFieldIndices>, + state: Option>, +} + +impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> { + #[inline] + fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + Self { + serializer, + skipped_field_indices, + state: None, + } + } + + #[inline] + fn into_serializer(mut self) -> Result { + match self.state { + Some(Ok(())) => {} + Some(Err(err)) => return Err(err), + None => self.serializer.serialize_value("")?, + } + Ok(self.serializer) + } + + #[inline] + fn accept_field(&self, field: &tracing::field::Field) -> bool { + self.state.is_none() + && field.name() == MESSAGE_FIELD + && !self + .skipped_field_indices + .is_some_and(|i| i.contains(field.index())) + } +} + +impl tracing::field::Visit for MessageFieldExtractor<'_, S> { + #[inline] + fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}"))); + } + } + + #[inline] + fn record_str(&mut self, field: &tracing::field::Field, value: &str) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}"))); + } + } + + #[inline] + fn record_error( + &mut self, + field: &tracing::field::Field, + value: &(dyn std::error::Error + 'static), + ) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&format_args!("{value}"))); + } + } +} + +/// Checks if there's any fields and field values present. If not, the JSON subobject +/// can be skipped. +// This is entirely optional and only cosmetic, though maybe helps a +// bit during log parsing in dashboards when there's no field with empty object. +struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>); + +// Even though some methods have an overhead (error, bytes) it is assumed the +// compiler won't include this since we ignore the value entirely. +impl tracing::field::Visit for FieldsPresent<'_> { + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) { + if !self.1.is_some_and(|i| i.contains(field.index())) + && field.name() != MESSAGE_FIELD + && !field.name().starts_with("log.") + { + self.0 |= true; + } + } +} + +/// Serializes the fields directly supplied with a log event. +struct SerializableEventFields<'a, 'event>( + &'a tracing::Event<'event>, + Option<&'a SkippedFieldIndices>, +); + +impl serde::ser::Serialize for SerializableEventFields<'_, '_> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + use serde::ser::SerializeMap; + let serializer = serializer.serialize_map(None)?; + let mut message_skipper = MessageFieldSkipper::new(serializer, self.1); + self.0.record(&mut message_skipper); + let serializer = message_skipper.into_serializer()?; + serializer.end() + } +} + +/// A tracing field visitor that skips the message field. +struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> { + serializer: S, + skipped_field_indices: Option<&'a SkippedFieldIndices>, + state: Result<(), S::Error>, +} + +impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { + #[inline] + fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + Self { + serializer, + skipped_field_indices, + state: Ok(()), + } + } + + #[inline] + fn accept_field(&self, field: &tracing::field::Field) -> bool { + self.state.is_ok() + && field.name() != MESSAGE_FIELD + && !field.name().starts_with("log.") + && !self + .skipped_field_indices + .is_some_and(|i| i.contains(field.index())) + } + + #[inline] + fn into_serializer(self) -> Result { + self.state?; + Ok(self.serializer) + } +} + +impl tracing::field::Visit for MessageFieldSkipper<'_, S> { + #[inline] + fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { + if self.accept_field(field) { + self.state = self + .serializer + .serialize_entry(field.name(), &format_args!("{value:x?}")); + } + } + + #[inline] + fn record_str(&mut self, field: &tracing::field::Field, value: &str) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { + if self.accept_field(field) { + self.state = self + .serializer + .serialize_entry(field.name(), &format_args!("{value:?}")); + } + } + + #[inline] + fn record_error( + &mut self, + field: &tracing::field::Field, + value: &(dyn std::error::Error + 'static), + ) { + if self.accept_field(field) { + self.state = self.serializer.serialize_value(&format_args!("{value}")); + } + } +} + +/// Serializes the span stack from root to leaf (parent of event) enumerated +/// inside an object where the keys are just the number padded with zeroes +/// to retain sorting order. +// The object is necessary because Loki cannot flatten arrays. +struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>) +where + Span: Subscriber + for<'lookup> LookupSpan<'lookup>; + +impl serde::ser::Serialize for SerializableSpanStack<'_, '_, Span> +where + Span: Subscriber + for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: Ser) -> Result + where + Ser: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + + if let Some(leaf_span) = self.0.lookup_current() { + for (i, span) in leaf_span.scope().from_root().enumerate() { + serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?; + } + } + + serializer.end() + } +} + +/// Serializes a single span. Include the span ID, name and its fields as +/// recorded up to this point. +struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>) +where + Span: for<'lookup> LookupSpan<'lookup>; + +impl serde::ser::Serialize for SerializableSpan<'_, '_, Span> +where + Span: for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: Ser) -> Result + where + Ser: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + // TODO: the span ID is probably only useful for debugging tracing. + serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?; + serializer.serialize_entry("span_name", self.0.metadata().name())?; + + let ext = self.0.extensions(); + if let Some(data) = ext.get::() { + for (key, value) in &data.fields.pin() { + serializer.serialize_entry(key, value)?; + } + } + + serializer.end() + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use std::sync::{Arc, Mutex, MutexGuard}; + + use assert_json_diff::assert_json_eq; + use tracing::info_span; + + use super::*; + + struct TestClock { + current_time: Mutex>, + } + + impl Clock for Arc { + fn now(&self) -> DateTime { + *self.current_time.lock().expect("poisoned") + } + } + + struct VecWriter<'a> { + buffer: MutexGuard<'a, Vec>, + } + + impl MakeWriter for Arc>> { + fn make_writer(&self) -> impl io::Write { + VecWriter { + buffer: self.lock().expect("poisoned"), + } + } + } + + impl io::Write for VecWriter<'_> { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.buffer.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } + } + + #[test] + fn test_field_collection() { + let clock = Arc::new(TestClock { + current_time: Mutex::new(Utc::now()), + }); + let buffer = Arc::new(Mutex::new(Vec::new())); + let log_layer = JsonLoggingLayer { + clock: clock.clone(), + skipped_field_indices: papaya::HashMap::default(), + writer: buffer.clone(), + }; + + let registry = tracing_subscriber::Registry::default().with(log_layer); + + tracing::subscriber::with_default(registry, || { + info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| { + info_span!("span2").in_scope(|| { + tracing::error!( + a = 1, + a = 2, + a = 3, + message = "explicit message field", + "implicit message field" + ); + }); + }); + }); + + let buffer = Arc::try_unwrap(buffer) + .expect("no other reference") + .into_inner() + .expect("poisoned"); + let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON"); + let expected: serde_json::Value = serde_json::json!( + { + "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + "level": "ERROR", + "message": "explicit message field", + "fields": { + "a": 3, + }, + "spans": { + "00":{ + "span_id": "0000000000000001", + "span_name": "span1", + "x": 42, + }, + "01": { + "span_id": "0000000000000002", + "span_name": "span2", + } + }, + "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(), + "target": "proxy::logging::tests", + "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(), + "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(), + "thread_name": "logging::tests::test_field_collection", + } + ); + + assert_json_eq!(actual, expected); + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f3d281a26b59..25bcc81108b6 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -96,6 +96,16 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_ips_number: Histogram<10>, + /// Number of cache hits/misses for VPC endpoint IDs. + pub vpc_endpoint_id_cache_stats: CounterVec>, + + /// Number of cache hits/misses for access blocker flags. + pub access_blocker_flags_cache_stats: CounterVec>, + + /// Number of allowed VPC endpoints IDs + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_vpc_endpoint_ids: Histogram<10>, + /// Number of connections (per sni). pub accepted_connections_by_sni: CounterVec>, @@ -570,6 +580,9 @@ pub enum RedisEventsCount { CancelSession, PasswordUpdate, AllowedIpsUpdate, + AllowedVpcEndpointIdsUpdateForProjects, + AllowedVpcEndpointIdsUpdateForAllProjectsInOrg, + BlockPublicOrVpcAccessUpdate, } pub struct ThreadPoolWorkers(usize); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 3336a9556a5b..861f1766e84c 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -201,25 +201,26 @@ impl CopyBuffer { W: AsyncWrite + ?Sized, { loop { - // If our buffer is empty, then we need to read some data to - // continue. - if self.pos == self.cap && !self.read_done { - self.pos = 0; - self.cap = 0; - + // If there is some space left in our buffer, then we try to read some + // data to continue, thus maximizing the chances of a large write. + if self.cap < self.buf.len() && !self.read_done { match self.poll_fill_buf(cx, reader.as_mut()) { Poll::Ready(Ok(())) => (), Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))), Poll::Pending => { - // Try flushing when the reader has no progress to avoid deadlock - // when the reader depends on buffered writer. - if self.need_flush { - ready!(writer.as_mut().poll_flush(cx)) - .map_err(ErrorDirection::Write)?; - self.need_flush = false; + // Ignore pending reads when our buffer is not empty, because + // we can try to write data immediately. + if self.pos == self.cap { + // Try flushing when the reader has no progress to avoid deadlock + // when the reader depends on buffered writer. + if self.need_flush { + ready!(writer.as_mut().poll_flush(cx)) + .map_err(ErrorDirection::Write)?; + self.need_flush = false; + } + + return Poll::Pending; } - - return Poll::Pending; } } } @@ -246,9 +247,13 @@ impl CopyBuffer { "writer returned length larger than input slice" ); + // All data has been written, the buffer can be considered empty again + self.pos = 0; + self.cap = 0; + // If we've written all the data and we've seen EOF, flush out the // data and finish the transfer. - if self.pos == self.cap && self.read_done { + if self.read_done { ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?; return Poll::Ready(Ok(self.amt)); } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index ab173bd0d052..8a407c811971 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -283,7 +283,8 @@ pub(crate) async fn handle_client( cancel_key_data, ctx, config.authentication_config.ip_allowlist_check_enabled, - auth_backend, + config.authentication_config.is_vpc_acccess_proxy, + auth_backend.get_api(), ) .await .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 10db2bcb303f..d8c00a9b4177 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -26,7 +26,7 @@ use crate::config::{ComputeConfig, RetryConfig}; use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{ - self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache, + self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache, }; use crate::error::ErrorKind; use crate::tls::client_config::compute_client_config_with_certs; @@ -526,9 +526,19 @@ impl TestControlPlaneClient for TestConnectMechanism { } } - fn get_allowed_ips_and_secret( + fn get_allowed_ips(&self) -> Result { + unimplemented!("not used in tests") + } + + fn get_allowed_vpc_endpoint_ids( + &self, + ) -> Result { + unimplemented!("not used in tests") + } + + fn get_block_public_or_vpc_access( &self, - ) -> Result<(CachedAllowedIps, Option), control_plane::errors::GetAuthInfoError> + ) -> Result { unimplemented!("not used in tests") } diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index bff800f0a2f0..9645eaf725b1 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -2,7 +2,7 @@ use std::hash::Hash; use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; -use dashmap::DashMap; +use clashmap::ClashMap; use rand::{thread_rng, Rng}; use tokio::time::Instant; use tracing::info; @@ -14,7 +14,7 @@ use crate::intern::EndpointIdInt; pub type EndpointRateLimiter = LeakyBucketRateLimiter; pub struct LeakyBucketRateLimiter { - map: DashMap, + map: ClashMap, config: utils::leaky_bucket::LeakyBucketConfig, access_count: AtomicUsize, } @@ -27,7 +27,7 @@ impl LeakyBucketRateLimiter { pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { Self { - map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), + map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards), config: config.into(), access_count: AtomicUsize::new(0), } @@ -58,7 +58,7 @@ impl LeakyBucketRateLimiter { let shard = thread_rng().gen_range(0..n); self.map.shards()[shard] .write() - .retain(|_, value| !value.get().bucket_is_empty(now)); + .retain(|(_, value)| !value.bucket_is_empty(now)); } } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index ec080f270b77..ef6c39f230f4 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Mutex; use anyhow::bail; -use dashmap::DashMap; +use clashmap::ClashMap; use itertools::Itertools; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; @@ -62,7 +62,7 @@ impl GlobalRateLimiter { pub type WakeComputeRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { - map: DashMap, Hasher>, + map: ClashMap, Hasher>, info: Cow<'static, [RateBucketInfo]>, access_count: AtomicUsize, rand: Mutex, @@ -202,7 +202,7 @@ impl BucketRateLimiter { info!(buckets = ?info, "endpoint rate limiter"); Self { info, - map: DashMap::with_hasher_and_shard_amount(hasher, 64), + map: ClashMap::with_hasher_and_shard_amount(hasher, 64), access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request rand: Mutex::new(rand), } diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index dddc7e2054db..dcb9a59f873b 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -1,7 +1,8 @@ +use std::io::ErrorKind; + use anyhow::Ok; use pq_proto::{id_to_cancel_key, CancelKeyData}; use serde::{Deserialize, Serialize}; -use std::io::ErrorKind; pub mod keyspace { pub const CANCEL_PREFIX: &str = "cancel"; diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index dcc6aac51bb7..3689bf7ae29b 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -1,7 +1,6 @@ use redis::{AsyncCommands, ToRedisArgs}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; - use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; pub struct RedisKVClient { diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 19fdd3280dfc..1a7024588aa1 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::cache::project_info::ProjectInfoCache; -use crate::intern::{ProjectIdInt, RoleNameInt}; +use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -86,9 +86,7 @@ pub(crate) struct BlockPublicOrVpcAccessUpdated { #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub(crate) struct AllowedVpcEndpointsUpdatedForOrg { - // TODO: change type once the implementation is more fully fledged. - // See e.g. https://github.com/neondatabase/neon/pull/10073. - account_id: ProjectIdInt, + account_id: AccountIdInt, } #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] @@ -205,6 +203,24 @@ impl MessageHandler { .proxy .redis_events_count .inc(RedisEventsCount::PasswordUpdate); + } else if matches!( + msg, + Notification::AllowedVpcEndpointsUpdatedForProjects { .. } + ) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects); + } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg); + } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate); } // TODO: add additional metrics for the other event types. @@ -230,20 +246,26 @@ fn invalidate_cache(cache: Arc, msg: Notification) { Notification::AllowedIpsUpdate { allowed_ips_update } => { cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id); } + Notification::BlockPublicOrVpcAccessUpdated { + block_public_or_vpc_access_updated, + } => cache.invalidate_block_public_or_vpc_access_for_project( + block_public_or_vpc_access_updated.project_id, + ), + Notification::AllowedVpcEndpointsUpdatedForOrg { + allowed_vpc_endpoints_updated_for_org, + } => cache.invalidate_allowed_vpc_endpoint_ids_for_org( + allowed_vpc_endpoints_updated_for_org.account_id, + ), + Notification::AllowedVpcEndpointsUpdatedForProjects { + allowed_vpc_endpoints_updated_for_projects, + } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects( + allowed_vpc_endpoints_updated_for_projects.project_ids, + ), Notification::PasswordUpdate { password_update } => cache .invalidate_role_secret_for_project( password_update.project_id, password_update.role_name, ), - Notification::BlockPublicOrVpcAccessUpdated { .. } => { - // https://github.com/neondatabase/neon/pull/10073 - } - Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => { - // https://github.com/neondatabase/neon/pull/10073 - } - Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => { - // https://github.com/neondatabase/neon/pull/10073 - } Notification::UnknownTopic => unreachable!(), } } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 6d5fb13681e9..0fb4a8a6cc70 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -30,6 +30,7 @@ use crate::control_plane::locks::ApiLocks; use crate::control_plane::CachedNodeInfo; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; +use crate::protocol2::ConnectionInfoExtra; use crate::proxy::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; @@ -57,23 +58,52 @@ impl PoolingBackend { let user_info = user_info.clone(); let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); - let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; + let allowed_ips = backend.get_allowed_ips(ctx).await?; + if self.config.authentication_config.ip_allowlist_check_enabled && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } + + let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?; + if self.config.authentication_config.is_vpc_acccess_proxy { + if access_blocker_flags.vpc_access_blocked { + return Err(AuthError::NetworkNotAllowed); + } + + let extra = ctx.extra(); + let incoming_endpoint_id = match extra { + None => String::new(), + Some(ConnectionInfoExtra::Aws { vpce_id }) => { + // Convert the vcpe_id to a string + String::from_utf8(vpce_id.to_vec()).unwrap_or_default() + } + Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), + }; + + if incoming_endpoint_id.is_empty() { + return Err(AuthError::MissingVPCEndpointId); + } + + let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?; + // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. + if !allowed_vpc_endpoint_ids.is_empty() + && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id) + { + return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id)); + } + } else if access_blocker_flags.public_access_blocked { + return Err(AuthError::NetworkNotAllowed); + } + if !self .endpoint_rate_limiter .check(user_info.endpoint.clone().into(), 1) { return Err(AuthError::too_many_connections()); } - let cached_secret = match maybe_secret { - Some(secret) => secret, - None => backend.get_role_secret(ctx).await?, - }; - + let cached_secret = backend.get_role_secret(ctx).await?; let secret = match cached_secret.value.clone() { Some(secret) => self.config.authentication_config.check_rate_limit( ctx, diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 44eac77e8f94..a300198de449 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -5,7 +5,7 @@ use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; use std::time::Duration; -use dashmap::DashMap; +use clashmap::ClashMap; use parking_lot::RwLock; use postgres_client::ReadyForQueryStatus; use rand::Rng; @@ -351,11 +351,11 @@ where // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - pub(crate) global_pool: DashMap>>, + pub(crate) global_pool: ClashMap>>, /// Number of endpoint-connection pools /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. pub(crate) global_pool_size: AtomicUsize, @@ -396,7 +396,7 @@ where pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), + global_pool: ClashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), config, global_connections_count: Arc::new(AtomicUsize::new(0)), @@ -442,10 +442,10 @@ where .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; - shard.retain(|endpoint, x| { + shard.retain(|(endpoint, x)| { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { + if let Some(pool) = Arc::get_mut(x) { let endpoints = pool.get_mut(); clients_removed = endpoints.clear_closed(); diff --git a/proxy/src/types.rs b/proxy/src/types.rs index 6e0bd61c9442..d5952d1d8b0a 100644 --- a/proxy/src/types.rs +++ b/proxy/src/types.rs @@ -97,6 +97,8 @@ smol_str_wrapper!(EndpointId); smol_str_wrapper!(BranchId); // 90% of project strings are 23 characters or less. smol_str_wrapper!(ProjectId); +// 90% of account strings are 23 characters or less. +smol_str_wrapper!(AccountId); // will usually equal endpoint ID smol_str_wrapper!(EndpointCacheKey); diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index e1cc7e87b4a4..d369e3742f82 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -10,9 +10,9 @@ use anyhow::{bail, Context}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; +use clashmap::mapref::entry::Entry; +use clashmap::ClashMap; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use dashmap::mapref::entry::Entry; -use dashmap::DashMap; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; @@ -137,7 +137,7 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub(crate) struct Metrics { - endpoints: DashMap, FastHasher>, + endpoints: ClashMap, FastHasher>, } impl Metrics { @@ -213,7 +213,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result( - endpoints: &DashMap, FastHasher>, + endpoints: &ClashMap, FastHasher>, ) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); @@ -271,7 +271,7 @@ fn create_event_chunks<'a>( #[expect(clippy::too_many_arguments)] #[instrument(skip_all)] async fn collect_metrics_iteration( - endpoints: &DashMap, FastHasher>, + endpoints: &ClashMap, FastHasher>, client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, storage: Option<&GenericRemoteStorage>, diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 06746d3e1dd5..38a7f202ba0a 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.84.0" +channel = "1.84.1" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 4e5b34a9bf65..5ecb23e8e04b 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -535,6 +535,10 @@ pub async fn main_task( // limit concurrent uploads let _upload_permit = tokio::select! { acq = limiter.acquire_partial_backup() => acq, + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return None; + } _ = cancel.cancelled() => { info!("task canceled"); return None; diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 9860bd5d0e44..63f43cdf62e0 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -32,6 +32,7 @@ postgres_connection.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true +rustls-native-certs.workspace = true serde.workspace = true serde_json.workspace = true thiserror.workspace = true @@ -39,9 +40,12 @@ tokio.workspace = true tokio-util.workspace = true tracing.workspace = true measured.workspace = true +rustls.workspace = true scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true +tokio-postgres.workspace = true +tokio-postgres-rustls.workspace = true diesel = { version = "2.2.6", features = [ "serde_json", diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 3884a6df4601..5bc3c81f02c8 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -225,7 +225,7 @@ pub(crate) enum NotifyError { // We shutdown while sending #[error("Shutting down")] ShuttingDown, - // A response indicates we will never succeed, such as 400 or 404 + // A response indicates we will never succeed, such as 400 or 403 #[error("Non-retryable error {0}")] Fatal(StatusCode), diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 35eb15b29791..c4e5b3958912 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1,6 +1,7 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -9,8 +10,11 @@ use diesel::prelude::*; use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; use diesel_async::pooled_connection::bb8::Pool; use diesel_async::pooled_connection::AsyncDieselConnectionManager; +use diesel_async::pooled_connection::ManagerConfig; +use diesel_async::AsyncPgConnection; use diesel_async::RunQueryDsl; -use diesel_async::{AsyncConnection, AsyncPgConnection}; +use futures::future::BoxFuture; +use futures::FutureExt; use itertools::Itertools; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::controller_api::MetadataHealthRecord; @@ -23,6 +27,9 @@ use pageserver_api::shard::ShardConfigError; use pageserver_api::shard::ShardIdentity; use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; +use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; +use rustls::client::WebPkiServerVerifier; +use rustls::crypto::ring; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; @@ -156,7 +163,13 @@ impl Persistence { const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); pub async fn new(database_url: String) -> Self { - let manager = AsyncDieselConnectionManager::::new(database_url); + let mut mgr_config = ManagerConfig::default(); + mgr_config.custom_setup = Box::new(establish_connection_rustls); + + let manager = AsyncDieselConnectionManager::::new_with_config( + database_url, + mgr_config, + ); // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time // to execute queries (database queries are not generally on latency-sensitive paths). @@ -181,8 +194,10 @@ impl Persistence { timeout: Duration, ) -> Result<(), diesel::ConnectionError> { let started_at = Instant::now(); + log_postgres_connstr_info(database_url) + .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?; loop { - match AsyncPgConnection::establish(database_url).await { + match establish_connection_rustls(database_url).await { Ok(_) => { tracing::info!("Connected to database."); return Ok(()); @@ -1256,6 +1271,130 @@ impl Persistence { } } +pub(crate) fn load_certs() -> anyhow::Result> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + anyhow::bail!("could not parse certificates: {:?}", der_certs.errors); + } + + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs.certs); + Ok(Arc::new(store)) +} + +#[derive(Debug)] +/// A verifier that accepts all certificates (but logs an error still) +struct AcceptAll(Arc); +impl ServerCertVerifier for AcceptAll { + fn verify_server_cert( + &self, + end_entity: &rustls::pki_types::CertificateDer<'_>, + intermediates: &[rustls::pki_types::CertificateDer<'_>], + server_name: &rustls::pki_types::ServerName<'_>, + ocsp_response: &[u8], + now: rustls::pki_types::UnixTime, + ) -> Result { + let r = + self.0 + .verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now); + if let Err(err) = r { + tracing::info!( + ?server_name, + "ignoring db connection TLS validation error: {err:?}" + ); + return Ok(ServerCertVerified::assertion()); + } + r + } + fn verify_tls12_signature( + &self, + message: &[u8], + cert: &rustls::pki_types::CertificateDer<'_>, + dss: &rustls::DigitallySignedStruct, + ) -> Result { + self.0.verify_tls12_signature(message, cert, dss) + } + fn verify_tls13_signature( + &self, + message: &[u8], + cert: &rustls::pki_types::CertificateDer<'_>, + dss: &rustls::DigitallySignedStruct, + ) -> Result { + self.0.verify_tls13_signature(message, cert, dss) + } + fn supported_verify_schemes(&self) -> Vec { + self.0.supported_verify_schemes() + } +} + +/// Loads the root certificates and constructs a client config suitable for connecting. +/// This function is blocking. +fn client_config_with_root_certs() -> anyhow::Result { + let client_config = + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions"); + static DO_CERT_CHECKS: std::sync::OnceLock = std::sync::OnceLock::new(); + let do_cert_checks = + DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok()); + Ok(if *do_cert_checks { + client_config + .with_root_certificates(load_certs()?) + .with_no_client_auth() + } else { + let verifier = AcceptAll( + WebPkiServerVerifier::builder_with_provider( + load_certs()?, + Arc::new(ring::default_provider()), + ) + .build()?, + ); + client_config + .dangerous() + .with_custom_certificate_verifier(Arc::new(verifier)) + .with_no_client_auth() + }) +} + +fn establish_connection_rustls(config: &str) -> BoxFuture> { + let fut = async { + // We first set up the way we want rustls to work. + let rustls_config = client_config_with_root_certs() + .map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?; + let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config); + let (client, conn) = tokio_postgres::connect(config, tls) + .await + .map_err(|e| ConnectionError::BadConnection(e.to_string()))?; + + AsyncPgConnection::try_from_client_and_connection(client, conn).await + }; + fut.boxed() +} + +#[cfg_attr(test, test)] +fn test_config_debug_censors_password() { + let has_pw = + "host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'"; + let has_pw_cfg = has_pw.parse::().unwrap(); + assert!(format!("{has_pw_cfg:?}").contains("specialuser")); + // Ensure that the password is not leaked by the debug impl + assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG")); +} + +fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> { + let config = config_str + .parse::() + .map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?; + // We use debug formatting here, and use a unit test to ensure that we don't leak the password. + // To make extra sure the test gets ran, run it every time the function is called + // (this is rather cold code, we can afford it). + #[cfg(not(test))] + test_config_debug_censors_password(); + tracing::info!("database connection config: {config:?}"); + Ok(()) +} + /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably #[derive( QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 03db94726315..58bc0ba1cd86 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder { } } + pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self { + Self { + config: ReconcilerConfig { + tenant_creation_hint: hint, + ..self.config + }, + } + } + pub(crate) fn build(self) -> ReconcilerConfig { self.config } @@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig { // During live migrations this is the amount of time that // the pagserver will hold our poll. secondary_download_request_timeout: Option, + + // A hint indicating whether this reconciliation is done on the + // creation of a new tenant. This only informs logging behaviour. + tenant_creation_hint: bool, } impl ReconcilerConfig { @@ -143,6 +156,10 @@ impl ReconcilerConfig { self.secondary_download_request_timeout .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) } + + pub(crate) fn tenant_creation_hint(&self) -> bool { + self.tenant_creation_hint + } } /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O @@ -934,16 +951,35 @@ impl Reconciler { ) .await; if let Err(e) = &result { + // Set this flag so that in our ReconcileResult we will set the flag on the shard that it + // needs to retry at some point. + self.compute_notify_failure = true; + // It is up to the caller whether they want to drop out on this error, but they don't have to: // in general we should avoid letting unavailability of the cloud control plane stop us from // making progress. - if !matches!(e, NotifyError::ShuttingDown) { - tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + match e { + // 404s from cplane during tenant creation are expected. + // Cplane only persists the shards to the database after + // creating the tenant and the timeline. If we notify before + // that, we'll get a 404. + // + // This is fine because tenant creations happen via /location_config + // and that returns the list of locations in the response. Hence, we + // silence the error and return Ok(()) here. Reconciliation will still + // be retried because we set [`Reconciler::compute_notify_failure`] above. + NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND) + if self.reconciler_config.tenant_creation_hint() => + { + return Ok(()); + } + NotifyError::ShuttingDown => {} + _ => { + tracing::warn!( + "Failed to notify compute of attached pageserver {node}: {e}" + ); + } } - - // Set this flag so that in our ReconcileResult we will set the flag on the shard that it - // needs to retry at some point. - self.compute_notify_failure = true; } result } else { diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index f5cab9dd5746..f9e72862ae3d 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -774,8 +774,9 @@ impl Scheduler { if !matches!(context.mode, ScheduleMode::Speculative) { tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", - scores.iter().map(|i| i.node_id().0).collect::>() + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", + scores.iter().map(|i| i.node_id().0).collect::>(), + preferred_az, ); } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 9ac9ee17cad4..4028cd702343 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2238,9 +2238,14 @@ impl Service { let waiters = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); + let config = ReconcilerConfigBuilder::new() + .tenant_creation_hint(true) + .build(); tenants .range_mut(TenantShardId::tenant_range(tenant_id)) - .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) + .filter_map(|(_shard_id, shard)| { + self.maybe_configured_reconcile_shard(shard, nodes, config) + }) .collect::>() }; diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 98034421d6ea..91d7183fde8d 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -96,28 +96,37 @@ impl ChaosInjector { let batch_size = 128; let mut inner = self.service.inner.write().unwrap(); let (nodes, tenants, scheduler) = inner.parts_mut(); - let tenant_ids = tenants.keys().cloned().collect::>(); // Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some // random tenants to move, and then on next chaos iteration moving them back, then picking some new // random tenants on the next iteration. - let mut victims = Vec::with_capacity(batch_size); - for shard in tenants.values() { - if shard.is_attached_outside_preferred_az(nodes) { - victims.push(shard.tenant_shard_id); - } + let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants + .values() + .map(|shard| { + ( + shard.tenant_shard_id, + shard.is_attached_outside_preferred_az(nodes), + ) + }) + .partition(|(_id, is_outside)| *is_outside); + + let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect(); + let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect(); - if victims.len() >= batch_size { - break; - } - } + let mut victims = Vec::with_capacity(batch_size); + if out_of_home_az.len() >= batch_size { + tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len()); - let choose_random = batch_size.saturating_sub(victims.len()); - tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len()); + out_of_home_az.shuffle(&mut thread_rng()); + victims.extend(out_of_home_az.into_iter().take(batch_size)); + } else { + tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len())); - let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random); - victims.extend(random_victims); + victims.extend(out_of_home_az); + in_home_az.shuffle(&mut thread_rng()); + victims.extend(in_home_az.into_iter().take(batch_size - victims.len())); + } for victim in victims { self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index cbc2696b260d..219c0dffe7c6 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -707,10 +707,15 @@ impl TenantShard { if let Some(node_id) = self.intent.get_attached() { // Populate secondary by demoting the attached node self.intent.demote_attached(scheduler, *node_id); + modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node - let node_id = scheduler.schedule_shard::( + // + // We use [`AttachedShardTag`] because when a secondary location is the only one + // a shard has, we expect that its next use will be as an attached location: we want + // the tenant to be ready to warm up and run fast in their preferred AZ. + let node_id = scheduler.schedule_shard::( &[], &self.intent.preferred_az_id, context, @@ -719,9 +724,17 @@ impl TenantShard { modified = true; } while self.intent.secondary.len() > 1 { - // We have no particular preference for one secondary location over another: just - // arbitrarily drop from the end - self.intent.pop_secondary(scheduler); + // If we have multiple secondaries (e.g. when transitioning from Attached to Secondary and + // having just demoted our attached location), then we should prefer to keep the location + // in our preferred AZ. Tenants in Secondary mode want to be in the preferred AZ so that + // they have a warm location to become attached when transitioning back into Attached. + + let mut candidates = self.intent.get_secondary().clone(); + // Sort to get secondaries outside preferred AZ last + candidates + .sort_by_key(|n| scheduler.get_node_az(n).as_ref() != self.preferred_az()); + let secondary_to_remove = candidates.pop().unwrap(); + self.intent.remove_secondary(scheduler, secondary_to_remove); modified = true; } } @@ -967,24 +980,51 @@ impl TenantShard { ), ) }) - .collect::>(); + .collect::>(); if secondary_scores.iter().any(|score| score.1.is_none()) { - // Don't have full list of scores, so can't make a good decision about which to drop unless - // there is an obvious one in the wrong AZ - for secondary in self.intent.get_secondary() { - if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + // Trivial case: if we only have one secondary, drop that one + if self.intent.get_secondary().len() == 1 { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary( + *self.intent.get_secondary().first().unwrap(), + ), + }); + } + + // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state + // where its score can't be calculated), and drop the others. This enables us to make progress in + // most cases, even if some nodes are offline or have scheduling=pause set. + + debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this + // logic presumes we are in a mode where we want secondaries to be in non-home AZ + if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| { + let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id; + let is_available = secondary_scores + .get(n) + .expect("Built from same list of nodes") + .is_some(); + is_available && !in_home_az + }) { + // Great, we found one to retain. Pick some other to drop. + if let Some(victim) = self + .intent + .get_secondary() + .iter() + .find(|n| n != &retain_secondary) + { return Some(ScheduleOptimization { sequence: self.sequence, - action: ScheduleOptimizationAction::RemoveSecondary(*secondary), + action: ScheduleOptimizationAction::RemoveSecondary(*victim), }); } } // Fall through: we didn't identify one to remove. This ought to be rare. tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", - self.intent.get_secondary() - ); + self.intent.get_secondary() + ); } else { let victim = secondary_scores .iter() @@ -993,7 +1033,7 @@ impl TenantShard { .0; return Some(ScheduleOptimization { sequence: self.sequence, - action: ScheduleOptimizationAction::RemoveSecondary(victim), + action: ScheduleOptimizationAction::RemoveSecondary(*victim), }); } } @@ -1079,12 +1119,31 @@ impl TenantShard { None => vec![], }; - let replacement = self.find_better_location::( - scheduler, - &schedule_context, - *secondary, - &exclude, - ); + let replacement = match &self.policy { + PlacementPolicy::Attached(_) => { + // Secondaries for an attached shard should be scheduled using `SecondaryShardTag` + // to avoid placing them in the preferred AZ. + self.find_better_location::( + scheduler, + &schedule_context, + *secondary, + &exclude, + ) + } + PlacementPolicy::Secondary => { + // In secondary-only mode, we want our secondary locations in the preferred AZ, + // so that they're ready to take over as an attached location when we transition + // into PlacementPolicy::Attached. + self.find_better_location::( + scheduler, + &schedule_context, + *secondary, + &exclude, + ) + } + PlacementPolicy::Detached => None, + }; + assert!(replacement != Some(*secondary)); if let Some(replacement) = replacement { // We have found a candidate and confirmed that its score is preferable @@ -1806,7 +1865,7 @@ impl TenantShard { .get(&node_id) .expect("referenced node exists") .get_availability_zone_id(), - ) == self.intent.preferred_az_id.as_ref() + ) != self.intent.preferred_az_id.as_ref() }) .unwrap_or(false) } @@ -2348,6 +2407,110 @@ pub(crate) mod tests { Ok(()) } + /// Test how the optimisation code behaves with an extra secondary + #[test] + fn optimize_removes_secondary() -> anyhow::Result<()> { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let mut nodes = make_test_nodes( + 4, + &[ + az_a_tag.clone(), + az_b_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut schedule_context = ScheduleContext::default(); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(az_a_tag.clone()); + shard_a + .schedule(&mut scheduler, &mut schedule_context) + .unwrap(); + + // Attached on node 1, secondary on node 2 + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]); + + // Initially optimiser is idle + assert_eq!( + shard_a.optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shard_a.optimize_secondary(&mut scheduler, &schedule_context), + None + ); + + // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over + // to our new location + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + + // A spare secondary in the non-home AZ, and one of them is offline + shard_a.intent.push_secondary(&mut scheduler, NodeId(4)); + nodes + .get_mut(&NodeId(4)) + .unwrap() + .set_availability(NodeAvailability::Offline); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + + // A spare secondary when should have none + shard_a.policy = PlacementPolicy::Attached(0); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![]); + + // Check that in secondary mode, we preserve the secondary in the preferred AZ + let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule() + shard_a.policy = PlacementPolicy::Secondary; + shard_a + .schedule(&mut scheduler, &mut schedule_context) + .unwrap(); + assert_eq!(shard_a.intent.get_attached(), &None); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + assert_eq!( + shard_a.optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shard_a.optimize_secondary(&mut scheduler, &schedule_context), + None + ); + + shard_a.intent.clear(&mut scheduler); + + Ok(()) + } + // Optimize til quiescent: this emulates what Service::optimize_all does, when // called repeatedly in the background. // Returns the applied optimizations @@ -2687,4 +2850,108 @@ pub(crate) mod tests { } Ok(()) } + + /// Check how the shard's scheduling behaves when in PlacementPolicy::Secondary mode. + #[test] + fn tenant_secondary_scheduling() -> anyhow::Result<()> { + let az_a = AvailabilityZone("az-a".to_string()); + let nodes = make_test_nodes( + 3, + &[ + az_a.clone(), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Secondary); + tenant_shard.intent.preferred_az_id = Some(az_a.clone()); + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_none()); + + // Should have scheduled into the preferred AZ + assert_eq!( + scheduler + .get_node_az(&tenant_shard.intent.secondary[0]) + .as_ref(), + tenant_shard.preferred_az() + ); + + // Optimizer should agree + assert_eq!( + tenant_shard.optimize_attachment(&mut scheduler, &context), + None + ); + assert_eq!( + tenant_shard.optimize_secondary(&mut scheduler, &context), + None + ); + + // Switch to PlacementPolicy::Attached + tenant_shard.policy = PlacementPolicy::Attached(1); + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); + // Secondary should now be in non-preferred AZ + assert_ne!( + scheduler + .get_node_az(&tenant_shard.intent.secondary[0]) + .as_ref(), + tenant_shard.preferred_az() + ); + // Attached should be in preferred AZ + assert_eq!( + scheduler + .get_node_az(&tenant_shard.intent.attached.unwrap()) + .as_ref(), + tenant_shard.preferred_az() + ); + + // Optimizer should agree + assert_eq!( + tenant_shard.optimize_attachment(&mut scheduler, &context), + None + ); + assert_eq!( + tenant_shard.optimize_secondary(&mut scheduler, &context), + None + ); + + // Switch back to PlacementPolicy::Secondary + tenant_shard.policy = PlacementPolicy::Secondary; + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_none()); + // When we picked a location to keep, we should have kept the one in the preferred AZ + assert_eq!( + scheduler + .get_node_az(&tenant_shard.intent.secondary[0]) + .as_ref(), + tenant_shard.preferred_az() + ); + + // Optimizer should agree + assert_eq!( + tenant_shard.optimize_attachment(&mut scheduler, &context), + None + ); + assert_eq!( + tenant_shard.optimize_secondary(&mut scheduler, &context), + None + ); + + tenant_shard.intent.clear(&mut scheduler); + + Ok(()) + } } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index fd7e193778bb..83a1a8761153 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -158,6 +158,9 @@ def counter(name: str) -> str: "pageserver_pitr_history_size", "pageserver_layer_bytes", "pageserver_layer_count", + "pageserver_layers_per_read_bucket", + "pageserver_layers_per_read_count", + "pageserver_layers_per_read_sum", "pageserver_visible_physical_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7e3cc1982960..7c4991ffabda 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2766,6 +2766,11 @@ def read_tenant_location_conf( log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}") raise + def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any: + path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json" + with open(path) as f: + return json.load(f) + def tenant_create( self, tenant_id: TenantId, @@ -4996,13 +5001,35 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) +# wait for subscriber to catch up with publisher def logical_replication_sync( subscriber: PgProtocol, publisher: PgProtocol, + # pass subname explicitly to avoid confusion + # when multiple subscriptions are present + subname: str, sub_dbname: str | None = None, pub_dbname: str | None = None, -) -> Lsn: +): """Wait logical replication subscriber to sync with publisher.""" + + def initial_sync(): + # first check if the subscription is active `s`=`synchronized`, `r` = `ready` + query = f"""SELECT 1 FROM pg_subscription_rel join pg_catalog.pg_subscription + on pg_subscription_rel.srsubid = pg_subscription.oid + WHERE srsubstate NOT IN ('r', 's') and subname='{subname}'""" + + if sub_dbname is not None: + res = subscriber.safe_psql(query, dbname=sub_dbname) + else: + res = subscriber.safe_psql(query) + + assert (res is None) or (len(res) == 0) + + wait_until(initial_sync) + + # wait for the subscription to catch up with current state of publisher + # caller is responsible to call checkpoint before calling this function if pub_dbname is not None: publisher_lsn = Lsn( publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0] @@ -5010,23 +5037,23 @@ def logical_replication_sync( else: publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - while True: + def subscriber_catch_up(): + query = f"select latest_end_lsn from pg_catalog.pg_stat_subscription where latest_end_lsn is NOT NULL and subname='{subname}'" + if sub_dbname is not None: - res = subscriber.safe_psql( - "select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname - )[0][0] + res = subscriber.safe_psql(query, dbname=sub_dbname) else: - res = subscriber.safe_psql( - "select latest_end_lsn from pg_catalog.pg_stat_subscription" - )[0][0] - - if res: - log.info(f"subscriber_lsn={res}") - subscriber_lsn = Lsn(res) - log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}") - if subscriber_lsn >= publisher_lsn: - return subscriber_lsn - time.sleep(0.5) + res = subscriber.safe_psql(query) + + assert res is not None + + res_lsn = res[0][0] + log.info(f"subscriber_lsn={res_lsn}") + subscriber_lsn = Lsn(res_lsn) + log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}") + assert subscriber_lsn >= publisher_lsn + + wait_until(subscriber_catch_up) def tenant_get_shards( diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md index 8eca056dda23..449e56e21df6 100644 --- a/test_runner/logical_repl/README.md +++ b/test_runner/logical_repl/README.md @@ -1,13 +1,18 @@ # Logical replication tests +> [!NOTE] +> Neon project should have logical replication enabled: +> +> https://neon.tech/docs/guides/logical-replication-postgres#enable-logical-replication-in-the-source-neon-project + ## Clickhouse ```bash export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb -docker compose -f clickhouse/docker-compose.yml up -d -pytest -m remote_cluster -k test_clickhouse -docker compose -f clickhouse/docker-compose.yml down +docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml up -d +./scripts/pytest -m remote_cluster -k test_clickhouse +docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down ``` ## Debezium @@ -15,8 +20,7 @@ docker compose -f clickhouse/docker-compose.yml down ```bash export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb -docker compose -f debezium/docker-compose.yml up -d -pytest -m remote_cluster -k test_debezium -docker compose -f debezium/docker-compose.yml down - -``` \ No newline at end of file +docker compose -f test_runner/logical_repl/debezium/docker-compose.yml up -d +./scripts/pytest -m remote_cluster -k test_debezium +docker compose -f test_runner/logical_repl/debezium/docker-compose.yml down +``` diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index efc7fa59dbea..6c009440058c 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): cur.execute("set log_statement = 'all'") cur.execute("create table t(x integer)") for _ in range(n_iters): - cur.execute(f"insert into t values (generate_series(1,{n_records}))") + with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") time.sleep(1) - cur.execute("vacuum t") + with zenbenchmark.record_duration("vacuum t"): + cur.execute("vacuum t") - with zenbenchmark.record_duration("test_query"): + with zenbenchmark.record_duration("SELECT count(*) from t"): cur.execute("SELECT count(*) from t") assert cur.fetchone() == (n_iters * n_records,) - flush_ep_to_pageserver(env, endpoint, tenant, timeline) - env.pageserver.http_client().timeline_checkpoint( - tenant, timeline, compact=False, wait_until_uploaded=True - ) + with zenbenchmark.record_duration("flush_ep_to_pageserver"): + flush_ep_to_pageserver(env, endpoint, tenant, timeline) + with zenbenchmark.record_duration("timeline_checkpoint"): + env.pageserver.http_client().timeline_checkpoint( + tenant, timeline, compact=False, wait_until_uploaded=True + ) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 9d653d1a1ef8..fdc56cc496e5 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -44,13 +44,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()]) # Wait logical replication to sync start = time.time() - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") log.info(f"Sync with master took {time.time() - start} seconds") sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]) diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index 76c3ad01a4a7..e5a9f17da8c2 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -247,7 +247,7 @@ def assert_all_split(): log.info(f"{shard_zero_id} timeline: {timeline_info}") # Run compaction for all tenants, restart endpoint so that on subsequent reads we will - # definitely hit pageserver for reads. This compaction passis expected to drop unwanted + # definitely hit pageserver for reads. This compaction pass is expected to drop unwanted # layers but not do any rewrites (we're still in the same generation) for tenant_id, tenant_state in tenants.items(): tenant_state.endpoint.stop() @@ -296,6 +296,16 @@ def assert_all_split(): for fut in pgbench_futs: fut.result() + # Run a full forced compaction, to detect any data corruption. + for tenant_id, tenant_state in tenants.items(): + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_compact( + shard_id, + tenant_state.timeline_id, + force_image_layer_creation=True, + force_l0_compaction=True, + ) + # Assert that some rewrites happened # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers) diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 354fc1574510..0b138bf1677a 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.66" +version = "0.10.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" +checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.103" +version = "0.9.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" +checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" dependencies = [ "cc", "libc", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index e88d245c8ff4..a4b9eabf8e63 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -184,6 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "gc_compaction_enabled": True, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, + "image_creation_preempt_threshold": 5, } vps_http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 2edfc884add6..f3347b594e32 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -29,6 +29,21 @@ # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later } +PREEMPT_COMPACTION_TENANT_CONF = { + "gc_period": "5s", + "compaction_period": "5s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 1, + "image_creation_preempt_threshold": 1, + # compact more frequently + "compaction_threshold": 3, + "compaction_upper_limit": 6, + "lsn_lease_length": "0s", +} + @skip_in_debug_build("only run with release build") @pytest.mark.parametrize( @@ -36,7 +51,8 @@ [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], ) def test_pageserver_compaction_smoke( - neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol + neon_env_builder: NeonEnvBuilder, + wal_receiver_protocol: PageserverWalReceiverProtocol, ): """ This is a smoke test that compaction kicks in. The workload repeatedly churns @@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke( page_cache_size=10 """ - env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF) + conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy() + env = neon_env_builder.init_start(initial_tenant_conf=conf) tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -86,9 +103,9 @@ def test_pageserver_compaction_smoke( log.info("Checking layer access metrics ...") layer_access_metric_names = [ - "pageserver_layers_visited_per_vectored_read_global_sum", - "pageserver_layers_visited_per_vectored_read_global_count", - "pageserver_layers_visited_per_vectored_read_global_bucket", + "pageserver_layers_per_read_global_sum", + "pageserver_layers_per_read_global_count", + "pageserver_layers_per_read_global_bucket", ] metrics = env.pageserver.http_client().get_metrics() @@ -96,8 +113,8 @@ def test_pageserver_compaction_smoke( layer_access_metrics = metrics.query_all(name) log.info(f"Got metrics: {layer_access_metrics}") - vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") - vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_per_read_global_count") if vectored_count.value > 0: assert vectored_sum.value > 0 vectored_average = vectored_sum.value / vectored_count.value @@ -113,6 +130,41 @@ def test_pageserver_compaction_smoke( assert vectored_average < 8 +@skip_in_debug_build("only run with release build") +def test_pageserver_compaction_preempt( + neon_env_builder: NeonEnvBuilder, +): + # Ideally we should be able to do unit tests for this, but we need real Postgres + # WALs in order to do unit testing... + + conf = PREEMPT_COMPACTION_TENANT_CONF.copy() + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 200000 + churn_rounds = 10 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=False) + workload.validate(env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True) + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + # ensure image layer creation gets preempted and then resumed + env.pageserver.assert_log_contains("resuming image layer creation") + + @skip_in_debug_build("only run with release build") @pytest.mark.parametrize( "with_branches", @@ -250,6 +302,9 @@ def compaction_finished(): workload.churn_rows(row_count, env.pageserver.id) # compact 3 times if mode is before_restart n_compactions = 3 if compaction_mode == "before_restart" else 1 + ps_http.timeline_compact( + tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True + ) for _ in range(n_compactions): # Force refresh gc info to have gc_cutoff generated ps_http.timeline_gc(tenant_id, timeline_id, None) diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index f0878b2631d1..50a922a616b0 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -183,6 +183,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');") cursor.execute("CREATE TABLE t(a int)") cursor.execute("INSERT INTO t VALUES (1)") + cursor.execute("CHECKPOINT") # connect to the subscriber_db and create a subscription # Note that we need to create subscription with @@ -195,7 +196,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # wait for the subscription to be active logical_replication_sync( - endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db" + endpoint, + endpoint, + "mysub", + sub_dbname="subscriber_db", + pub_dbname="publisher_db", ) # Check that replication is working diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index d7e6e9de5642..7f12c140731c 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -95,6 +95,8 @@ def endpoint_handler_build_tag(request: Request) -> Response: # mock remote_extensions spec spec: dict[str, Any] = { + "public_extensions": ["anon"], + "custom_extensions": None, "library_index": { "anon": "anon", }, diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 182f715b0ef4..6b35f3c6bb37 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -59,6 +59,9 @@ def handler(request: Request) -> Response: neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() + # The test needs LocalFs support, which is only built in testing mode. + env.pageserver.is_testing_enabled_or_skip() + env.pageserver.patch_config_toml_nonrecursive( { "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api" @@ -67,6 +70,12 @@ def handler(request: Request) -> Response: env.pageserver.stop() env.pageserver.start() + # By default our tests run with a tiny shared_buffers=1MB setting. That + # doesn't allow any prefetching on v17 and above, where the new streaming + # read machinery keeps buffers pinned while prefetching them. Use a higher + # setting to enable prefetching and speed up the tests + ep_config = ["shared_buffers=64MB"] + # # Put data in vanilla pg # @@ -243,7 +252,11 @@ def validate_vanilla_equivalence(ep): # ro_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn + branch_name=import_branch_name, + endpoint_id="ro", + tenant_id=tenant_id, + lsn=last_record_lsn, + config_lines=ep_config, ) validate_vanilla_equivalence(ro_endpoint) @@ -273,7 +286,10 @@ def validate_vanilla_equivalence(ep): # validate that we can write # rw_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id + branch_name=import_branch_name, + endpoint_id="rw", + tenant_id=tenant_id, + config_lines=ep_config, ) rw_endpoint.safe_psql("create table othertable(values text)") rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) @@ -293,7 +309,7 @@ def validate_vanilla_equivalence(ep): ancestor_start_lsn=rw_lsn, ) br_tip_endpoint = env.endpoints.create_start( - branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config ) validate_vanilla_equivalence(br_tip_endpoint) br_tip_endpoint.safe_psql("select * from othertable") @@ -306,7 +322,10 @@ def validate_vanilla_equivalence(ep): ancestor_start_lsn=initdb_lsn, ) br_initdb_endpoint = env.endpoints.create_start( - branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + branch_name="br-initdb", + endpoint_id="br-initdb-ro", + tenant_id=tenant_id, + config_lines=ep_config, ) validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index d9043fef7fb3..0260704ebf47 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -63,7 +63,7 @@ def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg): cur.execute("set statement_timeout=0") cur.execute("select create_snapshots(10000)") # Wait logical replication to sync - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline) env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 89087631092d..3a92f0d1d1f1 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -55,13 +55,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") # insert some data cur.execute("insert into t values (generate_series(1,1000), 0)") # Wait logical replication to sync - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000 # now stop subscriber... @@ -78,7 +78,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr vanilla_pg.start() # Wait logical replication to sync - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") # Check that subscribers receives all data assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000 @@ -148,7 +148,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr endpoint.start() vanilla_pg.start() - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3" assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q) log.info("rewriteheap synced") @@ -285,7 +285,7 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: V vanilla_pg.safe_psql("create table t(a int)") connstr = endpoint.connstr().replace("'", "''") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") vanilla_pg.stop() @@ -321,13 +321,13 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: V sk_http = sk.http_client() sk_http.configure_failpoints([("sk-pause-send", "off")]) - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2] # Check that local reads also work with endpoint.connect().cursor() as cur: cur.execute("insert into t values (3)") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3] log_path = vanilla_pg.pgdatadir / "pg.log" @@ -365,7 +365,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres) log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") connstr = endpoint.connstr().replace("'", "''") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") vanilla_pg.stop() wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) @@ -375,7 +375,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres) # this should flush current wal page cur.execute("insert into replication_example values (3, 4)") vanilla_pg.start() - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql( "select sum(somedata) from replication_example" ) == endpoint.safe_psql("select sum(somedata) from replication_example") @@ -409,18 +409,18 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): # Test simple insert, update, delete. But with very large values value = random_string(10_000_000) cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)] # Test delete, and reinsert another value cur.execute("DELETE FROM reptbl WHERE id = 1") cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] value = random_string(10_000_000) cur.execute(f"UPDATE reptbl SET largeval='{value}'") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] endpoint.stop() @@ -428,7 +428,7 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): cur = endpoint.connect().cursor() value = random_string(10_000_000) cur.execute(f"UPDATE reptbl SET largeval='{value}'") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] @@ -608,7 +608,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van for i in range(0, 1000): pcur.execute("INSERT into t values (%s, random()*100000)", (i,)) # wait until sub receives all data - logical_replication_sync(sub, vanilla_pg) + logical_replication_sync(sub, vanilla_pg, "sub") # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data # as flushed we'll now lose it if subscriber restars. That's why # logical_replication_wait_flush_lsn_sync is expected to hang while diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 028d1c2e49b8..c344f30f4d9c 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -27,6 +27,7 @@ ) from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import query_scalar, wait_until +from urllib3 import Retry if TYPE_CHECKING: from typing import Any @@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu "compaction_period": "0s", } ) - client = env.pageserver.http_client() + + # Disable retries, because we'll hit code paths that can give us + # 503 and want to see that directly + client = env.pageserver.http_client(retries=Retry(status=0)) + failpoint = "before-downloading-layer-stream-pausable" client.configure_failpoints((failpoint, "pause")) - env.pageserver.allowed_errors.extend( - [ - ".*downloading failed, possibly for shutdown.*", - ] - ) - info = client.layer_map_info(env.initial_tenant, env.initial_timeline) assert len(info.delta_layers()) == 1 @@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu client.configure_failpoints((failpoint, "off")) - with pytest.raises( - PageserverApiException, match="downloading failed, possibly for shutdown" - ): + with pytest.raises(PageserverApiException, match="Shutting down"): download.result() - env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*") - detach.result() client.configure_failpoints((failpoint, "pause")) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 1292682f9e3d..590093d23c37 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): workload.write_rows(256, env.pageservers[0].id) env.pageserver.http_client().tenant_heatmap_upload(tenant_id) - def validate_heatmap(heatmap): + def validate_heatmap(heatmap, on_disk_heatmap): assert len(heatmap["timelines"]) == 1 assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id) assert len(heatmap["timelines"][0]["layers"]) > 0 @@ -452,10 +452,13 @@ def validate_heatmap(heatmap): # Each layer appears at most once assert len(set(layer["name"] for layer in layers)) == len(layers) + assert heatmap == on_disk_heatmap + # Download and inspect the heatmap that the pageserver uploaded heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id) log.info(f"Read back heatmap: {heatmap_first}") - validate_heatmap(heatmap_first) + validate_heatmap(heatmap_first, heatmap_first_on_disk) # Do some more I/O to generate more layers workload.churn_rows(64, env.pageservers[0].id) @@ -463,9 +466,10 @@ def validate_heatmap(heatmap): # Ensure that another heatmap upload includes the new layers heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id) log.info(f"Read back heatmap: {heatmap_second}") assert heatmap_second != heatmap_first - validate_heatmap(heatmap_second) + validate_heatmap(heatmap_second, heatmap_second_on_disk) def list_elegible_layers( diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py index 3f9824ee677a..229439106b82 100644 --- a/test_runner/regress/test_physical_and_logical_replicaiton.py +++ b/test_runner/regress/test_physical_and_logical_replicaiton.py @@ -43,7 +43,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE s_cur.execute("select count(*) from t") assert s_cur.fetchall()[0][0] == n_records - logical_replication_sync(vanilla_pg, primary) + logical_replication_sync(vanilla_pg, primary, "sub1") assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records # Check that LR slot is not copied to replica @@ -87,7 +87,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg): s_con = secondary.connect() s_cur = s_con.cursor() - logical_replication_sync(vanilla_pg, primary) + logical_replication_sync(vanilla_pg, primary, "sub1") assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records s_cur.execute("select count(*) from t") diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 350fe31099fb..11a4d0920210 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -373,6 +373,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up but imports the generation number. """ + neon_env_builder.num_azs = 3 env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder) virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -409,6 +410,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up "node_secondary" ][0] + # Check that the secondary's scheduling is stable + assert env.storage_controller.reconcile_all() == 0 + # Call into storage controller to onboard the tenant generation += 1 r = virtual_ps_http.tenant_location_conf( @@ -460,6 +464,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) assert len(r["shards"]) == 1 + # Check that onboarding did not result in an unstable scheduling state + assert env.storage_controller.reconcile_all() == 0 + # We should see the tenant is now attached to the pageserver managed # by the sharding service origin_tenants = origin_ps.http_client().tenant_list() diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 1304d302b79c..0f4e5688a957 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -32,6 +32,12 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 env = neon_env_builder.init_start() + # We restart pageserver(s), which will cause storage storage controller + # requests to fail and warn. + env.storage_controller.allowed_errors.append(".*management API still failed.*") + env.storage_controller.allowed_errors.append( + ".*Reconcile error.*error sending request for url.*" + ) tenant_id = env.initial_tenant timeline_id = env.initial_timeline branch = "main" @@ -65,6 +71,10 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: else: tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] + # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable + # is it won't overlap with migrations + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + output_path = neon_env_builder.test_output_dir / "snapshot" os.makedirs(output_path) diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 645572da8e79..849d4f024d65 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -3,7 +3,7 @@ import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar, wait_until @@ -208,7 +208,6 @@ def insert_data(pub, start): # wake the sub and ensure that it catches up with the new data sub.start(create_test_user=True) with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: - logical_replication_sync(sub, pub) wait_until(check_that_changes_propagated) scur.execute("SELECT count(*) FROM t") res = scur.fetchall() diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index bec827058270..306e9716578e 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -582,12 +582,12 @@ def worker(): # This is expected: we are injecting chaos, API calls will sometimes fail. # TODO: can we narrow this to assert we are getting friendly 503s? log.info(f"Iteration error, will retry: {e}") - shutdown.wait(random.random()) + shutdown.wait(random.random() * 0.5) except requests.exceptions.RetryError as e: # Retryable error repeated more times than `requests` is configured to tolerate, this # is expected when a pageserver remains unavailable for a couple seconds log.info(f"Iteration error, will retry: {e}") - shutdown.wait(random.random()) + shutdown.wait(random.random() * 0.5) except Exception as e: log.warning( f"Unexpected worker exception (current timeline {state.timeline_id}): {e}" @@ -632,7 +632,7 @@ def worker(): # Make sure we're up for as long as we spent restarting, to ensure operations can make progress log.info(f"Staying alive for {restart_duration}s") - time.sleep(restart_duration) + time.sleep(restart_duration * 2) else: # Migrate our tenant between pageservers origin_ps = env.get_tenant_pageserver(tenant_shard_id) @@ -651,7 +651,7 @@ def worker(): # Sanity check that during our run we did exercise some full timeline lifecycles, in case # one of our workers got stuck - assert len(timelines_deleted) > 10 + assert len(timelines_deleted) > 5 # That no invariant-violations were reported by workers assert violations == [] diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3cf7ce1afab7..86d9ea96ebb9 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162 +Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index b654fa88b6fd..8dfd5a7030d3 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9 +Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3 diff --git a/vendor/revisions.json b/vendor/revisions.json index 982f53769238..efddaef46a4f 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,11 +1,11 @@ { "v17": [ "17.2", - "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9" + "8dfd5a7030d3e8a98b60265ebe045788892ac7f3" ], "v16": [ "16.6", - "3cf7ce1afab75027716d14223f95ddb300754162" + "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c" ], "v15": [ "15.10", diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index a3dffa8f195a..2c65401154a8 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } +tracing-log = { version = "0.2" } url = { version = "2", features = ["serde"] } zerocopy = { version = "0.7", features = ["derive", "simd"] } zeroize = { version = "1", features = ["derive", "serde"] }