From 570f7a3e3725afeebd035b0c0cb544b75308749b Mon Sep 17 00:00:00 2001 From: katmayb Date: Mon, 7 Apr 2025 15:34:27 -0400 Subject: [PATCH] Docs for PCR on Cloud clyster v25.2 --- .../v25.1/sidebar-data/cloud-deployments.json | 6 + .../physical-cluster-replication.md | 5 +- .../v25.2/sidebar-data/cloud-deployments.json | 6 + .../physical-cluster-replication.md | 256 ++++++++++++++++++ 4 files changed, 272 insertions(+), 1 deletion(-) create mode 100644 src/current/cockroachcloud/physical-cluster-replication.md diff --git a/src/current/_includes/v25.1/sidebar-data/cloud-deployments.json b/src/current/_includes/v25.1/sidebar-data/cloud-deployments.json index 6eeec003a17..93cb9b6b5c4 100644 --- a/src/current/_includes/v25.1/sidebar-data/cloud-deployments.json +++ b/src/current/_includes/v25.1/sidebar-data/cloud-deployments.json @@ -563,6 +563,12 @@ } ] }, + { + "title": "Physical Cluster Replication", + "urls": [ + "/cockroachcloud/physical-cluster-replication.html" + ] + }, { "title": "Billing Management", "urls": [ diff --git a/src/current/_includes/v25.2/known-limitations/physical-cluster-replication.md b/src/current/_includes/v25.2/known-limitations/physical-cluster-replication.md index c8a872968ef..ed9737fa4e6 100644 --- a/src/current/_includes/v25.2/known-limitations/physical-cluster-replication.md +++ b/src/current/_includes/v25.2/known-limitations/physical-cluster-replication.md @@ -1,3 +1,6 @@ -- Physical cluster replication is supported in CockroachDB {{ site.data.products.core }} clusters on v23.2 or later. The primary cluster can be a [new]({% link {{ page.version.version }}/set-up-physical-cluster-replication.md %}#step-1-create-the-primary-cluster) or [existing]({% link {{ page.version.version }}/set-up-physical-cluster-replication.md %}#set-up-pcr-from-an-existing-cluster) cluster. The standby cluster must be a [new cluster started with the `--virtualized-empty` flag]({% link {{ page.version.version }}/set-up-physical-cluster-replication.md %}#step-2-create-the-standby-cluster). +- Physical cluster replication is supported in: + - CockroachDB self-hosted in new clusters on v23.2 or above. Physical Cluster Replication cannot be enabled on clusters that have been upgraded from a previous version of CockroachDB. + - CockroachDB {{ site.data.products.advanced }} in new clusters on v25.1 with the `"support_physical_cluster_replication"` field enabled. +- The primary and standby clusters must have the same [zone configurations]({% link {{ page.version.version }}/configure-replication-zones.md %}) in CockroachDB self-hosted. - The primary and standby clusters must have the same [zone configurations]({% link {{ page.version.version }}/configure-replication-zones.md %}). - Before failover to the standby, the standby cluster does not support running [backups]({% link {{ page.version.version }}/backup-and-restore-overview.md %}) or [changefeeds]({% link {{ page.version.version }}/change-data-capture-overview.md %}). diff --git a/src/current/_includes/v25.2/sidebar-data/cloud-deployments.json b/src/current/_includes/v25.2/sidebar-data/cloud-deployments.json index 6eeec003a17..93cb9b6b5c4 100644 --- a/src/current/_includes/v25.2/sidebar-data/cloud-deployments.json +++ b/src/current/_includes/v25.2/sidebar-data/cloud-deployments.json @@ -563,6 +563,12 @@ } ] }, + { + "title": "Physical Cluster Replication", + "urls": [ + "/cockroachcloud/physical-cluster-replication.html" + ] + }, { "title": "Billing Management", "urls": [ diff --git a/src/current/cockroachcloud/physical-cluster-replication.md b/src/current/cockroachcloud/physical-cluster-replication.md new file mode 100644 index 00000000000..ff644194c12 --- /dev/null +++ b/src/current/cockroachcloud/physical-cluster-replication.md @@ -0,0 +1,256 @@ +--- +title: Physical Cluster Replication +summary: Set up physical cluster replication (PCR) in a Cloud deployment. +toc: true +--- + +{{site.data.alerts.callout_info}} +{% include feature-phases/preview.md %} +{{site.data.alerts.end}} + +CockroachDB **physical cluster replication (PCR)** continuously sends all data at the byte level from a _primary_ cluster to an independent _standby_ cluster. Existing data and ongoing changes on the active primary cluster, which is serving application data, replicate asynchronously to the passive standby cluster. + +In a disaster recovery scenario, you can [fail over](#step-4-fail-over-to-the-standby-cluster) from the unavailable primary cluster to the standby cluster. This will stop the PCR stream, reset the standby cluster to a point in time where all ingested data is consistent, and mark the standby as ready to accept application traffic. + +## Set up PCR on CockroachDB {{ site.data.products.advanced }} + +In this guide, you'll use the [{{ site.data.products.cloud }} API]({% link cockroachcloud/cloud-api.md %}) to set up PCR from a primary cluster to a standby cluster, monitor the PCR stream, and fail over from the primary to the standby cluster. + +{{site.data.alerts.callout_info}} +PCR is supported on CockroachDB {{ site.data.products.advanced }} and CockroachDB self-hosted clusters. For a guide to setting up PCR on CockroachDB self-hosted, refer to the [Set Up Physical Cluster Replication]({% link {{ site.current_cloud_version }}/set-up-physical-cluster-replication.md %}) tutorial. +{{site.data.alerts.end}} + +### Before you begin + +You'll need the following: + +- [Cloud API Access]({% link cockroachcloud/managing-access.md %}#api-access). + + To set up and manage PCR on CockroachDB {{ site.data.products.advanced }} clusters, you'll use the `'https://cockroachlabs.cloud/api/v1/replication-streams'` endpoint. Access to the `replication-streams` endpoint requires a valid CockroachDB {{ site.data.products.cloud }} [service account]({% link cockroachcloud/managing-access.md %}#manage-service-accounts) with the correct permissions. + + The following describes the required roles for the `replication-streams` endpoint methods: + + Method | Required roles | Description + -------+----------------+------------ + `POST` | [Cluster Administrator]({% link cockroachcloud/authorization.md %}#cluster-administrator) | Create a PCR stream. + `GET` | [Cluster Administrator]({% link cockroachcloud/authorization.md %}#cluster-administrator), [Cluster Operator]({% link cockroachcloud/authorization.md %}#cluster-operator), [Cluster Developer]({% link cockroachcloud/authorization.md %}#cluster-developer) | Retrieve information for the PCR stream. + `PATCH` | [Cluster Administrator]({% link cockroachcloud/authorization.md %}#cluster-administrator) | Update the PCR stream to fail over. + + {{site.data.alerts.callout_success}} + We recommend creating service accounts with the [principle of least privilege](https://wikipedia.org/wiki/Principle_of_least_privilege), and giving each application that accesses the API its own service account and API key. This allows fine-grained access to the cluster and PCR streams. + {{site.data.alerts.end}} + +- Read the [Configuration](#configuration) section for detail on cloud provider and region setup. + +### Configuration + +To set up PCR successfully: + +- Clusters must be in the same cloud (AWS, GCP, or Azure). +- Clusters must be single [region]({% link cockroachcloud/regions.md %}) (multiple availability zones per clusteris supported). +- The primary and standby cluster in AWS and Azure must be in different regions. +- The primary and standby cluster in GCP can be in the same region, but must not have overlapping CIDR ranges. +- Clusters can have different [node topology]({% link cockroachcloud/plan-your-cluster-advanced.md %}#cluster-topology) and [hardware configurations]({% link cockroachcloud/plan-your-cluster-advanced.md %}#cluster-sizing-and-scaling). For disaster recovery purposes (failover and redirecting application traffic to a standby), we recommend configuring the primary and standby clusters with similar hardware. + +### Step 1. Create the clusters + +To use PCR, it is necessary to set the **standby** cluster with the `support_physical_cluster_replication` field to `true`, which indicates that a cluster should start using an architecture that supports PCR. For details on supported cluster cloud provider and region setup, refer to [Configuration](#configuration). + +1. Send a `POST` request to create the primary cluster: + + {% include_cached copy-clipboard.html %} + ~~~ shell + curl --location --request POST 'https://cockroachlabs.cloud/api/v1/clusters' --header "Authorization: Bearer api_secret_key" --header 'Content-Type: application/json' --data '{"name": "primary_cluster_name", "provider": "AWS", "spec": {"dedicated": {"cockroachVersion": "v24.3", "hardware": {"disk_iops": 0, "machine_spec": {"num_virtual_cpus": 4}, "storage_gib": 16}, "region_nodes": {"us-east-1": 3}, "support_physical_cluster_replication": true}}}' + ~~~ + + Ensure that you replace each of the values for the cluster specification as per your requirements. For details on the cluster specifications, refer to [Create a cluster]({% link cockroachcloud/cloud-api.md %}#create-a-cluster). Also, replace `api_secret_key` with your API secret key. You can include `support_physical_cluster_replication` set to `true`, but it is not a requirement for the primary cluster. + +1. Send a `POST` request to create the standby cluster that includes your necessary cluster specification. Ensure that you include `support_physical_cluster_replication` set to `true`: + + {% include_cached copy-clipboard.html %} + ~~~ shell + curl --location --request POST 'https://cockroachlabs.cloud/api/v1/clusters' --header "Authorization: Bearer api_secret_key" --header 'Content-Type: application/json' --data '{"name": "standby_cluster_name", "provider": "AWS", "spec": {"dedicated": {"cockroachVersion": "v24.3", "hardware": {"disk_iops": 0, "machine_spec": {"num_virtual_cpus": 4}, "storage_gib": 16}, "region_nodes": {"us-east-2": 3}, "support_physical_cluster_replication": true}}}' + ~~~ + + If you're creating clusters in AWS or Azure, you must start the primary and standby clusters in different regions. + +{{site.data.alerts.callout_success}} +We recommend [enabling Prometheus metrics export]({% link cockroachcloud/export-metrics.md %}) on your cluster before starting a PCR stream. For details on metrics to track, refer to [Monitor the PCR stream](#step-3-monitor-the-pcr-stream). +{{site.data.alerts.end}} + +### Step 2. Start the PCR stream + +{{site.data.alerts.callout_info}} +The standby cluster must be empty upon starting PCR. It is possible to write to both clusters before initiating the PCR stream, however, we recommend keeping the standby empty. That is, not writing to the standby prior to starting PCR. When you initiate the PCR stream, CockroachDB {{ site.data.products.cloud }} will take a full cluster backup of the standby cluster, delete all data from the standby, and then start the PCR stream. +{{site.data.alerts.end}} + +With the primary and standby clusters created, you can now start the PCR stream. + +Send a `POST` request to the `/v1/replication-streams` endpoint to start the PCR stream: + +{% include_cached copy-clipboard.html %} +~~~ shell +curl --location --request POST 'https://cockroachlabs.cloud/api/v1/replication-streams' --header "Authorization: Bearer api_secret_key" --header 'Content-Type: application/json' --data '{"sourceClusterId": "primary_cluster_id","targetClusterId": "standby_cluster_id"}' +~~~ + +Replace: + +- `api_secret_key` with your API secret key. +- `primary_cluster_id` with the cluster ID returned after creating the primary cluster. +- `standby_cluster_id` with the cluster ID returned after creating the standby cluster. + +You can find the cluster IDs in the cluster creation output, or in the URL of the single cluster overview page: `https://cockroachlabs.cloud/cluster/{your_cluster_id}/overview`. The ID will resemble `ad1e8630-729a-40f3-87e4-9f72eb3347a0`. + +Once you have started PCR, the standby cluster cannot accept writes and reads, therefore the [Cloud Console]({% link cockroachcloud/cluster-overview-page.md %}) and SQL shell will be unavailable prior to failover. + +~~~ json +{ + "id": "7487d7a6-868b-4c6f-aa60-cc306cc525fe", + "status": "STARTING", + "source_cluster_id": "ad1e8630-729a-40f3-87e4-9f72eb3347a0", + "target_cluster_id": "e64b56c3-09f2-42ee-9f5a-d9b13985a897", + "created_at": "2025-01-13T16:41:20.467781Z", + "retained_time": null, + "replicated_time": null, + "failover_at": null, + "activation_at": null +} +~~~ + +To start PCR between clusters, CockroachDB {{ site.data.products.cloud }} sets up VPC peering between clusters and validates the connectivity. As a result, it may take around 5 minutes to initialize the PCR job during which the status will be `STARTING`. + +### Step 3. Monitor the PCR stream + +For monitoring the current status of the PCR stream, send a `GET` request to the `/v1/replication-streams` endpoint along with the primary cluster, standby cluster, or the ID of the PCR stream: + +{% include_cached copy-clipboard.html %} +~~~ shell +curl --location --request GET "https://cockroachlabs.cloud/api/v1/replication-streams?cluster_id=primary/standby_cluster_id" --header "Authorization: Bearer api_secret_key" --header 'Content-Type: application/json' +~~~ + +Replace: + +- `api_secret_key` with your API secret key. +- `primary/standby_cluster_id` with the primary or standby cluster's ID. + +This will return: + +~~~json +{ + "replication_streams": [ + { + "id": "7487d7a6-868b-4c6f-aa60-cc306cc525fe", + "status": "REPLICATING", + "source_cluster_id": "ad1e8630-729a-40f3-87e4-9f72eb3347a0", + "target_cluster_id": "e64b56c3-09f2-42ee-9f5a-d9b13985a897", + "created_at": "2025-01-13T16:41:20.467781Z", + "retained_time": "2025-01-13T19:35:14.472670Z", + "replicated_time": "2025-01-13T19:46:15Z", + "failover_at": null, + "activation_at": null + } + ], +} +~~~ + +- `id`: The ID of the PCR stream. +- `status`: The status of the PCR stream. For descriptions, refer to [Status](#status). +- `source_cluster_id`: The cluster ID of the primary cluster. +- `target_cluster_id`: The cluster ID of the standby cluster. +- `created_at`: The timestamp when the PCR stream was started. +- `retained_time`: The timestamp indicating the lower bound that the PCR stream can failover to. The tracked replicated time and the advancing [protected timestamp]({% link {{ site.current_cloud_version }}/architecture/storage-layer.md %}#protected-timestamps) allows PCR to also track [_retained time_](#technical-reference). +- `replicated_time`: The latest time at which the standby cluster has consistent data. +- `failover_at`: The requested timestamp for failover. This will return `NULL` until you have requested failover. If you used `"status":"FAILING_OVER"` to initiate the failover and omitted `failover_at`, the failover time will default to the latest consistent replicated time. For more details, refer to [Fail over to the standby cluster](#step-4-fail-over-to-the-standby-cluster). +- `activation_at`: The CockroachDB system time at which failover is finalized, which could be different from the time that failover was requested. This field will return a response when the PCR stream is in [`COMPLETED` status](#status). + +For details on the returned statuses, refer to the [CockroachDB {{ site.data.products.cloud }} API reference](https://www.cockroachlabs.com/docs/api/cloud/v1). + +#### Status + +Status | Description +-------+------------ +`STARTING` | Setting up VPC peering between clusters and validating the connectivity. +`REPLICATING` | Completing an initial scan and then continuing ongoing replication between the primary and standby clusters. +`FAILING_OVER` | Initiating the failover from the primary to the standby cluster. +`COMPLETED` | The failover is complete and the standby cluster is now independent from the primary cluster. + +#### Metrics + +For continual monitoring of PCR, track the following metrics with [Prometheus]({% link cockroachcloud/export-metrics.md %}): + +- `physical_replication.logical_bytes`: The logical bytes (the sum of all keys and values) ingested by all PCR streams. +- `physical_replication.sst_bytes`: The SST bytes (compressed) sent to the KV layer by all PCR streams. +- `physical_replication.replicated_time_seconds`: The replicated time of the PCR stream in seconds since the Unix epoch. + +### Step 4. Fail over to the standby cluster + +Failing over from the primary cluster to the standby cluster will stop the PCR stream, reset the standby cluster to a point in time where all ingested data is consistent, and mark the standby as ready to accept application traffic. You can schedule the failover to: + +- The latest consistent time. +- A time in the past within the [`retained_time`](#technical-reference). +- A time up to 1 hour in the future. + +To specify a timestamp, send a `PATCH` request to the `/v1/replication-streams` endpoint along with the primary cluster, standby cluster, or the ID of the PCR stream. Include the `failover_at` timestamp and the `"status": "FAILING_OVER"` field: + +{% include_cached copy-clipboard.html %} +~~~ shell +curl --location --request PATCH "https://cockroachlabs.cloud/api/v1/replication-streams/7487d7a6-868b-4c6f-aa60-cc306cc525fe" --header "Authorization: Bearer api_secret_key" --header 'Content-Type:application/json' --data '{"status": "FAILING_OVER", "failover_at": "2025-01-13T19:35:14.472670Z"}' +~~~ + +To fail over to the latest consistent time, you only need to include `"status": "FAILING_OVER"` in your request with one of the cluster IDs or PCR stream ID: + +{% include_cached copy-clipboard.html %} +~~~ shell +curl --location --request PATCH "https://cockroachlabs.cloud/api/v1/replication-streams/7487d7a6-868b-4c6f-aa60-cc306cc525fe" --header "Authorization: Bearer api_secret_key" --header 'Content-Type:application/json' --data '{"status": "FAILING_OVER"}' +~~~ +~~~json +{ + "id": "f1ea3b5a-5b9b-4101-9db4-780734554b14", + "status": "FAILING_OVER", + "source_cluster_id": "560cc798-7881-4a5b-baf0-321354a42c19", + "target_cluster_id": "24a9d47c-418c-4abc-940b-d2f665d95715", + "created_at": "2025-01-23T16:54:15.926325Z", + "retained_time": null, + "replicated_time": null, + "failover_at": null, + "activation_at": null +} +~~~ + +After the failover is complete, both clusters can receive traffic and operate as separate clusters. It is necessary to redirect application traffic manually. + +{{site.data.alerts.callout_info}} +PCR is on the cluster level, which means that the job also replicates all system tables. Users that need to access the standby cluster after failover should use the user roles for the primary cluster, because the standby cluster is a copy of the primary cluster. PCR overwrites all previous system tables on the standby cluster. +{{site.data.alerts.end}} + +### Fail back to the primary cluster + +To fail back from the standby to the primary cluster, start another PCR stream with the standby cluster as the `sourceClusterId` and the original primary cluster as the `targetClusterId`. + +## Technical reference + +The replication happens at the byte level, which means that the job is unaware of databases, tables, row boundaries, and so on. However, when a [failover](#step-4-fail-over-to-the-standby-cluster) to the standby cluster is initiated, the PCR job ensures that the cluster is in a transactionally consistent state as of a certain point in time. Beyond the application data, the job will also replicate users, privileges, and schema changes. + +At startup, the PCR job will set up VPC peering between the primary and standby {{ site.data.products.advanced }} clusters and validate the connectivity. + +During the job, [rangefeeds]({% link {{ site.current_cloud_version }}/create-and-configure-changefeeds.md %}#enable-rangefeeds) are periodically emitting _resolved timestamps_, which is the time where the ingested data is known to be consistent. Resolved timestamps provide a guarantee that there are no new writes from before that timestamp. This allows the [protected timestamp]({% link {{ site.current_cloud_version }}/architecture/storage-layer.md %}#protected-timestamps) to move forward as the replicated timestamp advances, which permits the [garbage collection]({% link {{ site.current_cloud_version }}/architecture/storage-layer.md %}#garbage-collection) to continue as the PCR stream on the standby cluster advances. + +{{site.data.alerts.callout_info}} +If the primary cluster does not receive replicated time information from the standby after 24 hours, it cancels the replication job. This ensures that an inactive replication job will not prevent garbage collection. +{{site.data.alerts.end}} + +The tracked replicated time and the advancing protected timestamp provide the PCR job with enough information to track _retained time_, which is a timestamp in the past indicating the lower bound that the PCR stream could fail over to. Therefore, the _failover window_ for a PCR stream falls between the retained time and the replicated time. + +Timeline showing how the failover window is between the retained time and replicated time. + +_Replication lag_ is the time between the most up-to-date replicated time and the actual time. While the PCR stream keeps as current as possible to the actual time, this replication lag window is where there is potential for data loss. + +For the [failover process](#step-4-fail-over-to-the-standby-cluster), the standby cluster waits until it has reached the specified failover time, which can be in the past (up to the retained time), the latest consistent timestamp, or in the future (up to 1 hour). Once that timestamp has been reached, the PCR stream stops and any data in the standby cluster that is **above** the failover time is removed. Depending on how much data the standby needs to revert, this can affect the duration of [RTO (recovery time objective)]({% link {{ site.current_cloud_version }}/disaster-recovery-overview.md %}). + +After reverting any necessary data, the standby cluster is promoted as available to serve traffic. + +## See also + +- [Physical Cluster Replication Overview]({% link {{ site.current_cloud_version }}/physical-cluster-replication-overview.md %}) +- [CockroachDB {{ site.data.products.cloud }} API reference](https://www.cockroachlabs.com/docs/api/cloud/v1) +- [Disaster Recovery Overview]({% link {{ site.current_cloud_version }}/disaster-recovery-overview.md %}) \ No newline at end of file