diff --git a/.cspell.json b/.cspell.json index 0b829d6e4..91ad69188 100644 --- a/.cspell.json +++ b/.cspell.json @@ -1,16 +1,8 @@ { "version": "0.1", "allowCompoundWords": true, - "enabledLanguageIds": [ - "json", - "jsonc", - "markdown", - "yaml", - "yml" - ], - "ignoreRegExpList": [ - "/'s\\b/" - ], + "enabledLanguageIds": ["json", "jsonc", "markdown", "yaml", "yml"], + "ignoreRegExpList": ["/'s\\b/"], "ignoreWords": [ "AGE-SECRET-KEY-1KTYK6RVLN5TAPE7VF6FQQSKZ9HWWCDSKUGXXNUQDWZ7XXT5YK5LSF3UTKQ", "FPpLvZyAdAmuzc3N", @@ -112,7 +104,10 @@ "favourite", "WPUE", "wsbtpg", - "uxqf" + "uxqf", + "xvjf", + "initdb", + "creds" ], "language": "en", "words": [ @@ -179,6 +174,7 @@ "prio", "rabbitmq", "rbac", + "rclone", "redkubes", "rego", "repos", diff --git a/docs/for-ops/disaster-recovery/gitea.md b/docs/for-ops/disaster-recovery/gitea.md new file mode 100644 index 000000000..12a4fe88f --- /dev/null +++ b/docs/for-ops/disaster-recovery/gitea.md @@ -0,0 +1,78 @@ +--- +slug: gitea +title: Gitea repositories and database +sidebar_label: Gitea +--- +## Introduction + +Gitea stores the platform configuration (in the `values` repository), the workload catalog (in the `charts` repository), and user-created repositories. + +The recovery procedure described here uses the application-level backup of Gitea, i.e. using the `gitea dump` command line. This includes a current SQL dump of the database as well as all repositories and data. However, [Gitea documentation](https://docs.gitea.com/administration/backup-and-restore) recommends different methods for restoring the database, due to potential compatibility issues. + +A restore using this backup is advised if for some reason only Gitea has been affected by a severe operational event leading to data corruption or loss. It is also possible to restore the complete the database or single repositories. Be aware that after a partial restore there may be mismatches between the repository information and the database. + +## Retrieving backups + +When uploading and storing backups in the configured object storage bucket, there is also a local retention of the backups on a local volume for one day. After the local retention has expired, archives can be retrieved from the remote storage. + +Note that `rclone` is installed on the first time upload of a Gitea backup. If not present, it can be obtained from the releases page at https://github.com/rclone/rclone/releases/. Following variables such as `$BUCKET_NAME` or storage authentication are pre-configured in the container, so they do not need to be changed. + +```sh +## +## In the local terminal +## +kubectl exec -it -n gitea gitea-0 -- /bin/bash + +## +## The following to be run in the remote container +## + +## If needed, obtain and install Rclone +mkdir -p /backup/.bin +cd /backup/.bin +curl -fsSL -o rclone.zip https://github.com/rclone/rclone/releases/download/v1.69.0/rclone-v1.69.0-linux-amd64.zip +unzip -oj rclone.zip +cd /backup + +## Optional, not required if backup is available locally +.bin/rclone lsf gitea:/$BUCKET_NAME # List files +.bin/rclone copy gitea:/$BUCKET_NAME/.tar.bz2 /backup/ # Retrieve file from remote + +## Extract the backup +mkdir restore +tar xvjf .tar.bz2 -C restore +cd restore +``` + +## Restoring a single repository + +Repositories are stored in the mounted container path `/data/git/gitea-repositories`, with the owning user or organization as a subdirectory. To restore a single repository, find the backup in the backup's `data/repos/` directory and copy it over to `/data/git/gitea-repositories/`. + +Note it is not recommended to restore the `otomi/values` repository with this method after restoring a full cluster. + +```sh +## ... commands above to obtain and extract the backup +cp -R repos/otomi/charts.git /data/git/gitea-repositories/otomi/ +``` + +## Other assets + +Gitea file assets such as avatar images are to be found in the `data` directory of the backup. Similarly, they can be copied to the `/data/` subdirectory as needed, e.g. + +```sh +## ... commands above to obtain and extract the backup +cp -R data/avatars /data/ +``` + +## Restoring the database + +For restoring the database of Gitea, please refer to the [platform database instructions](platform-databases.md). + +## Cleaning up + +Remove any extracted files from the local backup directory to free up space. They are not removed automatically. Only compressed backups with the `.tar.bz2` are cleaned up after one day. + +```sh +cd /backup +rm -R restore +``` diff --git a/docs/for-ops/disaster-recovery/overview.md b/docs/for-ops/disaster-recovery/overview.md new file mode 100644 index 000000000..d96aaeb86 --- /dev/null +++ b/docs/for-ops/disaster-recovery/overview.md @@ -0,0 +1,35 @@ +--- +slug: overview +title: Disaster Recovery Overview +sidebar_label: Overview +--- + +## Prerequisites + +This area covers some potential scenarios, when a complete or partial restore of the platform is required. + +This guide has the following prerequisites and limitations that should be checked regularly: + +1. The following items should be backed up regularly by the platform administrator: + - The Kubernetes secret ending in "-wildcard-cert" in namespace "istio-system" (if installed via the Linode cloud console, or using your own certificate). + - The Kubernetes secret "otomi-sops-secrets" in namespace "otomi-pipelines". + - A download of the complete values in Platform -> Maintenance. Depending on whether these are downloaded with or without secrets, some passwords might have to be reset after recovery. + - Optionally manual backups of databases, as covered in this guide for the CloudNative PostgreSQL Operator, should be taken. + +2. Object storage needs to be set up for all backup types referred to. Credentials should be added to Platform Settings -> Object Storage. + +3. All backup types should be activated in the Platform Settings -> Backup. + +4. This guide does not cover the partial or complete loss of attached object storage. For production environments, it is advised to set up additional object storage in a different region, where all contents of the platform object storage is mirrored to, and can be retrieved in the event of accidental deletes, data center availability issues etc. The transfer to and from these remote storage locations is not covered in this guide. + +5. Workloads may store data in local storage, object storage, different types of databases, message queues etc. The backup and recovery strategy of Workload storage is not covered here. + +6. Currently it is not supported to reinstall a cluster in-place that has been provisioned directly using the Linode API or Console. Such an LKE cluster must instead be reprovisioned with the application platform through a Helm install. However, since the cluster ID changes, the domainsuffix will also change. Adjustments need to be made to the values file before the restore. Also, you will need a domain name set up with a DNS provider supported by App Platform and the credentials should be added to the values file. + +7. All instructions assume you are familiar with essential Kubernetes tools such as `kubectl` and have access to the Kubernetes API. Usage of TUI applications such as `k9s` from the administration terminal is strongly recommended. + +## Guides + +* [Gitea](gitea.md): Restoring the platform's Gitea database and repositories from the application backup +* [Databases](platform-databases.md): Backup and restore of the CNPG databases +* [Reinstall](platform-reinstall.md): Restoring the complete platform, including settings and data diff --git a/docs/for-ops/disaster-recovery/platform-databases.md b/docs/for-ops/disaster-recovery/platform-databases.md new file mode 100644 index 000000000..caefb5145 --- /dev/null +++ b/docs/for-ops/disaster-recovery/platform-databases.md @@ -0,0 +1,399 @@ +--- +slug: databases +title: Restoring platform databases +sidebar_label: Databases +--- + +Generally it is recommended to get familiar with the [CNPG documentation](https://cloudnative-pg.io/documentation/current/recovery/) on how to restore a PostgreSQL database. The steps here are written down specifically for App Platform for LKE. + +## Initial notes + +Changes to the `values` repository can usually be made through the Gitea UI after signing in with the `platform-admin` user. As this requires Keycloak in addition to Gitea operating normally, the risk can be reduced by creating an application token and pulling/pushing local changes to the repository. In Gitea, go to the user settings, click on the `Applications` tab, enter a token name and select `repo` as the scope. After creating this token, you can include it in the repository URL, e.g. + +```sh +git clone https://@gitea.example.com/otomi/values.git +``` + +In the event that platform-critical services Gitea and Keycloak are not able to start, required changes to the database configuration can be applied directly in the following Argo CD applications in the `argocd` namespace. This change persists and is synchronized into the cluster until the next following Tekton pipeline overwrites them: + +* Gitea database: `gitea-gitea-otomi-db` +* Keycloak database: `keycloak-keycloak-otomi-db` + +Where applicable, in these manifests the `initdb` section in `clusterSpec.bootstrap` can be replaced with `recovery` and `externalClusters` just as instructed below. Note that `recovery` and `externalClusters` do not need to be reflected in the values file later, since they are only considered when initializing the cluster; even when Tekton does revert these changes, after a successful recovery this no longer has an effect. + +## Regular recovery with backup in same cluster + +This procedure should be taken if the database has gotten to an unhealthy state, e.g. because of volume filesystem corruption. For reverting undesired updates, additional instructions for a point-in-time recovery are to be considered as described in the following sections. + +Recovering any of the platform databases should be performed in the following order: + +1. Note the name of the `Backup` resource that you intend to run the recovery from. + +2. Make adjustments to the values as described in this section. This needs to be done within the values repository directly, since this is not exposed to the platform API. + +3. Shut down the service accessing the database (see above). + +4. Halt ArgoCD auto-sync and delete the database `Cluster` resource. + +5. Re-enable ArgoCD sync. + +6. Re-enable the backup disabled in step 2. This is also possible via the Console. + +### Listing backup resources + +Available backups can be listed using the following command. Consider only `completed` ones for recovery. +Note that the time stamps of the backup names are universal time (UTC). + +```sh +kubectl get backup -n +``` +where `` is to be replaced with `gitea`, `harbor`, or `keycloak`. + +### Adjustments to the backup configuration + +After the recovery, new backups will be created. For avoiding accidental mixing and overwriting of backups, CloudnativePG does not allow for the new backup and the recovery source to be in the same location. Therefore, the backups should be temporarily disabled, and the suffix (the directory inside the object storage) are to be adjusted. + +Inside `env/settings.yaml`, locate the path `platformBackups.database.` (where `` is either `gitea`, `harbor`, or `keycloak`) and set the values `enabled: false` and `pathSuffix: -1`. The path suffix may also be set to something completely different, but must not exist in the object storage. + +Example: + +```yaml +# ... +platformBackups: + database: + gitea: + enabled: false + retentionPolicy: 7d + schedule: 0 0 * * * + pathSuffix: gitea-1 +# ... +``` + +### Adjustments to the database configuration + +The following change only has an effect on an initial database cluster. Therefore it can be made ahead of shutting down platform-critical services. + +In the file `env/databases/.yaml`, update the structure of `databases..recovery` as follows, depending on the app, inserting the backup name as determined above: + +For Gitea: +```yaml +databases: + gitea: + # ... + recovery: + backup: + name: + database: gitea + owner: gitea + secret: + name: gitea-db-secret +``` + +For Harbor: +```yaml +databases: + harbor: + # ... + recovery: + backup: + name: + database: registry + owner: harbor +``` + +For Keycloak: +```yaml +databases: + keycloak: + # ... + recovery: + backup: + name: + database: keycloak + owner: keycloak +``` + +Note that ArgoCD may show a sync error, pointing out that there are multiple `bootstrap` configurations on an existing database cluster. This will be resolved in the following steps. + +### Shutting down services + +Check the Tekton pipelines to ensure that values changes have been deployed as expected. After this, during a backup or recovery of the database, the application should to be shut down for avoiding any write operations leading to inconsistencies. + +For temporarily disabling Gitea: +```sh +## Disable ArgoCD auto-sync during the changes +kubectl patch application -n argocd gitea-gitea --patch '[{"op": "remove", "path": "/spec/syncPolicy/automated"}]' --type=json +## Scale Gitea statefulset to zero +kubectl patch statefulset -n gitea gitea --patch '[{"op": "replace", "path": "/spec/replicas", "value": 0}]' --type=json +## Verify that pods are shut down +kubectl get statefulset -n gitea gitea # Should show READY 0/0 +``` + +For temporarily disabling Keycloak: +```sh +## Disable ArgoCD auto-sync during the changes +kubectl patch application -n argocd keycloak-keycloak-operator --patch '[{"op": "remove", "path": "/spec/syncPolicy/automated"}]' --type=json +## Scale Keycloak statefulset to zero +kubectl patch keycloak -n keycloak keycloak --patch '[{"op": "replace", "path": "/spec/instances", "value": 0}]' --type=json +## Verify that pods are shut down +kubectl get statefulset -n keycloak keycloak # Should show READY 0/0 +``` + +For temporarily disabling Harbor: +```sh +## Disable ArgoCD auto-sync during the changes +kubectl patch application -n argocd harbor-harbor --patch '[{"op": "remove", "path": "/spec/syncPolicy/automated"}]' --type=json +## Scale Harbor deployment to zero +kubectl patch deploy -n harbor harbor-core --patch '[{"op": "replace", "path": "/spec/replicas", "value": 0}]' --type=json +## Verify that pods are shut down +kubectl get deploy -n harbor harbor-core # Should show READY 0/0 +``` + +### Removing the existing database + +After deploying the values changes and shutting down applications accessing the database, delete the database cluster. + +For Gitea: +```sh +## Disable ArgoCD auto-sync during the changes +kubectl patch application -n argocd gitea-gitea-otomi-db --patch '[{"op": "remove", "path": "/spec/syncPolicy/automated"}]' --type=json +## Delete the cluster +kubectl delete cluster -n gitea gitea-db +## Re-enable ArgoCD auto-sync +kubectl patch application -n argocd gitea-gitea-otomi-db --patch '[{"op": "add", "path": "/spec/syncPolicy/automated", "value": {"prune": true, "allowEmpty": true}}]' --type=json +``` + +For Harbor: +```sh +## Disable ArgoCD auto-sync during the changes +kubectl patch application -n argocd harbor-harbor-otomi-db --patch '[{"op": "remove", "path": "/spec/syncPolicy/automated"}]' --type=json +## Delete the cluster +kubectl delete cluster -n harbor harbor-otomi-db +## Re-enable ArgoCD auto-sync +kubectl patch application -n argocd harbor-harbor-otomi-db --patch '[{"op": "add", "path": "/spec/syncPolicy/automated", "value": {"prune": true, "allowEmpty": true}}]' --type=json +``` + +For Keycloak: +```sh +## Disable ArgoCD auto-sync during the changes +kubectl patch application -n argocd keycloak-keycloak-otomi-db --patch '[{"op": "remove", "path": "/spec/syncPolicy/automated"}]' --type=json +## Delete the cluster +kubectl delete cluster -n keycloak keycloak-db +## Re-enable ArgoCD auto-sync +kubectl patch application -n argocd keycloak-keycloak-otomi-db --patch '[{"op": "add", "path": "/spec/syncPolicy/automated", "value": {"prune": true, "allowEmpty": true}}]' --type=json +``` + +The cluster should now be recreated from the backup. Wait until the `Cluster` status shows `Cluster in healthy state` and restart the dependent services. + +### Restarting services + +For restoring Gitea processes: +```sh +## Re-enable ArgoCD auto-sync, which should also change the Gitea statefulset to scale up +kubectl patch application -n argocd gitea-gitea --patch '[{"op": "add", "path": "/spec/syncPolicy/automated", "value": {"prune": true, "allowEmpty": true}}]' --type=json +## Optional: scale up, for not having to wait for re-sync of ArgoCD +kubectl patch statefulset -n gitea gitea --patch '[{"op": "replace", "path": "/spec/replicas", "value": 1}]' --type=json +``` + +For restoring Keycloak processes: +```sh +## Re-enable ArgoCD auto-sync +kubectl patch application -n argocd keycloak-keycloak-operator-cr --patch '[{"op": "add", "path": "/spec/syncPolicy/automated", "value": {"prune": true, "allowEmpty": true}}]' --type=json +## Optional: scale up, for not having to wait for re-sync of ArgoCD +kubectl patch keycloak -n keycloak keycloak --patch '[{"op": "replace", "path": "/spec/instances", "value": 1}]' --type=json +## Required: force a restart of the platform Keycloak operator; ArgoCD re-creates the Deployment +kubectl delete deploy -n apl-keycloak-operator apl-keycloak-operator +``` + +For restoring Harbor processes: +```sh +## Re-enable ArgoCD auto-sync +kubectl patch application -n argocd harbor-harbor --patch '[{"op": "add", "path": "/spec/syncPolicy/automated", "value": {"prune": true, "allowEmpty": true}}]' --type=json +## Optional: scale up, for not having to wait for re-sync of ArgoCD +kubectl patch deploy -n harbor harbor-core --patch '[{"op": "replace", "path": "/spec/replicas", "value": 1}]' --type=json +``` + +## Obtaining a backup outside the cluster + +The following instructions for example apply for Gitea in the last step of [reinstalling a platform setup on a new cluster](platform-reinstall.md). If the backup to recover from is not available as a `Backup` resource within the cluster, but in an attached object storage, follow the instructions above, except for making the following change to `env/databases/.yaml` in the `values` repository: + +Adjust the object storage parameters below as needed, at least replacing the `` and `` placeholders. Typically `serverName` should remain unchanged. `linode-creds` are the account credentials set up by the platform and can be reused provided that they have access to the storage. + +env/databases/gitea.yaml: +```yaml +databases: + gitea: + # ... + recovery: + source: gitea-backup + database: gitea + owner: gitea + externalClusters: + - name: gitea-backup + barmanObjectStore: + serverName: gitea-db + destinationPath: s3:///gitea + endpointURL: https://.linodeobjects.com + s3Credentials: + accessKeyId: + name: linode-creds + key: S3_STORAGE_ACCOUNT + secretAccessKey: + name: linode-creds + key: S3_STORAGE_KEY + wal: + compression: gzip + maxParallel: 8 + data: + compression: gzip +``` + +env/databases/harbor.yaml: +```yaml +databases: + harbor: + # ... + recovery: + source: harbor-backup + database: registry + owner: harbor + externalClusters: + - name: harbor-backup + barmanObjectStore: + serverName: harbor-otomi-db + destinationPath: s3:///harbor + endpointURL: https://.linodeobjects.com + s3Credentials: + accessKeyId: + name: linode-creds + key: S3_STORAGE_ACCOUNT + secretAccessKey: + name: linode-creds + key: S3_STORAGE_KEY + wal: + compression: gzip + maxParallel: 8 + data: + compression: gzip +``` + +env/databases/keycloak.yaml: +```yaml +databases: + keycloak: + # ... + recovery: + source: keycloak-backup + database: keycloak + owner: keycloak + externalClusters: + - name: keycloak-backup + barmanObjectStore: + serverName: keycloak-db + destinationPath: s3:///keycloak + endpointURL: https://.linodeobjects.com + s3Credentials: + accessKeyId: + name: linode-creds + key: S3_STORAGE_ACCOUNT + secretAccessKey: + name: linode-creds + key: S3_STORAGE_KEY + wal: + compression: gzip + maxParallel: 8 + data: + compression: gzip +``` + +## Point-in-time recovery + +For restoring a backup only up to a specific point in time, add a recovery target to the `recovery` sections above, according to the [CloudnativePG docs](https://cloudnative-pg.io/documentation/current/recovery/#point-in-time-recovery-pitr). For example, for restoring Gitea up to a change that was made after 2023-03-06 08:00:39 CET, add the following value: + +```yaml +databases: + gitea: + # ... + recovery: + source: gitea-backup + database: gitea + owner: gitea + recoveryTarget: + # Time base target for the recovery + targetTime: "2023-03-06 08:00:39+01" + externalClusters: + # ... +``` + +You can also use a [named backup resource](#regular-recovery-with-backup-in-same-cluster). However, the backup must be from **before** the timestamp you choose as a recovery target, considering that they are named with a timestamp in universal time (UTC). + +Note that the timestamp above is not exactly in the common ISO 8601 format such as `2023-07-06T08:00:39Z`. Instead date and time must be separated by space and the timezone should be written out explicitly such as `+00` (for UTC) or `+01` (for CET without DST). For all valid formats, refer to the [specific section of the PostgreSQL documentation](https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-INPUT-TIME-STAMPS). + +## Emergency backup and restore + +The methods using the built-in tools of PostgreSQL `pg_dump` and `pg_restore` should be used of the operator is not available. This type of backup can also be used as an additional safety measure before using any of the aforementioned methods. Be aware that the backups are stored on the computer where the commands are executed. This requires a stable connection to the database pods during the time of the backup and recovery. + +1. Scale the application to zero that is using the database cluster ([see above](#shutting-down-services)). +2. Perform the backup or the restore as needed (following commands). +3. Restore the application processes ([see above](#restarting-services)). + +Note that in difference to the commands as documented in the [CNPG site](https://cloudnative-pg.io/documentation/current/troubleshooting/#emergency-backup), the following `pg_restore` commands include the `--clean` flag which will clear tables before the import. Otherwise, the import will likely fail as the database is usually not empty after the application has been initializing it on startup. Nevertheless **use this flag with care**! + +In the following steps, the `-n` suffix of each pod name (e.g. `gitea-db-n`) needs to be replaced with the primary pod instance (e.g. `kubectl exec -n gitea gitea-db-1 ...`). + +### Gitea database + +Determine the primary instance: +```sh +kubectl get cluster -n gitea gitea-db +``` + +Backup: +```sh +kubectl exec -n gitea gitea-db-n postgres \ + -- pg_dump -Fc -d gitea > gitea.dump +``` + +Restore: +```sh +kubectl exec -i -n gitea gitea-db-n postgres \ + -- pg_restore --no-owner --role=gitea -d gitea --verbose --clean < gitea.dump +``` + +### Keycloak database + +Determine the primary instance: +```sh +kubectl get cluster -n keycloak keycloak-db +``` + +Backup: +```sh +kubectl exec -n keycloak keycloak-db-n postgres \ + -- pg_dump -Fc -d keycloak > keycloak.dump +``` + +Restore: +```sh +kubectl exec -i -n keycloak keycloak-db-n postgres \ + -- pg_restore --no-owner --role=keycloak -d keycloak --verbose --clean < keycloak.dump +``` + +### Harbor database + +Determine the primary instance: +```sh +kubectl get cluster -n harbor harbor-otomi-db +``` + +Backup: +```sh +kubectl exec -n harbor harbor-otomi-db-n postgres \ + -- pg_dump -Fc -d harbor > harbor.dump +``` + +Restore: +```sh +kubectl exec -i -n harbor harbor-otomi-db-n postgres \ + -- pg_restore --no-owner --role=keycloak -d harbor --verbose --clean < harbor.dump +``` diff --git a/docs/for-ops/disaster-recovery/platform-reinstall.md b/docs/for-ops/disaster-recovery/platform-reinstall.md new file mode 100644 index 000000000..7dfb29725 --- /dev/null +++ b/docs/for-ops/disaster-recovery/platform-reinstall.md @@ -0,0 +1,199 @@ +--- +slug: reinstall +title: Reinstalling the platform +sidebar_label: Reinstall +--- + +## Prerequisites + +- All the required backups exist in object storage, and the storage has not been corrupted. + +- You should have downloaded a (non-redacted) values file using Platform -> Maintenance. + +- You should have your own domain name for the cluster. The old and the reinstalled cluster may use the same domain, but in that case the old cluster should no longer be running. For avoiding issues with DNS caching it is advised to use a new (sub-)domain. + +- Re-installing a Linode LKE cluster with the App Platform for LKE is currently not supported. It can only be installed in a new LKE cluster without the pre-installed platform, using your own domain. + +- Gitea cannot be restored directly onto a new installation of App Platform. The data and a database backup can however be restored after the installation with an initial database. + +## Steps + +The following steps are described in more detail: + +1. Pepare a new cluster. + +2. Prepare the values for for reinstallation. + +3. Make sure to stop any write operations to object storage. + +4. Reinstall the platform on the new cluster. + +5. Restore the Gitea database and repositories. + +## Provision a new cluster + +Provide a new cluster suitable for running the configuration. Do not install the platform directly when creating an LKE cluster. Example: + +```sh +# Update variables as needed +CLUSTER_LABEL=new-cluster +CLUSTER_REGION=nl-ams + +# Create the new cluster +linode-cli lke cluster-create \ + --label "$CLUSTER_LABEL" \ + --region "$CLUSTER_REGION" \ + --k8s_version 1.31 \ + --control_plane.high_availability true \ + --node_pools.type g6-dedicated-8 \ + --node_pools.count 3 +# Retrieve the kubeconfig +linode-cli get-kubeconfig --label "$CLUSTER_LABEL" +# Set cluster id variable +CLUSTER_ID=$(linode-cli lke clusters-list --label "$CLUSTER_LABEL" --json | jq ".[0].id") +# Set new default context +kubectl config use-context lke$CLUSTER_ID-ctx +``` + +## Values file adjustments + +Make a copy of the downloaded values file and adjust: + +- `cluster.domainSuffix` +- `dns.domainFilters` +- [DNS configuration](../../get-started/installation/dns.md) must be updated, if the previous platform was provisioned directly through Linode API +- `cluster.name` (preferably to the label of the new cluster from the previous step) +- Other credentials (e.g. access tokens) that will change +- Domains of any services that are changed + +First relocate the new cluster's backups, provided they are using the same object storage (buckets), by updating the `pathSuffix`. The backups can be activated except for Gitea, which should only be activated after the recovery (in the last step). + +```yaml +# ... +platformBackups: + database: + gitea: + # To be activated later + enabled: false + retentionPolicy: 7d + schedule: 0 0 * * * + pathSuffix: gitea-1 + harbor: + enabled: true + retentionPolicy: 7d + schedule: 0 0 * * * + pathSuffix: harbor-1 + keycloak: + enabled: true + retentionPolicy: 7d + schedule: 0 0 * * * + pathSuffix: keycloak-1 +``` + +Then prepare the database to be initialized with the backup data from the attached object storage. +In the following examples, replace the `` and `` placeholders. If the source platform itself has been recovered from a backup before, also update the last portion of the `destinationPath`, e.g. to `harbor-1`. In that case, change the aforementioned `pathSuffix` to a different value, e.g. `harbor-2`. + +In the section `databases.harbor`: + +```yaml +# ... +databases: + # ... + harbor: + # ... + recovery: + source: harbor-backup + database: registry + owner: harbor + externalClusters: + - name: harbor-backup + barmanObjectStore: + serverName: harbor-otomi-db + destinationPath: s3:///harbor + endpointURL: https://.linodeobjects.com + s3Credentials: + accessKeyId: + name: linode-creds + key: S3_STORAGE_ACCOUNT + secretAccessKey: + name: linode-creds + key: S3_STORAGE_KEY + wal: + compression: gzip + maxParallel: 8 + data: + compression: gzip +``` + +In `databases.keycloak`: + +```yaml +# ... +databases: + # ... + keycloak: + # ... + recovery: + source: keycloak-backup + database: keycloak + owner: keycloak + externalClusters: + - name: keycloak-backup + barmanObjectStore: + serverName: keycloak-db + destinationPath: s3:///keycloak + endpointURL: https://.linodeobjects.com + s3Credentials: + accessKeyId: + name: linode-creds + key: S3_STORAGE_ACCOUNT + secretAccessKey: + name: linode-creds + key: S3_STORAGE_KEY + wal: + compression: gzip + maxParallel: 8 + data: + compression: gzip +``` + +## Stopping write operations + +If the old cluster is still running, make sure to halt any write operations to the object storage that the new one will be using for recovery. While for your applications this is very individual, for the platform this means in detail: + +Where applicable, on the Console of **the old cluster** +* Go to Settings -> Backup, and disable all options. Also clear the `linodeApiToken`. +* Go to Settings -> Object storage, and set the provider to `Disabled`. + +Note that apps strictly dependent on object storage (e.g. Harbor) will become unavailable on the old cluster. + +## Re-installing the platform + +Follow the basic [installation instructions](../../get-started/installation/helm.md) to install the Helm repository. Then install the platform using the edited values file: + +```sh +helm install -f updated-values.yaml apl apl/apl +``` + +Note that due to some race conditions during the Helm execution, some recoverable errors may occur during the installation process. Currently known issues are: + +- The database recovery pods (suffixed with `-full-recovery`) may fail to start, reporting a missing secret. This can usually be fixed by deleting the pod. It will be recreated automatically. +- The Istio operator can sometimes take a long time to start. + +When the installation has completed, you should be able to log into the console using the credentials known from the previous platform. + +## Restoring Gitea repositories + +When the platform is installed, Gitea can also be restored to the state as preserved in the backups. For restoring the database, refer to the [instructions on platform databases](platform-databases.md). + +1. [Adjust the values file in the repository](platform-databases.md#regular-recovery-with-backup-in-same-cluster) taking into account the [cluster is restored from a remote backup](platform-databases.md#obtaining-a-backup-outside-the-cluster). + +2. [Shut down Gitea](platform-databases.md#shutting-down-services). + +3. Delete the Gitea database. + +4. [Start Gitea](platform-databases.md#restarting-services). + +5. Activate backups for Gitea in the platform Settings -> Backups. + +The contents of the code repositories can be retrieved following the [Gitea-specific steps](gitea.md). diff --git a/sidebar-docs.js b/sidebar-docs.js index b68a68f59..e977ccd61 100644 --- a/sidebar-docs.js +++ b/sidebar-docs.js @@ -1,8 +1,6 @@ module.exports = { mainSidebar: { - "Akamai App Platform": [ - "akamai-app-platform/introduction", - ], + "Akamai App Platform": ["akamai-app-platform/introduction"], "Getting Started": [ "get-started/overview", { @@ -68,7 +66,6 @@ module.exports = { "for-devs/console/settings", "for-devs/console/sealed-secrets", "for-devs/console/shell", - ], }, ], @@ -118,7 +115,15 @@ module.exports = { ], }, { - "CLI": [ + "Disaster recovery": [ + "for-ops/disaster-recovery/overview", + "for-ops/disaster-recovery/gitea", + "for-ops/disaster-recovery/platform-databases", + "for-ops/disaster-recovery/platform-reinstall", + ], + }, + { + CLI: [ "for-ops/cli/installation", "for-ops/cli/deploying", "for-ops/cli/known-issues",