diff --git a/.template-infra/app-analytics.yml b/.template-infra/app-analytics.yml index 2465d7fe77..3e2b058539 100644 --- a/.template-infra/app-analytics.yml +++ b/.template-infra/app-analytics.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: v0.15.7 +_commit: v0.17.0 _src_path: https://github.com/navapbc/template-infra app_name: analytics template: app diff --git a/.template-infra/app-api.yml b/.template-infra/app-api.yml index bf03dc0435..9401be7e56 100644 --- a/.template-infra/app-api.yml +++ b/.template-infra/app-api.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: v0.15.7 +_commit: v0.17.0 _src_path: https://github.com/navapbc/template-infra app_name: api template: app diff --git a/.template-infra/app-frontend.yml b/.template-infra/app-frontend.yml index 7b44852f61..bc3d8893a9 100644 --- a/.template-infra/app-frontend.yml +++ b/.template-infra/app-frontend.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: v0.15.7 +_commit: v0.17.0 _src_path: https://github.com/navapbc/template-infra app_name: frontend template: app diff --git a/.template-infra/app-nofos.yml b/.template-infra/app-nofos.yml index 8bb14391bb..6eab8539cc 100644 --- a/.template-infra/app-nofos.yml +++ b/.template-infra/app-nofos.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: v0.15.7 +_commit: v0.17.0 _src_path: https://github.com/navapbc/template-infra app_name: nofos template: app diff --git a/.template-infra/base.yml b/.template-infra/base.yml index 571e0f616e..2d8a528f56 100644 --- a/.template-infra/base.yml +++ b/.template-infra/base.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: v0.15.7 +_commit: v0.17.0 _src_path: https://github.com/navapbc/template-infra base_code_repository_url: https://github.com/navapbc/simpler-grants-gov base_default_region: us-east-2 diff --git a/docs/infra/deletion-protection-and-temporary-environments.md b/docs/infra/deletion-protection-and-temporary-environments.md new file mode 100644 index 0000000000..e9af46318d --- /dev/null +++ b/docs/infra/deletion-protection-and-temporary-environments.md @@ -0,0 +1,91 @@ +# Deletion protection and temporary environments + +Resources like databases, load balancers, and S3 buckets have deletion protection enabled to prevent accidental data loss in permanent environments. However, temporary environments — such as [pull request environments](./pull-request-environments.md) and CI test environments — need to be destroyed automatically when they're no longer needed. This document explains the convention used to conditionally disable deletion protection in temporary environments so that cleanup workflows can destroy resources without manual intervention. + +## How temporary environments are identified + +Temporary environments are identified using [Terraform workspaces](./develop-and-test-infrastructure-in-isolation-using-workspaces.md). Each Terraform root module defines a local variable `is_temporary` based on whether the current workspace is the `default` workspace: + +```hcl +# infra//service/main.tf + +locals { + # All non-default terraform workspaces are considered temporary. + # Temporary environments do not have deletion protection enabled. + # Examples: pull request preview environments are temporary. + is_temporary = terraform.workspace != "default" +} +``` + +This local is then passed to child modules as a variable: + +```hcl +module "service" { + ... + is_temporary = local.is_temporary +} +``` + +The `default` workspace is used for permanent environments (e.g. dev, staging, prod). Any other workspace — whether created by a PR environment workflow, a CI pipeline, or a developer testing in isolation — is considered temporary. + +## The `is_temporary` pattern for gating deletion protection + +Each module that manages a deletion-protected resource accepts an `is_temporary` variable and uses it to conditionally disable deletion protection. The variable defaults to `false` so that resources are protected unless explicitly marked as temporary. + +Each module defines the variable the same way: + +```hcl +variable "is_temporary" { + description = "Whether the service is meant to be spun up temporarily (e.g. for automated infra tests). This is used to disable deletion protection." + type = bool + default = false +} +``` + +The expression used depends on the resource's deletion protection attribute. For boolean attributes, negate `is_temporary`: + +```hcl +# Boolean deletion protection (e.g. ALB, RDS) +enable_deletion_protection = !var.is_temporary + +# S3 force destroy +force_destroy = var.is_temporary +``` + +Some resources use non-boolean values. For example, Cognito user pools use string values: + +```hcl +deletion_protection = var.is_temporary ? "INACTIVE" : "ACTIVE" +``` + +Always keep the attribute on its own line with the standard comment so that the pattern is easy to find with grep: + +```hcl +# Use a separate line to support automated terraform destroy commands +force_destroy = var.is_temporary +``` + +Search for `is_temporary` across the codebase to see all resources currently using this pattern. + +## What happens if you don't gate deletion protection + +If a deletion-protected resource does not use the `is_temporary` pattern: + +- **Cleanup workflows fail.** The PR environment destroy workflow and CI cleanup jobs run `terraform destroy` in non-default workspaces. If a resource has deletion protection unconditionally enabled, the destroy will fail. +- **Orphaned resources accumulate.** Failed destroys leave resources running in AWS, accruing costs. +- **Manual intervention is required.** Someone has to manually disable deletion protection and delete the orphaned resources. +- **CI pipelines break.** Subsequent CI runs may fail due to naming conflicts with orphaned resources. + +## Cleanup mechanisms that depend on this pattern + +Automated cleanup workflows rely on `is_temporary` being properly gated: + +- **PR environment destroy workflows** — When a pull request is merged or closed, [pr-environment-destroy.yml](/.github/workflows/pr-environment-destroy.yml) runs `terraform destroy` in the PR's workspace and then deletes the workspace. See [Pull request environments](./pull-request-environments.md) for details. +- **Developer workspace cleanup** — Developers working in [isolated workspaces](./develop-and-test-infrastructure-in-isolation-using-workspaces.md) run `terraform destroy` to clean up after merging their changes. +- **CI infrastructure checks** — The [ci-infra.yml](/.github/workflows/ci-infra.yml) workflow creates temporary workspaces for infrastructure validation and destroys them after checks complete. + +## See also + +- [Pull request environments](./pull-request-environments.md) +- [Develop and test infrastructure in isolation using workspaces](./develop-and-test-infrastructure-in-isolation-using-workspaces.md) +- [Destroy infrastructure](./destroy-infrastructure.md) diff --git a/docs/infra/temporary-environments-and-out-of-band-resources.md b/docs/infra/temporary-environments-and-out-of-band-resources.md new file mode 100644 index 0000000000..4103117ccc --- /dev/null +++ b/docs/infra/temporary-environments-and-out-of-band-resources.md @@ -0,0 +1,95 @@ +# Temporary Environments and Out-of-Band Resources + +This document explains how temporary environments handle resources that require **out-of-band processes** — that is, resources that can't be created or destroyed purely through Terraform. + +## Background + +Temporary environments are short-lived infrastructure instances used for testing and review. Template-infra supports three types: + +- **PR environments** — created automatically when a pull request is opened and destroyed when it's merged or closed. See [Pull Request Environments](./pull-request-environments.md). +- **Workspace-based environments** — created manually using Terraform workspaces for isolated development and testing. See [Develop and Test Infrastructure in Isolation Using Workspaces](./develop-and-test-infrastructure-in-isolation-using-workspaces.md). +- **CI end-to-end environments** — created and torn down by [`template-only-ci-infra.yml`](/.github/workflows/template-only-ci-infra.yml) to test creating a new project from scratch, deploying infrastructure (tests live in `infra/test/`), and tearing it all down. + +All three types need to handle the fact that some resources can't simply be `terraform apply`'d into existence and `terraform destroy`'d away. + +## What Are Out-of-Band Resources? + +An "out-of-band resource" is any resource whose lifecycle involves steps outside of Terraform. Common reasons a resource might require out-of-band processes include: + +- **External coordination** — the resource requires manual configuration in an external system (e.g., adding DNS NS records at a domain registrar) +- **Long provisioning times** — creating the resource takes so long that it's impractical for short-lived environments (e.g., database provisioning can take 20–40 minutes) +- **Shared state dependencies** — the resource holds state that is valuable to reuse across environments rather than recreate (e.g., a database with seed data, or an identity provider with test user accounts) +- **Global uniqueness constraints** — the resource has naming or configuration constraints that make it difficult to create multiple instances (e.g., certain DNS configurations) + +## How Temporary Environments Handle Out-of-Band Resources + +Temporary environments use two strategies for handling out-of-band resources: **sharing** and **exclusion**. + +### Strategy 1: Sharing Resources + +Some out-of-band resources are shared across temporary environments rather than being provisioned independently for each one. This sharing can take two forms: + +- **Cross-layer sharing** — temporary environments in one infrastructure layer (also called a root module — see [Module Architecture](./module-architecture.md)) share resources managed by a different layer (e.g., the service layer's temporary environments share the database provisioned by the database layer) +- **Same-layer sharing** — temporary environments using non-default Terraform workspaces share resources with the default workspace (e.g., a PR environment's service workspace shares resources that only exist in the default workspace) + +**Resources that use this strategy:** + +| Resource | Why It's Shared | Implications | +| --- | --- | --- | +| Database | Provisioning a new database takes 20–40 minutes, which is impractical for short-lived environments | Cross-layer sharing: service layer temporary environments use the database from the database layer. Database migrations from the PR branch are **not** automatically run. Schema changes should be isolated into separate PRs and merged before application changes that depend on them. | +| Identity provider (Cognito) | Cognito user pools contain test user accounts and configuration that would need to be recreated for each environment | Same-layer sharing: temporary environments in non-default workspaces share the Cognito user pool from the default workspace. Changes to identity provider configuration in a PR will not be reflected until merged. | + +**What this means for developers:** If your change introduces a new resource that requires out-of-band processes, and that resource holds shared state (like user data or application data), consider whether temporary environments should share the resource. If so, you'll need to: + +1. Ensure the Terraform configuration supports pointing temporary environments at the shared instance of the resource (whether cross-layer or same-layer) + +### Strategy 2: Exclusion from Temporary Environments + +Some out-of-band resources are simply not created in temporary environments. The Terraform configuration detects when it's running in a non-default workspace and skips these resources entirely. + +**Resources that use this strategy:** + +| Resource | Why It's Excluded | Implications | +| --- | --- | --- | +| DNS records / custom domains | Requires manual NS record configuration at the domain registrar, and temporary environments don't need custom domain names | Temporary environments are accessed via their default AWS-provided URLs rather than custom domains | +| Resources with external approval workflows | Some resources may require manual approval or configuration steps that can't be automated | The functionality provided by these resources can't be tested in temporary environments | + +**What this means for developers:** If your change introduces a new resource that requires out-of-band setup and doesn't hold shared state, consider whether the resource should simply be excluded from temporary environments. If so, you'll need to: + +1. Add conditional logic to the Terraform configuration to skip the resource in non-default workspaces +2. Ensure the application can function without the resource (graceful degradation) + +## Decision Framework + +When introducing a new resource that requires out-of-band processes, use this framework to decide how temporary environments should handle it: + +```mermaid +flowchart TD + A{"Does the resource hold state that is\nvaluable to share?\ne.g. user data, test data"} + B["Share across temporary environments\n• Configure Terraform to reference\n the shared instance\n (cross-layer or same-layer)"] + C{"Can the application function\nwithout this resource?"} + D["Exclude from temporary environments\n• Add workspace-conditional logic\n• Ensure graceful degradation"] + E["Design problem — solve in the tech spec\n• Consider redesigning to avoid\n out-of-band processes\n• Consider a lightweight substitute\n for temporary environments\n• Discuss with the team before proceeding"] + + A -->|Yes| B + A -->|No| C + C -->|Yes| D + C -->|No| E +``` + +## Cleanup Considerations + +Out-of-band resources create cleanup challenges for temporary environments: + +- **Shared resources** don't need cleanup (they're managed elsewhere), but be aware that temporary environments may leave artifacts in the shared resource (e.g., database rows, identity provider sessions) that aren't cleaned up automatically +- **Excluded resources** don't need cleanup (they were never created), but if a resource is partially provisioned before an out-of-band step is needed, the Terraform state may contain references to incomplete resources + +For more on cleanup of temporary environment resources, see [Pull Request Environments — Cleanup](./pull-request-environments.md) and the CI cleanup workflows. + +## Related Documentation + +- [Pull Request Environments](./pull-request-environments.md) — how PR environments are created and destroyed +- [Develop and Test Infrastructure in Isolation Using Workspaces](./develop-and-test-infrastructure-in-isolation-using-workspaces.md) — how workspace-based temporary environments work +- [Custom Domains](./custom-domains.md) — an example of a resource requiring out-of-band DNS configuration +- [Identity Provider](./identity-provider.md) — Cognito setup and configuration +- [Set Up Database](./set-up-database.md) — database provisioning diff --git a/infra/modules/storage/access_control.tf b/infra/modules/storage/access_control.tf index ee34e6ac40..3861dafd88 100644 --- a/infra/modules/storage/access_control.tf +++ b/infra/modules/storage/access_control.tf @@ -57,6 +57,7 @@ data "aws_iam_policy_document" "storage_access" { "arn:aws:s3:::${var.name}/*" ] } + statement { actions = ["kms:GenerateDataKey", "kms:Decrypt"] effect = "Allow" diff --git a/infra/modules/storage/encryption.tf b/infra/modules/storage/encryption.tf index 30a92e7359..2a2abd14db 100644 --- a/infra/modules/storage/encryption.tf +++ b/infra/modules/storage/encryption.tf @@ -1,3 +1,53 @@ +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} + +data "aws_iam_policy_document" "kms_key_policy" { + # checkov:skip=CKV_AWS_109:Root account requires full KMS permissions to enable IAM-based access control + # checkov:skip=CKV_AWS_111:Root account requires full KMS permissions to enable IAM-based access control + # checkov:skip=CKV_AWS_356:In a key policy, the wildcard character in the Resource element represents the KMS key to which the key policy is attached. + + # This gives the AWS account that owns the KMS key full access to the KMS key, + # deferring specific access rules to IAM roles. + # + # This is the default key policy for programmatically generated KMS keys in + # general, see: https://docs.aws.amazon.com/kms/latest/developerguide/key-policy-default.html#key-policy-default-allow-root-enable-iam + statement { + sid = "Enable IAM User Permissions" + effect = "Allow" + principals { + type = "AWS" + identifiers = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"] + } + actions = ["kms:*"] + resources = ["*"] + } + + # Optional: Allow the given AWS service principals to use the key via S3, for situations + # where the service can't use more explicitly configured IAM roles + dynamic "statement" { + for_each = length(var.service_principals_with_access) > 0 ? [1] : [] + content { + sid = "AllowViaS3Service" + effect = "Allow" + principals { + type = "Service" + identifiers = var.service_principals_with_access + } + actions = [ + "kms:Decrypt", + "kms:GenerateDataKey", + "kms:DescribeKey" + ] + resources = ["*"] + condition { + test = "StringEquals" + variable = "kms:ViaService" + values = ["s3.${data.aws_region.current.name}.amazonaws.com"] + } + } + } +} + resource "aws_kms_key" "storage" { description = "KMS key for bucket ${var.name}" # The waiting period, specified in number of days. After the waiting period ends, AWS KMS deletes the KMS key. @@ -5,7 +55,7 @@ resource "aws_kms_key" "storage" { # Generates new cryptographic material every 365 days, this is used to encrypt your data. The KMS key retains the old material for decryption purposes. enable_key_rotation = "true" - # checkov:skip=CKV2_AWS_64:We're encrpying with KMS, I don't feel the need to specify a KMS policy as well + policy = data.aws_iam_policy_document.kms_key_policy.json } resource "aws_s3_bucket_server_side_encryption_configuration" "storage" { diff --git a/infra/modules/storage/providers.tf b/infra/modules/storage/providers.tf new file mode 100644 index 0000000000..f2702bf6e8 --- /dev/null +++ b/infra/modules/storage/providers.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } +} diff --git a/infra/modules/storage/tests/storage.tftest.hcl b/infra/modules/storage/tests/storage.tftest.hcl index 0a1e556476..33c7da3883 100644 --- a/infra/modules/storage/tests/storage.tftest.hcl +++ b/infra/modules/storage/tests/storage.tftest.hcl @@ -1,4 +1,14 @@ -mock_provider "aws" {} +mock_provider "aws" { + # aws_iam_policy_document.kms_key_policy is used as the policy for + # aws_kms_key.storage, which validates the value as JSON. The default + # mock_provider value is not valid JSON, so provide a valid stub. + override_data { + target = data.aws_iam_policy_document.kms_key_policy + values = { + json = "{}" + } + } +} variables { name = "my-test-bucket-12345" diff --git a/infra/modules/storage/variables.tf b/infra/modules/storage/variables.tf index 61358ba527..7b08b4aea7 100644 --- a/infra/modules/storage/variables.tf +++ b/infra/modules/storage/variables.tf @@ -8,3 +8,17 @@ variable "name" { type = string description = "Name of the AWS S3 bucket. Needs to be globally unique across all regions." } + +variable "service_principals_with_access" { + description = <<-EOT + Storage access should generally be controlled via attaching the `access_policy_arn` + output to an IAM role, but there are some situations where that may not be possible. + Generally when an AWS service doesn't have a way to assume a particular IAM role for operations. + + This list of AWS service principals (e.g., bedrock.amazonaws.com) will be used to configure + resources (e.g., the bucket's KMS key) such that the services will be able + to access the bucket's objects. + EOT + type = list(string) + default = [] +} diff --git a/infra/project-config/aws_services.tf b/infra/project-config/aws_services.tf index 8109271ade..a277db7ee5 100644 --- a/infra/project-config/aws_services.tf +++ b/infra/project-config/aws_services.tf @@ -15,8 +15,14 @@ locals { // AWS Backup – Centralized service for automating backups across AWS resources. "backup", + // AWS Bedrock – Provides access to large language models and generative AI services. + "bedrock", + "cloudfront", + // AWS CloudFormation – Infrastructure as Code (IaC) service for provisioning and managing AWS resources. + "cloudformation", + // Amazon CloudWatch – Monitors and logs AWS resources and applications. "cloudwatch",