fix(AWS): Replace acl public-read (#93)

feat(infra): push to Supabase BREAKING CHANGES: Pushes the harvested DWD data to Supabase instead of S3. Includes unrelated and deprecated AWS, terraform changes.
technologiestiftung · Jul 12, 2023 · 1da9e94 · 1da9e94
1 parent 1ad1673
commit 1da9e94
Show file tree

Hide file tree

Showing 15 changed files with 1,027 additions and 68 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -77,6 +77,16 @@
         "code",
         "bug"
       ]
+    },
+    {
+      "login": "Jaszkowic",
+      "name": "Jonas Jaszkowic",
+      "avatar_url": "https://avatars.githubusercontent.com/u/10830180?v=4",
+      "profile": "https://github.com/Jaszkowic",
+      "contributions": [
+        "code",
+        "infra"
+      ]
     }
   ],
   "contributorsPerLine": 7

diff --git a/.github/workflows/test-harvest.yml b/.github/workflows/test-harvest.yml
@@ -17,9 +17,8 @@ env:
   PG_USER: postgres
   PG_PASS: postgres
   PG_DB: postgres
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_TEST }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_TEST }}
-  S3_BUCKET: ${{ secrets.AWS_S3_BUCKET_TEST }}
+  SUPABASE_URL: http://localhost:54321
+  SUPABASE_BUCKET_NAME: data_assets
   OUTPUT: "True"
   MAPBOXUSERNAME: "123"
   MAPBOXTOKEN: "456"
@@ -58,9 +57,10 @@ jobs:
       - name: build the harvester
         run: cd harvester && docker build --tag technologiestiftung/giessdenkiez-de-dwd-harvester:test .
       - name: Start the api
-        run: cd api && supabase start && cd ..
+        id: api-start
+        run: cd api && supabase start | grep -w "service_role key" | cut -d ":" -f 2 | xargs |  tr -d '\n' | awk '{print "service_role_key="$1}' >> "$GITHUB_OUTPUT" && cd ..
       - name: run the harvester
-        run: docker run --env PG_SERVER='0.0.0.0' --env PG_DB --env PG_PORT --env PG_USER --env PG_PASS --env AWS_SECRET_ACCESS_KEY --env AWS_ACCESS_KEY_ID --env S3_BUCKET --env MAPBOXTOKEN --env MAPBOXUSERNAME --env LOGGING --env OUTPUT --network host technologiestiftung/giessdenkiez-de-dwd-harvester:test
+        run: docker run --env PG_SERVER='0.0.0.0' --env PG_DB --env PG_PORT --env PG_USER --env PG_PASS --env SUPABASE_URL --env SUPABASE_ACCESS_TOKEN='${{ steps.api-start.outputs.service_role_key }}' --env SUPABASE_BUCKET_NAME --env MAPBOXTOKEN --env MAPBOXUSERNAME --env LOGGING --env OUTPUT --network host technologiestiftung/giessdenkiez-de-dwd-harvester:test
       - name: stop the api
         run: cd api && supabase stop && cd ..
   release:

diff --git a/.gitignore b/.gitignore
@@ -197,6 +197,7 @@ override.tf.json
 # example: *tfplan*
 terraform/rds/terraform.tfvars
 terraform/ecs-harvester/terraform.tfvars
+terraform/s3-bucket/terraform.tfvars
 terraform/.vscode/settings.json
 # Logs
 logs

diff --git a/README.md b/README.md
@@ -5,9 +5,9 @@
 
 # giessdenkiez-de-dwd-harvester
 
-- Gather precipitation data from DWD's radolan data set, for the region of Berlin and connect to the giessdenkiez.de postgres DB (AWS RDS)
+- Gather precipitation data from DWD's radolan data set, for the region of Berlin and connect to the giessdenkiez.de postgres DB
 - Uploads trees combined with weather data to Mapbox and uses its API to create vector tiles for use on mobile devices
-- Generates CSV and GeoJSON files that contain trees locations and weather data (grid) and uploads them to a AWS S3
+- Generates CSV and GeoJSON files that contain trees locations and weather data (grid) and uploads them to a Supabase Storage bucket
 
 ## Pre-Install
 
@@ -49,15 +49,22 @@ Copy the `sample.env` file and rename to `.env` then update the parameters, most
 
 ## Running
 
+### Preparing the Buffer Shape
 `harvester/prepare.py` shows how the assets/buffer.shp was created. If a bigger buffer is needed change `line 10` accordingly and re-run.
 
+### Creating the Grid Structure
+`harvester/grid/grid.py` can be used to populate the radolan_geometry table. This table contains vector data for the target city. The data is needed by the harvest process to find the rain data for the target city area.
+
+This tool currently works for Berlin. To make use of it for another city, just replace the `harvester/grid/buffer.shp` file with a suitable shape. (can be generated by `harvester/prepare.py` for example. See above)
+
+### Running the Harvest Process
 `harvester/harvester.py` is the actual file for harvesting the data. Simply run, no command line parameters, all settings are in `.env`.
 
 The code in `harvester/harvester.py` tries to clean up after running the code. But, when running this in a container, as the script is completely stand alone, its probably best to just destroy the whole thing and start from scratch next time.
 
 ## Docker
 
-To have a local database for testing you need Docker and docker-compose installed. You will also have to create a public S3 Bucket. You also need to update the `.env` file with the values from `sample.env` below the line `# for your docker environment`.
+To have a local database for testing you need Docker and docker-compose installed. You will also have to create a public Supabase Storage bucket. You also need to update the `.env` file with the values from `sample.env` below the line `# for your docker environment`.
 
 to start only the database run
 

diff --git a/action.yml b/action.yml
@@ -17,14 +17,14 @@ inputs:
   PG_DB:
     required: true
     description: "The name of the PostgreSQL database"
-  AWS_ACCESS_KEY_ID:
+  SUPABASE_URL:
     required: true
-    description: ""
-  AWS_SECRET_ACCESS_KEY:
+    description: "The base URL to Supabase, e.g. http://localhost:54321 for a local Supabase"
+  SUPABASE_ACCESS_TOKEN:
     required: true
-    description: ""
-  S3_BUCKET:
-    description: ""
+    description: "The access token for accessing the Supabase bucket"
+  SUPABASE_BUCKET_NAME:
+    description: "The bucket name where the harvested data should be pushed to"
     required: true
   OUTPUT:
     description: ""
@@ -55,9 +55,9 @@ runs:
     PG_USER: ${{ inputs.PG_USER }}
     PG_PASS: ${{ inputs.PG_PASS }}
     PG_DB: ${{ inputs.PG_DB }}
-    AWS_ACCESS_KEY_ID: ${{ inputs.AWS_ACCESS_KEY_ID }}
-    AWS_SECRET_ACCESS_KEY: ${{ inputs.AWS_SECRET_ACCESS_KEY }}
-    S3_BUCKET: ${{ inputs.S3_BUCKET }}
+    SUPABASE_URL: ${{ inputs.SUPABASE_URL }}
+    SUPABASE_ACCESS_TOKEN: ${{ inputs.SUPABASE_ACCESS_TOKEN }}
+    SUPABASE_BUCKET_NAME: ${{ inputs.SUPABASE_BUCKET_NAME }}
     MAPBOXUSERNAME: ${{ inputs.MAPBOXUSERNAME }}
     MAPBOXTOKEN: ${{ inputs.MAPBOXTOKEN }}
     MAPBOXTILESET: ${{ inputs.MAPBOXTILESET }}

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -16,9 +16,9 @@ services:
       # PG_PORT: 5432
       # PG_USER: fangorn
       # PG_PASS: ent
-      # AWS_ACCESS_KEY_ID: 123
-      # AWS_SECRET_ACCESS_KEY: 456
-      # S3_BUCKET: xyz
+      # SUPABASE_PROJECT_ID=
+      # SUPABASE_ACCESS_TOKEN=
+      # SUPABASE_BUCKET_NAME=data_assets
       # bygiessdenkiez-devoverwriting these here we make sure
       # we dont push all the time to mapbox 💵
       MAPBOXUSERNAME: "123"

diff --git a/harvester/grid/grid-germany.asc b/harvester/grid/grid-germany.asc
diff --git a/harvester/grid/grid.py b/harvester/grid/grid.py
@@ -2,7 +2,7 @@
 
 Input:
     buffer.shp: Shapefile containing the outline of the area of interest as a polygon
-    grid.asc: An example raster file conatining precipitation data for whole Germany
+    grid-germany.asc: An example raster file conatining precipitation data for whole Germany
 """
 
 import sys
@@ -58,19 +58,20 @@
     conn = None
 
 # we need to give each grid cell a unique value, otherwise gdal_polygonize will combine cells with equal values
-asc_data = numpy.loadtxt(temp + "/grid.asc", skiprows=6)
+base_grid_file = "grid-germany.asc"
+asc_data = numpy.loadtxt(base_grid_file, skiprows=6)
 col_value = 1
 for r_idx, row in enumerate(asc_data):
     for c_idx, col in enumerate(row):
         asc_data[r_idx][c_idx] = col_value
         col_value += 1
 
-header = linecache.getline(temp + "/grid.asc", 1) + \
-    linecache.getline(temp + "/grid.asc", 2) + \
-    linecache.getline(temp + "/grid.asc", 3) + \
-    linecache.getline(temp + "/grid.asc", 4) + \
-    linecache.getline(temp + "/grid.asc", 5) + \
-    linecache.getline(temp + "/grid.asc", 6)
+header = linecache.getline(base_grid_file, 1) + \
+    linecache.getline(base_grid_file, 2) + \
+    linecache.getline(base_grid_file, 3) + \
+    linecache.getline(base_grid_file, 4) + \
+    linecache.getline(base_grid_file, 5) + \
+    linecache.getline(base_grid_file, 6)
 
 numpy.savetxt(temp + "/grid-transform.asc", asc_data,
               header=header.rstrip(), comments='', fmt='%i')

diff --git a/harvester/harvester.py b/harvester/harvester.py
@@ -39,7 +39,7 @@
 load_dotenv()
 
 # check if all required environmental variables are accessible
-for env_var in ["PG_DB", "PG_PORT", "PG_USER", "PG_PASS", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "S3_BUCKET"]:
+for env_var in ["PG_DB", "PG_PORT", "PG_USER", "PG_PASS", "SUPABASE_URL", "SUPABASE_BUCKET_NAME", "SUPABASE_ACCESS_TOKEN"]:
     if env_var not in os.environ:
         logging.error(
             "❌Environmental Variable {} does not exist".format(env_var))
@@ -58,6 +58,10 @@
 # get last day of insert
 last_date = None
 
+SUPABASE_URL = os.getenv('SUPABASE_URL')
+SUPABASE_BUCKET_NAME = os.getenv('SUPABASE_BUCKET_NAME')
+SUPABASE_ACCESS_TOKEN = os.getenv('SUPABASE_ACCESS_TOKEN')
+
 try:
     conn = psycopg2.connect(dsn)
     logging.info("🗄 Database connection established")
@@ -287,10 +291,8 @@
 
     values = None
 
-    # generate gejson for map and upload to S3
+    # generate geojson for map and upload to Supabase Storage
     logging.info("generate geojson 🗺️")
-    s3 = boto3.client('s3', aws_access_key_id=os.getenv(
-        "AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"))
 
     features = []
     features_light = []
@@ -302,6 +304,37 @@
         features_light.append(feature_template.format(
             cell[1], cell[0], sum(clean[cellindex])))
 
+    def check_file_exists_in_supabase_storage(file_name):
+        url = f'{SUPABASE_URL}/storage/v1/object/info/public/{SUPABASE_BUCKET_NAME}/{file_name}'
+        response = requests.get(url)
+        return response.status_code == 200
+
+    def upload_file_to_supabase_storage(file_path, file_name):
+        try:
+            file = open(file_path, 'rb')
+            file_url = f'{SUPABASE_URL}/storage/v1/object/{SUPABASE_BUCKET_NAME}/{file_name}'
+            r = requests.put if check_file_exists_in_supabase_storage(file_name) else requests.post
+            response = r(
+                file_url,
+                files={'file': file}, 
+                headers={
+                    'Authorization': f'Bearer {SUPABASE_ACCESS_TOKEN}',
+                    'ContentType': 'application/geo+json',
+                    'AcceptEncoding': 'gzip, deflate, br'
+                },
+            )
+
+            if response.status_code == 200:
+                logging.info("✅ Uploaded {} to supabase storage".format(file_name))
+            else:
+                logging.warning(response.status_code)
+                logging.warning(response.content)
+                logging.warning("❌ Could not upload {} to supabase storage".format(file_name))
+
+        except Exception as error:
+            logging.warning(error)
+            logging.warning("❌ Could not upload {} supabase storage".format(file_name))
+
     def finishGeojson(feature_list, file_name):
         geojson = '{{"type":"FeatureCollection","properties":{{"start":"{}","end":"{}"}},"features":[{}]}}'.format(
             startdate, enddate, ",".join(feature_list))
@@ -311,16 +344,7 @@ def finishGeojson(feature_list, file_name):
         text_file.close()
         n = None
 
-        s3.upload_file(path + file_name, os.getenv("S3_BUCKET"), file_name)
-
-        # add an additional gzip version
-        geojson_file = open(path + file_name, "rb")
-        geojson_data = geojson_file.read()
-        geojson_bindata = bytearray(geojson_data)
-        with gzip.open(path + file_name + ".gz", "wb") as f:
-            f.write(geojson_bindata)
-        s3.upload_file(path + file_name + ".gz", os.getenv("S3_BUCKET"), file_name + ".gz",
-                       ExtraArgs={'ContentType': 'application/json', 'ContentEncoding': 'gzip'})
+        upload_file_to_supabase_storage(path + file_name, file_name)
 
     finishGeojson(features, "weather.geojson")
     finishGeojson(features_light, "weather_light.geojson")
@@ -368,21 +392,10 @@ def finishGeojson(feature_list, file_name):
         text_file.close()
         n = None
 
-        s3.upload_file(path + "trees.csv", os.getenv("S3_BUCKET"), "trees.csv")
-        csv_data = bytes(trees_csv, "utf-8")
-        with gzip.open(path + "trees.csv.gz", "wb") as f:
-            f.write(csv_data)
-        s3.upload_file(path + "trees.csv.gz", os.getenv("S3_BUCKET"), "trees.csv.gz",
-                       ExtraArgs={'ContentType': 'text/csv', 'ContentEncoding': 'gzip'})
+        upload_file_to_supabase_storage(path + "trees.csv", "trees.csv")
 
         for i in range(4):
-            s3.upload_file(path + "trees-p{}.csv".format(i + 1),
-                           os.getenv("S3_BUCKET"), "trees-p{}.csv".format(i + 1))
-            csv_data = bytes(singleCSVs[i], "utf-8")
-            with gzip.open(path + "trees-p{}.csv.gz".format(i + 1), "wb") as f:
-                f.write(csv_data)
-            s3.upload_file(path + "trees-p{}.csv.gz".format(i + 1), os.getenv("S3_BUCKET"),
-                           "trees-p{}.csv.gz".format(i + 1), ExtraArgs={'ContentType': 'text/csv', 'ContentEncoding': 'gzip'})
+            upload_file_to_supabase_storage(path + "trees-p{}.csv".format(i + 1), "trees-p{}.csv".format(i + 1))
 
         # send the updated csv to mapbox
 

diff --git a/harvester/requirements.txt b/harvester/requirements.txt
@@ -18,7 +18,6 @@ pyproj==2.6.0
 python-dateutil==2.8.1
 python-dotenv==0.19.0
 pytz==2019.3
-s3transfer==0.3.3
 Shapely==1.7.0
 six==1.14.0
 urllib3==1.25.8

diff --git a/harvester/sample.env b/harvester/sample.env
@@ -5,10 +5,10 @@ PG_USER=
 PG_PASS=
 PG_DB=
 
-# AWS S3 Access data to store the resulting geojson/csv files
-AWS_ACCESS_KEY_ID=
-AWS_SECRET_ACCESS_KEY=
-S3_BUCKET=
+# Supabase Storage access data to store the resulting geojson/csv files
+SUPABASE_URL=
+SUPABASE_ACCESS_TOKEN=
+SUPABASE_BUCKET_NAME=
 
 # Generation of maptiles through mapbox requires api credentials and desired tileset name
 MAPBOXUSERNAME=
@@ -25,7 +25,7 @@ LOGGING=INFO
 # PG_USER=postgres
 # PG_PASS=postgres_password
 # PG_DB=postgres
-# AWS_ACCESS_KEY_ID=
-# AWS_SECRET_ACCESS_KEY=
-# S3_BUCKET=
+# SUPABASE_URL=
+# SUPABASE_ACCESS_TOKEN=
+# SUPABASE_BUCKET_NAME=
 # OUTPUT=True
diff --git a/terraform/.tool-versions b/terraform/.tool-versions
@@ -0,0 +1 @@
+terraform 1.4.6
diff --git a/terraform/s3-bucket/main.tf b/terraform/s3-bucket/main.tf
@@ -7,14 +7,34 @@ provider "aws" {
 output "s3-bucket-name-radolan" {
   value = "${aws_s3_bucket.radolan.bucket_domain_name}"
 }
+
+resource "aws_s3_bucket_public_access_block" "uploads" {
+  bucket                  = "${var.prefix}-${var.name}-${var.env}"
+  block_public_policy     = false
+  restrict_public_buckets = false
+}
+
 resource "aws_s3_bucket" "radolan" {
   bucket        = "${var.prefix}-${var.name}-${var.env}"
-  acl           = "public-read"
   force_destroy = true
   versioning {
     enabled = false
   }
 
+  policy = jsonencode({
+    "Version" = "2012-10-17"
+    "Id"      = "Policy-public-read-1"
+    "Statement" = [
+      {
+        "Sid"       = "AllowPublicRead"
+        "Effect"    = "Allow"
+        "Principal" = "*"
+        "Action"    = "s3:GetObject"
+        "Resource"  = "arn:aws:s3:::${var.prefix}-${var.name}-${var.env}/*"
+      }
+    ]
+  })
+
   # Should be added for production
   cors_rule {
     allowed_headers = ["*"]

diff --git a/terraform/s3-bucket/terraform.tfvars b/terraform/s3-bucket/terraform.tfvars
diff --git a/terraform/s3-bucket/terraform.tfvars.example b/terraform/s3-bucket/terraform.tfvars.example
@@ -0,0 +1,5 @@
+# rename me to terraform.tfvars
+profile         = "you"
+region          = "eu-central-1"
+env             = "dev"
+allowed_origins = [""]