Sen2Cube-at · fkroeber · May 10, 2025 · May 10, 2025 · May 10, 2025 · May 10, 2025
diff --git a/Readme.md b/Readme.md
@@ -11,7 +11,8 @@ This package builds on top of [semantique](https://zgis.github.io/semantique/#)
 3. Scaling mechanisms that allow to evaluate recipes for large spatio-temporal extents up to the mesoscale, with internal automatic handling of the required chunking of the processing into smaller parts
 
 ## Installation
-At this moment the package can only be installed from source. This can be done in several ways:
+### A. Local setup 
+It is strongly recommended to create a virtual environment before installing the package. The package installation itself can be done in several ways:
 
 1) Using pip to install directly from GitHub:
 
@@ -27,6 +28,38 @@ cd gsemantique
 pip install .
 ```
 
+### B. Cloud-based setup
+Gsemantique can be deployed on any cloud infrastructure. The following steps describe how to do so within an AWS EC2 computing environment. To setup an AWS EC2 instance with gsemantique, it is necessary to...
+
+1) Launch an EC2 instance as described [here](https://docs.aws.amazon.com/codedeploy/latest/userguide/instances-ec2-create.html#instances-ec2-create-console)
+    * OS: choose an Ubuntu image
+    * instance type: choose an r-instance (high RAM per CPU ratio) with the desired amount of RAM/CPUs 
+    * memory configuration: depending on the size of results to be saved (minimum of 10GB recommended)
+
+2) Access the remote EC2 server
+    * download the private key certificate .pem & change permissions to 400 (on Windows shift file to WLS home directory first, then change permissions there)
+    * run the following CLI command to access the server: ssh -i "xxx.pem" ubuntu@server_adress.amazonaws.com
+
+3) Configure the server environment
+
+    ```
+    # 3.1 Update apt & install core tools
+    sudo apt update && sudo apt upgrade -y
+    sudo apt install -y git python3 python3-venv python3-pip python3-dev libpq-dev
+
+    # 3.2 Create virtual environment
+    mkdir -p venv
+    python3 -m venv venv/gsemantique
+    source venv/gsemantique/bin/activate
+
+    # 3.3 Install gsemantique
+    mkdir repos
+    cd repos
+    git clone https://github.com/Sen2Cube-at/gsemantique.git
+    cd gsemantique
+    pip install .
+    ```
+
 ## Usage
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Sen2Cube-at/gsemantique/main)
 [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/github/Sen2Cube-at/gsemantique/blob/main)  

diff --git a/demo/Readme.md b/demo/Readme.md
@@ -0,0 +1,12 @@
+This directory contains several files demonstrating the usage of gsemantique. The following scripts are contained:
+
+* `basics_data.ipynb` -> explains the fundamentals of addressing data with gsemantique
+* `basics_processing.ipynb` -> explains the fundamentals of model creation & execution with gsemantique
+* `example_clouds.ipynb` -> showcases how to create cloud-free composites
+* `example_forests.ipynb` -> showcases how to create several models for forest disturbance assessments
+* `large_scale_clouds.py` -> calculates cloud-free statistics
+* `large_scale_clouds.sh` -> wrapper around `large_scale_clouds.py`  
+
+To reproduce the results from our paper, look at the following files:
+* Section 5.1 (Application I – cloud-free scenes): `large_scale_clouds.py` & `large_scale_clouds.sh`
+* Section 5.2 (Application II – forest disturbances): `example_forests.ipynb`
diff --git a/demo/files/aoi_europe.geojson b/demo/files/aoi_europe.geojson
diff --git a/demo/large_scale_clouds.py b/demo/large_scale_clouds.py
@@ -0,0 +1,195 @@
+import logging
+import geopandas as gpd
+import gsemantique as gsq
+import json
+import os
+import pandas as pd
+import semantique as sq
+import warnings
+from datetime import datetime
+from shapely.geometry import Polygon
+
+logger = logging.getLogger("gsq.data.search")
+logger.setLevel(logging.INFO)
+stream_handler = logging.StreamHandler()
+logger.addHandler(stream_handler)
+
+warnings.filterwarnings("ignore")
+
+output_dir = f"results/{datetime.now().strftime('%H%M%S')}"
+
+# read European NUTS regions as AOI
+nuts = gpd.read_file("files/aoi_europe.geojson")
+aoi_polygons = nuts[nuts["LEVL_CODE"] == 1]
+excl_list = ["RUP FR — Régions Ultrapériphériques Françaises"]
+aoi_polygons = aoi_polygons[~aoi_polygons["NUTS_NAME"].isin(excl_list)]
+
+# define spatio-temporal extent
+res = 500
+epsg = 3035
+t_start, t_end = "2022-01-01", "2022-02-01"
+aoi = aoi_polygons.to_crs(4326)
+space = sq.SpatialExtent(aoi)
+
+# load data catalog
+ds_catalog = gsq.DatasetCatalog()
+ds_catalog.load()
+
+# define sentinel mapping & parameters
+s2_map = sq.mapping.Semantique()
+s2_map["entity"] = {}
+s2_map["entity"]["valid"] = {
+    "color": sq.layer("Planetary", "classification", "scl").evaluate("not_equal", 0)
+}
+s2_map["entity"]["cloud"] = {
+    "color": sq.layer("Planetary", "classification", "scl").evaluate("in", [8, 9, 10])
+}
+
+params = {
+    "sentinel": {
+        "layer_key": ("Planetary", "classification", "scl"),
+        "mapping": s2_map,
+        "cloud_meta_col": "eo:cloud_cover",
+    }
+}
+
+
+class MetaSearch:
+    # define recipe for cloud-free search
+    recipe = sq.QueryRecipe()
+    recipe["all"] = (
+        sq.entity("valid")
+        .groupby_time(["year", "dayofyear"])
+        .reduce("mode", "time")
+        .concatenate("new_time")
+        .reduce("count", "new_time")
+        .apply_custom("update_na", na_value=-99)
+        .apply_custom("change_dtype", dtype="int16")
+    )
+    recipe["cloudless"] = (
+        sq.entity("valid")
+        .evaluate("not_missing")
+        .filter(sq.self())
+        .filter(sq.entity("cloud").evaluate("not"))
+        .groupby_time(["year", "dayofyear"])
+        .reduce("mode", "time")
+        .concatenate("new_time")
+        .reduce("count", "new_time")
+        .apply_custom("update_na", na_value=-99)
+        .apply_custom("change_dtype", dtype="int16")
+    )
+
+    def __init__(
+        self, layer_search_key, sat_mapping, t_start, t_end, cloud_thres, output_dir
+    ):
+        """
+        Search for cloud-free data based on metadata statistics of cloud coverage.
+
+        Args:
+            layer_search_key (tuple): The layer key to search for.
+            sat_mapping (dict): The mapping specfic to the satellite data.
+            t_start (str): The start date in the format YYYY-MM-DD.
+            t_end (str): The end date in the format YYYY-MM-DD.
+            cloud_thres (int): The cloud coverage threshold.
+            output_dir (str): The output path to save the results.
+        """
+        self.layer_search_key = layer_search_key
+        self.sat_mapping = sat_mapping
+        self.t_start = t_start
+        self.t_end = t_end
+        self.cloud_thres = cloud_thres
+        self.output_dir = output_dir
+        self.fdr = None
+        self.th = None
+
+    def run(self):
+        # search for data
+        bounds_df = aoi.to_crs(4326).bounds
+        aoi["geometry"] = bounds_df.apply(
+            lambda row: Polygon(
+                [
+                    (row["minx"], row["miny"]),
+                    (row["maxx"], row["miny"]),
+                    (row["maxx"], row["maxy"]),
+                    (row["minx"], row["maxy"]),
+                ]
+            ),
+            axis=1,
+        )
+        self.fdr = gsq.Finder(ds_catalog, self.t_start, self.t_end, aoi)
+        self.fdr.search_man(self.layer_search_key)
+
+        # filter by cloud cover
+        stac_json = self.fdr.item_coll.to_dict()
+        gdf = gpd.GeoDataFrame.from_features(stac_json, "epsg:4326")
+        keep_idx = gdf[gdf["eo:cloud_cover"] <= self.cloud_thres].index.values
+        item_coll = [x for idx, x in enumerate(self.fdr.item_coll) if idx in keep_idx]
+
+        # construct datacube
+        with open(gsq.LAYOUT_PATH, "r") as file:
+            dc = sq.datacube.STACCube(
+                json.load(file),
+                src=item_coll,
+                group_by_solar_day=False,
+            )
+
+        # create TileHandler instance & execute processing
+        context = dict(
+            recipe=MetaSearch.recipe,
+            datacube=dc,
+            mapping=self.sat_mapping,
+            space=space,
+            time=sq.TemporalExtent(pd.Timestamp(t_start), pd.Timestamp(t_end)),
+            spatial_resolution=[-res, res],
+            crs=epsg,
+            chunksize_t="1W",
+            chunksize_s=512,
+            merge_mode="merged",
+            out_dir=self.output_dir,
+            reauth=True,
+            custom_verbs={"update_na": gsq.update_na, "change_dtype": gsq.change_dtype},
+        )
+        th = gsq.TileHandlerParallel(n_procs=os.cpu_count(), **context)
+        th.execute()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Run semantic cloud search.")
+    parser.add_argument(
+        "--t_start",
+        type=str,
+        default=t_start,
+        help="The start date in the format YYYY-MM-DD",
+    )
+    parser.add_argument(
+        "--t_end", type=str, default=t_end, help="The end date in the format YYYY-MM-DD"
+    )
+    parser.add_argument(
+        "--cloud_thresh", type=int, default=100, help="The cloud coverage threshold"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default=output_dir, help="The output directory"
+    )
+    args = parser.parse_args()
+
+    t_start = args.t_start
+    t_end = args.t_end
+    root_dir = args.output_dir
+    cloud_thresh = args.cloud_thresh
+
+    sub_dir = (
+        f"sentinel_{t_start.replace('-','')}_{t_end.replace('-','')}_c{cloud_thresh}"
+    )
+    out_dir = os.path.join(root_dir, sub_dir)
+
+    cfs = MetaSearch(
+        params["sentinel"]["layer_key"],
+        params["sentinel"]["mapping"],
+        t_start,
+        t_end,
+        cloud_thresh,
+        out_dir,
+    )
+    cfs.run()
diff --git a/demo/large_scale_clouds.sh b/demo/large_scale_clouds.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# define the base name for the output directory
+output_dir_base="/home/ubuntu/projects/clouds/results"
+mkdir -p "$output_dir_base"
+
+# define the cloud thresholds to test
+cloud_thresholds=(100)
+
+# define the log file
+log_file="${output_dir_base}/process.log"
+
+# run for each year and month
+for year in {2021..2023}
+do
+    for month in {01..12}
+    do
+        # define the start and end dates for this month
+        t_start="${year}-${month}-01"
+        if [ "$month" -eq 12 ]; then
+            t_end="$(($year+1))-01-01"
+        else
+            t_end="${year}-$(printf "%02d" $((10#$month + 1)))-01"
+        fi
+
+        # loop over each cloud threshold
+        for cloud_thresh in ${cloud_thresholds[@]}
+        do
+            # define the output directory for this month and cloud threshold
+            output_dir="${output_dir_base}"
+
+            # print the parameters to the console and the log file
+            echo "Year: $year, Month: $month, Cloud Threshold: $cloud_thresh" | tee -a "$log_file"
+
+            # call the Python script with these parameters and log the output
+            /home/ubuntu/venv/gsemantique/bin/python3 large_scale_clouds.py \
+                --t_start "$t_start" \
+                --t_end "$t_end" \
+                --cloud_thresh "$cloud_thresh" \
+                --output_dir "$output_dir" 2>&1 | tee -a "$log_file"
+        done
+    done
+done