Skip to content

Commit 1fe4fdf

Browse files
authored
Clay pipeline v04 (#173)
* Catch case where datacube size is 0 on S1 * Reduce tile size to 256 and bump pipeline version to 04 * Updated mgrs sampling strategy for v0.2 * Allow extracting index from array job index * Fix variable name in docs * Mute s3 sync command * Increase dates per location and s1 match attempts * Use logging instead of printing * Moved sync out of loop * Convert batch command to array job * Update submit script memory requirements and sample source * Update default sample source
1 parent 50094ba commit 1fe4fdf

File tree

6 files changed

+138
-86
lines changed

6 files changed

+138
-86
lines changed

docs/data_datacube.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ Build the docker image and push it to a ecr repository.
2929
```bash
3030
ecr_repo_id=12345
3131
cd scripts/pipeline/batch
32-
docker build -t $ecr_repo_iud.dkr.ecr.us-east-1.amazonaws.com/fetch-and-run .
32+
docker build -t $ecr_repo_id.dkr.ecr.us-east-1.amazonaws.com/fetch-and-run .
3333

34-
aws ecr get-login-password --profile clay --region us-east-1 | docker login --username AWS --password-stdin $ecr_repo_iud.dkr.ecr.us-east-1.amazonaws.com
34+
aws ecr get-login-password --profile clay --region us-east-1 | docker login --username AWS --password-stdin $ecr_repo_id.dkr.ecr.us-east-1.amazonaws.com
3535

36-
docker push $ecr_repo_iud.dkr.ecr.us-east-1.amazonaws.com/fetch-and-run:latest
36+
docker push $ecr_repo_id.dkr.ecr.us-east-1.amazonaws.com/fetch-and-run
3737
```
3838

3939
### Prepare AWS batch

docs/data_sampling.md

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@ In addition to the landcover classes, we also added diversity by selecting 500
2222
tiles out of the 3000 tiles with the highest count of land cover classes present
2323
in the tile.
2424

25+
After selecting MGRS tiles for each of these criteria, we removed duplicates.
26+
2527
The following table summarizes the selection criteria for each class.
2628

29+
## For model version v0.1
30+
2731
| Class | Nr of Tiles | From highest |
2832
|---|---|---|
2933
Diversity | 500 | 3000
@@ -39,9 +43,32 @@ Bare / sparse vegetation | 50 | 500
3943
Snow and Ice | 50 | 500
4044
Permanent water bodies | 100 | 1000
4145

42-
After selecting MGRS tiles for each of these criteria, we removed duplicates.
4346
This resulted in a sample of 1517 MGRS tiles total in our sample.
4447

4548
The resulting sample file can be downloaded from the following link
4649

4750
https://clay-mgrs-samples.s3.amazonaws.com/mgrs_sample.fgb
51+
52+
## For model version v0.2
53+
54+
| Class | Nr of Tiles | From highest |
55+
|---|---|---|
56+
Diversity | 400 | 2000
57+
Built-up | 300 | 300
58+
Built-up | 1000 | 1500
59+
Herbaceous wetland | 50 | 500
60+
Mangroves | 50 | 500
61+
Moss and lichen | 50 | 500
62+
Cropland | 800 | 3600
63+
Tree cover | 150 | 750
64+
Shrubland | 100 | 500
65+
Grassland | 200 | 500
66+
Bare / sparse vegetation | 50 | 500
67+
Snow and Ice | 25 | 500
68+
Permanent water bodies | 50 | 1000
69+
70+
This resulted in a sample of 2728 MGRS tiles total in our sample.
71+
72+
The resulting sample file can be downloaded from the following link
73+
74+
https://clay-mgrs-samples.s3.amazonaws.com/mgrs_sample_v02.fgb

scripts/pipeline/batch/submit.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,40 @@
1+
import os
2+
13
import boto3
24

35
batch = boto3.client("batch", region_name="us-east-1")
46

5-
NR_OF_TILES_IN_SAMPLE_FILE = 1517
7+
NR_OF_TILES_IN_SAMPLE_FILE = 2728
8+
9+
PC_KEY = os.environ["PC_SDK_SUBSCRIPTION_KEY"]
610

7-
PC_KEY = "***"
811

9-
for i in range(NR_OF_TILES_IN_SAMPLE_FILE):
10-
job = {
11-
"jobName": f"fetch-and-run-{i}",
12-
"jobQueue": "fetch-and-run",
13-
"jobDefinition": "fetch-and-run",
14-
"containerOverrides": {
15-
"command": ["datacube.py", "--index", f"{i}", "--bucket", "clay-tiles-02"],
16-
"environment": [
17-
{"name": "BATCH_FILE_TYPE", "value": "zip"},
18-
{
19-
"name": "BATCH_FILE_S3_URL",
20-
"value": "s3://clay-fetch-and-run-packages/batch-fetch-and-run.zip",
21-
},
22-
{"name": "PC_SDK_SUBSCRIPTION_KEY", "value": f"{PC_KEY}"},
23-
],
24-
"resourceRequirements": [
25-
{"type": "MEMORY", "value": "8000"},
26-
{"type": "VCPU", "value": "4"},
27-
],
28-
},
29-
}
12+
job = {
13+
"jobName": "fetch-and-run-datacube",
14+
"jobQueue": "fetch-and-run",
15+
"jobDefinition": "fetch-and-run",
16+
"containerOverrides": {
17+
"command": [
18+
"datacube.py",
19+
"--bucket",
20+
"clay-tiles-04-sample-v02",
21+
"--sample",
22+
"https://clay-mgrs-samples.s3.amazonaws.com/mgrs_sample_v02.fgb",
23+
],
24+
"environment": [
25+
{"name": "BATCH_FILE_TYPE", "value": "zip"},
26+
{
27+
"name": "BATCH_FILE_S3_URL",
28+
"value": "s3://clay-fetch-and-run-packages/batch-fetch-and-run.zip",
29+
},
30+
{"name": "PC_SDK_SUBSCRIPTION_KEY", "value": f"{PC_KEY}"},
31+
],
32+
"resourceRequirements": [
33+
{"type": "MEMORY", "value": "15500"},
34+
{"type": "VCPU", "value": "4"},
35+
],
36+
},
37+
"arrayProperties": {"size": NR_OF_TILES_IN_SAMPLE_FILE},
38+
}
3039

31-
print(batch.submit_job(**job))
40+
print(batch.submit_job(**job))

scripts/pipeline/datacube.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
Process Sentinel-2, Sentinel-1, and DEM data for a specified time range,
2727
area of interest, and resolution.
2828
"""
29+
import logging
30+
import os
2931
import random
3032
from datetime import timedelta
3133

@@ -46,8 +48,15 @@
4648
CLOUD_COVER_PERCENTAGE = 50
4749
NODATA_PIXEL_PERCENTAGE = 20
4850
NODATA = 0
49-
S1_MATCH_ATTEMPTS = 20
50-
DATES_PER_LOCATION = 3
51+
S1_MATCH_ATTEMPTS = 40
52+
DATES_PER_LOCATION = 4
53+
54+
logger = logging.getLogger("datacube")
55+
hdr = logging.StreamHandler()
56+
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s")
57+
hdr.setFormatter(formatter)
58+
logger.addHandler(hdr)
59+
logger.setLevel(logging.INFO)
5160

5261

5362
def get_surrounding_days(reference, interval_days):
@@ -123,7 +132,7 @@ def search_sentinel2(
123132
)
124133

125134
s2_items = search.item_collection()
126-
print(f"Found {len(s2_items)} Sentinel-2 items")
135+
logger.info(f"Found {len(s2_items)} Sentinel-2 items")
127136
if not len(s2_items):
128137
return None, None
129138

@@ -143,7 +152,7 @@ def search_sentinel2(
143152
bbox = least_clouds.geometry.bounds
144153

145154
epsg = s2_item.properties["proj:epsg"]
146-
print("EPSG code based on Sentinel-2 item: ", epsg)
155+
logger.info(f"EPSG code based on Sentinel-2 item: {epsg}")
147156

148157
return s2_item, bbox
149158

@@ -187,7 +196,7 @@ def search_sentinel1(bbox, catalog, date_range):
187196
},
188197
)
189198
s1_items = search.item_collection()
190-
print(f"Found {len(s1_items)} Sentinel-1 items")
199+
logger.info(f"Found {len(s1_items)} Sentinel-1 items")
191200

192201
if not len(s1_items):
193202
return
@@ -203,7 +212,7 @@ def search_sentinel1(bbox, catalog, date_range):
203212
s1_gdf = s1_gdf.sort_values(by="overlap", ascending=False)
204213

205214
most_overlap_orbit = s1_gdf.iloc[0]["sat:orbit_state"]
206-
print("Most overlapped orbit: ", most_overlap_orbit)
215+
logger.info(f"Most overlapped orbit: {most_overlap_orbit}")
207216
selected_item_ids = []
208217
intersection = None
209218
orbit = None
@@ -246,7 +255,7 @@ def search_dem(bbox, catalog):
246255
"""
247256
search = catalog.search(collections=["cop-dem-glo-30"], bbox=bbox)
248257
dem_items = search.item_collection()
249-
print(f"Found {len(dem_items)} DEM items")
258+
logger.info(f"Found {len(dem_items)} DEM items")
250259

251260
return dem_items
252261

@@ -354,15 +363,15 @@ def process(
354363
continue
355364

356365
surrounding_days = get_surrounding_days(s2_item.datetime, interval_days=3)
357-
print("Searching S1 in date range", surrounding_days)
366+
logger.info(f"Searching S1 in date range {surrounding_days}")
358367

359368
s1_items = search_sentinel1(bbox, catalog, surrounding_days)
360369

361370
if s1_items:
362371
break
363372

364373
if i == S1_MATCH_ATTEMPTS - 1:
365-
print(
374+
logger.info(
366375
"No match for S1 scenes found for date range "
367376
f"{date_range} after {S1_MATCH_ATTEMPTS} attempts."
368377
)
@@ -379,6 +388,10 @@ def process(
379388
resolution,
380389
)
381390

391+
if 0 in (dat.shape[0] for dat in result):
392+
logger.info("S2/S1 pixel coverages do not overlap although bounds do")
393+
return None, None
394+
382395
return date, result
383396

384397

@@ -404,13 +417,13 @@ def convert_attrs_and_coords_objects_to_str(data):
404417
@click.option(
405418
"--sample",
406419
required=False,
407-
default="https://clay-mgrs-samples.s3.amazonaws.com/mgrs_sample.fgb",
420+
default="https://clay-mgrs-samples.s3.amazonaws.com/mgrs_sample_v02.fgb",
408421
help="Location of MGRS tile sample",
409422
)
410423
@click.option(
411424
"--index",
412425
required=False,
413-
default=0,
426+
default=None,
414427
help="Index of MGRS tile from sample file that should be processed",
415428
)
416429
@click.option(
@@ -441,13 +454,20 @@ def convert_attrs_and_coords_objects_to_str(data):
441454
type=str,
442455
help="Comma separated list of date ranges, each provided as YYYY-MM-DD/YYYY-MM-DD.",
443456
)
444-
def main(sample, index, subset, bucket, localpath, dateranges):
445-
index = int(index)
457+
@click.option("-v", "--verbose", is_flag=True)
458+
def main(sample, index, subset, bucket, localpath, dateranges, verbose): # noqa: PLR0913
459+
if verbose:
460+
logger.setLevel(logging.DEBUG)
461+
462+
if index is None:
463+
index = int(os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", 0))
464+
else:
465+
index = int(index)
446466
tiles = gpd.read_file(sample)
447467
tile = tiles.iloc[index]
448468
mgrs = tile["name"]
449469

450-
print(f"Starting algorithm for MGRS tile {tile['name']} with index {index}")
470+
logger.info(f"Starting algorithm for MGRS tile {tile['name']} with index {index}")
451471

452472
if subset:
453473
subset = [int(dat) for dat in subset.split(",")]
@@ -466,7 +486,7 @@ def main(sample, index, subset, bucket, localpath, dateranges):
466486

467487
match_count = 0
468488
for date_range in date_ranges:
469-
print(f"Processing data for date range {date_range}")
489+
logger.info(f"Processing data for date range {date_range}")
470490
date, pixels = process(
471491
tile.geometry,
472492
date_range,
@@ -480,7 +500,7 @@ def main(sample, index, subset, bucket, localpath, dateranges):
480500
match_count += 1
481501

482502
if subset:
483-
print(f"Subsetting to {subset}")
503+
logger.info(f"Subsetting to {subset}")
484504
pixels = [
485505
part[:, subset[1] : subset[3], subset[0] : subset[2]] for part in pixels
486506
]
@@ -493,7 +513,7 @@ def main(sample, index, subset, bucket, localpath, dateranges):
493513
break
494514

495515
if not match_count:
496-
raise ValueError("No matching data found")
516+
logger.info("No matching data found")
497517

498518

499519
if __name__ == "__main__":

scripts/pipeline/landcover.py

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313
WGS84 = CRS.from_epsg(4326)
1414
NODATA = 0
1515
WATER = 80
16-
WATER_LOWER_TH = 0.2
17-
WATER_UPPER_TH = 0.7
18-
RANDOM_SEED = 42
16+
RANDOM_SEED = 23
1917
CLASSES = {
2018
10: "Tree cover",
2119
20: "Shrubland",
@@ -156,31 +154,25 @@ def percentages(data):
156154
def sample(wd):
157155
"""
158156
Sample the mgrs tiles based on landcover statistics.
159-
160-
Target: ~1000 tiles
161-
Set very small counts to zero. Exclude high latitudes.
162-
200 samples from the 2000 most diverse
163-
50 samples from the 1000 highest for all other categories except water
164-
100 samples from all tiles with water between 30% an 70% (making sure we
165-
capture some, but exclude only purely water so we catch coasts)
166157
"""
167158
data = geopandas.read_file(Path(wd, "mgrs_stats.fgb"))
168159

169160
data_norm = percentages(data.loc[:, data.columns != "count"])
170161
data[data_norm.columns] = data_norm
171162

172-
diversity = split_highest(data, "count", 500, 3000)
173-
urban = split_highest(data, "Built-up", 400)
163+
diversity = split_highest(data, "count", 400, 2000)
164+
urban = split_highest(data, "Built-up", 300, 300)
165+
urban = split_highest(data, "Built-up", 1000, 1500)
174166
wetland = split_highest(data, "Herbaceous wetland", 50, 500)
175167
mangroves = split_highest(data, "Mangroves", 50, 500)
176168
moss = split_highest(data, "Moss and lichen", 50, 500)
177-
cropland = split_highest(data, "Cropland", 100, 500)
178-
trees = split_highest(data, "Tree cover", 100, 500)
179-
shrubland = split_highest(data, "Shrubland", 50, 500)
180-
grassland = split_highest(data, "Grassland", 50, 500)
169+
cropland = split_highest(data, "Cropland", 800, 3600)
170+
trees = split_highest(data, "Tree cover", 150, 750)
171+
shrubland = split_highest(data, "Shrubland", 100, 500)
172+
grassland = split_highest(data, "Grassland", 200, 500)
181173
bare = split_highest(data, "Bare / sparse vegetation", 50, 500)
182-
snow = split_highest(data, "Snow and Ice", 50, 500)
183-
water = split_highest(data, "Permanent water bodies", 100, 1000)
174+
snow = split_highest(data, "Snow and Ice", 25, 250)
175+
water = split_highest(data, "Permanent water bodies", 50, 1000)
184176

185177
result = pandas.concat(
186178
[
@@ -200,6 +192,7 @@ def sample(wd):
200192
)
201193

202194
result = result.drop_duplicates(subset=["name"])
195+
print(f"Found {len(result)} MGRS tiles")
203196

204197
result.to_file(Path(wd, "mgrs_sample.fgb", driver="FlatGeobuf"))
205198

0 commit comments

Comments
 (0)