Skip to content

Commit

Permalink
Merge pull request #173 from nismod/feature/discard_duplicate_tracks
Browse files Browse the repository at this point in the history
Discard duplicate tracks
  • Loading branch information
thomas-fred authored Jan 22, 2024
2 parents f2ff907 + 8be8105 commit 1a28cf0
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 0 deletions.
1 change: 1 addition & 0 deletions workflow/rules/download/STORM.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Reference
https://data.4tu.nl/articles/dataset/STORM_tropical_cyclone_wind_speed_return_periods/12705164/3
https://data.4tu.nl/articles/dataset/STORM_climate_change_tropical_cyclone_wind_speed_return_periods/14510817/3
https://data.4tu.nl/articles/dataset/STORM_IBTrACS_present_climate_synthetic_tropical_cyclone_tracks/12706085/4
https://data.4tu.nl/articles/dataset/STORM_Climate_Change_synthetic_tropical_cyclone_tracks/14237678
"""


Expand Down
2 changes: 2 additions & 0 deletions workflow/rules/preprocess/join_data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ rule concat_storm_tracks:
"""
input:
by_sample=all_storm_tracks_by_sample
resources:
mem_mb = 48_000
output:
tracks_from_all_samples="{OUTPUT_DIR}/storm_tracks/{STORM_SET}/tracks.geoparquet",
run:
Expand Down
7 changes: 7 additions & 0 deletions workflow/scripts/preprocess/parse_IBTrACS.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
The output schema is a superset of the STORM synthetic track data
"""

import logging
import os
from typing import Union

Expand Down Expand Up @@ -171,6 +172,12 @@ def saffir_simpson_classifier(wind_speed_ms: float) -> Union[int, float]:
# change geometry from 0-360 to -180-180
df.lon = np.where(df.lon > 180, df.lon - 360, df.lon)

df["point_id"] = df.apply(lambda row: f"{row.track_id}_{row.lat}_{row.lon}", axis=1)
n_rows_raw = len(df)
logging.info(f"Collated {n_rows_raw} track points")
df = df.drop_duplicates(subset="point_id").drop(columns=["point_id"])
logging.info(f"Dropped {n_rows_raw - len(df)} track points as duplicates")

# construct geometry from lat and long
df = gpd.GeoDataFrame(
data=df,
Expand Down
9 changes: 9 additions & 0 deletions workflow/scripts/preprocess/parse_IRIS.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

from glob import glob
import logging
import os
import re
from typing import List
Expand Down Expand Up @@ -38,6 +39,8 @@

if __name__ == "__main__":

logging.basicConfig(format="%(asctime)s %(process)d %(filename)s %(message)s", level=logging.INFO)

csv_dir = snakemake.input.csv_dir
parquet_path = snakemake.output.parquet
sample = snakemake.wildcards.SAMPLE
Expand Down Expand Up @@ -94,6 +97,12 @@

df = pd.concat(data)

df["point_id"] = df.apply(lambda row: f"{row.track_id}_{row.lat}_{row.lon}", axis=1)
n_rows_raw = len(df)
logging.info(f"Collated {n_rows_raw} track points")
df = df.drop_duplicates(subset="point_id").drop(columns=["point_id"])
logging.info(f"Dropped {n_rows_raw - len(df)} track points as duplicates")

# construct geometry from lat and long
df = gpd.GeoDataFrame(
data=df,
Expand Down
9 changes: 9 additions & 0 deletions workflow/scripts/preprocess/parse_STORM.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

from glob import glob
import logging
import os
import re
from typing import List
Expand Down Expand Up @@ -46,6 +47,8 @@

if __name__ == "__main__":

logging.basicConfig(format="%(asctime)s %(process)d %(filename)s %(message)s", level=logging.INFO)

csv_dir = snakemake.input.csv_dir
parquet_path = snakemake.output.parquet
sample = snakemake.wildcards.SAMPLE
Expand Down Expand Up @@ -97,6 +100,12 @@
# rescale winds to 1-minutely
df.max_wind_speed_ms /= STORM_1MIN_WIND_FACTOR

df["point_id"] = df.apply(lambda row: f"{row.track_id}_{row.lat}_{row.lon}", axis=1)
n_rows_raw = len(df)
logging.info(f"Collated {n_rows_raw} track points")
df = df.drop_duplicates(subset="point_id").drop(columns=["point_id"])
logging.info(f"Dropped {n_rows_raw - len(df)} track points as duplicates")

# construct geometry from lat and long
df = gpd.GeoDataFrame(
data=df,
Expand Down

0 comments on commit 1a28cf0

Please sign in to comment.