Skip to content

Commit a0bc42e

Browse files
Copied stores files from noisepy repo (#2)
* Instantiated the repo using LINCC template * Instantiated the repo using LINCC template * Instantiated the repo using LINCC template * Copied store files from noisepy repo
1 parent 1db77ae commit a0bc42e

File tree

11 files changed

+3426
-10
lines changed

11 files changed

+3426
-10
lines changed

.copier-answers.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ include_benchmarks: false
99
include_docs: true
1010
include_notebooks: true
1111
mypy_type_checking: none
12-
package_name: noisepy-seis-io
12+
package_name: noisepy_seis_io
1313
preferred_linter: black
1414
project_license: MIT
1515
project_name: noisepy-io

.pre-commit-config.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ repos:
2323
- id: black
2424

2525

26-
- repo: https://github.com/PyCQA/flake8
27-
rev: 6.0.0
28-
hooks:
29-
- id: flake8
30-
additional_dependencies:
31-
- flake8-black
32-
exclude: .git,__pycache__,build,dist
26+
# - repo: https://github.com/PyCQA/flake8
27+
# rev: 6.0.0
28+
# hooks:
29+
# - id: flake8
30+
# additional_dependencies:
31+
# - flake8-black
32+
# exclude: .git,__pycache__,build,dist
3333
# repos:
3434

3535
# # Compare the local template version to the latest remote template version

docs/index.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
.. noisepy-seis-io documentation main file.
1+
.. noisepy_seis_io documentation main file.
22
You can adapt this file completely to your liking, but it should at least
33
contain the root `toctree` directive.
44
5-
Welcome to noisepy-seis-io's documentation!
5+
Welcome to noisepy_seis_io's documentation!
66
========================================================================================
77

88
Dev Guide - Getting Started

src/noisepy/seis/io/asdfstore.py

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
import glob
2+
import logging
3+
import os
4+
from pathlib import Path
5+
from typing import Callable, Dict, Generic, List, Optional, Set, Tuple, TypeVar
6+
7+
import numpy as np
8+
import obspy
9+
import pyasdf
10+
from datetimerange import DateTimeRange
11+
12+
from . import noise_module
13+
from .constants import PROGRESS_DATATYPE
14+
from .datatypes import Channel, ChannelData, ChannelType, CrossCorrelation, Stack, Station
15+
from .stores import (
16+
CrossCorrelationDataStore,
17+
RawDataStore,
18+
StackStore,
19+
parse_station_pair,
20+
parse_timespan,
21+
timespan_str,
22+
)
23+
24+
logger = logging.getLogger(__name__)
25+
26+
T = TypeVar("T")
27+
28+
29+
class ASDFDirectory(Generic[T]):
30+
"""
31+
Utility class used byt ASDFRawDataStore and ASDFCCStore to provide easy access
32+
to a set of ASDF files in a directory that follow a specific naming convention.
33+
The files are named after a generic type T (e.g. a timestamp or a pair of stations)
34+
so the constructor takes two functions to map between the type T and the corresponding
35+
file name.
36+
"""
37+
38+
def __init__(
39+
self, directory: str, mode: str, get_filename: Callable[[T], str], parse_filename: Callable[[str], T]
40+
) -> None:
41+
if mode not in ["a", "r"]:
42+
raise ValueError(f"Invalid mode {mode}. Must be 'a' or 'r'")
43+
44+
self.directory = directory
45+
self.mode = mode
46+
self.get_filename = get_filename
47+
self.parse_filename = parse_filename
48+
49+
def __getitem__(self, key: T) -> pyasdf.ASDFDataSet:
50+
return self._get_dataset(key, self.mode)
51+
52+
def _get_dataset(self, key: T, mode: str) -> pyasdf.ASDFDataSet:
53+
file_name = self.get_filename(key)
54+
file_path = os.path.join(self.directory, file_name)
55+
return _get_dataset(file_path, mode)
56+
57+
def get_keys(self) -> List[T]:
58+
h5files = sorted(glob.glob(os.path.join(self.directory, "**/*.h5"), recursive=True))
59+
return list(map(self.parse_filename, h5files))
60+
61+
def contains(self, key: T, data_type: str, path: str = None):
62+
# contains is always a read
63+
ccf_ds = self._get_dataset(key, "r")
64+
65+
if not ccf_ds:
66+
return False
67+
with ccf_ds:
68+
# source-receiver pair
69+
exists = data_type in ccf_ds.auxiliary_data
70+
if path is not None and exists:
71+
return path in ccf_ds.auxiliary_data[data_type]
72+
return exists
73+
74+
def add_aux_data(self, key: T, params: Dict, data_type: str, path: str, data: np.ndarray):
75+
with self[key] as ccf_ds:
76+
ccf_ds.add_auxiliary_data(data=data, data_type=data_type, path=path, parameters=params)
77+
78+
79+
class ASDFRawDataStore(RawDataStore):
80+
"""
81+
A data store implementation to read from a directory of ASDF files. Each file is considered
82+
a timespan with the naming convention: 2019_02_01_00_00_00T2019_02_02_00_00_00.h5
83+
"""
84+
85+
def __init__(self, directory: str, mode: str = "r"):
86+
super().__init__()
87+
self.datasets = ASDFDirectory(directory, mode, _filename_from_timespan, parse_timespan)
88+
89+
def get_channels(self, timespan: DateTimeRange) -> List[Channel]:
90+
with self.datasets[timespan] as ds:
91+
stations = [self._create_station(timespan, sta) for sta in ds.waveforms.list() if sta is not None]
92+
channels = [
93+
Channel(ChannelType(tag), sta)
94+
for sta in stations
95+
for tag in ds.waveforms[str(sta)].get_waveform_tags()
96+
]
97+
return channels
98+
99+
def get_timespans(self) -> List[DateTimeRange]:
100+
return self.datasets.get_keys()
101+
102+
def read_data(self, timespan: DateTimeRange, chan: Channel) -> ChannelData:
103+
with self.datasets[timespan] as ds:
104+
stream = ds.waveforms[str(chan.station)][str(chan.type)]
105+
return ChannelData(stream)
106+
107+
def get_inventory(self, timespan: DateTimeRange, station: Station) -> obspy.Inventory:
108+
with self.datasets[timespan] as ds:
109+
return ds.waveforms[str(station)]["StationXML"]
110+
111+
def _create_station(self, timespan: DateTimeRange, name: str) -> Optional[Station]:
112+
# What should we do if there's no StationXML?
113+
try:
114+
with self.datasets[timespan] as ds:
115+
inventory = ds.waveforms[name]["StationXML"]
116+
sta, net, lon, lat, elv, loc = noise_module.sta_info_from_inv(inventory)
117+
return Station(net, sta, lat, lon, elv, loc)
118+
except Exception as e:
119+
logger.warning(f"Missing StationXML for station {name}. {e}")
120+
return None
121+
122+
123+
class ASDFCCStore(CrossCorrelationDataStore):
124+
def __init__(self, directory: str, mode: str = "a") -> None:
125+
super().__init__()
126+
Path(directory).mkdir(exist_ok=True)
127+
self.datasets = ASDFDirectory(directory, mode, _filename_from_timespan, parse_timespan)
128+
129+
# CrossCorrelationDataStore implementation
130+
def contains(self, src: Station, rec: Station, timespan: DateTimeRange) -> bool:
131+
station_pair = self._get_station_pair(src, rec)
132+
contains = self.datasets.contains(timespan, station_pair)
133+
if contains:
134+
logger.info(f"Cross-correlation {station_pair} already exists")
135+
return contains
136+
137+
def append(
138+
self,
139+
timespan: DateTimeRange,
140+
src: Station,
141+
rec: Station,
142+
ccs: List[CrossCorrelation],
143+
):
144+
for cc in ccs:
145+
station_pair = self._get_station_pair(src, rec)
146+
# source-receiver pair: e.g. CI.ARV_CI.BAK
147+
# channels, e.g. bhn_bhn
148+
channels = self._get_channel_pair(cc.src, cc.rec)
149+
self.datasets.add_aux_data(timespan, cc.parameters, station_pair, channels, cc.data)
150+
151+
def get_timespans(self, src: Station, rec: Station) -> List[DateTimeRange]:
152+
timespans = {}
153+
pair_key = self._get_station_pair(src, rec)
154+
155+
def visit(pairs, ts):
156+
if pair_key in pairs:
157+
timespans[str(ts)] = ts
158+
159+
self._visit_pairs(visit)
160+
return sorted(timespans.values(), key=lambda t: str(t))
161+
162+
def get_station_pairs(self) -> List[Tuple[Station, Station]]:
163+
pairs_all = set()
164+
self._visit_pairs(lambda pairs, _: pairs_all.update((parse_station_pair(p) for p in pairs)))
165+
return list(pairs_all)
166+
167+
def read(self, timespan: DateTimeRange, src_sta: Station, rec_sta: Station) -> List[CrossCorrelation]:
168+
with self.datasets[timespan] as ccf_ds:
169+
dtype = self._get_station_pair(src_sta, rec_sta)
170+
if dtype not in ccf_ds.auxiliary_data:
171+
logging.warning(f"No data available for {timespan}/{dtype}")
172+
return []
173+
ccs = []
174+
ch_pair_paths = ccf_ds.auxiliary_data[dtype].list()
175+
for ch_pair_path in ch_pair_paths:
176+
src_ch, rec_ch = _parse_channel_path(ch_pair_path)
177+
stream = ccf_ds.auxiliary_data[dtype][ch_pair_path]
178+
ccs.append(CrossCorrelation(src_ch, rec_ch, stream.parameters, stream.data[:]))
179+
return ccs
180+
181+
def _visit_pairs(self, visitor: Callable[[Set[Tuple[str, str]], DateTimeRange], None]):
182+
all_timespans = self.datasets.get_keys()
183+
for timespan in all_timespans:
184+
with self.datasets[timespan] as ccf_ds:
185+
data = ccf_ds.auxiliary_data.list()
186+
pairs = {p for p in data if p != PROGRESS_DATATYPE}
187+
visitor(pairs, timespan)
188+
189+
def _get_channel_pair(self, src_chan: ChannelType, rec_chan: ChannelType) -> str:
190+
return f"{src_chan}_{rec_chan}"
191+
192+
def _get_station_pair(self, src_sta: Station, rec_sta: Station) -> str:
193+
return f"{src_sta}_{rec_sta}"
194+
195+
196+
class ASDFStackStore(StackStore):
197+
def __init__(self, directory: str, mode: str = "a"):
198+
super().__init__()
199+
self.datasets = ASDFDirectory(directory, mode, _filename_from_stations, _parse_station_pair_h5file)
200+
201+
# TODO: Do we want to support storing stacks from different timespans in the same store?
202+
def append(self, timespan: DateTimeRange, src: Station, rec: Station, stacks: List[Stack]):
203+
for stack in stacks:
204+
self.datasets.add_aux_data((src, rec), stack.parameters, stack.name, stack.component, stack.data)
205+
206+
def get_station_pairs(self) -> List[Tuple[Station, Station]]:
207+
return self.datasets.get_keys()
208+
209+
def get_timespans(self, src: Station, rec: Station) -> List[DateTimeRange]:
210+
# TODO: Do we want to support storing stacks from different timespans in the same store?
211+
return []
212+
213+
def read(self, timespan: DateTimeRange, src: Station, rec: Station) -> List[Stack]:
214+
stacks = []
215+
with self.datasets[(src, rec)] as ds:
216+
for name in ds.auxiliary_data.list():
217+
for component in ds.auxiliary_data[name].list():
218+
stream = ds.auxiliary_data[name][component]
219+
stacks.append(Stack(component, name, stream.parameters, stream.data[:]))
220+
return stacks
221+
222+
223+
def _get_dataset(filename: str, mode: str) -> pyasdf.ASDFDataSet:
224+
logger.debug(f"Opening {filename}")
225+
if os.path.exists(filename):
226+
return pyasdf.ASDFDataSet(filename, mode=mode, mpi=False, compression=None)
227+
elif mode == "r":
228+
return None
229+
else: # create new file
230+
Path(filename).parent.mkdir(exist_ok=True, parents=True)
231+
return pyasdf.ASDFDataSet(filename, mode=mode, mpi=False, compression=None)
232+
233+
234+
def _filename_from_stations(pair: Tuple[Station, Station]) -> str:
235+
return f"{pair[0]}/{pair[0]}_{pair[1]}.h5"
236+
237+
238+
def _filename_from_timespan(timespan: DateTimeRange) -> str:
239+
return f"{timespan_str(timespan)}.h5"
240+
241+
242+
def _parse_station_pair_h5file(path: str) -> Tuple[Station, Station]:
243+
pair = Path(path).stem
244+
return parse_station_pair(pair)
245+
246+
247+
def _parse_channel_path(path: str) -> Tuple[ChannelType, ChannelType]:
248+
parts = path.split("_")
249+
if len(parts) == 2: # e.g. bhn_bhn
250+
return tuple(map(ChannelType, parts))
251+
elif len(parts) == 3: # when we have one location code
252+
if parts[1].isdigit(): # e.g. bhn_00_bhn
253+
return tuple(map(ChannelType, ["_".join(parts[0:2]), parts[2]]))
254+
else: # e.g. bhn_bhn_00
255+
return tuple(map(ChannelType, [parts[0], "_".join(parts[1:3])]))
256+
elif len(parts) == 4: # when we have two location codes: e.g. bhn_00_bhn_00
257+
return tuple(map(ChannelType, ["_".join(parts[0:2]), "_".join(parts[2:4])]))
258+
else:
259+
raise ValueError(f"Invalid channel path {path}")

src/noisepy/seis/io/constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
STATION_FILE = "station.csv"
2+
DATE_FORMAT_HELP = "%%Y_%%m_%%d_%%H_%%M_%%S"
3+
DATE_FORMAT = "%Y_%m_%d_%H_%M_%S"
4+
DONE_PATH = "done"
5+
PROGRESS_DATATYPE = "Progress"
6+
CONFIG_FILE = "config.yaml"
7+
AWS_BATCH_JOB_ARRAY_INDEX = "AWS_BATCH_JOB_ARRAY_INDEX"
8+
AWS_BATCH_JOB_ID = "AWS_BATCH_JOB_ID"
9+
AWS_EXECUTION_ENV = "AWS_EXECUTION_ENV"
10+
NO_DATA_MSG = "Abort! no available seismic files for FFT"
11+
NO_CCF_DATA_MSG = "Abort! no available CCF data for stacking"
12+
WILD_CARD = "*"

0 commit comments

Comments
 (0)