|
| 1 | +import glob |
| 2 | +import logging |
| 3 | +import os |
| 4 | +from pathlib import Path |
| 5 | +from typing import Callable, Dict, Generic, List, Optional, Set, Tuple, TypeVar |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import obspy |
| 9 | +import pyasdf |
| 10 | +from datetimerange import DateTimeRange |
| 11 | + |
| 12 | +from . import noise_module |
| 13 | +from .constants import PROGRESS_DATATYPE |
| 14 | +from .datatypes import Channel, ChannelData, ChannelType, CrossCorrelation, Stack, Station |
| 15 | +from .stores import ( |
| 16 | + CrossCorrelationDataStore, |
| 17 | + RawDataStore, |
| 18 | + StackStore, |
| 19 | + parse_station_pair, |
| 20 | + parse_timespan, |
| 21 | + timespan_str, |
| 22 | +) |
| 23 | + |
| 24 | +logger = logging.getLogger(__name__) |
| 25 | + |
| 26 | +T = TypeVar("T") |
| 27 | + |
| 28 | + |
| 29 | +class ASDFDirectory(Generic[T]): |
| 30 | + """ |
| 31 | + Utility class used byt ASDFRawDataStore and ASDFCCStore to provide easy access |
| 32 | + to a set of ASDF files in a directory that follow a specific naming convention. |
| 33 | + The files are named after a generic type T (e.g. a timestamp or a pair of stations) |
| 34 | + so the constructor takes two functions to map between the type T and the corresponding |
| 35 | + file name. |
| 36 | + """ |
| 37 | + |
| 38 | + def __init__( |
| 39 | + self, directory: str, mode: str, get_filename: Callable[[T], str], parse_filename: Callable[[str], T] |
| 40 | + ) -> None: |
| 41 | + if mode not in ["a", "r"]: |
| 42 | + raise ValueError(f"Invalid mode {mode}. Must be 'a' or 'r'") |
| 43 | + |
| 44 | + self.directory = directory |
| 45 | + self.mode = mode |
| 46 | + self.get_filename = get_filename |
| 47 | + self.parse_filename = parse_filename |
| 48 | + |
| 49 | + def __getitem__(self, key: T) -> pyasdf.ASDFDataSet: |
| 50 | + return self._get_dataset(key, self.mode) |
| 51 | + |
| 52 | + def _get_dataset(self, key: T, mode: str) -> pyasdf.ASDFDataSet: |
| 53 | + file_name = self.get_filename(key) |
| 54 | + file_path = os.path.join(self.directory, file_name) |
| 55 | + return _get_dataset(file_path, mode) |
| 56 | + |
| 57 | + def get_keys(self) -> List[T]: |
| 58 | + h5files = sorted(glob.glob(os.path.join(self.directory, "**/*.h5"), recursive=True)) |
| 59 | + return list(map(self.parse_filename, h5files)) |
| 60 | + |
| 61 | + def contains(self, key: T, data_type: str, path: str = None): |
| 62 | + # contains is always a read |
| 63 | + ccf_ds = self._get_dataset(key, "r") |
| 64 | + |
| 65 | + if not ccf_ds: |
| 66 | + return False |
| 67 | + with ccf_ds: |
| 68 | + # source-receiver pair |
| 69 | + exists = data_type in ccf_ds.auxiliary_data |
| 70 | + if path is not None and exists: |
| 71 | + return path in ccf_ds.auxiliary_data[data_type] |
| 72 | + return exists |
| 73 | + |
| 74 | + def add_aux_data(self, key: T, params: Dict, data_type: str, path: str, data: np.ndarray): |
| 75 | + with self[key] as ccf_ds: |
| 76 | + ccf_ds.add_auxiliary_data(data=data, data_type=data_type, path=path, parameters=params) |
| 77 | + |
| 78 | + |
| 79 | +class ASDFRawDataStore(RawDataStore): |
| 80 | + """ |
| 81 | + A data store implementation to read from a directory of ASDF files. Each file is considered |
| 82 | + a timespan with the naming convention: 2019_02_01_00_00_00T2019_02_02_00_00_00.h5 |
| 83 | + """ |
| 84 | + |
| 85 | + def __init__(self, directory: str, mode: str = "r"): |
| 86 | + super().__init__() |
| 87 | + self.datasets = ASDFDirectory(directory, mode, _filename_from_timespan, parse_timespan) |
| 88 | + |
| 89 | + def get_channels(self, timespan: DateTimeRange) -> List[Channel]: |
| 90 | + with self.datasets[timespan] as ds: |
| 91 | + stations = [self._create_station(timespan, sta) for sta in ds.waveforms.list() if sta is not None] |
| 92 | + channels = [ |
| 93 | + Channel(ChannelType(tag), sta) |
| 94 | + for sta in stations |
| 95 | + for tag in ds.waveforms[str(sta)].get_waveform_tags() |
| 96 | + ] |
| 97 | + return channels |
| 98 | + |
| 99 | + def get_timespans(self) -> List[DateTimeRange]: |
| 100 | + return self.datasets.get_keys() |
| 101 | + |
| 102 | + def read_data(self, timespan: DateTimeRange, chan: Channel) -> ChannelData: |
| 103 | + with self.datasets[timespan] as ds: |
| 104 | + stream = ds.waveforms[str(chan.station)][str(chan.type)] |
| 105 | + return ChannelData(stream) |
| 106 | + |
| 107 | + def get_inventory(self, timespan: DateTimeRange, station: Station) -> obspy.Inventory: |
| 108 | + with self.datasets[timespan] as ds: |
| 109 | + return ds.waveforms[str(station)]["StationXML"] |
| 110 | + |
| 111 | + def _create_station(self, timespan: DateTimeRange, name: str) -> Optional[Station]: |
| 112 | + # What should we do if there's no StationXML? |
| 113 | + try: |
| 114 | + with self.datasets[timespan] as ds: |
| 115 | + inventory = ds.waveforms[name]["StationXML"] |
| 116 | + sta, net, lon, lat, elv, loc = noise_module.sta_info_from_inv(inventory) |
| 117 | + return Station(net, sta, lat, lon, elv, loc) |
| 118 | + except Exception as e: |
| 119 | + logger.warning(f"Missing StationXML for station {name}. {e}") |
| 120 | + return None |
| 121 | + |
| 122 | + |
| 123 | +class ASDFCCStore(CrossCorrelationDataStore): |
| 124 | + def __init__(self, directory: str, mode: str = "a") -> None: |
| 125 | + super().__init__() |
| 126 | + Path(directory).mkdir(exist_ok=True) |
| 127 | + self.datasets = ASDFDirectory(directory, mode, _filename_from_timespan, parse_timespan) |
| 128 | + |
| 129 | + # CrossCorrelationDataStore implementation |
| 130 | + def contains(self, src: Station, rec: Station, timespan: DateTimeRange) -> bool: |
| 131 | + station_pair = self._get_station_pair(src, rec) |
| 132 | + contains = self.datasets.contains(timespan, station_pair) |
| 133 | + if contains: |
| 134 | + logger.info(f"Cross-correlation {station_pair} already exists") |
| 135 | + return contains |
| 136 | + |
| 137 | + def append( |
| 138 | + self, |
| 139 | + timespan: DateTimeRange, |
| 140 | + src: Station, |
| 141 | + rec: Station, |
| 142 | + ccs: List[CrossCorrelation], |
| 143 | + ): |
| 144 | + for cc in ccs: |
| 145 | + station_pair = self._get_station_pair(src, rec) |
| 146 | + # source-receiver pair: e.g. CI.ARV_CI.BAK |
| 147 | + # channels, e.g. bhn_bhn |
| 148 | + channels = self._get_channel_pair(cc.src, cc.rec) |
| 149 | + self.datasets.add_aux_data(timespan, cc.parameters, station_pair, channels, cc.data) |
| 150 | + |
| 151 | + def get_timespans(self, src: Station, rec: Station) -> List[DateTimeRange]: |
| 152 | + timespans = {} |
| 153 | + pair_key = self._get_station_pair(src, rec) |
| 154 | + |
| 155 | + def visit(pairs, ts): |
| 156 | + if pair_key in pairs: |
| 157 | + timespans[str(ts)] = ts |
| 158 | + |
| 159 | + self._visit_pairs(visit) |
| 160 | + return sorted(timespans.values(), key=lambda t: str(t)) |
| 161 | + |
| 162 | + def get_station_pairs(self) -> List[Tuple[Station, Station]]: |
| 163 | + pairs_all = set() |
| 164 | + self._visit_pairs(lambda pairs, _: pairs_all.update((parse_station_pair(p) for p in pairs))) |
| 165 | + return list(pairs_all) |
| 166 | + |
| 167 | + def read(self, timespan: DateTimeRange, src_sta: Station, rec_sta: Station) -> List[CrossCorrelation]: |
| 168 | + with self.datasets[timespan] as ccf_ds: |
| 169 | + dtype = self._get_station_pair(src_sta, rec_sta) |
| 170 | + if dtype not in ccf_ds.auxiliary_data: |
| 171 | + logging.warning(f"No data available for {timespan}/{dtype}") |
| 172 | + return [] |
| 173 | + ccs = [] |
| 174 | + ch_pair_paths = ccf_ds.auxiliary_data[dtype].list() |
| 175 | + for ch_pair_path in ch_pair_paths: |
| 176 | + src_ch, rec_ch = _parse_channel_path(ch_pair_path) |
| 177 | + stream = ccf_ds.auxiliary_data[dtype][ch_pair_path] |
| 178 | + ccs.append(CrossCorrelation(src_ch, rec_ch, stream.parameters, stream.data[:])) |
| 179 | + return ccs |
| 180 | + |
| 181 | + def _visit_pairs(self, visitor: Callable[[Set[Tuple[str, str]], DateTimeRange], None]): |
| 182 | + all_timespans = self.datasets.get_keys() |
| 183 | + for timespan in all_timespans: |
| 184 | + with self.datasets[timespan] as ccf_ds: |
| 185 | + data = ccf_ds.auxiliary_data.list() |
| 186 | + pairs = {p for p in data if p != PROGRESS_DATATYPE} |
| 187 | + visitor(pairs, timespan) |
| 188 | + |
| 189 | + def _get_channel_pair(self, src_chan: ChannelType, rec_chan: ChannelType) -> str: |
| 190 | + return f"{src_chan}_{rec_chan}" |
| 191 | + |
| 192 | + def _get_station_pair(self, src_sta: Station, rec_sta: Station) -> str: |
| 193 | + return f"{src_sta}_{rec_sta}" |
| 194 | + |
| 195 | + |
| 196 | +class ASDFStackStore(StackStore): |
| 197 | + def __init__(self, directory: str, mode: str = "a"): |
| 198 | + super().__init__() |
| 199 | + self.datasets = ASDFDirectory(directory, mode, _filename_from_stations, _parse_station_pair_h5file) |
| 200 | + |
| 201 | + # TODO: Do we want to support storing stacks from different timespans in the same store? |
| 202 | + def append(self, timespan: DateTimeRange, src: Station, rec: Station, stacks: List[Stack]): |
| 203 | + for stack in stacks: |
| 204 | + self.datasets.add_aux_data((src, rec), stack.parameters, stack.name, stack.component, stack.data) |
| 205 | + |
| 206 | + def get_station_pairs(self) -> List[Tuple[Station, Station]]: |
| 207 | + return self.datasets.get_keys() |
| 208 | + |
| 209 | + def get_timespans(self, src: Station, rec: Station) -> List[DateTimeRange]: |
| 210 | + # TODO: Do we want to support storing stacks from different timespans in the same store? |
| 211 | + return [] |
| 212 | + |
| 213 | + def read(self, timespan: DateTimeRange, src: Station, rec: Station) -> List[Stack]: |
| 214 | + stacks = [] |
| 215 | + with self.datasets[(src, rec)] as ds: |
| 216 | + for name in ds.auxiliary_data.list(): |
| 217 | + for component in ds.auxiliary_data[name].list(): |
| 218 | + stream = ds.auxiliary_data[name][component] |
| 219 | + stacks.append(Stack(component, name, stream.parameters, stream.data[:])) |
| 220 | + return stacks |
| 221 | + |
| 222 | + |
| 223 | +def _get_dataset(filename: str, mode: str) -> pyasdf.ASDFDataSet: |
| 224 | + logger.debug(f"Opening {filename}") |
| 225 | + if os.path.exists(filename): |
| 226 | + return pyasdf.ASDFDataSet(filename, mode=mode, mpi=False, compression=None) |
| 227 | + elif mode == "r": |
| 228 | + return None |
| 229 | + else: # create new file |
| 230 | + Path(filename).parent.mkdir(exist_ok=True, parents=True) |
| 231 | + return pyasdf.ASDFDataSet(filename, mode=mode, mpi=False, compression=None) |
| 232 | + |
| 233 | + |
| 234 | +def _filename_from_stations(pair: Tuple[Station, Station]) -> str: |
| 235 | + return f"{pair[0]}/{pair[0]}_{pair[1]}.h5" |
| 236 | + |
| 237 | + |
| 238 | +def _filename_from_timespan(timespan: DateTimeRange) -> str: |
| 239 | + return f"{timespan_str(timespan)}.h5" |
| 240 | + |
| 241 | + |
| 242 | +def _parse_station_pair_h5file(path: str) -> Tuple[Station, Station]: |
| 243 | + pair = Path(path).stem |
| 244 | + return parse_station_pair(pair) |
| 245 | + |
| 246 | + |
| 247 | +def _parse_channel_path(path: str) -> Tuple[ChannelType, ChannelType]: |
| 248 | + parts = path.split("_") |
| 249 | + if len(parts) == 2: # e.g. bhn_bhn |
| 250 | + return tuple(map(ChannelType, parts)) |
| 251 | + elif len(parts) == 3: # when we have one location code |
| 252 | + if parts[1].isdigit(): # e.g. bhn_00_bhn |
| 253 | + return tuple(map(ChannelType, ["_".join(parts[0:2]), parts[2]])) |
| 254 | + else: # e.g. bhn_bhn_00 |
| 255 | + return tuple(map(ChannelType, [parts[0], "_".join(parts[1:3])])) |
| 256 | + elif len(parts) == 4: # when we have two location codes: e.g. bhn_00_bhn_00 |
| 257 | + return tuple(map(ChannelType, ["_".join(parts[0:2]), "_".join(parts[2:4])])) |
| 258 | + else: |
| 259 | + raise ValueError(f"Invalid channel path {path}") |
0 commit comments