|
1 | | -import logging |
2 | | -import pandas as pd |
3 | | -import os |
4 | | -import numpy as np |
5 | | -import datetime |
6 | | -import json |
7 | 1 | import pkg_resources |
8 | | -from datetime import timedelta |
9 | 2 |
|
10 | | -from fact.io import to_h5py |
11 | | -from fact.instrument import camera_distance_mm_to_deg |
12 | 3 |
|
13 | | -from . import datacheck_conditions as dcc |
14 | | -from .datacheck import get_runs, get_drs_runs |
15 | | -from .hdf_utils import rename_columns |
16 | | - |
17 | | -logger = logging.getLogger(__name__) |
18 | | - |
19 | | - |
20 | | -def add_theta_deg_columns(df): |
21 | | - for i in range(6): |
22 | | - incol = 'theta' if i == 0 else 'theta_off_{}'.format(i) |
23 | | - outcol = 'theta_deg' if i == 0 else 'theta_deg_off_{}'.format(i) |
24 | | - if incol in df.columns: |
25 | | - df[outcol] = camera_distance_mm_to_deg(df[incol]) |
26 | | - |
27 | | - |
28 | | - |
29 | | -def build_path(row, path_to_data, extension): |
30 | | - """ |
31 | | - builds a path to the fact data given the night, extension and filename |
32 | | - """ |
33 | | - night = str(row.night) |
34 | | - year = night[0:4] |
35 | | - month = night[4:6] |
36 | | - day = night[6:8] |
37 | | - res = os.path.join(path_to_data, year, month, day, row.filename + extension) |
38 | | - return res |
39 | | - |
40 | | -def test_drs_path(df, key): |
41 | | - """ |
42 | | - Test if the given drs paths in the key are present |
43 | | - """ |
44 | | - mask = df[key].apply(os.path.exists) |
45 | | - df['drs_file_exists'] = mask |
46 | | - |
47 | | - return df |
48 | | - |
49 | | - |
50 | | -def test_data_path(df, key): |
51 | | - """ |
52 | | - Test the given data paths in key if they exists. It tests for |
53 | | - both possible fileextensions [.fz, .gz] and corrects if necessary. |
54 | | - """ |
55 | | - mask = df[key].apply(os.path.exists) |
56 | | - df['data_file_exists'] = mask |
57 | | - df.loc[~mask, key] = df.loc[~mask, key].str.replace('.fz', '.gz') |
58 | | - df.loc[~mask, 'data_file_exists'] = df.loc[~mask, key].apply(os.path.exists) |
59 | | - |
60 | | - return df |
61 | | - |
62 | | -def build_filename(night, run_id): |
63 | | - return night.astype(str) + '_' + run_id.map('{:03d}'.format) |
64 | | - |
65 | | - |
66 | | -def mc_drs_file(): |
67 | | - ''' |
68 | | - return path to the drs file used for monte carlo files |
69 | | - ''' |
70 | | - drs_path = pkg_resources.resource_filename( |
71 | | - __name__, 'resources/mc_drs_constants.drs.fits.gz' |
72 | | - ) |
73 | | - return drs_path |
74 | | - |
75 | | - |
76 | | -def ensure_output(output_path): |
77 | | - ''' |
78 | | - Make sure the output file does not exist yet. |
79 | | - Create directorie to new output file if necessary |
80 | | - ''' |
81 | | - if os.path.exists(output_path): |
82 | | - raise FileExistsError('The output file already exists.') |
83 | | - directory = os.path.dirname(output_path) |
84 | | - if directory: |
85 | | - os.makedirs(directory, exist_ok=True) |
86 | | - |
87 | | - |
88 | | -def collect_output(job_outputs, output_path, df_started_runs=None, **kwargs): |
89 | | - ''' |
90 | | - Collects the output from the list of job_outputs and merges them into a dataframe. |
91 | | - The Dataframe will then be written to a file as specified by the output_path. |
92 | | - The datatframe df_started_runs is joined with the job outputs to get the real ontime. |
93 | | - ''' |
94 | | - logger.info("Concatenating results from each job and writing result to {}".format(output_path)) |
95 | | - frames = [f for f in job_outputs if isinstance(f, type(pd.DataFrame()))] |
96 | | - |
97 | | - if len(frames) != len(job_outputs): |
98 | | - logger.warn("Only {} jobs returned a proper DataFrame.".format(len(frames))) |
99 | | - |
100 | | - if len(frames) == 0: |
101 | | - return |
102 | | - |
103 | | - df_returned_data = pd.concat(frames, ignore_index=True) |
104 | | - logger.info("There are a total of {} events in the result".format(len(df_returned_data))) |
105 | | - |
106 | | - if len(df_returned_data)==0: |
107 | | - logger.info("No events in the result were returned, something must have gone bad, better go fix it.") |
108 | | - return |
109 | | - |
110 | | - logger.info("Number of started runs {}".format(len(df_started_runs))) |
111 | | - |
112 | | - if df_started_runs is not None: |
113 | | - if (set(['night','run_id']).issubset(df_started_runs.columns) and set(['night','run_id']).issubset(df_returned_data.columns)): |
114 | | - df_merged = pd.merge(df_started_runs, df_returned_data, on=['night','run_id'], how='outer', indicator=True) |
115 | | - elif (set(['data_path','bunch_index']).issubset(df_started_runs.columns) and set(['data_path','bunch_index']).issubset(df_returned_data.columns)): |
116 | | - df_merged = pd.merge(df_started_runs, df_returned_data, on=['data_path','bunch_index'], how='outer', indicator=True) |
117 | | - else: |
118 | | - df_merged = df_started_runs |
119 | | - df_merged["_merge"] = "both" |
120 | | - |
121 | | - df_merged["failed"] = (df_merged["_merge"] != "both") |
122 | | - df_merged.drop("_merge", axis=1, inplace=True) |
123 | | - |
124 | | - df_successfull = df_merged.query("failed == False") |
125 | | - df_failed = df_merged.query("failed == True") |
126 | | - |
127 | | - if 'ontime' in df_successfull.columns: |
128 | | - total_on_time_in_seconds = df_successfull.ontime.sum() |
129 | | - logger.info("Effective on time: {}. Thats {} hours.".format(datetime.timedelta(seconds=total_on_time_in_seconds), total_on_time_in_seconds/3600)) |
130 | | - |
131 | | - df_returned_data["total_on_time_in_seconds"] = total_on_time_in_seconds |
132 | | - |
133 | | - logger.info("Number of failed runs: {}".format(len(df_failed))) |
134 | | - if len(df_failed) > 0: |
135 | | - name, extension = os.path.splitext(output_path) |
136 | | - failed_file_list_path = name+"_failed_runs.csv" |
137 | | - |
138 | | - logger.info("Writing list of failed runs to: {}".format(failed_file_list_path)) |
139 | | - df_failed.to_csv(failed_file_list_path, columns=df_started_runs.columns, **kwargs) |
140 | | - |
141 | | - |
142 | | - df_returned_data.columns = rename_columns(df_returned_data.columns) |
143 | | - add_theta_deg_columns(df_returned_data) |
144 | | - |
145 | | - name, extension = os.path.splitext(output_path) |
146 | | - if extension not in ['.json', '.h5', '.hdf5', '.hdf' , '.csv']: |
147 | | - logger.warn("Did not recognize file extension {}. Writing to JSON".format(extension)) |
148 | | - df_returned_data.to_json(output_path, orient='records', date_format='epoch', **kwargs ) |
149 | | - elif extension == '.json': |
150 | | - logger.info("Writing JSON to {}".format(output_path)) |
151 | | - df_returned_data.to_json(output_path, orient='records', date_format='epoch', **kwargs ) |
152 | | - elif extension in ['.h5', '.hdf','.hdf5']: |
153 | | - logger.info("Writing HDF5 to {}".format(output_path)) |
154 | | - to_h5py(df_returned_data, output_path, key='events', mode='w', **kwargs) |
155 | | - elif extension == '.csv': |
156 | | - logger.info("Writing CSV to {}".format(output_path)) |
157 | | - df_returned_data.to_csv(output_path, **kwargs) |
158 | | - |
159 | | - |
160 | | -def load( |
161 | | - earliest_night, |
162 | | - latest_night, |
163 | | - path_to_data, |
164 | | - factdb, |
165 | | - source_name="Crab", |
166 | | - timedelta_in_minutes=30, |
167 | | - data_conditions=dcc.conditions["standard"] |
168 | | - ): |
169 | | - ''' |
170 | | - Given the earliest and latest night to fetch as a factnight string (as in 20141024) |
171 | | - this method returns a DataFrame containing the paths to data files |
172 | | - and their correpsonding .drs files. |
173 | | - The maximum time difference between the data and drs files is |
174 | | - specified by the timedelta_in_minutes parameter. |
175 | | -
|
176 | | - Returns None if no files can be found. |
177 | | - ''' |
178 | | - |
179 | | - logger.debug("Table names in DB: ") |
180 | | - logger.debug(factdb.table_names()) |
181 | | - |
182 | | - if len(factdb.table_names()) > 0: |
183 | | - logger.info("Connected to Database.") |
184 | | - |
185 | | - logger.info("Reading Data from DataBase from {} to {} for source: {}".format( |
186 | | - earliest_night, latest_night, source_name |
187 | | - )) |
188 | | - |
189 | | - conditions = [ |
190 | | - 'fNight >= {}'.format(earliest_night), |
191 | | - 'fNight <= {}'.format(latest_night), |
192 | | - 'fSourceName = "{}"'.format(source_name), |
193 | | - ] |
194 | | - conditions.extend(data_conditions) |
195 | | - logger.info('Querying data with conditions: {}'.format(' AND '.join(conditions))) |
196 | | - data = get_runs( |
197 | | - factdb, |
198 | | - conditions=conditions, |
199 | | - columns=( |
200 | | - 'fNight AS night', 'fRunID AS run_id', |
201 | | - 'fRunStart', 'fRunStop', |
202 | | - 'fOnTime', 'fEffectiveOn', |
203 | | - ), |
204 | | - ) |
205 | | - |
206 | | - # now lets get all drs runs |
207 | | - drs_conditions = [ |
208 | | - 'fNight >= {}'.format(earliest_night), |
209 | | - 'fNight <= {}'.format(latest_night), |
210 | | - ] |
211 | | - |
212 | | - drs_data = get_drs_runs( |
213 | | - factdb, conditions=drs_conditions, |
214 | | - columns=('fNight AS night', 'fRunID AS run_id', 'fRunStart', 'fRunStop'), |
215 | | - ) |
216 | | - |
217 | | - if len(data) == 0 or len(drs_data) == 0: |
218 | | - logger.error('No data or drs files found that adhere to the specified query.') |
219 | | - return None |
220 | | - |
221 | | - logger.info("Got {} data runs and {} runs".format(len(data), len(drs_data))) |
222 | | - |
223 | | - # the timestamp should be unique for each observation. |
224 | | - # No two observations start at the same time |
225 | | - data.set_index("fRunStart", inplace=True) |
226 | | - drs_data.set_index("fRunStart", inplace=True) |
227 | | - # sorting data by their timestamp. |
228 | | - data = data.sort_index() |
229 | | - drs_data = drs_data.sort_index() |
230 | | - |
231 | | - # write filenames |
232 | | - data["filename"] = build_filename(data.night, data.run_id) |
233 | | - drs_data["filename"] = build_filename(drs_data.night, drs_data.run_id) |
234 | | - |
235 | | - # write path |
236 | | - data["path"] = data.apply(build_path, axis=1, path_to_data=path_to_data, extension='.fits.fz') |
237 | | - drs_data["path"] = drs_data.apply(build_path, axis=1, path_to_data=path_to_data, extension='.drs.fits.gz') |
238 | | - |
239 | | - #remove all none existing drs files |
240 | | - drs_data = test_drs_path(drs_data, "path") |
241 | | - drs_data = drs_data[drs_data['drs_file_exists']] |
242 | | - |
243 | | - # reindex the drs table using the index of the data table. |
244 | | - # There are always more data runs than drs run in the db. |
245 | | - # hence missing rows have to be filled either forward or backwards |
246 | | - earlier_drs_entries = drs_data.reindex(data.index, method="ffill") |
247 | | - earlier_drs_entries = earlier_drs_entries.fillna(axis="index", method="ffill") |
248 | | - later_drs_entries = drs_data.reindex(data.index, method="backfill") |
249 | | - later_drs_entries = later_drs_entries.fillna(axis="index", method="ffill") |
250 | | - |
251 | | - # when backfilling the drs obeservations the last rows might be invalid and contain nans. |
252 | | - # We cannot drop them becasue the tables have to have the same length. |
253 | | - # in that case simply fill them up. |
254 | | - earlier_drs_entries["deltaT"] = np.abs(earlier_drs_entries.fRunStop - data.fRunStop) |
255 | | - later_drs_entries["deltaT"] = np.abs(later_drs_entries.fRunStop - data.fRunStop).fillna(axis='index', method='ffill') |
256 | | - d_later = later_drs_entries[later_drs_entries.deltaT < earlier_drs_entries.deltaT] |
257 | | - d_early = earlier_drs_entries[later_drs_entries.deltaT >= earlier_drs_entries.deltaT] |
258 | | - |
259 | | - closest_drs_entries = pd.concat([d_early, d_later]) |
260 | | - closest_drs_entries = closest_drs_entries[closest_drs_entries.deltaT < timedelta(minutes = timedelta_in_minutes)] |
261 | | - |
262 | | - mapping = pd.concat([ |
263 | | - closest_drs_entries.filename, |
264 | | - closest_drs_entries.path, |
265 | | - data.path, |
266 | | - closest_drs_entries.deltaT, |
267 | | - data.fOnTime, data.fEffectiveOn, |
268 | | - data.night, |
269 | | - data.run_id, |
270 | | - ], axis=1, keys=[ |
271 | | - "filename", |
272 | | - "drs_path", |
273 | | - "data_path", |
274 | | - "delta_t", |
275 | | - "ontime", |
276 | | - "effective_on", |
277 | | - "night", |
278 | | - "run_id", |
279 | | - ]) |
280 | | - |
281 | | - mapping = mapping.dropna(how='any') |
282 | | - |
283 | | - logger.info("Fetched {} data runs and approx {} drs entries from database where time delta is less than {} minutes".format(len(mapping), mapping['drs_path'].nunique(), timedelta_in_minutes)) |
284 | | - # effective_ontime = (mapping['ontime'] * mapping['effective_on']).sum() |
285 | | - # logger.info("Effective on time: {}. Thats {} hours.".format(datetime.timedelta(seconds=effective_ontime), effective_ontime/3600)) |
286 | | - |
287 | | - return mapping |
288 | | - |
289 | | - |
290 | | -def ft_json_to_df(json_path): |
291 | | - with open(json_path,'r') as text: |
292 | | - try: |
293 | | - logger.info("Reading fact-tools output.") |
294 | | - y=json.loads(text.read()) |
295 | | - df_out=pd.DataFrame(y) |
296 | | - logger.info("Returning data frame with {} entries".format(len(df_out))) |
297 | | - return df_out |
298 | | - except ValueError: |
299 | | - logger.exception("Fact-tools output could not be read.") |
300 | | - return "error reading json" |
301 | | - except Exception: |
302 | | - logger.exception("Fact-tools output could not be gathered.") |
303 | | - return "error gathering output" |
| 4 | +mc_drs_file = pkg_resources.resource_filename( |
| 5 | + __name__, 'resources/mc_drs_constants.drs.fits.gz' |
| 6 | +) |
0 commit comments