Skip to content

Commit 0900f9a

Browse files
authored
Merge pull request #80 from fact-project/dask
WIP: Dask, fixes #64
2 parents 43b090d + 62c34a1 commit 0900f9a

32 files changed

+1074
-1414
lines changed

README.md

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Easy RuN Access (ERNA)
2+
23
A collection of tools to handle FACT data and to execute jobs on a SGE/TORQUE cluster.
34

45
![http://www.itbusiness.ca/wp-content/uploads/2012/10/Old-women-on-laptop.jpg](http://www.itbusiness.ca/wp-content/uploads/2012/10/Old-women-on-laptop.jpg)
@@ -10,35 +11,8 @@ Dates are given in the usual FACT convention: YYYYMMDD.
1011

1112
## Requirements
1213
- FACT-Tools
13-
- Java 1.8+
14-
- Python 3.5+ (srsly. 3.5 please)
15-
16-
Install my fork of pygridmap
17-
18-
pip install https://github.com/mackaiver/gridmap/archive/master.tar.gz
19-
20-
Then install this via
21-
22-
pip install https://github.com/fact-project/erna/archive/master.tar.gz
23-
24-
## Config for using gridmap on PhiDo
25-
26-
You need to put this into your `.bashrc`, so erna is configured correctly
27-
28-
```bash
29-
export ERROR_MAIL_RECIPIENT=<your email address>
30-
export DRMAA_LIBRARY_PATH="/sl6/sw/projects/fact/pbs-drmaa-1.0.19/pbs_drmaa/libs/libdrmaa.so"
31-
export DEFAULT_TEMP_DIR="/local/$USER/$PBS_JOBID"
32-
export USE_MEM_FREE=TRUE
33-
export SMTP_SERVER="unimail.tu-dortmund.de"
34-
export ERROR_MAIL_RECIPIENT="[email protected]"
35-
export ERROR_MAIL_SENDER="[email protected]"
36-
export SEND_ERROR_MAIL=TRUE
37-
```
38-
39-
## How to for using gridmap on LiDo2
40-
A brief example of how to setup LiDo2 for erna can be found at:
41-
https://github.com/fact-project/erna/wiki/How-to-setup-Lido2-for-erna
14+
- Java 1.8 (module add java on lido3)
15+
- Python 3.5+
4216

4317

4418
## execute_data_processing.py

erna/__init__.py

Lines changed: 3 additions & 300 deletions
Original file line numberDiff line numberDiff line change
@@ -1,303 +1,6 @@
1-
import logging
2-
import pandas as pd
3-
import os
4-
import numpy as np
5-
import datetime
6-
import json
71
import pkg_resources
8-
from datetime import timedelta
92

10-
from fact.io import to_h5py
11-
from fact.instrument import camera_distance_mm_to_deg
123

13-
from . import datacheck_conditions as dcc
14-
from .datacheck import get_runs, get_drs_runs
15-
from .hdf_utils import rename_columns
16-
17-
logger = logging.getLogger(__name__)
18-
19-
20-
def add_theta_deg_columns(df):
21-
for i in range(6):
22-
incol = 'theta' if i == 0 else 'theta_off_{}'.format(i)
23-
outcol = 'theta_deg' if i == 0 else 'theta_deg_off_{}'.format(i)
24-
if incol in df.columns:
25-
df[outcol] = camera_distance_mm_to_deg(df[incol])
26-
27-
28-
29-
def build_path(row, path_to_data, extension):
30-
"""
31-
builds a path to the fact data given the night, extension and filename
32-
"""
33-
night = str(row.night)
34-
year = night[0:4]
35-
month = night[4:6]
36-
day = night[6:8]
37-
res = os.path.join(path_to_data, year, month, day, row.filename + extension)
38-
return res
39-
40-
def test_drs_path(df, key):
41-
"""
42-
Test if the given drs paths in the key are present
43-
"""
44-
mask = df[key].apply(os.path.exists)
45-
df['drs_file_exists'] = mask
46-
47-
return df
48-
49-
50-
def test_data_path(df, key):
51-
"""
52-
Test the given data paths in key if they exists. It tests for
53-
both possible fileextensions [.fz, .gz] and corrects if necessary.
54-
"""
55-
mask = df[key].apply(os.path.exists)
56-
df['data_file_exists'] = mask
57-
df.loc[~mask, key] = df.loc[~mask, key].str.replace('.fz', '.gz')
58-
df.loc[~mask, 'data_file_exists'] = df.loc[~mask, key].apply(os.path.exists)
59-
60-
return df
61-
62-
def build_filename(night, run_id):
63-
return night.astype(str) + '_' + run_id.map('{:03d}'.format)
64-
65-
66-
def mc_drs_file():
67-
'''
68-
return path to the drs file used for monte carlo files
69-
'''
70-
drs_path = pkg_resources.resource_filename(
71-
__name__, 'resources/mc_drs_constants.drs.fits.gz'
72-
)
73-
return drs_path
74-
75-
76-
def ensure_output(output_path):
77-
'''
78-
Make sure the output file does not exist yet.
79-
Create directorie to new output file if necessary
80-
'''
81-
if os.path.exists(output_path):
82-
raise FileExistsError('The output file already exists.')
83-
directory = os.path.dirname(output_path)
84-
if directory:
85-
os.makedirs(directory, exist_ok=True)
86-
87-
88-
def collect_output(job_outputs, output_path, df_started_runs=None, **kwargs):
89-
'''
90-
Collects the output from the list of job_outputs and merges them into a dataframe.
91-
The Dataframe will then be written to a file as specified by the output_path.
92-
The datatframe df_started_runs is joined with the job outputs to get the real ontime.
93-
'''
94-
logger.info("Concatenating results from each job and writing result to {}".format(output_path))
95-
frames = [f for f in job_outputs if isinstance(f, type(pd.DataFrame()))]
96-
97-
if len(frames) != len(job_outputs):
98-
logger.warn("Only {} jobs returned a proper DataFrame.".format(len(frames)))
99-
100-
if len(frames) == 0:
101-
return
102-
103-
df_returned_data = pd.concat(frames, ignore_index=True)
104-
logger.info("There are a total of {} events in the result".format(len(df_returned_data)))
105-
106-
if len(df_returned_data)==0:
107-
logger.info("No events in the result were returned, something must have gone bad, better go fix it.")
108-
return
109-
110-
logger.info("Number of started runs {}".format(len(df_started_runs)))
111-
112-
if df_started_runs is not None:
113-
if (set(['night','run_id']).issubset(df_started_runs.columns) and set(['night','run_id']).issubset(df_returned_data.columns)):
114-
df_merged = pd.merge(df_started_runs, df_returned_data, on=['night','run_id'], how='outer', indicator=True)
115-
elif (set(['data_path','bunch_index']).issubset(df_started_runs.columns) and set(['data_path','bunch_index']).issubset(df_returned_data.columns)):
116-
df_merged = pd.merge(df_started_runs, df_returned_data, on=['data_path','bunch_index'], how='outer', indicator=True)
117-
else:
118-
df_merged = df_started_runs
119-
df_merged["_merge"] = "both"
120-
121-
df_merged["failed"] = (df_merged["_merge"] != "both")
122-
df_merged.drop("_merge", axis=1, inplace=True)
123-
124-
df_successfull = df_merged.query("failed == False")
125-
df_failed = df_merged.query("failed == True")
126-
127-
if 'ontime' in df_successfull.columns:
128-
total_on_time_in_seconds = df_successfull.ontime.sum()
129-
logger.info("Effective on time: {}. Thats {} hours.".format(datetime.timedelta(seconds=total_on_time_in_seconds), total_on_time_in_seconds/3600))
130-
131-
df_returned_data["total_on_time_in_seconds"] = total_on_time_in_seconds
132-
133-
logger.info("Number of failed runs: {}".format(len(df_failed)))
134-
if len(df_failed) > 0:
135-
name, extension = os.path.splitext(output_path)
136-
failed_file_list_path = name+"_failed_runs.csv"
137-
138-
logger.info("Writing list of failed runs to: {}".format(failed_file_list_path))
139-
df_failed.to_csv(failed_file_list_path, columns=df_started_runs.columns, **kwargs)
140-
141-
142-
df_returned_data.columns = rename_columns(df_returned_data.columns)
143-
add_theta_deg_columns(df_returned_data)
144-
145-
name, extension = os.path.splitext(output_path)
146-
if extension not in ['.json', '.h5', '.hdf5', '.hdf' , '.csv']:
147-
logger.warn("Did not recognize file extension {}. Writing to JSON".format(extension))
148-
df_returned_data.to_json(output_path, orient='records', date_format='epoch', **kwargs )
149-
elif extension == '.json':
150-
logger.info("Writing JSON to {}".format(output_path))
151-
df_returned_data.to_json(output_path, orient='records', date_format='epoch', **kwargs )
152-
elif extension in ['.h5', '.hdf','.hdf5']:
153-
logger.info("Writing HDF5 to {}".format(output_path))
154-
to_h5py(df_returned_data, output_path, key='events', mode='w', **kwargs)
155-
elif extension == '.csv':
156-
logger.info("Writing CSV to {}".format(output_path))
157-
df_returned_data.to_csv(output_path, **kwargs)
158-
159-
160-
def load(
161-
earliest_night,
162-
latest_night,
163-
path_to_data,
164-
factdb,
165-
source_name="Crab",
166-
timedelta_in_minutes=30,
167-
data_conditions=dcc.conditions["standard"]
168-
):
169-
'''
170-
Given the earliest and latest night to fetch as a factnight string (as in 20141024)
171-
this method returns a DataFrame containing the paths to data files
172-
and their correpsonding .drs files.
173-
The maximum time difference between the data and drs files is
174-
specified by the timedelta_in_minutes parameter.
175-
176-
Returns None if no files can be found.
177-
'''
178-
179-
logger.debug("Table names in DB: ")
180-
logger.debug(factdb.table_names())
181-
182-
if len(factdb.table_names()) > 0:
183-
logger.info("Connected to Database.")
184-
185-
logger.info("Reading Data from DataBase from {} to {} for source: {}".format(
186-
earliest_night, latest_night, source_name
187-
))
188-
189-
conditions = [
190-
'fNight >= {}'.format(earliest_night),
191-
'fNight <= {}'.format(latest_night),
192-
'fSourceName = "{}"'.format(source_name),
193-
]
194-
conditions.extend(data_conditions)
195-
logger.info('Querying data with conditions: {}'.format(' AND '.join(conditions)))
196-
data = get_runs(
197-
factdb,
198-
conditions=conditions,
199-
columns=(
200-
'fNight AS night', 'fRunID AS run_id',
201-
'fRunStart', 'fRunStop',
202-
'fOnTime', 'fEffectiveOn',
203-
),
204-
)
205-
206-
# now lets get all drs runs
207-
drs_conditions = [
208-
'fNight >= {}'.format(earliest_night),
209-
'fNight <= {}'.format(latest_night),
210-
]
211-
212-
drs_data = get_drs_runs(
213-
factdb, conditions=drs_conditions,
214-
columns=('fNight AS night', 'fRunID AS run_id', 'fRunStart', 'fRunStop'),
215-
)
216-
217-
if len(data) == 0 or len(drs_data) == 0:
218-
logger.error('No data or drs files found that adhere to the specified query.')
219-
return None
220-
221-
logger.info("Got {} data runs and {} runs".format(len(data), len(drs_data)))
222-
223-
# the timestamp should be unique for each observation.
224-
# No two observations start at the same time
225-
data.set_index("fRunStart", inplace=True)
226-
drs_data.set_index("fRunStart", inplace=True)
227-
# sorting data by their timestamp.
228-
data = data.sort_index()
229-
drs_data = drs_data.sort_index()
230-
231-
# write filenames
232-
data["filename"] = build_filename(data.night, data.run_id)
233-
drs_data["filename"] = build_filename(drs_data.night, drs_data.run_id)
234-
235-
# write path
236-
data["path"] = data.apply(build_path, axis=1, path_to_data=path_to_data, extension='.fits.fz')
237-
drs_data["path"] = drs_data.apply(build_path, axis=1, path_to_data=path_to_data, extension='.drs.fits.gz')
238-
239-
#remove all none existing drs files
240-
drs_data = test_drs_path(drs_data, "path")
241-
drs_data = drs_data[drs_data['drs_file_exists']]
242-
243-
# reindex the drs table using the index of the data table.
244-
# There are always more data runs than drs run in the db.
245-
# hence missing rows have to be filled either forward or backwards
246-
earlier_drs_entries = drs_data.reindex(data.index, method="ffill")
247-
earlier_drs_entries = earlier_drs_entries.fillna(axis="index", method="ffill")
248-
later_drs_entries = drs_data.reindex(data.index, method="backfill")
249-
later_drs_entries = later_drs_entries.fillna(axis="index", method="ffill")
250-
251-
# when backfilling the drs obeservations the last rows might be invalid and contain nans.
252-
# We cannot drop them becasue the tables have to have the same length.
253-
# in that case simply fill them up.
254-
earlier_drs_entries["deltaT"] = np.abs(earlier_drs_entries.fRunStop - data.fRunStop)
255-
later_drs_entries["deltaT"] = np.abs(later_drs_entries.fRunStop - data.fRunStop).fillna(axis='index', method='ffill')
256-
d_later = later_drs_entries[later_drs_entries.deltaT < earlier_drs_entries.deltaT]
257-
d_early = earlier_drs_entries[later_drs_entries.deltaT >= earlier_drs_entries.deltaT]
258-
259-
closest_drs_entries = pd.concat([d_early, d_later])
260-
closest_drs_entries = closest_drs_entries[closest_drs_entries.deltaT < timedelta(minutes = timedelta_in_minutes)]
261-
262-
mapping = pd.concat([
263-
closest_drs_entries.filename,
264-
closest_drs_entries.path,
265-
data.path,
266-
closest_drs_entries.deltaT,
267-
data.fOnTime, data.fEffectiveOn,
268-
data.night,
269-
data.run_id,
270-
], axis=1, keys=[
271-
"filename",
272-
"drs_path",
273-
"data_path",
274-
"delta_t",
275-
"ontime",
276-
"effective_on",
277-
"night",
278-
"run_id",
279-
])
280-
281-
mapping = mapping.dropna(how='any')
282-
283-
logger.info("Fetched {} data runs and approx {} drs entries from database where time delta is less than {} minutes".format(len(mapping), mapping['drs_path'].nunique(), timedelta_in_minutes))
284-
# effective_ontime = (mapping['ontime'] * mapping['effective_on']).sum()
285-
# logger.info("Effective on time: {}. Thats {} hours.".format(datetime.timedelta(seconds=effective_ontime), effective_ontime/3600))
286-
287-
return mapping
288-
289-
290-
def ft_json_to_df(json_path):
291-
with open(json_path,'r') as text:
292-
try:
293-
logger.info("Reading fact-tools output.")
294-
y=json.loads(text.read())
295-
df_out=pd.DataFrame(y)
296-
logger.info("Returning data frame with {} entries".format(len(df_out)))
297-
return df_out
298-
except ValueError:
299-
logger.exception("Fact-tools output could not be read.")
300-
return "error reading json"
301-
except Exception:
302-
logger.exception("Fact-tools output could not be gathered.")
303-
return "error gathering output"
4+
mc_drs_file = pkg_resources.resource_filename(
5+
__name__, 'resources/mc_drs_constants.drs.fits.gz'
6+
)

erna/automatic_processing/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,8 @@
22
from .database_utils import fill_data_runs, fill_drs_runs
33

44

5-
__all__ = ['database', 'fill_data_runs', 'fill_drs_runs']
5+
__all__ = [
6+
'database',
7+
'fill_drs_runs',
8+
'fill_data_runs',
9+
]

erna/automatic_processing/job_submitter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def terminate(self):
7979
def process_pending_jobs(self):
8080
'''
8181
Fetches pending runs from the processing database
82-
and submits them using qsub if not to many jobs are running already.
82+
and submits them if not to many jobs are running already.
8383
'''
8484
current_jobs = get_current_jobs()
8585
running_jobs = current_jobs.query('state == "running"')

0 commit comments

Comments
 (0)