-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummary.py
64 lines (48 loc) · 2.12 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
Calculate summary statistics from a predicted time series.
Run inference.py first to generate the required input files (.parquet and _info.csv).
Arguments:
A DataFrame in parquet format, named {eid}.parquet
An actipy info dict should exist alongside the input file, named {eid}_info.csv
Example usage:
python summary.py /data/ukb/outputs/group3/1001366.parquet
Output:
An {eid}_summary.csv file in the same folder as the input .parquet file
"""
import os
import argparse
import pandas as pd
from pathlib import Path
import utils.utils as utils
from utils.summarisation import getActivitySummary
log = utils.get_logger()
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='SSL UKB Summary stats',
usage='Compute summary statistics from an input .parquet file')
parser.add_argument('input_file', type=str, help='input file')
args = parser.parse_args()
input_file = Path(args.input_file)
root = input_file.parent
log.info('Working on %s', input_file)
# load prediction dataframe and convert to time series format
df_ukb = pd.read_parquet(input_file, engine='pyarrow')
df = utils.ukb_df_to_series(df_ukb, 'label_hmm')
# extract pid and group from filename
pid = input_file.stem.split('_')[0]
group = input_file.parent.stem
# read device info file
info_file = os.path.join(root, pid + '_info.csv')
df_info = pd.read_csv(info_file)
# prepare summary data dict, this will be immuted by getActivitySummary with the summary stats
summary = {
'eid': pd.Series(pid, dtype='string'),
'file-name': pd.Series(df_info['Filename'][0], dtype='string')
}
log.info('Calculating summary stats')
data, data_imputed, _ = getActivitySummary(df, df_info, summary, utils.labels, imputation=True)
# output is a dataframe with 1 row
df_summary = pd.DataFrame(summary, index=[0])
# write {eid}_summary csv in the same path as the {eid}.parquet file
output_file = os.path.join(root, pid + '_summary.csv')
df_summary.to_csv(output_file, index=False)
log.info('Summary saved to %s', output_file)