Skip to content

Commit 4e5c3f1

Browse files
author
Andrew Yang
committed
Make loadmetadata more robust
1 parent 8af9b7b commit 4e5c3f1

File tree

7 files changed

+134
-103
lines changed

7 files changed

+134
-103
lines changed

CHANGELOG.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Release notes
22

3-
## Version 3.2.0 – 2023-8-**
3+
## Version 3.2.0 – 2023-08-**
44

55
### Added
66

@@ -11,7 +11,6 @@
1111

1212
- loadData function now toggleable. Can return either (a) data read from data blocks or (b) header
1313
information stored above the data block.
14-
- Exclude wx from tests.
1514

1615
### Removed
1716

src/diffpy/utils/parsers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"""
1818

1919
from .loaddata import loadData
20-
from .loadmetafile import load_PDF_into_db, load_from_db, markup_PDF, apply_schema_to_file, markup_oneline
20+
from .loadmetafile import serialize_data, deserialize_data, apply_schema_to_file, serial_oneline
2121
from .resample import resample
2222

2323
# silence the pyflakes syntax checker

src/diffpy/utils/parsers/loaddata.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwar
5151
Note transposing the loaded array as loadData(FILENAME).T
5252
has the same effect.
5353
54-
Return a numpy array of the data. If headers enabled, instead returns a
55-
dictionary of parameters read from the header.
54+
Return a numpy array of the data (data_block). If headers enabled, instead returns a
55+
dictionary of parameters read from the header (hddata).
5656
"""
5757
from numpy import array, loadtxt
5858
# for storing header data
@@ -145,14 +145,14 @@ def countcolumnsvalues(line):
145145
# Return an empty array when no data found.
146146
# loadtxt would otherwise raise an exception on loading from EOF.
147147
if start is None:
148-
rv = array([], dtype=float)
148+
data_block = array([], dtype=float)
149149
else:
150150
fid.seek(start)
151151
# always use usecols argument so that loadtxt does not crash
152152
# in case of trailing delimiters.
153153
kwargs.setdefault('usecols', list(range(ncvblock[0])))
154-
rv = loadtxt(fid, **kwargs)
155-
return rv
154+
data_block = loadtxt(fid, **kwargs)
155+
return data_block
156156

157157

158158
class TextDataLoader(object):

src/diffpy/utils/parsers/loadmetafile.py

Lines changed: 86 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -20,68 +20,110 @@
2020
supported_formats = ['.json']
2121

2222

23-
def load_PDF_into_db(dbname, pdfname, hddata: dict, rv: list, show_path=True):
24-
"""Load an entry consisting of PDF header and base data into a database file.
25-
26-
Requires hdata and rv generated from loadData.
27-
28-
dbname -- name of the database file to load an entry into.
29-
pdfname -- name of the PDF file.
30-
hddata -- Dictionary of PDF metadata generated by loadData.
31-
rv -- List of PDF (r, gr) pairs generated by loadData.
32-
show_path -- include a PDF_path element in the database entry (default True).
23+
def serialize_data(filename, hdata: dict, data_table: list, show_path=True, dt_colnames=None, serial_file=None):
24+
"""Serialize file data into a dictionary. Can also save dictionary into a serial language file.
25+
Dictionary is formatted as {filename: data}.
26+
27+
Requires hdata and data_table generated from loadData.
28+
29+
filename -- name of the file whose data is being serialized.
30+
hdata -- Dictionary of PDF metadata generated by loadData.
31+
data_table -- List storing parsed by loadData.
32+
dt_colnames -- List containing names of each column in data_table. Every name in
33+
data_table_cols will be put into the Dictionary as a key with a value
34+
of that column in data_table (stored as a List). Put None for
35+
columns without names. If dt_cols has less non-None entries
36+
than columns in data_table, the pair {'data table': data_table} will be put
37+
in the dictionary. (Default None: only entry {'data table': data_table}
38+
will be added to dictionary.)
39+
show_path -- include a path element in the database entry (default True).
40+
If 'path' is not included in hddata, extract path from filename.
41+
serial_file -- serial language file to dump dictionary into.
3342
3443
Returns the dictionary loaded from/into the updated database file.
3544
"""
36-
# new file or update
37-
existing = False
38-
if pathlib.Path.is_file(dbname):
39-
existing = True
40-
41-
# collect entry
42-
with open(pdfname, 'r') as grfile:
43-
data = {}
44-
45-
# add path
46-
grpath = grfile.name
47-
if show_path:
48-
data.update({'PDF_path': grpath})
4945

50-
# add r, gr, and header metadata
51-
data.update({'r': list(rv[:, 0]), 'gr': list(rv[:, 1])})
52-
data.update(hddata)
53-
54-
# parse name using pathlib and generate json entry
55-
name = pathlib.Path(grpath).name
56-
entry = {name: data}
46+
# compile data_table and hddata together
47+
data = {}
5748

49+
# handle getting name of file for variety of filename types
50+
with open(filename, 'r') as file_path:
51+
abs_path = pathlib.Path(file_path.name).resolve()
52+
# add path to start of data if requested
53+
if show_path and 'path' not in hdata.keys():
54+
data.update({'path': abs_path.name})
55+
# title the entry with name of file (taken from end of path)
56+
title = abs_path.name
57+
58+
# first add named columns in dt_cols
59+
num_columns = [len(row) for row in data_table]
60+
max_columns = max(num_columns)
61+
num_col_names = len(dt_colnames)
62+
if max_columns < num_col_names: # assume numpy.loadtxt gives non-irregular array
63+
raise Exception("More entries in dt_colnames than columns in data_table.")
64+
named_columns = 0
65+
for idx in range(num_col_names):
66+
colname = dt_colnames[idx]
67+
if colname is not None:
68+
data.update({colname: list(data_table[:, idx])})
69+
named_columns += 1
70+
71+
# second add data in hddata dict
72+
data.update(hdata)
73+
74+
# finally add data_table as an entry named 'data table' if not all columns were parsed
75+
if named_columns < max_columns:
76+
if 'data table' not in data.keys():
77+
data.update({'data table': data_table})
78+
else: # if 'data table' is already a key, keep adding primes to the end
79+
dt_name = 'data table'
80+
while dt_name in data.keys():
81+
dt_name += " prime"
82+
data.update({dt_name: data_table})
83+
84+
# parse name using pathlib and generate dictionary entry
85+
entry = {title: data}
86+
87+
# no save
88+
if serial_file is None:
89+
return entry
90+
91+
# saving/updating file
5892
# check if supported type
59-
extension = pathlib.Path(dbname).suffix
93+
extension = pathlib.Path(serial_file).suffix
6094
if extension not in supported_formats:
61-
raise Exception(f"Format of {dbname} is not supported.")
95+
raise Exception(f"Format of {serial_file} is not supported.")
96+
97+
# new file or update
98+
existing = False
99+
try:
100+
open(serial_file)
101+
existing = True
102+
except FileNotFoundError:
103+
pass
62104

63105
# json
64106
if extension == '.json':
65107
# dump if non-existing
66108
if not existing:
67-
with open(dbname, 'w') as jsonfile:
68-
pdfs = entry # for return
69-
json.dump(pdfs, jsonfile, indent=2)
109+
with open(serial_file, 'w') as jsonfile:
110+
file_data = entry # for return
111+
json.dump(file_data, jsonfile, indent=2)
70112

71113
# update if existing
72114
else:
73-
with open(dbname, 'r') as json_read:
74-
pdfs = json.load(json_read)
75-
pdfs.update(entry)
76-
with open(dbname, 'w') as json_write:
115+
with open(serial_file, 'r') as json_read:
116+
file_data = json.load(json_read)
117+
file_data.update(entry)
118+
with open(serial_file, 'w') as json_write:
77119
# dump to string first for formatting
78-
json.dump(pdfs, json_write, indent=2)
120+
json.dump(file_data, json_write, indent=2)
79121

80-
return pdfs
122+
return file_data
81123

82124

83-
def load_from_db(filename):
84-
"""Load a dictionary from a database file.
125+
def deserialize_data(filename):
126+
"""Load a dictionary from a serial file.
85127
86128
filename -- database file to load from.
87129
@@ -101,40 +143,7 @@ def load_from_db(filename):
101143
return j_dict
102144

103145

104-
def markup_PDF(hddata: dict, rv: list, muname=None):
105-
# FIXME: may be better suited for REST API package, not diffpy.utils
106-
"""Put PDF file information into a dictionary.
107-
108-
hddata -- Dictionary of metadata.
109-
rv -- List of (r, gr) pairs.
110-
muname -- file to save into (default None, no saving occurs).
111-
112-
Returns the dictionary loaded from/into markup file.
113-
"""
114-
115-
# gather data
116-
data = {}
117-
data.update({'r': list(rv[:, 0]), 'gr': list(rv[:, 1])})
118-
data.update(hddata)
119-
120-
# return directly
121-
if muname is None:
122-
return data
123-
124-
# save to disk when enabled
125-
extension = pathlib.Path(muname).suffix
126-
if extension not in supported_formats:
127-
raise Exception(f"Format of {muname} is not supported.")
128-
129-
# dumps into file, automatically overwrites
130-
if extension == '.json':
131-
with open(muname, 'w') as json_write:
132-
json.dump(data, json_write, indent=2)
133-
134-
return data
135-
136-
137-
def markup_oneline(filename):
146+
def serial_oneline(filename):
138147
"""Reformat lists in markup languages to take up only one line.
139148
140149
Works well when only lists are surrounded by square brackets and no other data is comma and newline separated.
Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from diffpy.utils.parsers import load_PDF_into_db, load_from_db, markup_PDF, apply_schema_to_file, markup_oneline
1+
from diffpy.utils.parsers import serialize_data, deserialize_data, apply_schema_to_file, serial_oneline
22
from diffpy.utils.parsers import loadData
33
from diffpy.utils.tests.testhelpers import datafile
44

@@ -7,47 +7,44 @@
77

88
tests_dir = os.path.dirname(os.path.abspath(locals().get('__file__', 'file.py')))
99

10-
targetjson = datafile('targetdb.json')
11-
10+
targetjson = datafile('targetjson.json')
1211
schemaname = datafile('strumining.json')
12+
1313
muload = datafile('loadmu.txt')
1414
targetmu = datafile('targetmu.json')
1515

1616

1717
def test_load_gr(tmp_path):
1818
# generate json and apply schema
19-
generatedjson = tmp_path / "generated_db.json"
19+
generatedjson = tmp_path / "generated_serialization.json"
2020
tddbload_list = os.listdir(os.path.join(tests_dir, "testdata", "dbload"))
2121
tddbload_list.sort()
2222
for headerfile in tddbload_list:
2323
headerfile = os.path.join(tests_dir, "testdata", "dbload", headerfile)
2424
hdata = loadData(headerfile, headers=True)
25-
rv = loadData(headerfile)
26-
db_data = load_PDF_into_db(generatedjson, headerfile, hdata, rv, show_path=False)
25+
data_table = loadData(headerfile)
26+
db_data = serialize_data(headerfile, hdata, data_table, dt_colnames=['r', 'gr'],
27+
show_path=False, serial_file=generatedjson)
2728
apply_schema_to_file(generatedjson, schemaname, multiple_entries=True)
28-
markup_oneline(generatedjson)
29+
serial_oneline(generatedjson)
2930

3031
# compare to target
3132
# first compare if base data is same
3233
import json
33-
target_db_data = load_from_db(targetjson)
34+
target_db_data = deserialize_data(targetjson)
3435
assert target_db_data == db_data
3536
# then compare file structure/organization
3637
assert filecmp.cmp(generatedjson, targetjson)
3738

3839

40+
# FIXME: tests for REST API, remove after merge
3941
def test_markup_gr(tmp_path):
4042
# put into json and apply schema
4143
generatedmu = tmp_path / "generated_markup.json"
4244
hdata = loadData(muload, headers=True)
43-
rv = loadData(muload)
44-
data = markup_PDF(hdata, rv, generatedmu)
45-
apply_schema_to_file(generatedmu, schemaname)
46-
markup_oneline(generatedmu)
47-
48-
# check against target
49-
# first compare data is same
50-
target_data = load_from_db(targetmu)
45+
data_table = loadData(muload)
46+
data = serialize_data(muload, hdata, data_table, dt_colnames=['r', 'gr'], show_path=False).get('loadmu.txt')
47+
48+
# compare data is same
49+
target_data = deserialize_data(targetmu)
5150
assert target_data == data
52-
# then compare structure
53-
assert filecmp.cmp(generatedmu, targetmu)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"e1.gr": {
3+
"r": [0.0, 1.0, 2.0, 3.0],
4+
"gr": [0.0, 0.0, 0.0, 0.0],
5+
"qmax": 10.0,
6+
"qmin": 0.0,
7+
"rmax": 10.0,
8+
"rmin": 0.0
9+
},
10+
"e2.gr": {
11+
"r": [0.0, 1.0, 2.0, 3.0],
12+
"gr": [1.0, 2.0, 3.0, 4.0],
13+
"qmax": 11.0,
14+
"qmin": 1.0,
15+
"rmax": 11.0,
16+
"rmin": 1.0
17+
},
18+
"e3.gr": {
19+
"r": [0.0, 1.0, 2.0, 3.0],
20+
"gr": [0.0, 5.0, 4.0, 3.0],
21+
"qmax": 12.0,
22+
"qmin": 2.0,
23+
"rmax": 12.0,
24+
"rmin": 2.0
25+
}
26+
}

0 commit comments

Comments
 (0)