Skip to content

Commit 7190375

Browse files
authored
Merge pull request #24 from Sparks29032/loadmetadata
Allow convert of PDF to json
2 parents 86f7fc5 + 6116cf3 commit 7190375

15 files changed

+423
-11
lines changed

Diff for: CHANGELOG.md

+1-5
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
# Release notes
22

3-
## Version 3.2.0 – 2023-8-**
3+
## Version 3.2.0 – 2023-08-**
44

55
### Added
66

77
- CI Coverage.
88
- New tests for loadData function.
9-
10-
### Changed
11-
129
- loadData function now toggleable. Can return either (a) data read from data blocks or (b) header
1310
information stored above the data block.
14-
- Exclude wx from tests.
1511

1612
### Removed
1713

Diff for: conda-recipe/run_test.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
#!/usr/bin/env python
22

3+
import sys
4+
import pathlib
5+
sys.path.append((pathlib.Path.cwd().parent.absolute() / "src").as_posix())
6+
37
import diffpy.utils.tests
48
assert diffpy.utils.tests.test().wasSuccessful()

Diff for: src/diffpy/utils/parsers/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
"""
1818

1919
from .loaddata import loadData
20+
from .serialization import serialize_data, deserialize_data
2021
from .resample import resample
2122

2223
# silence the pyflakes syntax checker
2324
assert loadData or resample or True
25+
assert serialize_data or deserialize_data or True
2426

2527
# End of file

Diff for: src/diffpy/utils/parsers/custom_exceptions.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python
2+
##############################################################################
3+
#
4+
# diffpy.utils by DANSE Diffraction group
5+
# Simon J. L. Billinge
6+
# (c) 2010 The Trustees of Columbia University
7+
# in the City of New York. All rights reserved.
8+
#
9+
# File coded by:
10+
#
11+
# See AUTHORS.txt for a list of people who contributed.
12+
# See LICENSE_DANSE.txt for license information.
13+
#
14+
##############################################################################
15+
16+
class UnsupportedTypeError(Exception):
17+
"""For file types not supported by our parsers.
18+
19+
supported_types -- List of supported types.
20+
file -- file triggering the error.
21+
message -- for writing a custom message.
22+
"""
23+
24+
def __init__(self, file, supported_types=None, message=None):
25+
if message is None:
26+
self.message = f"The file {file} is not supported."
27+
if supported_types is not None:
28+
self.message += " Supported file types include: "
29+
for t in supported_types:
30+
self.message += t + ", "
31+
self.message = self.message[:-2] + "."
32+
super().__init__(self.message)
33+
34+
35+
class ImproperSizeError(Exception):
36+
"""When the size of an object does not match expectations.
37+
38+
bad_object -- Object with improper size.
39+
message -- for writing a custom message.
40+
"""
41+
42+
def __init__(self, bad_object, message=None):
43+
if message is None:
44+
self.message = f"The size of {bad_object} is different than expected."
45+
super().__init__(self.message)

Diff for: src/diffpy/utils/parsers/loaddata.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwar
2121
2222
The data block is identified as the first matrix block of at least minrows rows
2323
and constant number of columns. This seems to work for most of the datafiles including
24-
those generated by PDFGetX2.
24+
those generated by diffpy programs.
2525
2626
filename -- name of the file we want to load data from.
2727
minrows -- minimum number of rows in the first data block.
@@ -51,8 +51,8 @@ def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwar
5151
Note transposing the loaded array as loadData(FILENAME).T
5252
has the same effect.
5353
54-
Return a numpy array of the data. If headers enabled, instead returns a
55-
dictionary of parameters read from the header.
54+
Return a numpy array of the data (data_block). If headers enabled, instead returns a
55+
dictionary of parameters read from the header (hddata).
5656
"""
5757
from numpy import array, loadtxt
5858
# for storing header data
@@ -145,14 +145,14 @@ def countcolumnsvalues(line):
145145
# Return an empty array when no data found.
146146
# loadtxt would otherwise raise an exception on loading from EOF.
147147
if start is None:
148-
rv = array([], dtype=float)
148+
data_block = array([], dtype=float)
149149
else:
150150
fid.seek(start)
151151
# always use usecols argument so that loadtxt does not crash
152152
# in case of trailing delimiters.
153153
kwargs.setdefault('usecols', list(range(ncvblock[0])))
154-
rv = loadtxt(fid, **kwargs)
155-
return rv
154+
data_block = loadtxt(fid, **kwargs)
155+
return data_block
156156

157157

158158
class TextDataLoader(object):

Diff for: src/diffpy/utils/parsers/serialization.py

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python
2+
##############################################################################
3+
#
4+
# diffpy.utils by DANSE Diffraction group
5+
# Simon J. L. Billinge
6+
# (c) 2010 The Trustees of Columbia University
7+
# in the City of New York. All rights reserved.
8+
#
9+
# File coded by:
10+
#
11+
# See AUTHORS.txt for a list of people who contributed.
12+
# See LICENSE_DANSE.txt for license information.
13+
#
14+
##############################################################################
15+
16+
import pathlib
17+
import json
18+
19+
from .custom_exceptions import UnsupportedTypeError, ImproperSizeError
20+
21+
# FIXME: add support for yaml, xml
22+
supported_formats = ['.json']
23+
24+
25+
def serialize_data(filename, hdata: dict, data_table: list, show_path=True, dt_colnames=None, serial_file=None):
26+
"""Serialize file data into a dictionary. Can also save dictionary into a serial language file.
27+
Dictionary is formatted as {filename: data}.
28+
29+
Requires hdata and data_table generated from loadData.
30+
31+
filename -- name of the file whose data is being serialized.
32+
hdata -- Dictionary of PDF metadata generated by loadData.
33+
data_table -- List storing parsed by loadData.
34+
dt_colnames -- List containing names of each column in data_table. Every name in
35+
data_table_cols will be put into the Dictionary as a key with a value
36+
of that column in data_table (stored as a List). Put None for
37+
columns without names. If dt_cols has less non-None entries
38+
than columns in data_table, the pair {'data table': data_table} will be put
39+
in the dictionary. (Default None: only entry {'data table': data_table}
40+
will be added to dictionary.)
41+
show_path -- include a path element in the database entry (default True).
42+
If 'path' is not included in hddata, extract path from filename.
43+
serial_file -- serial language file to dump dictionary into.
44+
45+
Returns the dictionary loaded from/into the updated database file.
46+
"""
47+
48+
# compile data_table and hddata together
49+
data = {}
50+
51+
# handle getting name of file for variety of filename types
52+
abs_path = pathlib.Path(filename).resolve()
53+
# add path to start of data if requested
54+
if show_path and 'path' not in hdata.keys():
55+
data.update({'path': abs_path.as_posix()})
56+
# title the entry with name of file (taken from end of path)
57+
title = abs_path.name
58+
59+
# first add named columns in dt_cols
60+
named_columns = 0 # initial value
61+
max_columns = 1 # higher than named_columns to trigger 'data table' entry
62+
if dt_colnames is not None:
63+
num_columns = [len(row) for row in data_table]
64+
max_columns = max(num_columns)
65+
num_col_names = len(dt_colnames)
66+
if max_columns < num_col_names: # assume numpy.loadtxt gives non-irregular array
67+
raise ImproperSizeError("More entries in dt_colnames than columns in data_table.")
68+
named_columns = 0
69+
for idx in range(num_col_names):
70+
colname = dt_colnames[idx]
71+
if colname is not None:
72+
data.update({colname: list(data_table[:, idx])})
73+
named_columns += 1
74+
75+
# second add data in hddata dict
76+
data.update(hdata)
77+
78+
# finally add data_table as an entry named 'data table' if not all columns were parsed
79+
if named_columns < max_columns:
80+
if 'data table' not in data.keys():
81+
data.update({'data table': data_table})
82+
else: # if 'data table' is already a key, keep adding primes to the end
83+
dt_name = 'data table'
84+
while dt_name in data.keys():
85+
dt_name += " prime"
86+
data.update({dt_name: data_table})
87+
88+
# parse name using pathlib and generate dictionary entry
89+
entry = {title: data}
90+
91+
# no save
92+
if serial_file is None:
93+
return entry
94+
95+
# saving/updating file
96+
# check if supported type
97+
sf = pathlib.Path(serial_file)
98+
sf_name = sf.name
99+
extension = sf.suffix
100+
if extension not in supported_formats:
101+
raise UnsupportedTypeError(sf_name, supported_formats)
102+
103+
# new file or update
104+
existing = False
105+
try:
106+
open(serial_file)
107+
existing = True
108+
except FileNotFoundError:
109+
pass
110+
111+
# json
112+
if extension == '.json':
113+
# dump if non-existing
114+
if not existing:
115+
with open(serial_file, 'w') as jsonfile:
116+
file_data = entry # for return
117+
json.dump(file_data, jsonfile, indent=2)
118+
119+
# update if existing
120+
else:
121+
with open(serial_file, 'r') as json_read:
122+
file_data = json.load(json_read)
123+
file_data.update(entry)
124+
with open(serial_file, 'w') as json_write:
125+
# dump to string first for formatting
126+
json.dump(file_data, json_write, indent=2)
127+
128+
return file_data
129+
130+
131+
def deserialize_data(filename):
132+
"""Load a dictionary from a serial file.
133+
134+
filename -- database file to load from.
135+
136+
Returns a dictionary of database information.
137+
"""
138+
139+
# check if supported type
140+
f = pathlib.Path(filename)
141+
f_name = f.name
142+
extension = f.suffix
143+
if extension not in supported_formats:
144+
raise UnsupportedTypeError(f_name, supported_formats)
145+
146+
# json
147+
if extension == '.json':
148+
with open(filename, 'r') as json_file:
149+
j_dict = json.load(json_file)
150+
151+
return j_dict

Diff for: src/diffpy/utils/tests/test_serialization.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from diffpy.utils.parsers import serialize_data, deserialize_data
2+
from diffpy.utils.parsers import loadData
3+
from diffpy.utils.tests.testhelpers import datafile
4+
5+
from diffpy.utils.parsers.custom_exceptions import UnsupportedTypeError, ImproperSizeError
6+
7+
import os
8+
import pytest
9+
import numpy
10+
11+
tests_dir = os.path.dirname(os.path.abspath(locals().get('__file__', 'file.py')))
12+
13+
targetjson = datafile('targetjson.json')
14+
schemaname = datafile('strumining.json')
15+
wrongtype = datafile('wrong.type')
16+
loadfile = datafile('loadfile.txt')
17+
nodt = datafile('loaddatawithheaders.txt')
18+
19+
20+
def test_load_multiple(tmp_path):
21+
# generate json and apply schema
22+
generatedjson = tmp_path / "generated_serialization.json"
23+
tlm_list = os.listdir(os.path.join(tests_dir, "testdata", "dbload"))
24+
tlm_list.sort()
25+
generated_data = None
26+
for hfname in tlm_list:
27+
# gather data using loadData
28+
headerfile = os.path.normpath(os.path.join(tests_dir, "testdata", "dbload", hfname))
29+
hdata = loadData(headerfile, headers=True)
30+
data_table = loadData(headerfile)
31+
32+
# check path extraction
33+
generated_data = serialize_data(headerfile, hdata, data_table, dt_colnames=['r', 'gr'], show_path=True)
34+
assert headerfile == os.path.normpath(generated_data[hfname].pop('path'))
35+
36+
# rerun without path information and save to file
37+
generated_data = serialize_data(headerfile, hdata, data_table, dt_colnames=['r', 'gr'],
38+
show_path=False, serial_file=generatedjson)
39+
40+
# compare to target
41+
target_data = deserialize_data(targetjson)
42+
assert target_data == generated_data
43+
# ensure file saved properly
44+
assert target_data == deserialize_data(generatedjson)
45+
46+
47+
def test_exceptions():
48+
hdata = loadData(loadfile, headers=True)
49+
data_table = loadData(loadfile)
50+
51+
# improper file types
52+
with pytest.raises(UnsupportedTypeError):
53+
serialize_data(loadfile, hdata, data_table, serial_file=wrongtype)
54+
with pytest.raises(UnsupportedTypeError):
55+
deserialize_data(wrongtype)
56+
57+
# various dt_colnames inputs
58+
with pytest.raises(ImproperSizeError):
59+
serialize_data(loadfile, hdata, data_table, dt_colnames=["one", "two", "three is too many"])
60+
# check proper output
61+
normal = serialize_data(loadfile, hdata, data_table, dt_colnames=['r', 'gr'])
62+
data_name = list(normal.keys())[0]
63+
r_list = normal[data_name]['r']
64+
gr_list = normal[data_name]['gr']
65+
# three equivalent ways to denote no column names
66+
missing_parameter = serialize_data(loadfile, hdata, data_table, show_path=False)
67+
empty_parameter = serialize_data(loadfile, hdata, data_table, show_path=False, dt_colnames=[])
68+
none_entry_parameter = serialize_data(loadfile, hdata, data_table, show_path=False, dt_colnames=[None, None])
69+
# check equivalence
70+
assert missing_parameter == empty_parameter
71+
assert missing_parameter == none_entry_parameter
72+
print(data_table)
73+
print(missing_parameter[data_name]['data table prime'])
74+
assert numpy.allclose(missing_parameter[data_name]['data table prime'], data_table)
75+
# extract a single column
76+
r_extract = serialize_data(loadfile, hdata, data_table, show_path=False, dt_colnames=['r'])
77+
gr_extract = serialize_data(loadfile, hdata, data_table, show_path=False, dt_colnames=[None, 'gr'])
78+
incorrect_r_extract = serialize_data(loadfile, hdata, data_table, show_path=False, dt_colnames=[None, 'r'])
79+
# check proper columns extracted
80+
assert numpy.allclose(gr_extract[data_name]['gr'], incorrect_r_extract[data_name]['r'])
81+
assert 'r' not in gr_extract[data_name]
82+
assert 'gr' not in r_extract[data_name] and 'gr' not in incorrect_r_extract[data_name]
83+
# check correct values extracted
84+
assert numpy.allclose(r_extract[data_name]['r'], r_list)
85+
assert numpy.allclose(gr_extract[data_name]['gr'], gr_list)
86+
# no datatable
87+
nodt_hdata = loadData(nodt, headers=True)
88+
nodt_dt = loadData(nodt)
89+
no_dt = serialize_data(nodt, nodt_hdata, nodt_dt, show_path=False)
90+
nodt_data_name = list(no_dt.keys())[0]
91+
assert numpy.allclose(no_dt[nodt_data_name]['data table'], nodt_dt)

Diff for: src/diffpy/utils/tests/testdata/dbload/e1.gr

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
rmin = 0
2+
rmax = 10
3+
qmin = 0
4+
qmax = 10
5+
6+
0 0
7+
1 0
8+
2 0
9+
3 0

Diff for: src/diffpy/utils/tests/testdata/dbload/e2.gr

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
rmin = 1
2+
rmax = 11
3+
qmin = 1
4+
qmax = 11
5+
6+
0 1
7+
1 2
8+
2 3
9+
3 4

0 commit comments

Comments
 (0)