Skip to content

Commit 86f7fc5

Browse files
authored
Merge pull request #20 from Sparks29032/loaddata_headers
Add header metadata parsing to loaddata
2 parents 8bafe53 + 699e604 commit 86f7fc5

File tree

4 files changed

+10145
-27
lines changed

4 files changed

+10145
-27
lines changed

CHANGELOG.md

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,32 @@
11
# Release notes
22

3-
## Version 3.1.0 – 2022-12-09
3+
## Version 3.2.0 – 2023-8-**
44

55
### Added
66

7-
- Compatibility with Python 3.10, 3.9, 3.8
7+
- CI Coverage.
8+
- New tests for loadData function.
89

910
### Changed
1011

11-
### Deprecated
12+
- loadData function now toggleable. Can return either (a) data read from data blocks or (b) header
13+
information stored above the data block.
14+
- Exclude wx from tests.
1215

1316
### Removed
1417

15-
- Remove the support for Python 3.5, 3.6.
18+
- Remove use of pkg_resources (deprecated).
19+
- No longer use Travis.
1620

17-
### Fixed
21+
## Version 3.1.0 – 2022-12-09
22+
23+
### Added
24+
25+
- Compatibility with Python 3.10, 3.9, 3.8
26+
27+
### Removed
28+
29+
- Remove the support for Python 3.5, 3.6.
1830

1931
## Version 3.0.0 -- 2019-03-12
2032

src/diffpy/utils/parsers/loaddata.py

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,47 @@
1616
import numpy
1717

1818

19-
def loadData(filename, minrows=10, **kwargs):
19+
def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwargs):
2020
"""Find and load data from a text file.
2121
22-
The data reading starts at the first matrix block of at least minrows rows
23-
and constant number of columns. This seems to work for most of the
24-
datafiles including those generated by PDFGetX2.
25-
26-
filename -- name of the file we want to load data from.
27-
minrows -- minimum number of rows in the first data block.
28-
All rows must have the same number of floating point values.
29-
usecols -- zero-based index of columns to be loaded, by default use
30-
all detected columns. The reading skips data blocks that
31-
do not have the usecols-specified columns.
32-
unpack -- return data as a sequence of columns that allows tuple
33-
unpacking such as x, y = loadData(FILENAME, unpack=True).
34-
Note transposing the loaded array as loadData(FILENAME).T
35-
has the same effect.
36-
kwargs -- keyword arguments that are passed to numpy.loadtxt
37-
38-
Return a numpy array of the data.
39-
See also numpy.loadtxt for more details.
22+
The data block is identified as the first matrix block of at least minrows rows
23+
and constant number of columns. This seems to work for most of the datafiles including
24+
those generated by PDFGetX2.
25+
26+
filename -- name of the file we want to load data from.
27+
minrows -- minimum number of rows in the first data block.
28+
All rows must have the same number of floating point values.
29+
headers -- when False (defualt), the function returns a numpy array of the
30+
data in the data block. When True, the function instead returns a
31+
dictionary of parameters and their corresponding values parsed from
32+
header (information prior the data block). See hdel and hignore for
33+
options to help with parsing header information.
34+
hdel -- (only used when headers enabled) delimiter for parsing header
35+
information (default '='). e.g. using default hdel, the line
36+
'parameter = p_value' is put into the dictionary as
37+
{parameter: p_value}.
38+
hignore -- (only used when headers enabled) ignore header rows beginning
39+
with any elements in the hignore list. e.g. hignore=['# ', '[']
40+
means the following lines are skipped: '# qmax=10', '[defaults]'.
41+
kwargs -- keyword arguments that are passed to numpy.loadtxt including
42+
the following arguments below. (See also numpy.loadtxt for more
43+
details.)
44+
delimiter -- delimiter for the data in the block (default use whitespace).
45+
For comma-separated data blocks, set delimiter to ','.
46+
usecols -- zero-based index of columns to be loaded, by default use
47+
all detected columns. The reading skips data blocks that
48+
do not have the usecols-specified columns.
49+
unpack -- return data as a sequence of columns that allows tuple
50+
unpacking such as x, y = loadData(FILENAME, unpack=True).
51+
Note transposing the loaded array as loadData(FILENAME).T
52+
has the same effect.
53+
54+
Return a numpy array of the data. If headers enabled, instead returns a
55+
dictionary of parameters read from the header.
4056
"""
4157
from numpy import array, loadtxt
58+
# for storing header data
59+
hdata = {}
4260
# determine the arguments
4361
delimiter = kwargs.get('delimiter')
4462
usecols = kwargs.get('usecols')
@@ -72,8 +90,39 @@ def countcolumnsvalues(line):
7290
fpos = (0, 0)
7391
nrows = 0
7492
for line in fid:
93+
# decode line
94+
dline = line.decode()
95+
# find header information if requested
96+
if headers:
97+
hpair = dline.split(hdel)
98+
flag = True
99+
# ensure number of non-blank arguments is two
100+
if len(hpair) != 2:
101+
flag = False
102+
else:
103+
# ignore if an argument is blank
104+
hpair[0] = hpair[0].strip() # name of data entry
105+
hpair[1] = hpair[1].strip() # value of entry
106+
if not hpair[0] or not hpair[1]:
107+
flag = False
108+
else:
109+
# check if row has an ignore tag
110+
if hignore is not None:
111+
for tag in hignore:
112+
taglen = len(tag)
113+
if len(hpair[0]) >= taglen and hpair[0][:taglen] == tag:
114+
flag = False
115+
# add header data
116+
if flag:
117+
name = hpair[0]
118+
value = hpair[1]
119+
# check if data value should be stored as float
120+
if isfloat(hpair[1]):
121+
value = float(hpair[1])
122+
hdata.update({name: value})
123+
# continue search for the start of datablock
75124
fpos = (fpos[1], fpos[1] + len(line))
76-
line = line.decode()
125+
line = dline
77126
ncv = countcolumnsvalues(line)
78127
if ncv < mincv:
79128
start = None
@@ -88,6 +137,11 @@ def countcolumnsvalues(line):
88137
# block was found here!
89138
if nrows >= minrows:
90139
break
140+
141+
# Return header data if requested
142+
if headers:
143+
return hdata # Return, so do not proceed to reading datablock
144+
91145
# Return an empty array when no data found.
92146
# loadtxt would otherwise raise an exception on loading from EOF.
93147
if start is None:
@@ -247,4 +301,4 @@ def isfloat(s):
247301
pass
248302
return False
249303

250-
# End of file
304+
# End of file

src/diffpy/utils/tests/test_loaddata.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from diffpy.utils.tests.testhelpers import datafile
1010

1111
loaddata01 = datafile('loaddata01.txt')
12+
loaddatawithheaders = datafile('loaddatawithheaders.txt')
1213

1314
##############################################################################
1415
class TestLoadData(unittest.TestCase):
@@ -44,9 +45,28 @@ def test_loadData_1column(self):
4445
self.assertFalse(numpy.array_equal(d1c, d))
4546
return
4647

48+
49+
def test_loadData_headers(self):
50+
"""check loadData() with headers options enabled
51+
"""
52+
hignore = ["# ", "// ", "["] # ignore lines beginning with these strings
53+
delimiter = ": " # what our data should be separated by
54+
hdata = loadData(loaddatawithheaders, headers=True, hdel=delimiter, hignore=hignore)
55+
# only fourteen lines of data are formatted properly
56+
assert len(hdata) == 14
57+
# check the following are floats
58+
vfloats = ["wavelength", "qmaxinst", "qmin", "qmax", "bgscale"]
59+
for name in vfloats:
60+
assert isinstance(hdata.get(name), float)
61+
# check the following are NOT floats
62+
vnfloats = ["composition", "rmax", "rmin", "rstep", "rpoly"]
63+
for name in vnfloats:
64+
assert not isinstance(hdata.get(name), float)
65+
66+
4767
# End of class TestRoutines
4868

4969
if __name__ == '__main__':
5070
unittest.main()
5171

52-
# End of file
72+
# End of file

0 commit comments

Comments
 (0)