forked from diffpy/diffpy.utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloaddata.py
343 lines (314 loc) · 12.5 KB
/
loaddata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#!/usr/bin/env python
##############################################################################
#
# diffpy.utils by DANSE Diffraction group
# Simon J. L. Billinge
# (c) 2010 The Trustees of Columbia University
# in the City of New York. All rights reserved.
#
# File coded by: Timur Davis, Chris Farrow, Pavol Juhas
#
# See AUTHORS.txt for a list of people who contributed.
# See LICENSE_DANSE.txt for license information.
#
##############################################################################
import numpy
def loadData(filename, minrows=10, headers=False, hdel="=", hignore=None, **kwargs):
"""Find and load data from a text file.
The data block is identified as the first matrix block of at least minrows rows and constant number of columns.
This seems to work for most of the datafiles including those generated by diffpy programs.
Parameters
----------
filename
Name of the file we want to load data from.
minrows: int
Minimum number of rows in the first data block. All rows must have the same number of floating
point values.
headers: bool
when False (default), the function returns a numpy array of the data in the data block.
When True, the function instead returns a dictionary of parameters and their corresponding
values parsed from header (information prior the data block). See hdel and hignore for options
to help with parsing header information.
hdel: str
(Only used when headers enabled.) Delimiter for parsing header information (default '='). e.g. using
default hdel, the line 'parameter = p_value' is put into the dictionary as {parameter: p_value}.
hignore: list
(Only used when headers enabled.) Ignore header rows beginning with any elements in hignore.
e.g. hignore=['# ', '['] causes the following lines to be skipped: '# qmax=10', '[defaults]'.
kwargs:
Keyword arguments that are passed to numpy.loadtxt including the following arguments below. (See
numpy.loadtxt for more details.) Only pass kwargs used by numpy.loadtxt.
Useful kwargs
=============
comments: str, sequence of str
The characters or list of characters used to indicate the start of a comment (default '#').
Comment lines are ignored.
delimiter: str
Delimiter for the data in the block (default use whitespace). For comma-separated data blocks,
set delimiter to ','.
unpack: bool
Return data as a sequence of columns that allows tuple unpacking such as x, y =
loadData(FILENAME, unpack=True). Note transposing the loaded array as loadData(FILENAME).T has the same
effect.
usecols:
Zero-based index of columns to be loaded, by default use all detected columns. The reading skips
data blocks that do not have the usecols-specified columns.
Returns
-------
data_block: ndarray
A numpy array containing the found data block. (This is not returned if headers is enabled.)
hdata: dict
If headers are enabled, return a dictionary of parameters read from the header.
"""
from numpy import array, loadtxt
# for storing header data
hdata = {}
# determine the arguments
delimiter = kwargs.get("delimiter")
usecols = kwargs.get("usecols")
# required at least one column of floating point values
mincv = (1, 1)
# but if usecols is specified, require sufficient number of columns
# where the used columns contain floats
if usecols is not None:
hiidx = max(-min(usecols), max(usecols) + 1)
mincv = (hiidx, len(set(usecols)))
# Check if a line consists of floats only and return their count
# Return zero if some strings cannot be converted.
def countcolumnsvalues(line):
try:
words = line.split(delimiter)
# remove trailing blank columns
while words and not words[-1].strip():
words.pop(-1)
nc = len(words)
if usecols is not None:
nv = len([float(words[i]) for i in usecols])
else:
nv = len([float(w) for w in words])
except (IndexError, ValueError):
nc = nv = 0
return nc, nv
# make sure fid gets cleaned up
with open(filename, "rb") as fid:
# search for the start of datablock
start = ncvblock = None
fpos = (0, 0)
nrows = 0
for line in fid:
# decode line
dline = line.decode()
# find header information if requested
if headers:
hpair = dline.split(hdel)
flag = True
# ensure number of non-blank arguments is two
if len(hpair) != 2:
flag = False
else:
# ignore if an argument is blank
hpair[0] = hpair[0].strip() # name of data entry
hpair[1] = hpair[1].strip() # value of entry
if not hpair[0] or not hpair[1]:
flag = False
else:
# check if row has an ignore tag
if hignore is not None:
for tag in hignore:
taglen = len(tag)
if len(hpair[0]) >= taglen and hpair[0][:taglen] == tag:
flag = False
# add header data
if flag:
name = hpair[0]
value = hpair[1]
# check if data value should be stored as float
if isfloat(hpair[1]):
value = float(hpair[1])
hdata.update({name: value})
# continue search for the start of datablock
fpos = (fpos[1], fpos[1] + len(line))
line = dline
ncv = countcolumnsvalues(line)
if ncv < mincv:
start = None
continue
# ncv is acceptable here, require the same number of columns
# throughout the datablock
if start is None or ncv != ncvblock:
ncvblock = ncv
nrows = 0
start = fpos[0]
nrows += 1
# block was found here!
if nrows >= minrows:
break
# Return header data if requested
if headers:
return hdata # Return, so do not proceed to reading datablock
# Return an empty array when no data found.
# loadtxt would otherwise raise an exception on loading from EOF.
if start is None:
data_block = array([], dtype=float)
else:
fid.seek(start)
# always use usecols argument so that loadtxt does not crash
# in case of trailing delimiters.
kwargs.setdefault("usecols", list(range(ncvblock[0])))
data_block = loadtxt(fid, **kwargs)
return data_block
class TextDataLoader(object):
"""Smart loading of a text data with possibly multiple datasets.
Parameters
----------
minrows: int
Minimum number of rows in the first data block. (Default 10.)
usecols: tuple
Which columns in our dataset to use. Ignores all other columns. If None (default), use all columns.
skiprows
Rows in dataset to skip. (Currently not functional.)
"""
def __init__(self, minrows=10, usecols=None, skiprows=None):
if minrows is not None:
self.minrows = minrows
if usecols is not None:
self.usecols = tuple(usecols)
# FIXME: implement usage in _findDataBlocks
if skiprows is not None:
self.skiprows = skiprows
# data items
self._reset()
return
def _reset(self):
self.filename = ""
self.headers = []
self.datasets = []
self._resetvars()
return
def _resetvars(self):
self._filename = ""
self._lines = None
self._splitlines = None
self._words = None
self._linerecs = None
self._wordrecs = None
return
def read(self, filename):
"""Open a file and run readfp.
Use if file is not already open for read byte.
"""
with open(filename, "rb") as fp:
self.readfp(fp)
return
def readfp(self, fp, append=False):
"""Get file details.
File details include:
* File name.
* All data blocks findable by loadData.
* Headers (if present) for each data block. (Generally the headers contain column name information).
"""
self._reset()
# try to read lines from fp first
self._lines = fp.readlines()
# and if good, assign filename
self.filename = getattr(fp, "name", "")
self._words = "".join(self._lines).split()
self._splitlines = [line.split() for line in self._lines]
self._findDataBlocks()
return
def _findDataBlocks(self):
mincols = 1
if self.usecols is not None and len(self.usecols):
mincols = max(mincols, max(self.usecols) + 1)
mincols = max(mincols, abs(min(self.usecols)))
nlines = len(self._lines)
nwords = len(self._words)
# idx - line index, nw0, nw1 - index of the first and last word,
# nf - number of words, ok - has data
self._linerecs = numpy.recarray(
(nlines,),
dtype=[("idx", int), ("nw0", int), ("nw1", int), ("nf", int), ("ok", bool)],
)
lr = self._linerecs
lr.idx = numpy.arange(nlines)
lr.nf = [len(sl) for sl in self._splitlines]
lr.nw1 = lr.nf.cumsum()
lr.nw0 = lr.nw1 - lr.nf
lr.ok = True
# word records
lw = self._wordrecs = numpy.recarray(
(nwords,),
dtype=[
("idx", int),
("line", int),
("col", int),
("ok", bool),
("value", float),
],
)
lw.idx = numpy.arange(nwords)
n1 = numpy.zeros(nwords, dtype=bool)
n1[lr.nw1[:-1]] = True
lw.line = n1.cumsum()
lw.col = lw.idx - lr.nw0[lw.line]
lw.ok = True
values = nwords * [0.0]
for i, w in enumerate(self._words):
try:
values[i] = float(w)
except ValueError:
lw.ok[i] = False
# prune lines that have a non-float values:
lw.values = values
if self.usecols is None:
badlines = lw.line[~lw.ok]
lr.ok[badlines] = False
else:
for col in self.usecols:
badlines = lw.line[(lw.col == col) & ~lw.ok]
lr.ok[badlines] = False
lr1 = lr[lr.nf >= mincols]
okb = numpy.r_[lr1.ok[:1], lr1.ok[1:] & ~lr1.ok[:-1], False]
oke = numpy.r_[False, ~lr1.ok[1:] & lr1.ok[:-1], lr1.ok[-1:]]
blockb = numpy.r_[True, lr1.nf[1:] != lr1.nf[:-1], False]
blocke = numpy.r_[False, blockb[1:-1], True]
beg = numpy.nonzero(okb | blockb)[0]
end = numpy.nonzero(oke | blocke)[0]
rowcounts = end - beg
assert not numpy.any(rowcounts < 0)
goodrows = rowcounts >= self.minrows
begend = numpy.transpose([beg, end - 1])[goodrows]
hbeg = 0
for dbeg, dend in begend:
bb1 = lr1[dbeg]
ee1 = lr1[dend]
hend = bb1.idx
header = "".join(self._lines[hbeg:hend])
hbeg = ee1.idx + 1
if self.usecols is None:
data = numpy.reshape(lw.value[bb1.nw0 : ee1.nw1], (-1, bb1.nf))
else:
tdata = numpy.empty((len(self.usecols), dend - dbeg), dtype=float)
for j, trow in zip(self.usecols, tdata):
j %= bb1.nf
trow[:] = lw.value[bb1.nw0 + j : ee1.nw1 : bb1.nf]
data = tdata.transpose()
self.headers.append(header)
self.datasets.append(data)
# finish reading to a last header and empty dataset
if hbeg < len(self._lines):
header = "".join(self._lines[hbeg:])
data = numpy.empty(0, dtype=float)
self.headers.append(header)
self.datasets.append(data)
return
# End of class TextDataLoader
def isfloat(s):
"""True if s is convertible to float."""
try:
float(s)
return True
except ValueError:
pass
return False
# End of file