Merge pull request #20 from Sparks29032/loaddata_headers

sbillinge · web-flow · commit 86f7fc5bd7e9 · 2023-08-02T07:21:58.000+02:00
Add header metadata parsing to loaddata
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,20 +1,32 @@
 # Release notes
 
-## Version 3.1.0 – 2022-12-09
+## Version 3.2.0 – 2023-8-**
 
 ### Added
 
-- Compatibility with Python 3.10, 3.9, 3.8
+- CI Coverage.
+- New tests for loadData function.
 
 ### Changed
 
-### Deprecated
+- loadData function now toggleable. Can return either (a) data read from data blocks or (b) header
+information stored above the data block.
+- Exclude wx from tests.
 
 ### Removed
 
-- Remove the support for Python 3.5, 3.6.
+- Remove use of pkg_resources (deprecated).
+- No longer use Travis.
 
-### Fixed
+## Version 3.1.0 – 2022-12-09
+
+### Added
+
+- Compatibility with Python 3.10, 3.9, 3.8
+
+### Removed
+
+- Remove the support for Python 3.5, 3.6.
 
 ## Version 3.0.0 -- 2019-03-12
 
diff --git a/src/diffpy/utils/parsers/loaddata.py b/src/diffpy/utils/parsers/loaddata.py
@@ -16,29 +16,47 @@
 import numpy
 
 
-def loadData(filename, minrows=10, **kwargs):
+def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwargs):
     """Find and load data from a text file.
 
-    The data reading starts at the first matrix block of at least minrows rows
-    and constant number of columns.  This seems to work for most of the
-    datafiles including those generated by PDFGetX2.
-
-    filename -- name of the file we want to load data from.
-    minrows  -- minimum number of rows in the first data block.
-                All rows must have the same number of floating point values.
-    usecols  -- zero-based index of columns to be loaded, by default use
-                all detected columns.  The reading skips data blocks that
-                do not have the usecols-specified columns.
-    unpack   -- return data as a sequence of columns that allows tuple
-                unpacking such as  x, y = loadData(FILENAME, unpack=True).
-                Note transposing the loaded array as loadData(FILENAME).T
-                has the same effect.
-    kwargs   -- keyword arguments that are passed to numpy.loadtxt
-
-    Return a numpy array of the data.
-    See also numpy.loadtxt for more details.
+    The data block is identified as the first matrix block of at least minrows rows
+    and constant number of columns. This seems to work for most of the datafiles including
+    those generated by PDFGetX2.
+
+    filename    -- name of the file we want to load data from.
+    minrows     -- minimum number of rows in the first data block.
+                   All rows must have the same number of floating point values.
+    headers     -- when False (defualt), the function returns a numpy array of the
+                   data in the data block. When True, the function instead returns a
+                   dictionary of parameters and their corresponding values parsed from
+                   header (information prior the data block). See hdel and hignore for
+                   options to help with parsing header information.
+    hdel        -- (only used when headers enabled) delimiter for parsing header
+                   information (default '='). e.g. using default hdel, the line
+                   'parameter = p_value' is put into the dictionary as
+                   {parameter: p_value}.
+    hignore     -- (only used when headers enabled) ignore header rows beginning
+                   with any elements in the hignore list. e.g. hignore=['# ', '[']
+                   means the following lines are skipped: '# qmax=10', '[defaults]'.
+    kwargs      -- keyword arguments that are passed to numpy.loadtxt including
+                   the following arguments below. (See also numpy.loadtxt for more
+                   details.)
+    delimiter   -- delimiter for the data in the block (default use whitespace).
+                   For comma-separated data blocks, set delimiter to ','.
+    usecols     -- zero-based index of columns to be loaded, by default use
+                   all detected columns. The reading skips data blocks that
+                   do not have the usecols-specified columns.
+    unpack      -- return data as a sequence of columns that allows tuple
+                   unpacking such as  x, y = loadData(FILENAME, unpack=True).
+                   Note transposing the loaded array as loadData(FILENAME).T
+                   has the same effect.
+
+    Return a numpy array of the data. If headers enabled, instead returns a
+    dictionary of parameters read from the header.
     """
     from numpy import array, loadtxt
+    # for storing header data
+    hdata = {}
     # determine the arguments
     delimiter = kwargs.get('delimiter')
     usecols = kwargs.get('usecols')
@@ -72,8 +90,39 @@ def countcolumnsvalues(line):
         fpos = (0, 0)
         nrows = 0
         for line in fid:
+            # decode line
+            dline = line.decode()
+            # find header information if requested
+            if headers:
+                hpair = dline.split(hdel)
+                flag = True
+                # ensure number of non-blank arguments is two
+                if len(hpair) != 2:
+                    flag = False
+                else:
+                    # ignore if an argument is blank
+                    hpair[0] = hpair[0].strip()  # name of data entry
+                    hpair[1] = hpair[1].strip()  # value of entry
+                    if not hpair[0] or not hpair[1]:
+                        flag = False
+                    else:
+                        # check if row has an ignore tag
+                        if hignore is not None:
+                            for tag in hignore:
+                                taglen = len(tag)
+                                if len(hpair[0]) >= taglen and hpair[0][:taglen] == tag:
+                                    flag = False
+                # add header data
+                if flag:
+                    name = hpair[0]
+                    value = hpair[1]
+                    # check if data value should be stored as float
+                    if isfloat(hpair[1]):
+                        value = float(hpair[1])
+                    hdata.update({name: value})
+            # continue search for the start of datablock
             fpos = (fpos[1], fpos[1] + len(line))
-            line = line.decode()
+            line = dline
             ncv = countcolumnsvalues(line)
             if ncv < mincv:
                 start = None
@@ -88,6 +137,11 @@ def countcolumnsvalues(line):
             # block was found here!
             if nrows >= minrows:
                 break
+
+        # Return header data if requested
+        if headers:
+            return hdata  # Return, so do not proceed to reading datablock
+
         # Return an empty array when no data found.
         # loadtxt would otherwise raise an exception on loading from EOF.
         if start is None:
@@ -247,4 +301,4 @@ def isfloat(s):
         pass
     return False
 
-# End of file
+# End of file
diff --git a/src/diffpy/utils/tests/test_loaddata.py b/src/diffpy/utils/tests/test_loaddata.py
@@ -9,6 +9,7 @@
 from diffpy.utils.tests.testhelpers import datafile
 
 loaddata01 = datafile('loaddata01.txt')
+loaddatawithheaders = datafile('loaddatawithheaders.txt')
 
 ##############################################################################
 class TestLoadData(unittest.TestCase):
@@ -44,9 +45,28 @@ def test_loadData_1column(self):
         self.assertFalse(numpy.array_equal(d1c, d))
         return
 
+
+    def test_loadData_headers(self):
+        """check loadData() with headers options enabled
+        """
+        hignore = ["# ", "// ", "["]  # ignore lines beginning with these strings
+        delimiter = ": "  # what our data should be separated by
+        hdata = loadData(loaddatawithheaders, headers=True, hdel=delimiter, hignore=hignore)
+        # only fourteen lines of data are formatted properly
+        assert len(hdata) == 14
+        # check the following are floats
+        vfloats = ["wavelength", "qmaxinst", "qmin", "qmax", "bgscale"]
+        for name in vfloats:
+            assert isinstance(hdata.get(name), float)
+        # check the following are NOT floats
+        vnfloats = ["composition", "rmax", "rmin", "rstep", "rpoly"]
+        for name in vnfloats:
+            assert not isinstance(hdata.get(name), float)
+
+
 # End of class TestRoutines
 
 if __name__ == '__main__':
     unittest.main()
 
-# End of file
+# End of file
diff --git a/src/diffpy/utils/tests/testdata/loaddatawithheaders.txt b/src/diffpy/utils/tests/testdata/loaddatawithheaders.txt