Make loadmetadata more robust

Andrew Yang · Andrew Yang · commit 4e5c3f11e019 · 2023-08-08T23:31:21.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Release notes
 
-## Version 3.2.0 – 2023-8-**
+## Version 3.2.0 – 2023-08-**
 
 ### Added
 
@@ -11,7 +11,6 @@
 
 - loadData function now toggleable. Can return either (a) data read from data blocks or (b) header
 information stored above the data block.
-- Exclude wx from tests.
 
 ### Removed
 
diff --git a/src/diffpy/utils/parsers/__init__.py b/src/diffpy/utils/parsers/__init__.py
@@ -17,7 +17,7 @@
 """
 
 from .loaddata import loadData
-from .loadmetafile import load_PDF_into_db, load_from_db, markup_PDF, apply_schema_to_file, markup_oneline
+from .loadmetafile import serialize_data, deserialize_data, apply_schema_to_file, serial_oneline
 from .resample import resample
 
 # silence the pyflakes syntax checker
diff --git a/src/diffpy/utils/parsers/loaddata.py b/src/diffpy/utils/parsers/loaddata.py
@@ -51,8 +51,8 @@ def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwar
                    Note transposing the loaded array as loadData(FILENAME).T
                    has the same effect.
 
-    Return a numpy array of the data. If headers enabled, instead returns a
-    dictionary of parameters read from the header.
+    Return a numpy array of the data (data_block). If headers enabled, instead returns a
+    dictionary of parameters read from the header (hddata).
     """
     from numpy import array, loadtxt
     # for storing header data
@@ -145,14 +145,14 @@ def countcolumnsvalues(line):
         # Return an empty array when no data found.
         # loadtxt would otherwise raise an exception on loading from EOF.
         if start is None:
-            rv = array([], dtype=float)
+            data_block = array([], dtype=float)
         else:
             fid.seek(start)
             # always use usecols argument so that loadtxt does not crash
             # in case of trailing delimiters.
             kwargs.setdefault('usecols', list(range(ncvblock[0])))
-            rv = loadtxt(fid, **kwargs)
-    return rv
+            data_block = loadtxt(fid, **kwargs)
+    return data_block
 
 
 class TextDataLoader(object):
diff --git a/src/diffpy/utils/parsers/loadmetafile.py b/src/diffpy/utils/parsers/loadmetafile.py
@@ -20,68 +20,110 @@
 supported_formats = ['.json']
 
 
-def load_PDF_into_db(dbname, pdfname, hddata: dict, rv: list, show_path=True):
-    """Load an entry consisting of PDF header and base data into a database file.
-
-    Requires hdata and rv generated from loadData.
-
-    dbname          -- name of the database file to load an entry into.
-    pdfname         -- name of the PDF file.
-    hddata          -- Dictionary of PDF metadata generated by loadData.
-    rv              -- List of PDF (r, gr) pairs generated by loadData.
-    show_path       -- include a PDF_path element in the database entry (default True).
+def serialize_data(filename, hdata: dict, data_table: list, show_path=True, dt_colnames=None, serial_file=None):
+    """Serialize file data into a dictionary. Can also save dictionary into a serial language file.
+    Dictionary is formatted as {filename: data}.
+
+    Requires hdata and data_table generated from loadData.
+
+    filename        -- name of the file whose data is being serialized.
+    hdata          -- Dictionary of PDF metadata generated by loadData.
+    data_table      -- List storing  parsed by loadData.
+    dt_colnames     -- List containing names of each column in data_table. Every name in
+                       data_table_cols will be put into the Dictionary as a key with a value
+                       of that column in data_table (stored as a List). Put None for
+                       columns without names. If dt_cols has less non-None entries
+                       than columns in data_table, the pair {'data table': data_table} will be put
+                       in the dictionary. (Default None: only entry {'data table': data_table}
+                       will be added to dictionary.)
+    show_path       -- include a path element in the database entry (default True).
+                       If 'path' is not included in hddata, extract path from filename.
+    serial_file     -- serial language file to dump dictionary into.
 
     Returns the dictionary loaded from/into the updated database file.
     """
-    # new file or update
-    existing = False
-    if pathlib.Path.is_file(dbname):
-        existing = True
-
-    # collect entry
-    with open(pdfname, 'r') as grfile:
-        data = {}
-
-        # add path
-        grpath = grfile.name
-        if show_path:
-            data.update({'PDF_path': grpath})
 
-        # add r, gr, and header metadata
-        data.update({'r': list(rv[:, 0]), 'gr': list(rv[:, 1])})
-        data.update(hddata)
-
-        # parse name using pathlib and generate json entry
-        name = pathlib.Path(grpath).name
-        entry = {name: data}
+    # compile data_table and hddata together
+    data = {}
 
+    # handle getting name of file for variety of filename types
+    with open(filename, 'r') as file_path:
+        abs_path = pathlib.Path(file_path.name).resolve()
+        # add path to start of data if requested
+        if show_path and 'path' not in hdata.keys():
+            data.update({'path': abs_path.name})
+        # title the entry with name of file (taken from end of path)
+        title = abs_path.name
+
+    # first add named columns in dt_cols
+    num_columns = [len(row) for row in data_table]
+    max_columns = max(num_columns)
+    num_col_names = len(dt_colnames)
+    if max_columns < num_col_names:  # assume numpy.loadtxt gives non-irregular array
+        raise Exception("More entries in dt_colnames than columns in data_table.")
+    named_columns = 0
+    for idx in range(num_col_names):
+        colname = dt_colnames[idx]
+        if colname is not None:
+            data.update({colname: list(data_table[:, idx])})
+            named_columns += 1
+
+    # second add data in hddata dict
+    data.update(hdata)
+
+    # finally add data_table as an entry named 'data table' if not all columns were parsed
+    if named_columns < max_columns:
+        if 'data table' not in data.keys():
+            data.update({'data table': data_table})
+        else:  # if 'data table' is already a key, keep adding primes to the end
+            dt_name = 'data table'
+            while dt_name in data.keys():
+                dt_name += " prime"
+            data.update({dt_name: data_table})
+
+    # parse name using pathlib and generate dictionary entry
+    entry = {title: data}
+
+    # no save
+    if serial_file is None:
+        return entry
+
+    # saving/updating file
     # check if supported type
-    extension = pathlib.Path(dbname).suffix
+    extension = pathlib.Path(serial_file).suffix
     if extension not in supported_formats:
-        raise Exception(f"Format of {dbname} is not supported.")
+        raise Exception(f"Format of {serial_file} is not supported.")
+
+    # new file or update
+    existing = False
+    try:
+        open(serial_file)
+        existing = True
+    except FileNotFoundError:
+        pass
 
     # json
     if extension == '.json':
         # dump if non-existing
         if not existing:
-            with open(dbname, 'w') as jsonfile:
-                pdfs = entry  # for return
-                json.dump(pdfs, jsonfile, indent=2)
+            with open(serial_file, 'w') as jsonfile:
+                file_data = entry  # for return
+                json.dump(file_data, jsonfile, indent=2)
 
         # update if existing
         else:
-            with open(dbname, 'r') as json_read:
-                pdfs = json.load(json_read)
-                pdfs.update(entry)
-            with open(dbname, 'w') as json_write:
+            with open(serial_file, 'r') as json_read:
+                file_data = json.load(json_read)
+                file_data.update(entry)
+            with open(serial_file, 'w') as json_write:
                 # dump to string first for formatting
-                json.dump(pdfs, json_write, indent=2)
+                json.dump(file_data, json_write, indent=2)
 
-    return pdfs
+    return file_data
 
 
-def load_from_db(filename):
-    """Load a dictionary from a database file.
+def deserialize_data(filename):
+    """Load a dictionary from a serial file.
 
     filename    -- database file to load from.
 
@@ -101,40 +143,7 @@ def load_from_db(filename):
     return j_dict
 
 
-def markup_PDF(hddata: dict, rv: list, muname=None):
-    # FIXME: may be better suited for REST API package, not diffpy.utils
-    """Put PDF file information into a dictionary.
-
-    hddata  -- Dictionary of metadata.
-    rv      -- List of (r, gr) pairs.
-    muname  -- file to save into (default None, no saving occurs).
-
-    Returns the dictionary loaded from/into markup file.
-    """
-
-    # gather data
-    data = {}
-    data.update({'r': list(rv[:, 0]), 'gr': list(rv[:, 1])})
-    data.update(hddata)
-
-    # return directly
-    if muname is None:
-        return data
-
-    # save to disk when enabled
-    extension = pathlib.Path(muname).suffix
-    if extension not in supported_formats:
-        raise Exception(f"Format of {muname} is not supported.")
-
-    # dumps into file, automatically overwrites
-    if extension == '.json':
-        with open(muname, 'w') as json_write:
-            json.dump(data, json_write, indent=2)
-
-    return data
-
-
-def markup_oneline(filename):
+def serial_oneline(filename):
     """Reformat lists in markup languages to take up only one line.
 
     Works well when only lists are surrounded by square brackets and no other data is comma and newline separated.
diff --git a/src/diffpy/utils/tests/test_loadmetafile.py b/src/diffpy/utils/tests/test_loadmetafile.py
@@ -1,4 +1,4 @@
-from diffpy.utils.parsers import load_PDF_into_db, load_from_db, markup_PDF, apply_schema_to_file, markup_oneline
+from diffpy.utils.parsers import serialize_data, deserialize_data, apply_schema_to_file, serial_oneline
 from diffpy.utils.parsers import loadData
 from diffpy.utils.tests.testhelpers import datafile
 
@@ -7,47 +7,44 @@
 
 tests_dir = os.path.dirname(os.path.abspath(locals().get('__file__', 'file.py')))
 
-targetjson = datafile('targetdb.json')
-
+targetjson = datafile('targetjson.json')
 schemaname = datafile('strumining.json')
+
 muload = datafile('loadmu.txt')
 targetmu = datafile('targetmu.json')
 
 
 def test_load_gr(tmp_path):
     # generate json and apply schema
-    generatedjson = tmp_path / "generated_db.json"
+    generatedjson = tmp_path / "generated_serialization.json"
     tddbload_list = os.listdir(os.path.join(tests_dir, "testdata", "dbload"))
     tddbload_list.sort()
     for headerfile in tddbload_list:
         headerfile = os.path.join(tests_dir, "testdata", "dbload", headerfile)
         hdata = loadData(headerfile, headers=True)
-        rv = loadData(headerfile)
-        db_data = load_PDF_into_db(generatedjson, headerfile, hdata, rv, show_path=False)
+        data_table = loadData(headerfile)
+        db_data = serialize_data(headerfile, hdata, data_table, dt_colnames=['r', 'gr'],
+                                 show_path=False, serial_file=generatedjson)
     apply_schema_to_file(generatedjson, schemaname, multiple_entries=True)
-    markup_oneline(generatedjson)
+    serial_oneline(generatedjson)
 
     # compare to target
     # first compare if base data is same
     import json
-    target_db_data = load_from_db(targetjson)
+    target_db_data = deserialize_data(targetjson)
     assert target_db_data == db_data
     # then compare file structure/organization
     assert filecmp.cmp(generatedjson, targetjson)
 
 
+# FIXME: tests for REST API, remove after merge
 def test_markup_gr(tmp_path):
     # put into json and apply schema
     generatedmu = tmp_path / "generated_markup.json"
     hdata = loadData(muload, headers=True)
-    rv = loadData(muload)
-    data = markup_PDF(hdata, rv, generatedmu)
-    apply_schema_to_file(generatedmu, schemaname)
-    markup_oneline(generatedmu)
-
-    # check against target
-    # first compare data is same
-    target_data = load_from_db(targetmu)
+    data_table = loadData(muload)
+    data = serialize_data(muload, hdata, data_table, dt_colnames=['r', 'gr'], show_path=False).get('loadmu.txt')
+
+    # compare data is same
+    target_data = deserialize_data(targetmu)
     assert target_data == data
-    # then compare structure
-    assert filecmp.cmp(generatedmu, targetmu)
diff --git a/src/diffpy/utils/tests/testdata/generated_db.json b/src/diffpy/utils/tests/testdata/generated_db.json
diff --git a/src/diffpy/utils/tests/testdata/targetjson.json b/src/diffpy/utils/tests/testdata/targetjson.json
@@ -0,0 +1,26 @@
+{
+  "e1.gr": {
+    "r": [0.0, 1.0, 2.0, 3.0],
+    "gr": [0.0, 0.0, 0.0, 0.0],
+    "qmax": 10.0,
+    "qmin": 0.0,
+    "rmax": 10.0,
+    "rmin": 0.0
+  },
+  "e2.gr": {
+    "r": [0.0, 1.0, 2.0, 3.0],
+    "gr": [1.0, 2.0, 3.0, 4.0],
+    "qmax": 11.0,
+    "qmin": 1.0,
+    "rmax": 11.0,
+    "rmin": 1.0
+  },
+  "e3.gr": {
+    "r": [0.0, 1.0, 2.0, 3.0],
+    "gr": [0.0, 5.0, 4.0, 3.0],
+    "qmax": 12.0,
+    "qmin": 2.0,
+    "rmax": 12.0,
+    "rmin": 2.0
+  }
+}