Extend lammpsdump to accept arbitrary columns (#3608)

pstaerk · Philipp Stärk · hejamu · web-flow · commit 3c83b8fea911 · 2024-03-05T06:58:08.000+11:00
* Extended mdanalysis to accept other attributes as well.

* Able to parse arbitrary columns now.

* Tried to fix most of the pep8 problems.

* First try at testing the additional column part.

* Testing multi read columns as well.

* Fix the no additional columns case.

* Implemented requested changes to docs.

* Implemented the requested changes to the tests.

* Incorporated the PEP8 comments.

* Third round of PEP...

* PEP8 ...

* Authors and changelog.

* Variable renaming issue.

* Hopefully fixed documentation.

* Sphinx

* Hopefully fixed the tests

* Implement input from UGM23

* refine tests

* Small typo

Co-authored-by: Hugo MacDermott-Opeskin &lt;hugomacdermott@gmail.com&gt;

* Removed comment

* Addressed hmacdope's comments regarding issue link

* Addressed hmacdope's comments regarding file paths.

* Added warning if keys are not in lammpsdump file.

* Tested the formatting error of additional_columns.

* Make changed lines comply with pep8

* 80 is indeed longer than 79...

* Fix test

* Fix test

* Don't format `datafiles.py`

* Added test of warning.

---------

Co-authored-by: Philipp Stärk &lt;philipp.staerk@t-online.de&gt;
Co-authored-by: hejamu &lt;info@jaegerhenrik.de&gt;
Co-authored-by: Hugo MacDermott-Opeskin &lt;hugomacdermott@gmail.com&gt;
diff --git a/package/AUTHORS b/package/AUTHORS
@@ -235,6 +235,7 @@ Chronological list of authors
   - Jenna M. Swarthout Goddard
   2024
   - Aditya Keshari
+  - Philipp Stärk
 
 External code
 -------------
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -15,7 +15,7 @@ The rules for this file:
 
 -------------------------------------------------------------------------------
 ??/??/?? IAlibay, HeetVekariya, marinegor, lilyminium, RMeli,
-         ljwoods2, aditya292002
+         ljwoods2, aditya292002, pstaerk
 
  * 2.8.0
 
@@ -30,6 +30,7 @@ Fixes
  * Fix deploy action to use the correct version of the pypi upload action.
 
 Enhancements
+ * Added parsing of arbitrary columns of the LAMMPS dump parser. (Issue #3504)
  * Documented the r0 attribute in the `Contacts` class and added the 
    `n_initial_contacts` attribute, with documentation. (Issue #2604, PR #4415)
 
diff --git a/package/MDAnalysis/coordinates/LAMMPS.py b/package/MDAnalysis/coordinates/LAMMPS.py
@@ -135,6 +135,7 @@
 from ..topology.LAMMPSParser import DATAParser
 from ..exceptions import NoDataError
 from . import base
+import warnings
 
 btype_sections = {'bond':'Bonds', 'angle':'Angles',
                   'dihedral':'Dihedrals', 'improper':'Impropers'}
@@ -458,12 +459,50 @@ class DumpReader(base.ReaderBase):
     """Reads the default `LAMMPS dump format 
     <https://docs.lammps.org/dump.html>`__
 
-    Supports coordinates in various LAMMPS coordinate conventions and both 
-    orthogonal and triclinic simulation box dimensions (for more details see 
-    `documentation <https://docs.lammps.org/Howto_triclinic.html>`__). In 
-    either case, MDAnalysis will always use ``(*A*, *B*, *C*, *alpha*, *beta*,
-    *gamma*)`` to represent the unit cell. Lengths *A*, *B*, *C* are in the 
-    MDAnalysis length unit (Å), and angles are in degrees.
+    Supports coordinates in the LAMMPS "unscaled" (x,y,z), "scaled" (xs,ys,zs),
+    "unwrapped" (xu,yu,zu) and "scaled_unwrapped" (xsu,ysu,zsu) coordinate
+    conventions (see https://docs.lammps.org/dump.html for more details).
+    If `lammps_coordinate_convention='auto'` (default),
+    one will be guessed. Guessing checks whether the coordinates fit each
+    convention in the order "unscaled", "scaled", "unwrapped",
+    "scaled_unwrapped" and whichever set of coordinates is detected first will
+    be used. If coordinates are given in the scaled coordinate convention
+    (xs,ys,zs) or scaled unwrapped coordinate convention (xsu,ysu,zsu) they
+    will automatically be converted from their scaled/fractional representation
+    to their real values.
+
+    Supports both orthogonal and triclinic simulation box dimensions (for more
+    details see https://docs.lammps.org/Howto_triclinic.html). In either case,
+    MDAnalysis will always use ``(*A*, *B*, *C*, *alpha*, *beta*, *gamma*)``
+    to represent the unit cell. Lengths *A*, *B*, *C* are in the MDAnalysis
+    length unit (Å), and angles are in degrees.
+
+    By using the keyword `additional_columns`, you can specify arbitrary data
+    to be read. The keyword expects a list of the names of the columns or
+    `True` to read all additional columns. The results are saved to
+    :attr:`Timestep.data`. For example, if your LAMMPS dump looks like this
+
+    .. code-block::
+
+        ITEM: ATOMS id x y z q l
+        1 2.84 8.17 -25 0.00258855 1.1
+        2 7.1 8.17 -25 6.91952e-05 1.2
+
+    Then you may parse the additional columns `q` and `l` via:
+
+    .. code-block:: python
+
+        u = mda.Universe('structure.data', 'traj.lammpsdump',
+                         additional_columns=['q', 'l'])
+
+    The additional data is then available for each time step via:
+
+    .. code-block:: python
+
+        for ts in u.trajectory:
+            charges = ts.data['q'] # Access additional data, sorted by the id
+            ls = ts.data['l']
+        ...
 
     Parameters
     ----------
@@ -497,6 +536,9 @@ class DumpReader(base.ReaderBase):
     **kwargs
        Other keyword arguments used in :class:`~MDAnalysis.coordinates.base.ReaderBase`
 
+    .. versionchanged:: 2.7.0
+       Reading of arbitrary, additional columns is now supported.
+       (Issue #3608)
     .. versionchanged:: 2.4.0
        Now imports velocities and forces, translates the box to the origin,
        and optionally unwraps trajectories with image flags upon loading.
@@ -510,18 +552,23 @@ class DumpReader(base.ReaderBase):
     format = 'LAMMPSDUMP'
     _conventions = ["auto", "unscaled", "scaled", "unwrapped",
                     "scaled_unwrapped"]
+
     _coordtype_column_names = {
         "unscaled": ["x", "y", "z"],
         "scaled": ["xs", "ys", "zs"],
         "unwrapped": ["xu", "yu", "zu"],
         "scaled_unwrapped": ["xsu", "ysu", "zsu"]
     }
 
+    _parsable_columns = ["id", "vx", "vy", "vz", "fx", "fy", "fz"]
+    for key in _coordtype_column_names.keys():
+        _parsable_columns += _coordtype_column_names[key]
+
     @store_init_arguments
-    def __init__(self, filename, 
+    def __init__(self, filename,
                  lammps_coordinate_convention="auto",
                  unwrap_images=False,
-                 **kwargs):
+                 additional_columns=None, **kwargs):
         super(DumpReader, self).__init__(filename, **kwargs)
 
         root, ext = os.path.splitext(self.filename)
@@ -536,6 +583,16 @@ def __init__(self, filename,
 
         self._unwrap = unwrap_images
 
+        if (util.iterable(additional_columns)
+                or additional_columns is None
+                or additional_columns is True):
+            self._additional_columns = additional_columns
+        else:
+            raise ValueError(f"additional_columns={additional_columns} "
+                             "is not a valid option. Please provide an "
+                             "iterable containing the additional"
+                             "column headers.")
+
         self._cache = {}
 
         self._reopen()
@@ -681,6 +738,25 @@ def _read_next_timestep(self):
             coord_cols.extend(image_cols)
 
         ids = "id" in attr_to_col_ix
+
+        # Create the data arrays for additional attributes which will be saved
+        # under ts.data
+        if self._additional_columns is True:
+            # Parse every column that is not already parsed
+            # elsewhere (total \ parsable)
+            additional_keys = set(attrs).difference(self._parsable_columns)
+        elif self._additional_columns:
+            if not all([key in attrs for key in self._additional_columns]):
+                warnings.warn("Some of the additional columns are not present "
+                              "in the file, they will be ignored")
+            additional_keys = \
+                [key for key in self._additional_columns if key in attrs]
+        else:
+            additional_keys = []
+        for key in additional_keys:
+            ts.data[key] = np.empty(self.n_atoms)
+
+        # Parse all the atoms
         for i in range(self.n_atoms):
             fields = f.readline().split()
             if ids:
@@ -701,12 +777,22 @@ def _read_next_timestep(self):
             if self._has_forces:
                 ts.forces[i] = [fields[dim] for dim in force_cols]
 
+            # Collect additional cols
+            for attribute_key in additional_keys:
+                ts.data[attribute_key][i] = \
+                    fields[attr_to_col_ix[attribute_key]]
+
         order = np.argsort(indices)
         ts.positions = ts.positions[order]
         if self._has_vels:
             ts.velocities = ts.velocities[order]
         if self._has_forces:
             ts.forces = ts.forces[order]
+
+        # Also need to sort the additional keys
+        for attribute_key in additional_keys:
+            ts.data[attribute_key] = ts.data[attribute_key][order]
+
         if (self.lammps_coordinate_convention.startswith("scaled")):
             # if coordinates are given in scaled format, undo that
             ts.positions = distances.transform_StoR(ts.positions,
diff --git a/testsuite/MDAnalysisTests/coordinates/reference.py b/testsuite/MDAnalysisTests/coordinates/reference.py
@@ -25,9 +25,10 @@
 from MDAnalysisTests import datafiles
 from MDAnalysisTests.datafiles import (PDB_small, PDB, LAMMPSdata,
                                        LAMMPSdata2, LAMMPSdcd2,
-                                       LAMMPSdata_mini, PSF_TRICLINIC,
-                                       DCD_TRICLINIC, PSF_NAMD_TRICLINIC,
-                                       DCD_NAMD_TRICLINIC)
+                                       LAMMPSdata_mini,
+                                       LAMMPSdata_additional_columns,
+                                       PSF_TRICLINIC, DCD_TRICLINIC,
+                                       PSF_NAMD_TRICLINIC, DCD_NAMD_TRICLINIC)
 
 
 class RefAdKSmall(object):
@@ -227,3 +228,9 @@ class RefLAMMPSDataMini(object):
                          dtype=np.float32)
     dimensions = np.array([60., 50., 30., 90., 90., 90.], dtype=np.float32)
 
+
+class RefLAMMPSDataAdditionalColumns(object):
+    q = np.array([2.58855e-03, 6.91952e-05, 1.05548e-02, 4.20319e-03,
+                  9.19172e-03, 4.79777e-03, 6.36864e-04, 5.87125e-03,
+                  -2.18125e-03, 6.88910e-03])
+    p = np.array(5 * [1.1, 1.2])
diff --git a/testsuite/MDAnalysisTests/coordinates/test_lammps.py b/testsuite/MDAnalysisTests/coordinates/test_lammps.py
@@ -29,16 +29,18 @@
 import MDAnalysis as mda
 from MDAnalysis import NoDataError
 
-from numpy.testing import (assert_equal, assert_allclose, assert_allclose)
+from numpy.testing import (assert_equal, assert_allclose)
 
 from MDAnalysisTests import make_Universe
 from MDAnalysisTests.coordinates.reference import (
     RefLAMMPSData, RefLAMMPSDataMini, RefLAMMPSDataDCD,
+    RefLAMMPSDataAdditionalColumns
 )
 from MDAnalysisTests.datafiles import (
     LAMMPScnt, LAMMPShyd, LAMMPSdata, LAMMPSdata_mini, LAMMPSdata_triclinic,
     LAMMPSDUMP, LAMMPSDUMP_allcoords, LAMMPSDUMP_nocoords, LAMMPSDUMP_triclinic,
-    LAMMPSDUMP_image_vf, LAMMPS_image_vf
+    LAMMPSDUMP_image_vf, LAMMPS_image_vf, LAMMPSdata_additional_columns,
+    LAMMPSDUMP_additional_columns
 )
 
 
@@ -511,6 +513,46 @@ def u(self, tmpdir, request):
         yield mda.Universe(f, format='LAMMPSDUMP',
                            lammps_coordinate_convention="auto")
 
+    @pytest.fixture()
+    def u_additional_columns_true(self):
+        f = LAMMPSDUMP_additional_columns
+        top = LAMMPSdata_additional_columns
+        return mda.Universe(top, f, format='LAMMPSDUMP',
+                            lammps_coordinate_convention="auto",
+                            additional_columns=True)
+
+    @pytest.fixture()
+    def u_additional_columns_single(self):
+        f = LAMMPSDUMP_additional_columns
+        top = LAMMPSdata_additional_columns
+        return mda.Universe(top, f, format='LAMMPSDUMP',
+                            lammps_coordinate_convention="auto",
+                            additional_columns=['q'])
+
+    @pytest.fixture()
+    def u_additional_columns_multiple(self):
+        f = LAMMPSDUMP_additional_columns
+        top = LAMMPSdata_additional_columns
+        return mda.Universe(top, f, format='LAMMPSDUMP',
+                            lammps_coordinate_convention="auto",
+                            additional_columns=['q', 'p'])
+
+    @pytest.fixture()
+    def u_additional_columns_wrong_format(self):
+        f = LAMMPSDUMP_additional_columns
+        top = LAMMPSdata_additional_columns
+        return mda.Universe(top, f, format='LAMMPSDUMP',
+                            lammps_coordinate_convention="auto",
+                            additional_columns='q')
+
+    @pytest.fixture()
+    def u_additional_columns_not_present(self):
+        f = LAMMPSDUMP_additional_columns
+        top = LAMMPSdata_additional_columns
+        return mda.Universe(top, f, format='LAMMPSDUMP',
+                            lammps_coordinate_convention="auto",
+                            additional_columns=['q', 'w'])
+
     @pytest.fixture()
     def reference_positions(self):
         # manually copied from traj file
@@ -592,6 +634,32 @@ def test_atom_reordering(self, u, reference_positions):
             assert_allclose(atom1.position, atom1_pos-bmin, atol=1e-5)
             assert_allclose(atom13.position, atom13_pos-bmin, atol=1e-5)
 
+    @pytest.mark.parametrize("system, fields", [
+        ('u_additional_columns_true', ['q', 'p']),
+        ('u_additional_columns_single', ['q']),
+        ('u_additional_columns_multiple', ['q', 'p']),
+    ])
+    def test_additional_columns(self, system, fields, request):
+        u = request.getfixturevalue(system)
+        for field in fields:
+            data = u.trajectory[0].data[field]
+            assert_allclose(data,
+                            getattr(RefLAMMPSDataAdditionalColumns, field))
+
+    @pytest.mark.parametrize("system", [
+        ('u_additional_columns_wrong_format'),
+    ])
+    def test_wrong_format_additional_colums(self, system, request):
+        with pytest.raises(ValueError,
+                           match="Please provide an iterable containing"):
+            request.getfixturevalue(system)
+
+    @pytest.mark.parametrize("system", [
+        ('u_additional_columns_not_present'),
+    ])
+    def test_warning(self, system, request):
+        with pytest.warns(match="Some of the additional"):
+            request.getfixturevalue(system)
 
 @pytest.mark.parametrize("convention",
                          ["unscaled", "unwrapped", "scaled_unwrapped"])
diff --git a/testsuite/MDAnalysisTests/data/lammps/additional_columns.data b/testsuite/MDAnalysisTests/data/lammps/additional_columns.data
@@ -0,0 +1,29 @@
+LAMMPS data file via write_data, version 24 Mar 2022, timestep = 500
+
+10 atoms
+1 atom types
+
+0 42.6 xlo xhi
+0 44.2712 ylo yhi
+-25.1 25.1 zlo zhi
+
+Masses
+
+1 12.011
+
+Pair Coeffs # lj/cut/coul/long/omp
+
+1 0.0663 3.5812
+
+Atoms # full
+
+1 2 1 -0.00706800004577013 2.84 8.17 -25 0 0 0
+2 2 1 0.004078816788554217 7.1 8.17 -25 0 0 0
+3 2 1 -0.005824512619752745 2.13 6.94 -25 0 0 0
+4 2 1 0.002812345167059992 6.39 6.94 -25 0 0 0
+5 2 1 -0.004070019151543417 2.84 5.71 -25 0 0 0
+6 2 1 0.004796116641855679 7.1 5.71 -25 0 0 0
+7 2 1 -0.003217742434809291 2.13 4.48 -25 0 0 0
+8 2 1 0.0008273956785370801 6.39 4.48 -25 0 0 0
+9 2 1 -0.0003942558636157474 2.84 3.25 -25 0 0 0
+10 2 1 0.001288716009147968 7.1 3.25 -25 0 0 0
diff --git a/testsuite/MDAnalysisTests/data/lammps/additional_columns.lammpstrj b/testsuite/MDAnalysisTests/data/lammps/additional_columns.lammpstrj
@@ -0,0 +1,19 @@
+ITEM: TIMESTEP
+0
+ITEM: NUMBER OF ATOMS
+10
+ITEM: BOX BOUNDS pp pp ff
+0.0000000000000000e+00 4.2600000000000001e+01
+0.0000000000000000e+00 4.4271200000000000e+01
+-2.5100000000000001e+01 2.5100000000000001e+01
+ITEM: ATOMS id x y z q p
+1 2.84 8.17 -25 0.00258855 1.1
+2 7.1 8.17 -25 6.91952e-05 1.2
+3 2.13 6.94 -25 0.0105548  1.1
+4 6.39 6.94 -25 0.00420319 1.2
+5 2.84 5.71 -25 0.00919172 1.1
+6 7.1 5.71 -25 0.00479777 1.2
+7 2.13 4.48 -25 0.000636864 1.1
+8 6.39 4.48 -25 0.00587125 1.2
+9 2.84 3.25 -25 -0.00218125 1.1
+10 7.1 3.25 -25 0.0068891 1.2
diff --git a/testsuite/MDAnalysisTests/datafiles.py b/testsuite/MDAnalysisTests/datafiles.py
@@ -156,6 +156,7 @@
     "LAMMPSdata_deletedatoms",  # with deleted atoms
     "LAMMPSdata_triclinic",  # lammpsdata file to test triclinic dimension parsing, albite with most atoms deleted
     "LAMMPSdata_PairIJ",  # lammps datafile with a PairIJ Coeffs section
+    "LAMMPSdata_additional_columns",  # structure for the additional column lammpstrj
     "LAMMPSDUMP",
     "LAMMPSDUMP_long",  # lammpsdump file with a few zeros sprinkled in the first column first frame
     "LAMMPSDUMP_allcoords",  # lammpsdump file with all coordinate conventions (x,xs,xu,xsu) present, from LAMMPS rdf example
@@ -166,6 +167,7 @@
     "LAMMPSDUMP_chain1", # Lammps dump file with chain reader
     "LAMMPSDUMP_chain2", # Lammps dump file with chain reader
     "LAMMPS_chain", # Lammps data file with chain reader
+    "LAMMPSDUMP_additional_columns",  # lammpsdump file with additional data (an additional charge column)
     "unordered_res",  # pdb file with resids non sequential
     "GMS_ASYMOPT",  # GAMESS C1  optimization
     "GMS_SYMOPT",   # GAMESS D4h optimization
@@ -552,6 +554,8 @@
 LAMMPSDUMP_chain2 = (_data_ref / "lammps/chain_dump_2.lammpstrj").as_posix()
 LAMMPS_chain = (_data_ref / "lammps/chain_initial.data").as_posix()
 LAMMPSdata_many_bonds = (_data_ref / "lammps/a_lot_of_bond_types.data").as_posix()
+LAMMPSdata_additional_columns = (_data_ref / "lammps/additional_columns.data").as_posix()
+LAMMPSDUMP_additional_columns = (_data_ref / "lammps/additional_columns.lammpstrj").as_posix()
 
 unordered_res = (_data_ref / "unordered_res.pdb").as_posix()
 

Original file line number	Diff line number	Diff line change
`@@ -235,6 +235,7 @@ Chronological list of authors`
`235`	`235`	`- Jenna M. Swarthout Goddard`
`236`	`236`	`2024`
`237`	`237`	`- Aditya Keshari`
	`238`	`+ - Philipp Stärk`
`238`	`239`
`239`	`240`	`External code`
`240`	`241`	`-------------`