Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
105 commits
Select commit Hold shift + click to select a range
aa2a88f
Start working on MMCIF parser
marinegor May 22, 2024
218cf43
Add first (not working) version of MMCIFReader and MMCIF topology parser
marinegor May 22, 2024
7f78e02
Do some squashing
marinegor May 22, 2024
6682d6e
Remove inherited docs
marinegor May 22, 2024
817f3a0
Try improving the parsing
marinegor May 22, 2024
3cc8c80
Try three independent loops over the model
marinegor May 30, 2024
f1bf325
Merge remote-tracking branch 'upstream/develop' into feature/mmcif
marinegor Jul 25, 2024
d21c220
Add gemmi dependency
marinegor Sep 13, 2024
2a1be15
necessary params
marinegor Sep 20, 2024
77645e6
finished sorting atom attrs
marinegor Sep 20, 2024
91e6942
add function for transformation into *idx
marinegor Sep 20, 2024
9a0c086
oh damn seems to finally be working
marinegor Sep 20, 2024
9c731df
remove TODOs
marinegor Sep 20, 2024
8b40ec7
Remove debug prints
marinegor Sep 20, 2024
bdcbd73
Merge branch 'develop' into feature/mmcif
marinegor Sep 22, 2024
401a4d3
try to pack things into separate class in utils?
marinegor Sep 22, 2024
9c336bd
remove unnecessary functions
marinegor Sep 22, 2024
def88e4
copy all loops into separate functions
marinegor Sep 23, 2024
cabfd37
Move loops over structures into functions
marinegor Sep 23, 2024
4c9d930
Move coordinate fetching into function for the coordinate reader as well
marinegor Sep 23, 2024
184491a
Fix imports
marinegor Sep 23, 2024
3de8565
Start adding documentation
marinegor Sep 30, 2024
ca6ebbb
Reference MMCIFParser in PDBParser
marinegor Oct 1, 2024
45077ad
Add documentation for trajectory and topology parsers
marinegor Oct 1, 2024
9a1a59a
Add mmcif tests
marinegor Oct 2, 2024
27c10d6
Update format specifications
marinegor Oct 2, 2024
950cfcf
Write simple tests
marinegor Oct 2, 2024
8d1a8b5
Merge remote-tracking branch 'upstream/develop' into feature/mmcif
marinegor Oct 24, 2024
ef29338
update github action with gemmi
marinegor Oct 24, 2024
caca17e
fix gemmi import errors
marinegor Oct 24, 2024
f0e49cc
add mmcif testfiles
marinegor Oct 24, 2024
b7ada7c
add mmcif to __all__
marinegor Oct 24, 2024
e80632c
add black instead of ruff
marinegor Oct 25, 2024
10f3124
Merge remote-tracking branch 'origin/feature/mmcif' into feature/mmcif
marinegor Feb 7, 2025
98353fe
fix function signature
marinegor Feb 10, 2025
35fa187
Merge remote-tracking branch 'upstream/develop' into feature/mmcif
marinegor Feb 18, 2025
e68fcce
Add documentation for mmcif coords
marinegor Feb 19, 2025
263e9f1
expand documentation and type annotations
marinegor Feb 20, 2025
ba47d53
add invalid cif and MMCIF rst files
marinegor Feb 20, 2025
9ffb6f2
add mmcif with invalid atom type
marinegor Feb 20, 2025
fcfc6c0
add biopython cif and fix invalid cif formatting
marinegor Feb 20, 2025
0de720e
remove weird docs part
marinegor Feb 20, 2025
236b286
fix fstring
marinegor Feb 20, 2025
b562115
replace version to 2.9.0
marinegor Feb 20, 2025
816b23f
Merge remote-tracking branch 'upstream/develop' into feature/mmcif
marinegor Feb 20, 2025
92ae164
update changelog
marinegor Feb 20, 2025
88c64a3
move gemmi to optional deps
marinegor Feb 20, 2025
59b7e29
fix issue with accidentally updated datafiles
marinegor Feb 20, 2025
f2c23c8
add mmcif to all
marinegor Feb 21, 2025
776676e
Start working on MMCIF parser
marinegor May 22, 2024
71e60f4
Add first (not working) version of MMCIFReader and MMCIF topology parser
marinegor May 22, 2024
36b7125
Do some squashing
marinegor May 22, 2024
b058941
Remove inherited docs
marinegor May 22, 2024
ef30fa7
Try improving the parsing
marinegor May 22, 2024
95572c1
Try three independent loops over the model
marinegor May 30, 2024
a8a9436
Add gemmi dependency
marinegor Sep 13, 2024
6706bbe
necessary params
marinegor Sep 20, 2024
8cf9da4
finished sorting atom attrs
marinegor Sep 20, 2024
f13156b
add function for transformation into *idx
marinegor Sep 20, 2024
dda981c
oh damn seems to finally be working
marinegor Sep 20, 2024
ebdf849
remove TODOs
marinegor Sep 20, 2024
47043f6
Remove debug prints
marinegor Sep 20, 2024
9770d7b
try to pack things into separate class in utils?
marinegor Sep 22, 2024
fd7f70d
remove unnecessary functions
marinegor Sep 22, 2024
1493056
copy all loops into separate functions
marinegor Sep 23, 2024
3d7fbb9
Move loops over structures into functions
marinegor Sep 23, 2024
9b9286e
Move coordinate fetching into function for the coordinate reader as well
marinegor Sep 23, 2024
b8f3c04
Fix imports
marinegor Sep 23, 2024
0f38a2d
Start adding documentation
marinegor Sep 30, 2024
b915aab
Reference MMCIFParser in PDBParser
marinegor Oct 1, 2024
0d61248
Add documentation for trajectory and topology parsers
marinegor Oct 1, 2024
34d76ca
Add mmcif tests
marinegor Oct 2, 2024
b242aa5
Update format specifications
marinegor Oct 2, 2024
4fc3a78
Write simple tests
marinegor Oct 2, 2024
14fa756
fix actions
marinegor Feb 22, 2025
e3a9a1f
fix gemmi import errors
marinegor Oct 24, 2024
d492b4e
add mmcif testfiles
marinegor Oct 24, 2024
1880e4a
add mmcif to __all__
marinegor Oct 24, 2024
927d7a0
add black instead of ruff
marinegor Oct 25, 2024
ad0f0be
fix function signature
marinegor Feb 10, 2025
e03c3e5
Add documentation for mmcif coords
marinegor Feb 19, 2025
4d79205
expand documentation and type annotations
marinegor Feb 20, 2025
32d7cf9
add invalid cif and MMCIF rst files
marinegor Feb 20, 2025
0df8c3a
add mmcif with invalid atom type
marinegor Feb 20, 2025
05c6ea1
add biopython cif and fix invalid cif formatting
marinegor Feb 20, 2025
88dab79
remove weird docs part
marinegor Feb 20, 2025
a82fe52
fix fstring
marinegor Feb 20, 2025
e3f1714
replace version to 2.9.0
marinegor Feb 20, 2025
db46016
fix actions
marinegor Feb 20, 2025
32cd103
fix datafiles
marinegor Feb 22, 2025
805089e
add mmcif to all
marinegor Feb 21, 2025
55c3dbb
add mmcif to coordinates and topology modules
marinegor Feb 22, 2025
cd201d0
update docs following yuxuanzhuang comments
marinegor Feb 22, 2025
81f0b5b
merge remote
marinegor Feb 22, 2025
22d1cca
add linked issues and prs to changelog
marinegor Feb 22, 2025
d1ba434
remove mmcif files from black ignore
marinegor Feb 23, 2025
a03b56f
add tests for multimodel file warnings
marinegor Feb 23, 2025
bd4c255
add tests for cryst1 warnings
marinegor Feb 23, 2025
3d61dc5
black
marinegor Feb 23, 2025
aed9b54
add invalid cif file itself
marinegor Feb 23, 2025
53c51f4
format datafiles with black
marinegor Feb 23, 2025
3e0324c
merge develop
marinegor Oct 22, 2025
205c910
add tests for 1BD2 and other files mentioned in discussion
marinegor Oct 23, 2025
f9f7912
enable linting of mmcif-related files
marinegor Oct 23, 2025
9a90316
add test files
marinegor Oct 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions package/MDAnalysis/coordinates/MMCIF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import gemmi
import logging
from . import base

logger = logging.getLogger("MDAnalysis.coordinates.MMCIF")


class MMCIFReader(base.SingleFrameReaderBase):
"""Reads from an MMCIF file"""

format = "MMCIF"
units = {"time": None, "length": "Angstrom"}

def _read_first_frame(self):
structure = gemmi.read_structure(self.filename)
coords = np.array(
[
[*at.pos.tolist()]
for model in structure
for chain in model
for res in chain
for at in res
]
)
self.n_atoms = len(coords)
self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs)
self.ts.frame = 0

def Writer(self, filename, n_atoms=None, **kwargs):
raise NotImplementedError

def close(self):
pass


class MMCIFWriter(base.WriterBase):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't include this at this stage, Writer is optional

pass
1 change: 1 addition & 0 deletions package/MDAnalysis/coordinates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,3 +791,4 @@ class can choose an appropriate reader automatically.
from . import NAMDBIN
from . import FHIAIMS
from . import TNG
from . import MMCIF
178 changes: 178 additions & 0 deletions package/MDAnalysis/topology/MMCIFParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
"""
MMCIF Topology Parser #
===================
"""

import gemmi
import numpy as np
import warnings
import itertools

from ..core.topology import Topology
from ..core.topologyattrs import (
AltLocs,
Atomids,
Atomnames,
Atomtypes,
ChainIDs,
Elements,
FormalCharges,
ICodes,
Masses,
Occupancies,
RecordTypes,
Resids,
Resnames,
Resnums,
Segids,
Tempfactors,
)
from .base import TopologyReaderBase


def _into_idx(arr: list[int]) -> list[int]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document what this does, ideally with an example

return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group]


class MMCIFParser(TopologyReaderBase):
format = "MMCIF"

def parse(self, **kwargs):
"""Read the file and return the structure.
Returns
-------
MDAnalysis Topology object
"""
structure = gemmi.read_structure(self.filename)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is intentional, but the MMCIFParser can't take a StringIO object (which looks like it gets converted into a MDAnalysis.lib.util.NamedStream at some point in Universe creation) in place of a filepath as-is- the PDB parser currently can do this already

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From playing around, looks like this won't take a pathlib.Path object either unless I'm mistaken


if len(structure) > 1:
warnings.warn(
"MMCIF model {self.filename} contains {len(model)=} different models, "
"but only the first one will be used to assign the topology"
)
model = structure[0]

# atom properties
(
altlocs, # at.altloc
serials, # at.serial
names, # at.name
atomtypes, # at.name
# ------------------
chainids, # chain.name
elements, # at.element.name
formalcharges, # at.charge
weights, # at.element.weight
# ------------------
occupancies, # at.occ
record_types, # res.het_flag
tempfactors, # at.b_iso
residx, # _into_idx(res.seqid.num)
) = map( # this is probably not pretty, but it's efficient -- one loop over the mmcif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all the fields here guaranteed in a valid pdbx? One benefit to working column by column is that you can do optional columns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example of a PDBx in mind, or like a test set for them? I've never actually worked with the format, since in RCSB afaik we have only pdb or mmcif

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PDBx is mmcif. The download links here will give you an example file: https://www.rcsb.org/structure/4ake we use 4ake elsewhere in the testsuite. In my experience, sometimes the PDB / mmcif versions of the same entry aren't completely identical, so I wouldn't worry about trying to align the PDB & PDBx tests.

np.array,
list(
zip(
*[
(
at.altloc, # altlocs
at.serial, # serials
at.name, # names
at.name, # atomtypes
# ------------------
chain.name, # chainids
at.element.name, # elements
at.charge, # formalcharges
at.element.weight, # weights
# ------------------
at.occ, # occupancies
res.het_flag, # record_types
at.b_iso, # tempfactors
res.seqid.num, # residx, later translated into continious repr
)
for chain in model
for res in chain
for at in res
]
)
),
)

(
icodes, # res.seqid.icode
resids, # res.seqid.num
resnames, # res.name
segidx, # chain.name
resnums,
) = map(
np.array,
list(
zip(
*[
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm struggling to follow the logic here, a comment breaking down what this double nested loop iteration into a zip is doing would be nice

(
res.seqid.icode,
res.seqid.num,
res.name,
chain.name,
res.seqid.num,
)
for chain in model
for res in chain
]
)
),
)

segids = [chain.name for chain in model]

# transform *idx into continious numpy arrays
residx = np.array(_into_idx(residx))
segidx = np.array(_into_idx(segidx))

# fill in altlocs
altlocs = ["A" if not elem else elem for elem in altlocs]
record_types = [
"ATOM" if record == "A" else "HETATM" if record == "H" else None
for record in record_types
]
if any((elem is None for elem in record_types)):
raise ValueError("Found an atom that is neither ATOM or HETATM")

attrs = [
# AtomAttr subclasses
AltLocs(altlocs), # at.altloc
Atomids(serials), # at.serial
Atomnames(names), # at.name
Atomtypes(atomtypes), # at.name
# ---------------------------------------
ChainIDs(chainids), # chain.name
Elements(elements), # at.element.name
FormalCharges(formalcharges), # at.charge
Masses(weights), # at.element.weight
# ---------------------------------------
Occupancies(occupancies), # at.occ
RecordTypes(record_types), # res.het_flat
Resnums(resnums), # res.seqid.num
Tempfactors(tempfactors), # at.b_iso
#
# ResidueAttr subclasses
ICodes(icodes), # res.seqid.icode
Resids(resids), # res.seqid.num
Resnames(resnames), # res.name
#
# SegmentAttr subclasses
Segids(segids), # chain.name
]

n_atoms = len(names)
n_residues = len(resids)
n_segments = len(segids)

return Topology(
n_atoms,
n_residues,
n_segments,
attrs=attrs,
atom_resindex=residx,
residue_segindex=segidx,
)
1 change: 1 addition & 0 deletions package/MDAnalysis/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,4 @@
from . import MinimalParser
from . import ITPParser
from . import FHIAIMSParser
from . import MMCIFParser
1 change: 1 addition & 0 deletions package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ extra_formats = [
"pytng>=0.2.3",
"gsd>3.0.0",
"rdkit>=2020.03.1",
"gemmi", # for mmcif format
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will probably be optional, so other imports will have to respect that too

]
analysis = [
"biopython>=1.80",
Expand Down
Loading