Source code for MDAnalysis.topology.MMTFParser

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""
MMTF Topology Parser
====================

Reads topology data from the `Macromolecular Transmission Format
(MMTF) format`_.  This should generally be a quicker alternative to PDB.

Makes individual models within the MMTF file available via the `models`
attribute on Universe.

.. versionadded:: 0.16.0
.. versionchanged:: 0.20.0
   Can now read files with optional fields missing/empty

Reads the following topology attributes:

Atoms:
 - altLoc
 - atom ID
 - bfactor
 - bonds
 - charge
 - masses (guessed)
 - name
 - occupancy
 - type

Residues:
 - icode
 - resname
 - resid
 - resnum

Segments:
 - segid
 - model

Classes
-------

.. autoclass:: MMTFParser
   :members:

.. _Macromolecular Transmission Format (MMTF) format: https://mmtf.rcsb.org/
"""
from __future__ import absolute_import
from six.moves import zip

from collections import defaultdict
import mmtf
import numpy as np


from . import base
from . import guessers
from ..core.topology import Topology
from ..core.topologyattrs import (
    AltLocs,
    Atomids,
    Atomnames,
    Atomtypes,
    Bfactors,
    Bonds,
    Charges,
    ICodes,
    Masses,
    Occupancies,
    Resids,
    Resnames,
    Resnums,
    Segids,
    SegmentAttr,  # for model
)
from ..core.selection import RangeSelection
from ..due import due, Doi


def _parse_mmtf(fn):
    if fn.endswith('gz'):
        return mmtf.parse_gzip(fn)
    else:
        return mmtf.parse(fn)


class Models(SegmentAttr):
    attrname = 'models'
    singular = 'model'
    transplants = defaultdict(list)

    def models(self):
        """Models in this Universe.

        The MMTF format can define various models for a given structure. The
        topology (eg residue identity) can change between different models,
        resulting in a different number of atoms in each model.

        Returns
        -------
        A list of AtomGroups, each representing a single model.
        """
        model_ids = np.unique(self.segments.models)

        return [self.select_atoms('model {}'.format(i))
                for i in model_ids]

    transplants['Universe'].append(
        ('models', property(models, None, None, models.__doc__)))


class ModelSelection(RangeSelection):
    token = 'model'
    field = 'models'

    def apply(self, group):
        mask = np.zeros(len(group), dtype=np.bool)
        vals = group.models

        for upper, lower in zip(self.uppers, self.lowers):
            if upper is not None:
                thismask = vals >= lower
                thismask &= vals <= upper
            else:
                thismask = vals == lower

            mask |= thismask
        return group[mask].unique


[docs]class MMTFParser(base.TopologyReaderBase):
    format = 'MMTF'

    @staticmethod
    def _format_hint(thing):
        """Can parser read *thing*?

        .. versionadded:: 1.0.0
        """
        return isinstance(thing, mmtf.MMTFDecoder)

    @due.dcite(
        Doi('10.1371/journal.pcbi.1005575'),
        description="MMTF Parser",
        path='MDAnalysis.topology.MMTFParser',
    )
    def parse(self, **kwargs):
        if isinstance(self.filename, mmtf.MMTFDecoder):
            mtop = self.filename
        else:
            mtop = _parse_mmtf(self.filename)

        def iter_atoms(field):
            # iterate through atoms in groups
            for i in mtop.group_type_list:
                g = mtop.group_list[i]
                for val in g[field]:
                    yield val

        natoms = mtop.num_atoms
        nresidues = mtop.num_groups
        nsegments = mtop.num_chains
        attrs = []

        # required
        charges = Charges(list(iter_atoms('formalChargeList')))
        names = Atomnames(list(iter_atoms('atomNameList')))
        types = Atomtypes(list(iter_atoms('elementList')))
        masses = Masses(guessers.guess_masses(types.values), guessed=True)
        attrs.extend([charges, names, types, masses])

        #optional are empty list if empty, sometimes arrays
        if len(mtop.atom_id_list):
            attrs.append(Atomids(mtop.atom_id_list))
        else:
            # must have this attribute for MDA
            attrs.append(Atomids(np.arange(natoms), guessed=True))
        if mtop.alt_loc_list:
            attrs.append(AltLocs([val.replace('\x00', '').strip()
                                  for val in mtop.alt_loc_list]))
        else:
            attrs.append(AltLocs(['']*natoms))
        if len(mtop.b_factor_list):
            attrs.append(Bfactors(mtop.b_factor_list))
        else:
            attrs.append(Bfactors([0]*natoms))
        if len(mtop.occupancy_list):
            attrs.append(Occupancies(mtop.occupancy_list))
        else:
            attrs.append(Occupancies([1]*natoms))

        # Residue things
        # required
        resids = Resids(mtop.group_id_list)
        resnums = Resnums(resids.values.copy())
        resnames = Resnames([mtop.group_list[i]['groupName']
                             for i in mtop.group_type_list])
        attrs.extend([resids, resnums, resnames])
        # optional
        # mmtf empty icode is '\x00' rather than ''
        if mtop.ins_code_list:
            attrs.append(ICodes([val.replace('\x00', '').strip()
                                 for val in mtop.ins_code_list]))
        else:
            attrs.append(ICodes(['']*nresidues))

        # Segment things
        # optional
        if mtop.chain_name_list:
            attrs.append(Segids(mtop.chain_name_list))
        else:
            # required for MDAnalysis
            attrs.append(Segids(['SYSTEM'] * nsegments, guessed=True))

        mods = np.repeat(np.arange(mtop.num_models), mtop.chains_per_model)
        attrs.append(Models(mods))
        #attrs.append(chainids)

        # number of atoms in a given group id
        groupID_2_natoms = {i:len(g['atomNameList'])
                            for i, g in enumerate(mtop.group_list)}
        # mapping of atoms to residues
        resindex = np.repeat(np.arange(nresidues),
                             [groupID_2_natoms[i] for i in mtop.group_type_list])

        # mapping of residues to segments
        segindex = np.repeat(np.arange(nsegments), mtop.groups_per_chain)

        # Bonds
        # bonds are listed as indices within a group,
        # offset pulls out 'global' index
        offset = 0
        bonds = []
        for gtype in mtop.group_type_list:
            g = mtop.group_list[gtype]
            bondlist = g['bondAtomList']

            for x, y in zip(bondlist[1::2], bondlist[::2]):
                if x > y:
                    x, y = y, x  # always have x < y
                bonds.append((x + offset, y + offset))

            offset += groupID_2_natoms[gtype]
        # add inter group bonds
        if not mtop.bond_atom_list is None:  # optional field
            for x, y in zip(mtop.bond_atom_list[1::2],
                            mtop.bond_atom_list[::2]):
                if x > y:
                    x, y = y, x
                bonds.append((x, y))
        attrs.append(Bonds(bonds))

        top = Topology(natoms, nresidues, nsegments,
                       atom_resindex=resindex,
                       residue_segindex=segindex,
                       attrs=attrs)

        return top