Source code for MDAnalysis.topology.TOPParser

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the Lesser GNU Public Licence, v2.1 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""
AMBER PRMTOP topology parser
============================

Reads an AMBER top file to build the system.

Amber keywords are turned into the following attributes:

+----------------------------+----------------------+
| AMBER flag                 | MDAnalysis attribute |
+============================+======================+
| ATOM_NAME                  | names                |
+----------------------------+----------------------+
| CHARGE                     | charges              |
+----------------------------+----------------------+
| ATOMIC_NUMBER              | elements             |
+----------------------------+----------------------+
| MASS                       | masses               |
+----------------------------+----------------------+
| BONDS_INC_HYDROGEN         | bonds                |
| BONDS_WITHOUT_HYDROGEN     |                      |
+----------------------------+----------------------+
| ANGLES_INC_HYDROGEN        | angles               |
| ANGLES_WITHOUT_HYDROGEN    |                      |
+----------------------------+----------------------+
| DIHEDRALS_INC_HYDROGEN     | dihedrals / improper |
| DIHEDRALS_WITHOUT_HYDROGEN |                      |
+----------------------------+----------------------+
| ATOM_TYPE_INDEX            | type_indices         |
+----------------------------+----------------------+
| AMBER_ATOM_TYPE            | types                |
+----------------------------+----------------------+
| RESIDUE_LABEL              | resnames             |
+----------------------------+----------------------+
| RESIDUE_POINTER            | residues             |
+----------------------------+----------------------+

TODO:
  Add support for Chamber-style topologies
  More stringent tests

.. Note::

   The Amber charge is converted to electron charges as used in
   MDAnalysis and other packages. To get back Amber charges, multiply
   by 18.2223.

   Chamber-style Amber topologies (i.e. topologies generated via parmed
   conversion of a CHARMM topology to an AMBER one) are not currently
   supported. Support will likely be added in future MDAnalysis releases.

   As of version 2.0.0, elements are no longer guessed if ATOMIC_NUMBER records
   are missing. In those scenarios, if elements are necessary, users will have
   to invoke the element guessers after parsing the topology file. Please see
   :mod:`MDAnalysis.guessers` for more details.

.. _`PARM parameter/topology file specification`:
   https://ambermd.org/FileFormats.php#topo.cntrl

Classes
-------

.. autoclass:: TOPParser
   :members:
   :inherited-members:

"""
import numpy as np
import itertools

from ..guesser.tables import Z2SYMB
from ..lib.util import openany, FORTRANReader
from .base import TopologyReaderBase, change_squash
from ..core.topology import Topology
from ..core.topologyattrs import (
    Atomnames,
    Atomtypes,
    Atomids,
    Charges,
    Elements,
    Masses,
    Resnames,
    Resids,
    Resnums,
    Segids,
    ChainIDs,
    AtomAttr,
    Bonds,
    Angles,
    Dihedrals,
    Impropers
)

import warnings
import logging

logger = logging.getLogger('MDAnalysis.topology.TOPParser')


class TypeIndices(AtomAttr):
    """Numerical type of each Atom"""
    attrname = 'type_indices'
    singular = 'type_index'
    level = 'atom'



[docs]
class TOPParser(TopologyReaderBase):
    """Reads topology information from an AMBER top file.

    Reads the following Attributes if in topology:
    - Atomnames
    - Charges
    - Masses
    - Elements
    - Atomtypes
    - Resnames
    - Type_indices
    - Bonds
    - Angles
    - Dihedrals (inc. impropers)
    - ChainIDs (from %RESIDUE_CHAINID)
    - Segids (from %RESIDUE_CHAINID)

    The format is defined in `PARM parameter/topology file
    specification`_.  The reader tries to detect if it is a newer
    (AMBER 12?) file format by looking for the flag "ATOMIC_NUMBER".

    .. _`PARM parameter/topology file specification`:
       https://ambermd.org/FileFormats.php#topo.cntrl

    Additionally, the RESIDUE_CHAINID non-standard flag is supported. This
    can be added with the `addPDB`_ command from parmed:

    .. _`addPDB`: https://parmed.github.io/ParmEd/html/parmed.html#addpdb

    Notes
    -----
    Elements are obtained from the atomic numbers (if present). If a given
    input atomic number does not belong to an element (usually either -1 or 0),
    the element will be assigned an empty record.

    .. versionchanged:: 0.7.6
      parses both amber10 and amber12 formats
    .. versionchanged:: 0.19.0
      parses bonds, angles, dihedrals, and impropers
    .. versionchanged:: 1.0.0
      warns users that chamber-style topologies are not currently supported
    .. versionchanged:: 2.0.0
      no longer guesses elements if missing
    .. versionchanged:: 2.7.0
      gets Segments and chainIDs from flag RESIDUE_CHAINID, when present
    """
    format = ['TOP', 'PRMTOP', 'PARM7']


[docs]
    def parse(self, **kwargs):
        """Parse Amber PRMTOP topology file *filename*.

        Returns
        -------
        A MDAnalysis Topology object
        """
        # Sections that we grab as we parse the file
        sections = {
            "ATOM_NAME": (1, 20, self.parse_names, "name", 0),
            "CHARGE": (1, 5, self.parse_charges, "charge", 0),
            "ATOMIC_NUMBER": (1, 10, self.parse_elements, "elements", 0),
            "MASS": (1, 5, self.parse_masses, "mass", 0),
            "ATOM_TYPE_INDEX": (1, 10, self.parse_type_indices, "type_indices",
                                0),
            "AMBER_ATOM_TYPE": (1, 20, self.parse_types, "types", 0),
            "RESIDUE_LABEL": (1, 20, self.parse_resnames, "resname", 11),
            "RESIDUE_POINTER": (1, 10, self.parse_residx, "respoint", 11),
            "BONDS_INC_HYDROGEN": (3, 10, self.parse_bonded, "bondh", 2),
            "BONDS_WITHOUT_HYDROGEN": (3, 10, self.parse_bonded, "bonda", 3),
            "ANGLES_INC_HYDROGEN": (4, 10, self.parse_bonded, "angh", 4),
            "ANGLES_WITHOUT_HYDROGEN": (4, 10, self.parse_bonded, "anga", 5),
            "DIHEDRALS_INC_HYDROGEN": (5, 10, self.parse_bonded, "dihh", 6),
            "DIHEDRALS_WITHOUT_HYDROGEN": (5, 10, self.parse_bonded, "diha", 7),
            "RESIDUE_CHAINID": (1, 20, self.parse_chainids, "segids", 11),
        }

        attrs = {}  # empty dict for attrs that we'll fill

        # Open and check top validity
        # Reading header info POINTERS
        with openany(self.filename) as self.topfile:
            header = next(self.topfile)
            if not header.startswith("%VE"):
                raise ValueError(
                    "{0} is not a valid TOP file. %VE Missing in header"
                    "".format(self.filename))
            title = next(self.topfile).split()

            if not (title[1] == "TITLE"):
                # Raise a separate warning if Chamber-style TOP is detected
                if title[1] == "CTITLE":
                    emsg = ("{0} is detected as a Chamber-style TOP file. "
                            "At this time MDAnalysis does not support such "
                            "topologies".format(self.filename))
                else:
                    emsg = ("{0} is not a valid TOP file. "
                            "'TITLE' missing in header".format(self.filename))
                raise ValueError(emsg)

            while not header.startswith('%FLAG POINTERS'):
                header = next(self.topfile)
            next(self.topfile)

            topremarks = [next(self.topfile).strip() for i in range(4)]
            sys_info = [int(k) for i in topremarks for k in i.split()]

            header = next(self.topfile)
            # grab the next section title
            next_section = header.split("%FLAG")[1].strip()

            while next_section is not None:
                try:
                    (num_per_record, per_line,
                     func, name, sect_num) = sections[next_section]
                except KeyError:
                    def next_getter():
                        return self.skipper()
                else:
                    num = sys_info[sect_num] * num_per_record
                    numlines = (num // per_line)
                    if num % per_line != 0:
                        numlines += 1

                    attrs[name] = func(num_per_record, numlines)

                    def next_getter():
                        return next(self.topfile)

                try:
                    line = next_getter()
                    # Capture case where section is empty w/ 1 empty line
                    if numlines == 0 and not line.strip():
                        line = next_getter()
                except StopIteration:
                    next_section = None
                else:
                    try:
                        next_section = line.split("%FLAG")[1].strip()
                    except IndexError:
                        errmsg = (f"%FLAG section not found, formatting error "
                                  f"for PARM7 file {self.filename} ")
                        raise IndexError(errmsg) from None

        # strip out a few values to play with them
        n_atoms = len(attrs['name'])

        resptrs = attrs.pop('respoint')
        resptrs.append(n_atoms)
        residx = np.zeros(n_atoms, dtype=np.int32)
        for i, (x, y) in enumerate(zip(resptrs[:-1], resptrs[1:])):
            residx[x:y] = i

        n_res = len(attrs['resname'])

        # Deal with recreating bonds and angle records here
        attrs['bonds'] = Bonds([i for i in itertools.chain(
                                attrs.pop('bonda'), attrs.pop('bondh'))])

        attrs['angles'] = Angles([i for i in itertools.chain(
                                attrs.pop('anga'), attrs.pop('angh'))])

        attrs['dihedrals'], attrs['impropers'] = self.parse_dihedrals(
                              attrs.pop('diha'), attrs.pop('dihh'))

        # Warn user if elements not in topology
        if 'elements' not in attrs:
            msg = ("ATOMIC_NUMBER record not found, elements attribute will "
                   "not be populated. If needed these can be guessed using "
                   "universe.guess_TopologyAttrs(to_guess=['elements']).")
            logger.warning(msg)
            warnings.warn(msg)
        elif np.any(attrs['elements'].values == ""):
            # only send out one warning that some elements are unknown
            msg = ("Unknown ATOMIC_NUMBER value found for some atoms, these "
                   "have been given an empty element record. If needed these "
                   "can be guessed using "
                   "universe.guess_TopologyAttrs(to_guess=['elements']).")
            logger.warning(msg)
            warnings.warn(msg)

        # atom ids are mandatory
        attrs['atomids'] = Atomids(np.arange(n_atoms) + 1)
        attrs['resids'] = Resids(np.arange(n_res) + 1)
        attrs['resnums'] = Resnums(np.arange(n_res) + 1)

        # Amber's 'RESIDUE_CHAINID' is a by-residue attribute, turn it into
        # a by-atom attribute when present. See PR #4007.
        if "segids" in attrs and len(attrs["segids"]) == n_res:
            segidx, (segids,) = change_squash((attrs["segids"],), (attrs["segids"],))
            chainids = [attrs["segids"][r] for r in residx]

            attrs["segids"] = Segids(segids)
            attrs["ChainIDs"] = ChainIDs(chainids)
            n_segs = len(segids)
        else:
            if "segids" in attrs:
                msg = (
                    f"Number of residues ({n_res}) does not match number of "
                    f"%RESIDUE_CHAINID ({len(attrs['segids'])}). Skipping section."
                )
                logger.warning(msg)
                warnings.warn(msg)
            attrs["segids"] = Segids(np.array(["SYSTEM"], dtype=object))
            segidx = None
            n_segs = 1

        top = Topology(
            n_atoms,
            n_res,
            n_segs,
            attrs=list(attrs.values()),
            atom_resindex=residx,
            residue_segindex=segidx,
        )

        return top



[docs]
    def skipper(self):
        """TOPParser :class: helper function, skips lines of input parm7 file
        until we find the next %FLAG entry and return that

        Returns
        -------
        line : string
            String containing the current line of the parm7 file
        """
        line = next(self.topfile)
        while not line.startswith("%FLAG"):
            line = next(self.topfile)
        return line



[docs]
    def parse_names(self, num_per_record, numlines):
        """Extracts atoms names from parm7 file

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in the section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr : :class:`Atomnames`
            A :class:`Atomnames` instance containing the names of each atom as
            defined in the parm7 file
        """
        vals = self.parsesection_mapper(numlines, lambda x: x)
        attr = Atomnames(np.array(vals, dtype=object))
        return attr



[docs]
    def parse_resnames(self, num_per_record, numlines):
        """Extracts the names of each residue

        Parameters
        ----------
        num_per_record : int
            The number of entries for each recrod in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr : :class:`Resnames`
            A :class:`Resnames` instance containing the names of each residue
            as defined in the parm7 file
        """
        vals = self.parsesection_mapper(numlines, lambda x: x)
        attr = Resnames(np.array(vals, dtype=object))
        return attr



[docs]
    def parse_charges(self, num_per_record, numlines):
        """Extracts the partial charges for each atom

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr : :class:`Charges`
            A :class:`Charges` instance containing the partial charges of each
            atom as defined in the parm7 file
        """
        vals = self.parsesection_mapper(numlines, lambda x: float(x))
        charges = np.array(vals, dtype=np.float32)
        charges /= 18.2223  # to electron charge units
        attr = Charges(charges)
        return attr



[docs]
    def parse_masses(self, num_per_record, numlines):
        """Extracts the mass of each atom

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr : :class:`Masses`
            A :class:`Masses` instance containing the mass of each atom as
            defined in the parm7 file
        """
        vals = self.parsesection_mapper(numlines, lambda x: float(x))
        attr = Masses(vals)
        return attr



[docs]
    def parse_elements(self, num_per_record, numlines):
        """Extracts the atomic numbers of each atom and converts to element type

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section(unused input)
        numlines : int
            The number of lines to be pasred in current section

        Returns
        -------
        attr : :class:`Elements`
            A :class:`Elements` instance containing the element of each atom
            as defined in the parm7 file

        Note
        ----
        If the record contains unknown atomic numbers (e.g. <= 0), these will
        be treated as unknown elements and assigned an empty string value. See
        issues #2306 and #2651 for more details.

        .. versionchanged:: 2.0.0
           Unrecognised elements will now return a empty string. The parser
           will no longer attempt to guess the element by default.
        """

        vals = self.parsesection_mapper(
                numlines,
                lambda x: Z2SYMB[int(x)] if int(x) > 0 else "")
        attr = Elements(np.array(vals, dtype=object))
        return attr



[docs]
    def parse_types(self, num_per_record, numlines):
        """Extracts the force field atom types of each atom

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr : :class:`Atomtypes`
            A :class:`Atomtypes` instance containing the atom types for each
            atom as defined in the parm7 file
        """
        vals = self.parsesection_mapper(numlines, lambda x: x)
        attr = Atomtypes(np.array(vals, dtype=object))
        return attr



[docs]
    def parse_type_indices(self, num_per_record, numlines):
        """Extracts the index of atom types of the each atom involved in Lennard
        Jones (6-12) interactions.

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr :class:`TypeIndices`
            A :class:`TypeIndices` instance containing the LJ 6-12 atom type
            index for each atom
        """
        vals = self.parsesection_mapper(numlines, lambda x: int(x))
        attr = TypeIndices(np.array(vals, dtype=np.int32))
        return attr



[docs]
    def parse_residx(self, num_per_record, numlines):
        """Extracts the residue pointers for each atom

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        vals : list of int
            A list of zero-formatted residue pointers for each atom
        """
        vals = self.parsesection_mapper(numlines, lambda x: int(x) - 1)
        return vals



[docs]
    def parse_chunks(self, data, chunksize):
        """Helper function to parse AMBER PRMTOP bonds/angles.

        Parameters
        ----------
        data : list of int
            Input list of the parm7 bond/angle section, zero-indexed
        num_per_record : int
            The number of entries for each record in the input list

        Returns
        -------
        vals : list of int tuples
            A list of tuples containing the atoms involved in a given bonded
            interaction

        Note
        ----
        In the parm7 format this information is structured in the following
        format: [ atoms 1:n, internal index ]
        Where 1:n represent the ids of the n atoms involved in the bond/angle
        and the internal index links to a given set of FF parameters.
        Therefore, to extract the required information, we split out the list
        into chunks of size num_per_record, and only extract the atom ids.
        """
        vals = [tuple(data[x:x+chunksize-1])
                for x in range(0, len(data), chunksize)]
        return vals



[docs]
    def parse_bonded(self, num_per_record, numlines):
        """Extracts bond information from PARM7 format files

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section
        numlines : int
            The number of lines to be parsed for this section

        Note
        ----
        For the bond/angle sections of parm7 files, the atom numbers are set to
        coordinate array index values. As detailed in `the specification`_,
        to recover the actual atom number, one
        should divide the values by 3 and add 1. Here, since we want to satisfy
        zero-indexing, we only divide by 3.

        .. _`the specification`: https://ambermd.org/FileFormats.php#topo.cntrl
        """
        fields = self.parsesection_mapper(numlines, lambda x: int(x) // 3)
        section = self.parse_chunks(fields, num_per_record)
        return section



[docs]
    def parsesection_mapper(self, numlines, mapper):
        """Parses FORTRAN formatted section, and returns a list of all entries
        in each line

        Parameters
        ----------
        numlines : int
            The number of lines to be parsed in this section
        mapper : lambda operator
            Operator to format entries in current section

        Returns
        -------
        section : list
            A list of all entries in a given parm7 section
        """
        section = []

        def get_fmt(file):
            """ Skips '%COMMENT' lines until it gets the FORMAT specification
            for the section."""
            line = next(file)
            if line[:7] == "%FORMAT":
                return line[8:].split(")")[0]
            elif line[:8] == "%COMMENT":
                return get_fmt(file)
            else:
                raise ValueError(
                    "Invalid header line. Does not begin with either %FLAG, %FORMAT "
                    f"nor %COMMENT:\n{line}"
                )

        # There may be %COMMENT lines between %FLAG and %FORMAT statements. Skip them.
        fmt = get_fmt(self.topfile)
        x = FORTRANReader(fmt)
        for i in range(numlines):
            l = next(self.topfile)
            for j in range(len(x.entries)):
                val = l[x.entries[j].start:x.entries[j].stop].strip()
                if val:
                    section.append(mapper(val))
        return section



[docs]
    def parse_dihedrals(self, diha, dihh):
        """Combines hydrogen and non-hydrogen containing AMBER dihedral lists
        and extracts sublists for conventional dihedrals and improper angles

        Parameters
        ----------
        diha : list of tuples
            The atom ids of dihedrals not involving hydrogens
        dihh : list of tuples
            The atom ids of dihedrals involving hydrogens

        Returns
        -------
        dihedrals : :class:`Dihedrals`
            A :class:`Dihedrals` instance containing a list of all unique
            dihedrals as defined by the parm7 file
        impropers : :class:`Impropers`
            A :class:`Impropers` instance containing a list of all unique
            improper dihedrals as defined by the parm7 file

        Note
        ----
        As detailed in `the specification`_, the dihedral sections
        of parm7 files contain information about both conventional dihedrals
        and impropers. The following must be accounted for:
        1) If the fourth atom in a dihedral entry is given a negative value,
        this indicates that it is an improper.
        2) If the third atom in a dihedral entry is given a negative value,
        this indicates that it 1-4 NB interactions are ignored for this
        dihedrals. This could be due to the dihedral within a ring, or if it is
        part of a multi-term dihedral definition or if it is an improper.

        .. _`the specification`: https://ambermd.org/FileFormats.php#topo.cntrl
        """
        improp = []
        dihed = []
        for i in itertools.chain(diha, dihh):
            if i[3] < 0:
                improp.append(i[:2]+(abs(i[2]),)+(abs(i[3]),))
            elif i[2] < 0:
                vals = i[:2] + (abs(i[2]),) + i[3:]
                dihed.append(vals)
            else:
                dihed.append(i)
        dihed = sorted(set(dihed))
        dihedrals = Dihedrals(dihed)
        impropers = Impropers(improp)
        return dihedrals, impropers



[docs]
    def parse_chainids(self, num_per_record: int, numlines: int):
        """Extracts the chainID of each residue

        Parameters
        ----------
        num_per_record : int
            The number of entries for each record in section (unused input)
        numlines : int
            The number of lines to be parsed in current section

        Returns
        -------
        attr : numpy array
            A numpy array containing the chainID of each residue as defined in
            the parm7 file
        """
        vals = self.parsesection_mapper(numlines, lambda x: x)
        attr = np.array(vals)
        return attr