Source code for MDAnalysis.topology.ITPParser

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

r"""
ITP topology parser
===================

Reads a GROMACS ITP_ or TOP_ file to build the system. The topology will
contain atom IDs, segids, residue IDs, residue names, atom names, atom types,
charges, chargegroups, masses (guessed if not found), moltypes, and molnums. 
Bonds, angles, dihedrals and impropers are also read from the file.

If an ITP file is passed without a ``[ molecules ]`` directive, passing 
``infer_system=True`` (the default option) will create a Universe with 
1 molecule of each defined ``moleculetype``. 
If a ``[ molecules ]`` section is present, ``infer_system`` is ignored.

If files are included with the `#include` directive, they will also be read.
If they are not in the working directory, ITPParser will look for them in the
``include_dir`` directory. By default, this is set to
``include_dir="/usr/local/gromacs/share/gromacs/top/"``.
Variables can be defined with the `#define` directive in files, or by passing
in :ref:`keyword arguments <itp-define-kwargs>`.

Examples
--------

    ::
        import MDAnalysis as mda
        from MDAnalysis.tests.datafiles import ITP_tip5p

        #  override charge of HW2 atom (defined in file as HW2_CHARGE)
        u = mda.Universe(ITP_tip5p, HW2_CHARGE=2, infer_system=True)

    .. note::

        AMBER also uses topology files with the .top extension. To use ITPParser
        to read GROMACS top files, pass ``topology_format='ITP'``.

        ::

            import MDAnalysis as mda
            u = mda.Universe('topol.top', topology_format='ITP')


.. _itp-define-kwargs:

Preprocessor variables
----------------------

ITP files are often defined with lines that depend on 
whether a keyword flag is given. For example, this modified TIP5P water file:

    .. code-block:: none

        [ moleculetype ]
        ; molname       nrexcl
        SOL             2

        #ifndef HW1_CHARGE
            #define HW1_CHARGE 0.241
        #endif

        #define HW2_CHARGE 0.241

        [ atoms ]
        ; id    at type res nr  residu name     at name         cg nr   charge
        1       opls_118     1       SOL              OW             1       0
        2       opls_119     1       SOL             HW1             1       HW1_CHARGE
        3       opls_119     1       SOL             HW2             1       HW2_CHARGE
        4       opls_120     1       SOL             LP1             1      -0.241
        5       opls_120     1       SOL             LP2             1      -0.241
        #ifdef EXTRA_ATOMS  ; added for keyword tests
        6       opls_120     1       SOL             LP3             1      -0.241
        7       opls_120     1       SOL             LP4             1       0.241
        #endif


Define these preprocessor variables by passing keyword arguments. Any arguments that you 
pass in *override* any variables defined in the file. For example, the universe below 
will have charges of 3 for the HW1 and HW2 atoms::

    import MDAnalysis as mda
    from MDAnalysis.tests.datafiles import ITP_tip5p

    u = mda.Universe(ITP_tip5p, EXTRA_ATOMS=True, HW1_CHARGE=3, HW2_CHARGE=3)

These keyword variables are **case-sensitive**. Note that if you set keywords to 
``False`` or ``None``, they will be treated as if they are not defined in #ifdef conditions.

For example, the universe below will only have 5 atoms. ::

    u = mda.Universe(ITP_tip5p, EXTRA_ATOMS=False)


.. _ITP: http://manual.gromacs.org/current/reference-manual/topologies/topology-file-formats.html#molecule-itp-file

.. _TOP: http://manual.gromacs.org/current/reference-manual/file-formats.html#top

Classes
-------

.. autoclass:: ITPParser
   :members:
   :inherited-members:

"""
from collections import defaultdict
import os

import logging
import numpy as np

from ..lib.util import openany
from . import guessers
from .base import TopologyReaderBase, change_squash, reduce_singular
from ..core.topologyattrs import (
    Atomids,
    Atomnames,
    Atomtypes,
    Masses,
    Moltypes,
    Molnums,
    Charges,
    Resids,
    Resnums,
    Resnames,
    Segids,
    Bonds,
    Angles,
    Dihedrals,
    Impropers,
    AtomAttr,
)
from ..core.topology import Topology


class Chargegroups(AtomAttr):
    """The charge group for each Atom"""
    attrname = 'chargegroups'
    singular = 'chargegroup'


class GmxTopIterator:
    """
    Iterate over the lines of a TOP/ITP file and its included files

    The iterator strips comments, deals with #ifdef and #ifndef conditions,
    and substitutes defined variables.

    Defined variables passed into ``__init__`` *override* anything defined in the file.
    """

    def __init__(self, path, include_dir, defines):
        self._original_defines = defines
        self.defines = dict(**defines)  # copy
        self.include_dir = include_dir
        self.file_stack = [path]
        self.starting_file = path

    @property
    def current_file(self):
        return self.file_stack[-1]

    def __iter__(self):
        for line in self.iter_from_file(self.starting_file):
            yield line

    def iter_from_file(self, path):
        found_path = self.find_path(path)
        with openany(found_path) as infile:
            self.file_stack.append(infile)

            for line in self.clean_file_lines(infile):
                if line.startswith('#include'):
                    inc = line.split(None, 1)[1][1:-1]
                    for line in self.iter_from_file(inc):
                        yield line
                elif line.startswith('#define'):
                    self.define(line)
                elif line.startswith('#if'):
                    self.do_if(line, infile)
                elif line.startswith('#else'):
                    self.skip_until_endif(infile)
                elif line.startswith('#'):  # ignore #if and others
                    pass
                elif line:
                    line = self.substitute_defined(line)
                    yield line
            self.file_stack.pop()

    def define(self, line):
        try:
            _, variable, value = line.split(None, 2)
        except ValueError:
            _, variable = line.split()
            value = True
        
        # kwargs overrides files
        if variable not in self._original_defines:
            self.defines[variable] = value

    def substitute_defined(self, line):
        split = line.split()
        for k, v in self.defines.items():
            if k in split:
                split[split.index(k)] = str(v)
        line = ' '.join(split)
        return line

    def clean_file_lines(self, infile):
        for line in infile:
            line = line.split(';')[0].strip()  # ; is for comments
            yield line

    def do_if(self, line, infile):
        ifdef, variable = line.split()
        if ifdef == '#ifdef':
            if self.defines.get(variable) in (False, None):
                self.skip_until_else(infile)
        elif ifdef == '#ifndef':
            if self.defines.get(variable) not in (False, None):
                self.skip_until_else(infile)

    def skip_until_else(self, infile):
        """Skip lines until #if condition ends"""
        for line in self.clean_file_lines(infile):
            if line.startswith('#if'):
                self.skip_until_endif(infile)
            elif line.startswith('#endif') or line.startswith('#else'):
                break
        else:
            raise IOError('Missing #endif in {}'.format(self.current_file))
    
    def skip_until_endif(self, infile):
        """Skip lines until #endif"""
        for line in self.clean_file_lines(infile):
            if line.startswith('#if'):
                self.skip_until_endif(infile)
            elif line.startswith('#endif'):
                break
        else:
            raise IOError('Missing #endif in {}'.format(self.current_file))

    def find_path(self, path):
        try:
            # in case of TextIOWrapper
            current_file = self.current_file.name
        except AttributeError:
            current_file = self.current_file

        try:
            path = os.path.abspath(path.name)
        except AttributeError:
            pass
        current_dir = os.path.dirname(current_file)
        dir_path = os.path.join(current_dir, path)
        if os.path.exists(dir_path):
            return dir_path
        include_path = os.path.join(self.include_dir, path)
        if os.path.exists(include_path):
            return include_path
        raise IOError('Could not find {}'.format(path))


class Molecule:
    """Store moleculetype-specific attributes"""

    def __init__(self, name):
        self.name = name
        self.ids = []
        self.types = []
        self.resids = []
        self.resnames = []
        self.names = []
        self.chargegroups = []
        self.charges = []
        self.masses = []

        self.bonds = defaultdict(list)
        self.angles = defaultdict(list)
        self.dihedrals = defaultdict(list)
        self.impropers = defaultdict(list)

        self.parsers = {
            'atoms': self.parse_atoms,
            'bonds': self.parse_bonds,
            'angles': self.parse_angles,
            'dihedrals': self.parse_dihedrals,
            'constraints': self.parse_constraints,
            'settles': self.parse_settles
        }

        self.resolved_residue_attrs = False

    @property
    def atom_order(self):
        return [self.ids, self.types, self.resids, self.resnames, 
                self.names, self.chargegroups, self.charges, 
                self.masses]

    @property
    def params(self):
        return [self.bonds, self.angles, self.dihedrals, self.impropers]
    
    def parse_atoms(self, line):
        values = line.split()
        for lst in self.atom_order:
            try:
                lst.append(values.pop(0))
            except IndexError:  # ran out of values
                lst.append('')
    
    def parse_bonds(self, line):
        self.add_param(line, self.bonds, n_funct=2, 
                       funct_values=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
    
    def parse_angles(self, line):
        self.add_param(line, self.angles, n_funct=3, 
                       funct_values=(1, 2, 3, 4, 5, 6, 8, 10))
    
    def parse_dihedrals(self, line):
        dih = self.add_param(line, self.dihedrals, n_funct=4, 
                             funct_values=(1, 3, 5, 8, 9, 10, 11))
        if not dih:
            self.add_param(line, self.impropers, n_funct=4,
                           funct_values=(2, 4))

    def parse_constraints(self, line):
        self.add_param(line, self.bonds, n_funct=2, funct_values=(1, 2))

    def parse_settles(self, line):
        # [ settles ] is a triangular constraint for 
        # water molecules.
        # In ITP files this is defined with only the 
        # oxygen atom index. The next two atoms are 
        # assumed to be hydrogens. Unlike TPRParser,  
        # the manual only lists this format (as of 2019).
        # These are treated as 2 bonds and 1 angle.
        oxygen, funct, doh, dhh = line.split()
        try:
            base = self.index_ids([oxygen])[0]
        except ValueError:
            pass
        else:
            self.bonds[(base, base+1)].append("settles")
            self.bonds[(base, base+2)].append("settles")
            self.angles[(base+1, base, base+2)].append("settles")

    def resolve_residue_attrs(self):
        """Figure out residue borders and assign moltypes and molnums"""
        resids = np.array(self.resids, dtype=np.int32)
        resnames = np.array(self.resnames, dtype=object)
        self.residx, (self.resids, resnames) = change_squash((resids,), (resids, resnames))
        self.resnames = list(resnames)
        self.moltypes = [self.name] * len(self.resids)
        self.molnums = np.array([1] * len(self.resids))

        self.resolved_residue_attrs = True

    def shift_indices(self, atomid=0, resid=0, molnum=0, cgnr=0, n_res=0, n_atoms=0):
        """
        Get attributes ready for adding onto a larger topology.

        Shifts atom indices, residue indices, molnums, and chargegroup numbers.

        Returns
        -------
        atom_attrs: list of lists
            attributes in the [ atoms ] section

        new_params: list of dicts
            Bonds, angles, dihedrals, impropers as dicts of shape {indices: parameters}

        molnums: list
        moltypes: list
        residx: list
        """
        if not self.resolved_residue_attrs:
            self.resolve_residue_attrs()

        resids = list(np.array(self.resids)+resid)
        residx = list(np.array(self.residx)+n_res)
        molnums = list(np.array(self.molnums) + molnum)
        ids = list(np.array(self.ids, dtype=int) + atomid)

        try:
            cg = np.array(self.chargegroups, dtype=int)
        except ValueError:
            cg = np.arange(1, len(self.chargegroups)+1)
        chargegroups = list(cg+cgnr)

        atom_order = [ids, self.types, resids, self.resnames, 
                      self.names, chargegroups, self.charges, 
                      self.masses]

        new_params = []
        for p in self.params:
            new = {}
            for indices, values in p.items():
                new[tuple(np.array(indices)+n_atoms)] = values
            new_params.append(new)

        return atom_order, new_params, molnums, self.moltypes, residx

    def add_param(self, line, container, n_funct=2, funct_values=[]):
        """Add defined GROMACS directive lines, only if the funct is in ``funct_values``"""
        values = line.split()
        funct = int(values[n_funct])
        if funct in funct_values:
            try:
                ids = self.index_ids(values[:n_funct])
                container[ids].append(funct)
            except ValueError:
                pass
            return True
        else:
            return False

    def index_ids(self, values):
        """
        Get indices of atom ids (list of strings)
        """
        return tuple(map(self.ids.index, values))


[docs]class ITPParser(TopologyReaderBase):
    """Read topology information from a GROMACS ITP_ or TOP_ file.

    Creates a Topology with the following Attributes:
    - ids
    - names
    - types
    - masses
    - charges
    - chargegroups
    - resids
    - resnames
    - segids
    - moltypes
    - molnums
    - bonds
    - angles
    - dihedrals
    - impropers

    .. _ITP: http://manual.gromacs.org/current/reference-manual/topologies/topology-file-formats.html#molecule-itp-file
    .. _TOP: http://manual.gromacs.org/current/reference-manual/file-formats.html#top
    """
    format = 'ITP'

[docs]    def parse(self, include_dir='/usr/local/gromacs/share/gromacs/top/',
              infer_system=True,
              **kwargs):
        """Parse ITP file into Topology

        Parameters
        ----------
        include_dir: str, optional
            A directory in which to look for other files included
            from the original file, if the files are not first found
            in the current directory. 
            Default: "/usr/local/gromacs/share/gromacs/top/"

        infer_system: bool, optional (default True)
            If a ``[ molecules ]`` directive is not found within the the
            topology file, create a Topology with one of every 
            ``[ moleculetype ]`` defined. If a ``[ molecules ]`` directive is 
            found, this keyword is ignored.

        Returns
        -------
        MDAnalysis *Topology* object
        """

        self.atomtypes = {}
        self.molecules = {}
        self._molecules = []  # for order
        self.current_mol = None
        self.parser = self._pass
        self.system_molecules = []        

        # Open and check itp validity
        with openany(self.filename) as itpfile:
            self.lines = GmxTopIterator(itpfile, include_dir, kwargs)
            for line in self.lines:
                if '[' in line and ']' in line:
                    section = line.split('[')[1].split(']')[0].strip()

                    if section == 'atomtypes':
                        self.parser = self.parse_atomtypes

                    elif section == 'moleculetype':
                        self.parser = self.parse_moleculetype

                    elif section == 'molecules':
                        self.parser = self.parse_molecules
                    
                    elif self.current_mol:
                        self.parser = self.current_mol.parsers.get(section, self._pass)
                    
                    else:
                        self.parser = self._pass
                
                else:
                    self.parser(line)

        if not self.system_molecules and infer_system:
            self.system_molecules = [x.name for x in self._molecules]

        self.build_system()

        self.types = np.array(self.types)
        self.charges = np.array(self.charges)
        self.masses = np.array(self.masses)

        if not all(self.charges):
            empty = self.charges == ''
            self.charges[empty] = [
                (
                    self.atomtypes.get(x)["charge"]
                    if x in self.atomtypes.keys()
                    else ''
                )
                for x in self.types[empty]
            ]

        if not all(self.masses):
            empty = self.masses == ''
            self.masses[empty] = [
                (
                    self.atomtypes.get(x)["mass"]
                    if x in self.atomtypes.keys()
                    else ''
                )
                for x in self.types[empty]
            ]

        attrs = []
        # atom stuff
        for vals, Attr, dtype in (
            (self.ids, Atomids, np.int32),
            (self.types, Atomtypes, object),
            (self.names, Atomnames, object),
            (self.chargegroups, Chargegroups, np.int32),
            (self.charges, Charges, np.float32),
        ):
            if all(vals):
                attrs.append(Attr(np.array(vals, dtype=dtype)))

        if not all(self.masses):
            empty = self.masses == ''
            self.masses[empty] = guessers.guess_masses(
                guessers.guess_types(self.types)[empty])
            attrs.append(Masses(np.array(self.masses, dtype=np.float64),
                                guessed=True))
        else:
            attrs.append(Masses(np.array(self.masses, dtype=np.float64),
                                guessed=False))

        # residue stuff
        resids = np.array(self.resids, dtype=np.int32)
        resnames = np.array(self.resnames, dtype=object)
        molnums = np.array(self.molnums, dtype=np.int32)
        attrs.append(Resids(resids))
        attrs.append(Resnums(resids.copy()))
        attrs.append(Resnames(resnames))
        attrs.append(Moltypes(np.array(self.moltypes, dtype=object)))
        attrs.append(Molnums(molnums))
        
        n_atoms = len(self.ids)
        n_residues = len(self.resids)
        n_segments = len(self.system_molecules)
        attrs.append(Segids(np.array(self.system_molecules, dtype=object)))
        segidx = molnums-1

        top = Topology(n_atoms, n_residues, n_segments,
                       attrs=attrs,
                       atom_resindex=self.residx,
                       residue_segindex=segidx)

        # connectivity stuff
        for dct, Attr, attrname in (
            (self.bonds, Bonds, 'bonds'),
            (self.angles, Angles, 'angles'),
            (self.dihedrals, Dihedrals, 'dihedrals'),
            (self.impropers, Impropers, 'impropers')
        ):
            if dct:
                indices, types = zip(*list(dct.items()))
            else:
                indices, types = [], []

            types = [reduce_singular(t) for t in types]
            
            tattr = Attr(indices, types=types)
            top.add_TopologyAttr(tattr)

        return top

    def _pass(self, line):
        pass

    def parse_atomtypes(self, line):
        keys = ['type_bonded', 'atomic_number', 'mass', 'charge', 'p_type']
        fields = line.split()
        if len(fields[5]) == 1 and fields[5].isalpha():
            values = fields[1:6]
        elif len(fields[3]) == 1 and fields[3].isalpha():
            values = '', '', fields[1], fields[2], fields[3]
        elif len(fields[4]) == 1 and fields[4].isalpha():
            if fields[1][0].isalpha():
                values = fields[1], '', fields[2], fields[3], fields[4]
            else:
                values = '', fields[1], fields[2], fields[3], fields[4]
        self.atomtypes[fields[0]] = dict(zip(keys, values))

    def parse_moleculetype(self, line):
        name = line.split()[0]
        self.current_mol = self.molecules[name] = Molecule(name)
        self._molecules.append(self.current_mol)

    def parse_molecules(self, line):
        name, n_mol = line.split()
        self.system_molecules.extend([name]*int(n_mol))

    def build_system(self):
        self.ids = []
        self.types = []
        self.resids = []
        self.resnames = []
        self.names = []
        self.chargegroups = []
        self.charges = []
        self.masses = []
        self.moltypes = []
        self.molnums = []
        self.residx = []

        self.atom_order = [self.ids, self.types, self.resids, self.resnames, 
                           self.names, self.chargegroups, self.charges, 
                           self.masses]

        self.bonds = defaultdict(list)
        self.angles = defaultdict(list)
        self.dihedrals = defaultdict(list)
        self.impropers = defaultdict(list)

        self.params = [self.bonds, self.angles, self.dihedrals, self.impropers]

        for i, moltype in enumerate(self.system_molecules):
            mol = self.molecules[moltype]

            atomid = self.ids[-1] if self.ids else 0
            resid = self.resids[-1] if self.resids else 0
            cgnr = self.chargegroups[-1] if self.chargegroups else 0
            n_res = len(self.resids)
            n_atoms = len(self.ids)

            shifted = mol.shift_indices(atomid=atomid, resid=resid, 
                                        n_res=n_res, cgnr=cgnr, molnum=i,
                                        n_atoms=n_atoms)
            atom_order, params, molnums, moltypes, residx = shifted

            for system_attr, mol_attr in zip(self.atom_order, atom_order):
                system_attr.extend(mol_attr)

            self.moltypes.extend(moltypes)
            self.molnums.extend(molnums)
            self.residx.extend(residx)

            for system_param, mol_param in zip(self.params, params):
                system_param.update(mol_param)