Source code for MDAnalysis.analysis.leaflet

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#


"""
Leaflet identification --- :mod:`MDAnalysis.analysis.leaflet`
==============================================================

This module implements the *LeafletFinder* algorithm, described in
[Michaud-Agrawal2011]_. It can identify the lipids in a bilayer of
arbitrary shape and topology, including planar and undulating bilayers
under periodic boundary conditions or vesicles.

One can use this information to identify

* the upper and lower leaflet of a *planar membrane* by comparing the
  the :meth:`~MDAnalysis.core.groups.AtomGroup.center_of_geometry` of
  the leaflet groups, or

* the outer and inner leaflet of a *vesicle* by comparing histograms
  of distances from the centre of geometry (or possibly simply the
  :meth:`~MDAnalysis.core.groups.AtomGroup.radius_of_gyration`).

See example scripts in the MDAnalysisCookbook_ on how to use
:class:`LeafletFinder`. The function :func:`optimize_cutoff` implements a
(slow) heuristic method to find the best cut off for the LeafletFinder
algorithm.

.. _MDAnalysisCookbook: https://github.com/MDAnalysis/MDAnalysisCookbook/tree/master/examples


Algorithm
---------

1. build a graph of all phosphate distances < cutoff
2. identify the largest connected subgraphs
3. analyse first and second largest graph, which correspond to the leaflets

For further details see [Michaud-Agrawal2011]_.


Classes and Functions
---------------------

.. autoclass:: LeafletFinder
   :members:

.. autofunction:: optimize_cutoff

"""
from __future__ import division, absolute_import

from six.moves import range

import warnings

import numpy as np
import networkx as NX

from .. import core
from . import distances
from .. import selections

from ..due import due, Doi

due.cite(Doi("10.1002/jcc.21787"),
         description="LeafletFinder algorithm",
         path="MDAnalysis.analysis.leaflet",
         cite_module=True)
del Doi


[docs]class LeafletFinder(object):
    """Identify atoms in the same leaflet of a lipid bilayer.

    This class implements the *LeafletFinder* algorithm [Michaud-Agrawal2011]_.

    Parameters
    ----------
    universe : Universe or str
        :class:`MDAnalysis.Universe` or a file name (e.g., in PDB or
        GRO format)
    select : AtomGroup or str
        A AtomGroup instance or a
        :meth:`Universe.select_atoms` selection string
        for atoms that define the lipid head groups, e.g.
        universe.atoms.PO4 or "name PO4" or "name P*"
    cutoff : float (optional)
        head group-defining atoms within a distance of `cutoff`
        Angstroms are deemed to be in the same leaflet [15.0]
    pbc : bool (optional)
        take periodic boundary conditions into account [``False``]
    sparse : bool (optional)
        ``None``: use fastest possible routine; ``True``: use slow
        sparse matrix implementation (for large systems); ``False``:
        use fast :func:`~MDAnalysis.lib.distances.distance_array`
        implementation [``None``].

    Example
    -------
    The components of the graph are stored in the list
    :attr:`LeafletFinder.components`; the atoms in each component are numbered
    consecutively, starting at 0. To obtain the atoms in the input structure
    use :meth:`LeafletFinder.groups`::

       L = LeafletFinder(PDB, 'name P*')
       leaflet0 = L.groups(0)
       leaflet1 = L.groups(1)

    The residues can be accessed through the standard MDAnalysis mechanism::

       leaflet0.residues

    provides a :class:`~MDAnalysis.core.groups.ResidueGroup`
    instance. Similarly, all atoms in the first leaflet are then ::

       leaflet0.residues.atoms

    .. versionchanged:: 1.0.0
       Changed `selection` keyword to `select`
    """

    def __init__(self, universe, select, cutoff=15.0, pbc=False, sparse=None):
        universe = core.universe.as_Universe(universe)
        self.universe = universe
        self.selectionstring = select
        if isinstance(self.selectionstring, core.groups.AtomGroup):
            self.selection = self.selectionstring
        else:
            self.selection = universe.select_atoms(self.selectionstring)
        self.pbc = pbc
        self.sparse = sparse
        self._init_graph(cutoff)

    def _init_graph(self, cutoff):
        self.cutoff = cutoff
        self.graph = self._get_graph()
        self.components = self._get_components()

    # The last two calls in _get_graph() and the single line in
    # _get_components() are all that are needed to make the leaflet
    # detection work.

    def _get_graph(self):
        """Build graph from adjacency matrix at the given cutoff.
        Automatically select between high and low memory usage versions of
        contact_matrix."""
        # could use self_distance_array to speed up but then need to deal with the sparse indexing
        if self.pbc:
            box = self.universe.trajectory.ts.dimensions
        else:
            box = None
        coord = self.selection.positions
        if self.sparse is False:
            # only try distance array
            try:
                adj = distances.contact_matrix(coord, cutoff=self.cutoff, returntype="numpy", box=box)
            except ValueError:      # pragma: no cover
                warnings.warn('N x N matrix too big, use sparse=True or sparse=None', category=UserWarning,
                              stacklevel=2)
                raise
        elif self.sparse is True:
            # only try sparse
            adj = distances.contact_matrix(coord, cutoff=self.cutoff, returntype="sparse", box=box)
        else:
            # use distance_array and fall back to sparse matrix
            try:
                # this works for small-ish systems and depends on system memory
                adj = distances.contact_matrix(coord, cutoff=self.cutoff, returntype="numpy", box=box)
            except ValueError:       # pragma: no cover
                # but use a sparse matrix method for larger systems for memory reasons
                warnings.warn(
                    'N x N matrix too big - switching to sparse matrix method (works fine, but is currently rather '
                    'slow)',
                    category=UserWarning, stacklevel=2)
                adj = distances.contact_matrix(coord, cutoff=self.cutoff, returntype="sparse", box=box)
        return NX.Graph(adj)

    def _get_components(self):
        """Return connected components (as sorted numpy arrays), sorted by size."""
        return [np.sort(list(component)) for component in NX.connected_components(self.graph)]

[docs]    def update(self, cutoff=None):
        """Update components, possibly with a different *cutoff*"""
        if cutoff is None:
            cutoff = self.cutoff
        self._init_graph(cutoff)

[docs]    def sizes(self):
        """Dict of component index with size of component."""
        return dict(((idx, len(component)) for idx, component in enumerate(self.components)))

[docs]    def groups(self, component_index=None):
        """Return a :class:`MDAnalysis.core.groups.AtomGroup` for *component_index*.

        If no argument is supplied, then a list of all leaflet groups is returned.

        See Also
        --------
        :meth:`LeafletFinder.group`
        :meth:`LeafletFinder.groups_iter`
        """
        if component_index is None:
            return list(self.groups_iter())
        else:
            return self.group(component_index)

[docs]    def group(self, component_index):
        """Return a :class:`MDAnalysis.core.groups.AtomGroup` for *component_index*."""
        # maybe cache this?
        indices = [i for i in self.components[component_index]]
        return self.selection[indices]

[docs]    def groups_iter(self):
        """Iterator over all leaflet :meth:`groups`"""
        for component_index in range(len(self.components)):
            yield self.group(component_index)

[docs]    def write_selection(self, filename, **kwargs):
        """Write selections for the leaflets to *filename*.

        The format is typically determined by the extension of *filename*
        (e.g. "vmd", "pml", or "ndx" for VMD, PyMol, or Gromacs).

        See :class:`MDAnalysis.selections.base.SelectionWriter` for all
        options.
        """
        sw = selections.get_writer(filename, kwargs.pop('format', None))
        with sw(filename, mode=kwargs.pop('mode', 'w'),
                preamble="leaflets based on select={selectionstring!r} cutoff={cutoff:f}\n".format(
                    **vars(self)),
                **kwargs) as writer:
            for i, ag in enumerate(self.groups_iter()):
                name = "leaflet_{0:d}".format((i + 1))
                writer.write(ag, name=name)

    def __repr__(self):
        return "<LeafletFinder({0!r}, cutoff={1:.1f} A) with {2:d} atoms in {3:d} groups>".format(
            self.selectionstring, self.cutoff, self.selection.n_atoms,
            len(self.components))


[docs]def optimize_cutoff(universe, select, dmin=10.0, dmax=20.0, step=0.5,
                    max_imbalance=0.2, **kwargs):
    r"""Find cutoff that minimizes number of disconnected groups.

    Applies heuristics to find best groups:

    1. at least two groups (assumes that there are at least 2 leaflets)
    2. reject any solutions for which:

       .. math::

              \frac{|N_0 - N_1|}{|N_0 + N_1|} > \mathrm{max_imbalance}

       with :math:`N_i` being the number of lipids in group
       :math:`i`. This heuristic picks groups with balanced numbers of
       lipids.

    Parameters
    ----------
    universe : Universe
        :class:`MDAnalysis.Universe` instance
    select : AtomGroup or str
        AtomGroup or selection string as used for :class:`LeafletFinder`
    dmin : float (optional)
    dmax : float (optional)
    step : float (optional)
        scan cutoffs from `dmin` to `dmax` at stepsize `step` (in Angstroms)
    max_imbalance : float (optional)
        tuning parameter for the balancing heuristic [0.2]
    kwargs : other keyword arguments
        other arguments for  :class:`LeafletFinder`

    Returns
    -------
    (cutoff, N)
         optimum cutoff and number of groups found


    .. Note:: This function can die in various ways if really no
              appropriate number of groups can be found; it ought  to be
              made more robust.

    .. versionchanged:: 1.0.0
       Changed `selection` keyword to `select`
    """
    kwargs.pop('cutoff', None)  # not used, so we filter it
    _sizes = []
    for cutoff in np.arange(dmin, dmax, step):
        LF = LeafletFinder(universe, select, cutoff=cutoff, **kwargs)
        # heuristic:
        #  1) N > 1
        #  2) no imbalance between large groups:
        sizes = LF.sizes()
        if len(sizes) < 2:
            continue
        n0 = float(sizes[0])  # sizes of two biggest groups ...
        n1 = float(sizes[1])  # ... assumed to be the leaflets
        imbalance = np.abs(n0 - n1) / (n0 + n1)
        # print "sizes: %(sizes)r; imbalance=%(imbalance)f" % vars()
        if imbalance > max_imbalance:
            continue
        _sizes.append((cutoff, len(LF.sizes())))
    results = np.rec.fromrecords(_sizes, names="cutoff,N")
    del _sizes
    results.sort(order=["N", "cutoff"])  # sort ascending by N, then cutoff
    return results[0]  # (cutoff,N) with N>1 and shortest cutoff