Source code for MDAnalysis.analysis.dssp.pydssp_numpy

"""
A re-implementation of DSSP algorithm :footcite:p:`Kabsch1983`, taken from
*pydssp* v.0.9.0 (https://github.com/ShintaroMinami/PyDSSP) by Shintaro Minami,
distributed under MIT license.

Current implementation doesn't use `einops` as a dependency, instead directly
using `numpy` operations for axis rearrangement. However, this implementation
does not allow for batch computation, in contrast with `pydssp`, since it's
designed to be used in per-frame manner in protein trajectories.
"""

import numpy as np

CONST_Q1Q2 = 0.084
CONST_F = 332
DEFAULT_CUTOFF = -0.5
DEFAULT_MARGIN = 1.0


def _upsample(a: np.ndarray, window: int) -> np.ndarray:
    """Performs array upsampling with given window along given axis.

    Example
    -------
    .. code-block:: python
        hbmap = np.arange(4*4).reshape(4,4)
        print(hbmap)
        # [[ 0  1  2  3]
        #  [ 4  5  6  7]
        #  [ 8  9 10 11]
        #  [12 13 14 15]]

        print(_upsample(hbmap))
        # [[[[ 0  1  2]
        #    [ 4  5  6]
        #    [ 8  9 10]]

        #   [[ 1  2  3]
        #    [ 5  6  7]
        #    [ 9 10 11]]]


        #  [[[ 4  5  6]
        #    [ 8  9 10]
        #    [12 13 14]]

        #   [[ 5  6  7]
        #    [ 9 10 11]
        #    [13 14 15]]]]

    Parameters
    ----------
    a : np.ndarray
        input array
    window : int
        upsample window

    Returns
    -------
    np.ndarray
        unfolded array
    """
    return _unfold(_unfold(a, window, -2), window, -2)


def _unfold(a: np.ndarray, window: int, axis: int):
    "Helper function for 2D array upsampling"
    idx = np.arange(window)[:, None] + np.arange(a.shape[axis] - window + 1)[None, :]
    unfolded = np.take(a, idx, axis=axis)
    return np.moveaxis(unfolded, axis - 1, -1)


def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
    """Fills in hydrogen atoms positions if they are abscent, under the
    assumption that C-N-H and H-N-CA angles are perfect 120 degrees,
    and N-H bond length is 1.01 A.

    Parameters
    ----------
    coord : np.ndarray
        input coordinates in Angstrom, shape (n_atoms, 4, 3),
        where second axes corresponds to (N, CA, C, O) atom coordinates

    Returns
    -------
    np.ndarray
        coordinates of additional hydrogens, shape (n_atoms-1, 3)

    .. versionadded:: 2.8.0
    """
    # C_i, N_i, H_i and CA_{i+1} are all in the peptide bond plane
    # we wanna get C_{i+1} - N_{i} vectors and normalize them
    # ---------
    # v1 = vec(C_i, N_i)
    # v2 = vec(CA_{i+1}, N_i)
    # v3 = vec(N_i, H_i) = ?
    # we use the assumption that all the angles are 120 degrees,
    # and |v3| = 1.01, hence
    # we can derive v3 = (v1/|v1| + v2/|v2|)*|v3|

    # get v1 = vec(C_i, N_i)
    vec_cn = coord[1:, 0] - coord[:-1, 2]
    vec_cn = vec_cn / np.linalg.norm(vec_cn, axis=-1, keepdims=True)

    # get v2 = vec(CA_{i+1}, N_{i})
    vec_can = coord[1:, 0] - coord[1:, 1]
    vec_can = vec_can / np.linalg.norm(vec_can, axis=-1, keepdims=True)

    vec_nh = vec_cn + vec_can
    vec_nh = vec_nh / np.linalg.norm(vec_nh, axis=-1, keepdims=True)

    # vec_(0, H) = vec(0, N) + vec_nh
    return coord[1:, 0] + 1.01 * vec_nh


def get_hbond_map(
    coord: np.ndarray,
    cutoff: float = DEFAULT_CUTOFF,
    margin: float = DEFAULT_MARGIN,
    return_e: bool = False,
) -> np.ndarray:
    """Returns hydrogen bond map

    Parameters
    ----------
    coord : np.ndarray
        input coordinates in either (n, 4, 3) or (n, 5, 3) shape
        (without or with hydrogens). If hydrogens are not present, then
        ideal positions (see :func:_get_hydrogen_atom_positions) are used.
    cutoff : float, optional
        cutoff, by default DEFAULT_CUTOFF
    margin : float, optional
        margin, by default DEFAULT_MARGIN
    return_e : bool, optional
        if to return energy instead of hbond map, by default False

    Returns
    -------
    np.ndarray
        output hbond map or energy depending on return_e param


    .. versionadded:: 2.8.0
    """
    n_atoms, n_atom_types, _ = coord.shape
    assert n_atom_types in (
        4,
        5,
    ), "Number of atoms should be 4 (N,CA,C,O) or 5 (N,CA,C,O,H)"

    if n_atom_types == 4:
        h_1 = _get_hydrogen_atom_position(coord)
    elif n_atom_types == 5:
        h_1 = coord[1:, 4]
        coord = coord[:, :4]
    else:  # pragma: no cover
        raise ValueError("Number of atoms should be 4 (N,CA,C,O) or 5 (N,CA,C,O,H)")
    # after this:
    # h.shape == (n_atoms, 3)
    # coord.shape == (n_atoms, 4, 3)

    # distance matrix
    n_1, c_0, o_0 = coord[1:, 0], coord[0:-1, 2], coord[0:-1, 3]

    n = n_atoms - 1
    cmap = np.tile(c_0, (n, 1, 1))
    omap = np.tile(o_0, (n, 1, 1))
    nmap = np.tile(n_1, (1, 1, n)).reshape(n, n, 3)
    hmap = np.tile(h_1, (1, 1, n)).reshape(n, n, 3)

    d_on = np.linalg.norm(omap - nmap, axis=-1)
    d_ch = np.linalg.norm(cmap - hmap, axis=-1)
    d_oh = np.linalg.norm(omap - hmap, axis=-1)
    d_cn = np.linalg.norm(cmap - nmap, axis=-1)

    # electrostatic interaction energy
    # e[i, j] = e(CO_i) - e(NH_j)
    e = np.pad(
        CONST_Q1Q2 * (1.0 / d_on + 1.0 / d_ch - 1.0 / d_oh - 1.0 / d_cn) * CONST_F,
        [[1, 0], [0, 1]],
    )

    if return_e:  # pragma: no cover
        return e

    # mask for local pairs (i,i), (i,i+1), (i,i+2)
    local_mask = ~np.eye(n_atoms, dtype=bool)
    local_mask *= ~np.diag(np.ones(n_atoms - 1, dtype=bool), k=-1)
    local_mask *= ~np.diag(np.ones(n_atoms - 2, dtype=bool), k=-2)
    # hydrogen bond map (continuous value extension of original definition)
    hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
    hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2
    hbond_map = hbond_map * local_mask

    return hbond_map



[docs]
def assign(coord: np.ndarray) -> np.ndarray:
    """Assigns secondary structure for a given coordinate array,
    either with or without assigned hydrogens

    Parameters
    ----------
    coord : np.ndarray
        input coordinates in either (n, 4, 3) or (n, 5, 3) shape,
        without or with hydrogens, respectively. Second dimension `k` represents
        (N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates
        (when k=5).

    Returns
    -------
    np.ndarray
        output (n,) array with one-hot labels in C3 notation ('-', 'H', 'E'),
        representing loop, helix and sheet, respectively.


    .. versionadded:: 2.8.0
    """
    # get hydrogen bond map
    hbmap = get_hbond_map(coord)
    hbmap = np.swapaxes(hbmap, -1, -2)  # convert into "i:C=O, j:N-H" form

    # identify turn 3, 4, 5
    turn3 = np.diagonal(hbmap, offset=3) > 0.0
    turn4 = np.diagonal(hbmap, offset=4) > 0.0
    turn5 = np.diagonal(hbmap, offset=5) > 0.0

    # assignment of helical secondary structures
    h3 = np.pad(turn3[:-1] * turn3[1:], [[1, 3]])
    h4 = np.pad(turn4[:-1] * turn4[1:], [[1, 4]])
    h5 = np.pad(turn5[:-1] * turn5[1:], [[1, 5]])

    # helix4 first, as alpha helix
    helix4 = h4 + np.roll(h4, 1, 0) + np.roll(h4, 2, 0) + np.roll(h4, 3, 0)
    h3 = h3 * ~np.roll(helix4, -1, 0) * ~helix4  # helix4 is higher prioritized
    h5 = h5 * ~np.roll(helix4, -1, 0) * ~helix4  # helix4 is higher prioritized
    helix3 = h3 + np.roll(h3, 1, 0) + np.roll(h3, 2, 0)
    helix5 = (
        h5
        + np.roll(h5, 1, 0)
        + np.roll(h5, 2, 0)
        + np.roll(h5, 3, 0)
        + np.roll(h5, 4, 0)
    )

    # identify bridge
    unfoldmap = _upsample(hbmap, 3) > 0.0
    unfoldmap_rev = np.swapaxes(unfoldmap, 0, 1)

    p_bridge = (unfoldmap[:, :, 0, 1] * unfoldmap_rev[:, :, 1, 2]) + (
        unfoldmap_rev[:, :, 0, 1] * unfoldmap[:, :, 1, 2]
    )
    p_bridge = np.pad(p_bridge, [[1, 1], [1, 1]])

    a_bridge = (unfoldmap[:, :, 1, 1] * unfoldmap_rev[:, :, 1, 1]) + (
        unfoldmap[:, :, 0, 2] * unfoldmap_rev[:, :, 0, 2]
    )
    a_bridge = np.pad(a_bridge, [[1, 1], [1, 1]])

    # ladder
    ladder = (p_bridge + a_bridge).sum(-1) > 0.0

    # H, E, L of C3
    helix = (helix3 + helix4 + helix5) > 0.0
    strand = ladder
    loop = ~helix * ~strand

    onehot = np.stack([loop, helix, strand], axis=-1)
    return onehot