Source code for MDAnalysis.visualization.streamlines_3D

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""Streamplots (3D) --- :mod:`MDAnalysis.visualization.streamlines_3D`
===================================================================

:Authors: Tyler Reddy and Matthieu Chavent
:Year: 2014
:Copyright: GNU Public License v3


The :func:`generate_streamlines_3d` function can generate a 3D flow field from
a MD trajectory, for instance, lipid molecules in a virus capsid. It can make
use of multiple cores to perform the analyis in parallel (using
:mod:`multiprocessing`).

.. rubric: References

.. bibliography::
    :filter: False
    :style: MDA
    :keyprefix: b-
    :labelprefix: ᵇ

    Chavent2014

See Also
--------
MDAnalysis.visualization.streamlines : streamplots in 2D


.. autofunction:: generate_streamlines_3d

"""
import multiprocessing

import numpy as np
import numpy.testing
import scipy
import scipy.spatial.distance

import MDAnalysis


def determine_container_limits(topology_file_path, trajectory_file_path,
                               buffer_value):
    """Calculate the extent of the atom coordinates + buffer.

    A function for the parent process which should take the input trajectory
    and calculate the limits of the container for the system and return these
    limits.

    Parameters
    ----------
    topology_file_path : str
        topology file name
    trajectory_file_path : str
        trajectory file name
    buffer_value : float
        buffer value (padding) in +/- {x, y, z}
    """
    universe_object = MDAnalysis.Universe(topology_file_path, trajectory_file_path)
    all_atom_selection = universe_object.select_atoms('all')  # select all particles
    all_atom_coordinate_array = all_atom_selection.positions
    x_min, x_max, y_min, y_max, z_min, z_max = [
        all_atom_coordinate_array[..., 0].min(),
        all_atom_coordinate_array[..., 0].max(), all_atom_coordinate_array[..., 1].min(),
        all_atom_coordinate_array[..., 1].max(), all_atom_coordinate_array[..., 2].min(),
        all_atom_coordinate_array[..., 2].max()]
    tuple_of_limits = \
        (
            x_min - buffer_value,
            x_max + buffer_value, y_min - buffer_value, y_max + buffer_value, z_min - buffer_value,
            z_max + buffer_value)  # using buffer_value to catch particles near edges
    return tuple_of_limits


def produce_grid(tuple_of_limits, grid_spacing):
    """Produce a 3D grid for the simulation system.

    The partitioning is based on the tuple of Cartesian Coordinate limits
    calculated in an earlier step.

    Parameters
    ----------
    tuple_of_limits : tuple
        ``x_min, x_max, y_min, y_max, z_min, z_max``
    grid_spacing : float
        grid size in all directions in ångström

    Returns
    -------
    grid : array
        ``numpy.mgrid[x_min:x_max:grid_spacing, y_min:y_max:grid_spacing, z_min:z_max:grid_spacing]``

    """
    x_min, x_max, y_min, y_max, z_min, z_max = tuple_of_limits
    grid = np.mgrid[x_min:x_max:grid_spacing, y_min:y_max:grid_spacing, z_min:z_max:grid_spacing]
    return grid


def split_grid(grid, num_cores):
    """Split the grid into blocks of vertices.

    Take the overall `grid` for the system and split it into lists of cube
    vertices that can be distributed to each core.

    Parameters
    ----------
    grid : numpy.array
        3D array
    num_cores : int
        number of partitions to generate

    Returns
    -------
    list_dictionaries_for_cores : list of dict
    total_cubes : int
    num_sheets : int
    delta_array_shape : tuple

    """
    # unpack the x,y,z mgrid arrays
    x, y, z = grid
    num_z_values = z.shape[-1]
    num_sheets = z.shape[0]
    delta_array_shape = tuple(
        [n - 1 for n in x.shape])  # the final target shape for return delta arrays is n-1 in each dimension

    ordered_list_per_sheet_x_values = []
    for x_sheet in x:  # each x_sheet should have shape (25,23) and the same x value in each element
        array_all_x_values_current_sheet = x_sheet.flatten()
        ordered_list_per_sheet_x_values.append(array_all_x_values_current_sheet)
    ordered_list_per_sheet_y_values = []
    for y_columns in y:
        array_all_y_values_current_sheet = y_columns.flatten()
        ordered_list_per_sheet_y_values.append(array_all_y_values_current_sheet)
    ordered_list_per_sheet_z_values = []
    for z_slices in z:
        array_all_z_values_current_sheet = z_slices.flatten()
        ordered_list_per_sheet_z_values.append(array_all_z_values_current_sheet)

    ordered_list_cartesian_coordinates_per_sheet = []
    for x_sheet_coords, y_sheet_coords, z_sheet_coords in zip(ordered_list_per_sheet_x_values,
                                                              ordered_list_per_sheet_y_values,
                                                              ordered_list_per_sheet_z_values):
        ordered_list_cartesian_coordinates_per_sheet.append(list(zip(x_sheet_coords, y_sheet_coords, z_sheet_coords)))
    array_ordered_cartesian_coords_per_sheet = np.array(ordered_list_cartesian_coordinates_per_sheet)
    #now I'm going to want to build cubes in an ordered fashion, and in such a way that I can track the index /
    # centroid of each cube for domain decomposition / reconstruction and mayavi mlab.flow() input
    #cubes will be formed from N - 1 base sheets combined with subsequent sheets
    current_base_sheet = 0
    dictionary_cubes_centroids_indices = {}
    cube_counter = 0
    while current_base_sheet < num_sheets - 1:
        current_base_sheet_array = array_ordered_cartesian_coords_per_sheet[current_base_sheet]
        current_top_sheet_array = array_ordered_cartesian_coords_per_sheet[
            current_base_sheet + 1]  # the points of the sheet 'to the right' in the grid
        current_index = 0
        while current_index < current_base_sheet_array.shape[0] - num_z_values:
            # iterate through all the indices in each of the sheet arrays (careful to avoid extra
            # points not needed for cubes)
            column_z_level = 0  # start at the bottom of a given 4-point column and work up
            while column_z_level < num_z_values - 1:
                current_list_cube_vertices = []
                first_two_vertices_base_sheet = current_base_sheet_array[current_index:current_index + 2, ...].tolist()
                first_two_vertices_top_sheet = current_top_sheet_array[current_index:current_index + 2, ...].tolist()
                next_two_vertices_base_sheet = current_base_sheet_array[current_index +
                                                                        num_z_values: 2 +
                                                                        num_z_values + current_index, ...].tolist()
                next_two_vertices_top_sheet = current_top_sheet_array[current_index +
                                                                      num_z_values: 2 +
                                                                      num_z_values + current_index, ...].tolist()
                for vertex_set in [
                    first_two_vertices_base_sheet, first_two_vertices_top_sheet,
                    next_two_vertices_base_sheet, next_two_vertices_top_sheet
                ]:
                    current_list_cube_vertices.extend(vertex_set)
                vertex_array = np.array(current_list_cube_vertices)
                assert vertex_array.shape == (8, 3), "vertex_array has incorrect shape"
                cube_centroid = np.average(np.array(current_list_cube_vertices), axis=0)
                dictionary_cubes_centroids_indices[cube_counter] = {
                    'centroid': cube_centroid,
                    'vertex_list': current_list_cube_vertices}
                cube_counter += 1
                current_index += 1
                column_z_level += 1
                if column_z_level == num_z_values - 1:  # the loop will break but I should also increment the
                    # current_index
                    current_index += 1
        current_base_sheet += 1
    total_cubes = len(dictionary_cubes_centroids_indices)

    #produce an array of pseudo cube indices (actually the dictionary keys which are cube numbers in string format):
    pseudo_cube_indices = np.arange(0, total_cubes)
    sublist_of_cube_indices_per_core = np.array_split(pseudo_cube_indices, num_cores)
    #now, the split of pseudoindices seems to work well, and the above sublist_of_cube_indices_per_core is a list of
    # arrays of cube numbers / keys in the original dictionary
    #now I think I'll try to produce a list of dictionaries that each contain their assigned cubes based on the above
    #  per core split
    list_dictionaries_for_cores = []
    subdictionary_counter = 0
    for array_cube_indices in sublist_of_cube_indices_per_core:
        current_core_dictionary = {}
        items_to_pop = len(array_cube_indices)
        items_popped = 0
        while items_popped < items_to_pop:
            key, value = dictionary_cubes_centroids_indices.popitem()
            current_core_dictionary.update({key: value})
            items_popped += 1
        list_dictionaries_for_cores.append(current_core_dictionary)
        subdictionary_counter += 1
    return list_dictionaries_for_cores, total_cubes, num_sheets, delta_array_shape


def per_core_work(start_frame_coord_array, end_frame_coord_array, dictionary_cube_data_this_core, MDA_selection,
                  start_frame, end_frame):
    """Run the analysis on one core.

    The code to perform on a given core given the dictionary of cube data.
    """
    list_previous_frame_centroids = []
    list_previous_frame_indices = []
    # define some utility functions for trajectory iteration:

    def point_in_cube(array_point_coordinates, list_cube_vertices, cube_centroid):
        """Determine if an array of coordinates are within a cube."""
        #the simulation particle point can't be more than half the cube side length away from the cube centroid in
        # any given dimension:
        array_cube_vertices = np.array(list_cube_vertices)
        cube_half_side_length = scipy.spatial.distance.pdist(array_cube_vertices, 'euclidean').min() / 2.0
        array_cube_vertex_distances_from_centroid = scipy.spatial.distance.cdist(array_cube_vertices,
                                                                                 cube_centroid[np.newaxis, :])
        np.testing.assert_almost_equal(array_cube_vertex_distances_from_centroid.min(),
                                          array_cube_vertex_distances_from_centroid.max(), decimal=4,
                                          err_msg="not all cube vertex to centroid distances are the same, "
                                                  "so not a true cube")
        absolute_delta_coords = np.absolute(np.subtract(array_point_coordinates, cube_centroid))
        absolute_delta_x_coords = absolute_delta_coords[..., 0]
        indices_delta_x_acceptable = np.where(absolute_delta_x_coords <= cube_half_side_length)
        absolute_delta_y_coords = absolute_delta_coords[..., 1]
        indices_delta_y_acceptable = np.where(absolute_delta_y_coords <= cube_half_side_length)
        absolute_delta_z_coords = absolute_delta_coords[..., 2]
        indices_delta_z_acceptable = np.where(absolute_delta_z_coords <= cube_half_side_length)
        intersection_xy_acceptable_arrays = np.intersect1d(indices_delta_x_acceptable[0],
                                                              indices_delta_y_acceptable[0])
        overall_indices_points_in_current_cube = np.intersect1d(intersection_xy_acceptable_arrays,
                                                                   indices_delta_z_acceptable[0])
        return overall_indices_points_in_current_cube

    def update_dictionary_point_in_cube_start_frame(array_simulation_particle_coordinates,
                                                    dictionary_cube_data_this_core):
        """Basically update the cube dictionary objects assigned to this core to contain a new key/value pair
        corresponding to the indices of the relevant particles that fall within a given cube. Also, for a given cube,
        store a key/value pair for the centroid of the particles that fall within the cube."""
        cube_counter = 0
        for key, cube in dictionary_cube_data_this_core.items():
            index_list_in_cube = point_in_cube(array_simulation_particle_coordinates, cube['vertex_list'],
                                               cube['centroid'])
            cube['start_frame_index_list_in_cube'] = index_list_in_cube
            if len(index_list_in_cube) > 0:  # if there's at least one particle in this cube
                centroid_particles_in_cube = np.average(array_simulation_particle_coordinates[index_list_in_cube],
                                                           axis=0)
                cube['centroid_of_particles_first_frame'] = centroid_particles_in_cube
            else:  # empty cube
                cube['centroid_of_particles_first_frame'] = None
            cube_counter += 1

    def update_dictionary_end_frame(array_simulation_particle_coordinates, dictionary_cube_data_this_core):
        """Update the cube dictionary objects again as appropriate for the second and final frame."""
        cube_counter = 0
        for key, cube in dictionary_cube_data_this_core.items():
            # if there were no particles in the cube in the first frame, then set dx,dy,dz each to 0
            if cube['centroid_of_particles_first_frame'] is None:
                cube['dx'] = 0
                cube['dy'] = 0
                cube['dz'] = 0
            else:  # there was at least one particle in the starting cube so we can get dx,dy,dz centroid values
                new_coordinate_array_for_particles_starting_in_this_cube = array_simulation_particle_coordinates[
                    cube['start_frame_index_list_in_cube']]
                new_centroid_for_particles_starting_in_this_cube = np.average(
                    new_coordinate_array_for_particles_starting_in_this_cube, axis=0)
                cube['centroid_of_paticles_final_frame'] = new_centroid_for_particles_starting_in_this_cube
                delta_centroid_array_this_cube = new_centroid_for_particles_starting_in_this_cube - cube[
                    'centroid_of_particles_first_frame']
                cube['dx'] = delta_centroid_array_this_cube[0]
                cube['dy'] = delta_centroid_array_this_cube[1]
                cube['dz'] = delta_centroid_array_this_cube[2]
            cube_counter += 1

    #now that the parent process is dealing with the universe object & grabbing required coordinates, each child
    # process only needs to take the coordinate arrays & perform the operations with its assigned cubes (no more file
    #  opening and trajectory iteration on each core--which I'm hoping will substantially reduce the physical memory
    # footprint of my 3D streamplot code)
    update_dictionary_point_in_cube_start_frame(start_frame_coord_array, dictionary_cube_data_this_core)
    update_dictionary_end_frame(end_frame_coord_array, dictionary_cube_data_this_core)
    return dictionary_cube_data_this_core


def produce_coordinate_arrays_single_process(topology_file_path, trajectory_file_path, MDA_selection, start_frame,
                                             end_frame):
    """Generate coordinate arrays.

    To reduce memory footprint produce only a single MDA selection and get
    desired coordinate arrays; can later send these coordinate arrays to all
    child processes rather than having each child process open a trajectory and
    waste memory.

    """
    universe_object = MDAnalysis.Universe(topology_file_path, trajectory_file_path)
    relevant_particles = universe_object.select_atoms(MDA_selection)
    # pull out coordinate arrays from desired frames:
    for ts in universe_object.trajectory:
        if ts.frame > end_frame:
            break  # stop here
        if ts.frame == start_frame:
            start_frame_relevant_particle_coordinate_array_xyz = relevant_particles.positions
        elif ts.frame == end_frame:
            end_frame_relevant_particle_coordinate_array_xyz = relevant_particles.positions
        else:
            continue
    return (start_frame_relevant_particle_coordinate_array_xyz, end_frame_relevant_particle_coordinate_array_xyz)


[docs]def generate_streamlines_3d(topology_file_path, trajectory_file_path, grid_spacing, MDA_selection, start_frame,
                            end_frame, xmin, xmax, ymin, ymax, zmin, zmax, maximum_delta_magnitude=2.0,
                            num_cores='maximum'):
    r"""Produce the x, y and z components of a 3D streamplot data set.

    Parameters
    ----------
    topology_file_path : str
            Absolute path to the topology file
    trajectory_file_path : str
            Absolute path to the trajectory file. It will normally be desirable
            to filter the trajectory with a tool such as GROMACS
            :program:`g_filter` (see :cite:p:`b-Chavent2014`)
    grid_spacing : float
            The spacing between grid lines (angstroms)
    MDA_selection : str
            MDAnalysis selection string
    start_frame : int
            First frame number to parse
    end_frame : int
            Last frame number to parse
    xmin : float
            Minimum coordinate boundary for x-axis (angstroms)
    xmax : float
            Maximum coordinate boundary for x-axis (angstroms)
    ymin : float
            Minimum coordinate boundary for y-axis (angstroms)
    ymax : float
            Maximum coordinate boundary for y-axis (angstroms)
    maximum_delta_magnitude : float
            Absolute value of the largest displacement tolerated for the
            centroid of a group of particles ( angstroms). Values above this
            displacement will not count in the streamplot (treated as
            excessively large displacements crossing the periodic boundary)
    num_cores : int or 'maximum' (optional)
            The number of cores to use. (Default 'maximum' uses all available
            cores)

    Returns
    -------
    dx_array : array of floats
            An array object containing the displacements in the x direction
    dy_array : array of floats
            An array object containing the displacements in the y direction
    dz_array : array of floats
            An array object containing the displacements in the z direction

    Examples
    --------
    Generate 3D streamlines and visualize in `mayavi`_::

        import numpy as np

        import MDAnalysis
        import MDAnalysis.visualization.streamlines_3D

        import mayavi, mayavi.mlab

        # assign coordinate system limits and grid spacing:
        x_lower,x_upper = -8.73, 1225.96
        y_lower,y_upper = -12.58, 1224.34
        z_lower,z_upper = -300, 300
        grid_spacing_value = 20

        x1, y1, z1 = MDAnalysis.visualization.streamlines_3D.generate_streamlines_3d(
                        'testing.gro', 'testing_filtered.xtc',
                         xmin=x_lower, xmax=x_upper,
                         ymin=y_lower, ymax=y_upper,
                         zmin=z_lower, zmax=z_upper,
                         grid_spacing=grid_spacing_value, MDA_selection = 'name PO4',
                         start_frame=2, end_frame=3, num_cores='maximum')

        x, y, z = np.mgrid[x_lower:x_upper:x1.shape[0]*1j,
                          y_lower:y_upper:y1.shape[1]*1j,
                          z_lower:z_upper:z1.shape[2]*1j]

        # plot with mayavi:
        fig = mayavi.mlab.figure(bgcolor=(1.0, 1.0, 1.0), size=(800, 800), fgcolor=(0, 0, 0))
        for z_value in np.arange(z_lower, z_upper, grid_spacing_value):
            st = mayavi.mlab.flow(x, y, z, x1, y1, z1, line_width=1,
                                  seedtype='plane', integration_direction='both')
            st.streamline_type = 'tube'
            st.tube_filter.radius = 2
            st.seed.widget.origin = np.array([ x_lower,  y_upper,   z_value])
            st.seed.widget.point1 = np.array([ x_upper, y_upper,  z_value])
            st.seed.widget.point2 = np.array([ x_lower, y_lower,  z_value])
            st.seed.widget.resolution = int(x1.shape[0])
            st.seed.widget.enabled = False
        mayavi.mlab.axes(extent = [0, 1200, 0, 1200, -300, 300])
        fig.scene.z_plus_view()
        mayavi.mlab.savefig('test_streamplot_3D.png')
        # more compelling examples can be produced for vesicles and other spherical systems

    .. image:: test_streamplot_3D.png

    See Also
    --------
    MDAnalysis.visualization.streamlines.generate_streamlines


    .. _mayavi: http://docs.enthought.com/mayavi/mayavi/
    """
    # work out the number of cores to use:
    if num_cores == 'maximum':
        num_cores = multiprocessing.cpu_count()  # use all available cores
    else:
        num_cores = num_cores  # use the value specified by the user
        # assert isinstance(num_cores,(int,long)), "The number of specified cores must (of course) be an integer."
    np.seterr(all='warn', over='raise')
    parent_cube_dictionary = {}  # collect all data from child processes here

    def log_result_to_parent(process_dict):
        parent_cube_dictionary.update(process_dict)

    #step 1: produce tuple of cartesian coordinate limits for the first frame
    #tuple_of_limits = determine_container_limits(topology_file_path = topology_file_path,trajectory_file_path =
    # trajectory_file_path,buffer_value=buffer_value)
    tuple_of_limits = (xmin, xmax, ymin, ymax, zmin, zmax)
    #step 2: produce a suitable grid (will assume that grid size / container size does not vary during simulation--or
    #  at least not beyond the buffer limit, such that this grid can be used for all subsequent frames)
    grid = produce_grid(tuple_of_limits=tuple_of_limits, grid_spacing=grid_spacing)
    #step 3: split the grid into a dictionary of cube information that can be sent to each core for processing:
    list_dictionaries_for_cores, total_cubes, num_sheets, delta_array_shape = split_grid(grid=grid, num_cores=num_cores)
    #step 3b: produce required coordinate arrays on a single core to avoid making a universe object on each core:
    start_frame_coord_array, end_frame_coord_array = produce_coordinate_arrays_single_process(topology_file_path,
                                                                                              trajectory_file_path,
                                                                                              MDA_selection,
                                                                                              start_frame, end_frame)
    #step 4: per process work using the above grid data split
    pool = multiprocessing.Pool(num_cores)
    for sub_dictionary_of_cube_data in list_dictionaries_for_cores:
        pool.apply_async(per_core_work, args=(
            start_frame_coord_array, end_frame_coord_array, sub_dictionary_of_cube_data, MDA_selection, start_frame,
            end_frame), callback=log_result_to_parent)
    pool.close()
    pool.join()
    #so, at this stage the parent process now has a single dictionary with all the cube objects updated from all
    # available cores
    #the 3D streamplot (i.e, mayavi flow() function) will require separate 3D np arrays for dx,dy,dz
    #the shape of each 3D array will unfortunately have to match the mgrid data structure (bit of a pain): (
    # num_sheets - 1, num_sheets - 1, cubes_per_column)
    cubes_per_sheet = int(float(total_cubes) / float(num_sheets - 1))
    #produce dummy zero arrays for dx,dy,dz of the appropriate shape:
    dx_array = np.zeros(delta_array_shape)
    dy_array = np.zeros(delta_array_shape)
    dz_array = np.zeros(delta_array_shape)
    #now use the parent cube dictionary to correctly substitute in dx,dy,dz values
    current_sheet = 0  # which is also the current row
    y_index_current_sheet = 0  # sub row
    z_index_current_column = 0  # column
    total_cubes_current_sheet = 0
    for cube_number in range(0, total_cubes):
        dx_array[current_sheet, y_index_current_sheet, z_index_current_column] = parent_cube_dictionary[cube_number][
            'dx']
        dy_array[current_sheet, y_index_current_sheet, z_index_current_column] = parent_cube_dictionary[cube_number][
            'dy']
        dz_array[current_sheet, y_index_current_sheet, z_index_current_column] = parent_cube_dictionary[cube_number][
            'dz']
        z_index_current_column += 1
        total_cubes_current_sheet += 1
        if z_index_current_column == delta_array_shape[2]:
            # done building current y-column so iterate y value and reset z
            z_index_current_column = 0
            y_index_current_sheet += 1
            if y_index_current_sheet == delta_array_shape[1]:  # current sheet is complete
                current_sheet += 1
                y_index_current_sheet = 0  # restart for new sheet
                z_index_current_column = 0
                total_cubes_current_sheet = 0
    # now set velocity component values greater than a certain cutoff to 0,
    # because they tend to reflect spurious values (i.e., PBC jumping)
    dx_array[abs(dx_array) >= maximum_delta_magnitude] = 1.0
    dy_array[abs(dy_array) >= maximum_delta_magnitude] = 1.0
    dz_array[abs(dz_array) >= maximum_delta_magnitude] = 1.0
    return (dx_array, dy_array, dz_array)