Source code for impactlab_tools.utils.binning



import xarray as xr
import numpy as np
import scipy.stats

from pandas import CategoricalIndex



[docs]
def binned_statistic_1d(da, dim, bins=10, statistic='count', value_range=None):
    '''
    Bin a data array by values and summarize along a dimension

    Parameters
    ----------

    da : xr.DataArray
        DataArray to be binned

    dim : str
        Dimension along which to summarize the binned values

    statistic : string or callable, optional
        The statistic to compute (default is 'count'). The following statistics
        are available:

        * 'mean' : compute the mean of values for points within each bin. Empty
            bins will be represented by NaN.

        * 'median' : compute the median of values for points within each bin.
            Empty bins will be represented by NaN.

        * 'count' : compute the count of points within each bin. This is
            identical to an unweighted histogram. values array is not
            referenced.

        * 'sum' : compute the sum of values for points within each bin. This is
            identical to a weighted histogram.

        * 'min' : compute the minimum of values for points within each bin.
            Empty bins will be represented by NaN.

        * 'max' : compute the maximum of values for point within each bin.
            Empty bins will be represented by NaN.

        * function : a user-defined function which takes a 1D array of values,
            and outputs a single numerical statistic. This function will be
            called on the values in each bin. Empty bins will be represented by
            function([]), or NaN if this returns an error.

    bins : int or sequence of scalars, optional
        If bins is an int, it defines the number of equal-width bins in the
        given range (10 by default). If bins is a sequence, it defines the bin
        edges, including the rightmost edge, allowing for non-uniform bin
        widths. Values in x that are smaller than lowest bin edge are assigned
        to bin number 0, values beyond the highest bin are assigned to
        bins[-1]. If the bin edges are specified, the number of bins will be,
        (nx = len(bins)-1).


    value_range : (float, float) or [(float, float)], optional
        The lower and upper range of the bins. If not provided, value_range is
        simply (x.min(), x.max()). Values outside the range are ignored.

    Returns
    -------

    binned : xr.DataArray
        A data array with bins along the summary dimension

    Examples
    --------

    .. code-block:: python

        >>> da = xr.DataArray(
        ...     np.arange(16).reshape(4,4),
        ...     dims=('a', 'b'),
        ...     coords={'a': list('abcd'), 'b': list('wxyz')})
        ...
        >>> da # doctest: +SKIP
        <xarray.DataArray (a: 4, b: 4)>
        array([[  0,  1,  2,  3],
               [  4,  5,  6,  7],
               [  8,  9, 10, 11],
               [ 12, 13, 14, 15]])
        Coordinates:
          * a        (a) <U1 'a' 'b' 'c' 'd'
          * b        (b) <U1 'w' 'x' 'y' 'z'

        >>> binned_statistic_1d(
        ...     da, 'b', [0, 2, 5, 20])
        ...     # doctest: +SKIP
        <xarray.DataArray (a: 4, goh realroups: 3)>
        array([[ 2., 2., 0.],
               [ 0., 1., 3.],
               [ 0., 0., 4.],
               [ 0., 0., 4.]])
        Coordinates:
          * a        (a) <U1 'a' 'b' 'c' 'd'
          * groups   (groups) object '(0, 2]' '(2, 5]' '(5, 20]'

        >>> binned_statistic_1d(da, 'a', statistic='sum') # doctest: +SKIP
        <xarray.DataArray (groups: 10, b: 4)>
        array([[  0.,  1.,  2.,  3.],
               [  0.,  0.,  0.,  0.],
               [  0.,  0.,  0.,  0.],
               [  4.,  5.,  6.,  7.],
               [  0.,  0.,  0.,  0.],
               [  0.,  0.,  0.,  0.],
               [  8.,  9., 10., 11.],
               [  0.,  0.,  0.,  0.],
               [  0.,  0.,  0.,  0.],
               [ 12., 13., 14., 15.]])
        Coordinates:
          * groups   (groups) object '(0.0, 1.5]' '(1.5, 3.0]' '(3.0, 4.5]' ...
          * b        (b) <U1 'w' 'x' 'y' 'z'
    '''

    # apply binned_statistic along dim
    bnd = np.apply_along_axis(
        lambda x, **kwds: scipy.stats.binned_statistic(x, x, **kwds)[0],
        da.get_axis_num(dim),
        da.values,
        bins=bins,
        statistic=statistic,
        range=value_range)

    if isinstance(bins, int):
        if value_range is None:
            value_range = float(da.min()), float(da.max())
        bins = np.linspace(value_range[0], value_range[1], bins+1)

    # build index for new array
    bindex = CategoricalIndex(
        [f'({bins[i-1]}, {bins[i]}]' for i in range(1, len(bins))],
        ordered=True)

    da = xr.DataArray(
        bnd,
        dims=tuple([
            d if d != dim else 'groups' for d in da.dims]),
        coords={
            d if d != dim else 'groups': da.coords[d] if d != dim else bindex
            for d in da.dims})
    return da