Source code for fgivenx.mass

""" Utilities for computing the probability mass function. """
import scipy.stats
import scipy.interpolate
import numpy
from fgivenx.parallel import parallel_apply
from fgivenx.io import CacheException, Cache


[docs]def PMF(samples, y):
    """ Compute the probability mass function.

        The set of samples defines a probability density P(y),
        which is computed using a kernel density estimator.

        From :math:`P(y)` we define:

        :math:`\mathrm{pmf}(p) = \int_{P(y)<p} P(y) dy`

        This is the cumulative distribution function expressed as a
        function of the probability

        We aim to compute :math:`M(y)`, which indicates the amount of
        probability contained outside the iso-probability contour
        passing through :math:`y`::


             ^ P(y)                   ...
             |                     | .   .
             |                     |       .
            p|- - - - - - - - - - .+- - - - . - - - - - - - - - - -
             |                   .#|        #.
             |                  .##|        ##.
             |                  .##|        ##.
             |                 .###|        ###.     M(p)
             |                 .###|        ###.     is the
             |                 .###|        ###.     shaded area
             |                .####|        ####.
             |                .####|        ####.
             |              ..#####|        #####..
             |          ....#######|        #######....
             |         .###########|        ###########.
             +---------------------+-------------------------------> y
                                  t

             ^ M(p)                        ^ M(y)
             |                             |
            1|                +++         1|         +
             |               +             |        + +
             |       ++++++++              |       +   +
             |     ++                      |     ++     ++
             |   ++                        |   ++         ++
             |+++                          |+++             +++
             +---------------------> p     +---------------------> y
            0

        Parameters
        ----------
        samples: array-like
            Array of samples from a probability density P(y).

        y: array-like (optional)
            Array to evaluate the PDF at

        Returns
        -------
        1D numpy.array:
            PMF evaluated at each y value

    """
    # Remove any nans from the samples
    samples = numpy.array(samples)
    samples = samples[~numpy.isnan(samples)]
    try:
        # Compute the kernel density estimate
        kernel = scipy.stats.gaussian_kde(samples)

        # Add two more samples definitely outside the range and sort them
        mn = min(samples) - 10*numpy.sqrt(kernel.covariance[0, 0])
        mx = max(samples) + 10*numpy.sqrt(kernel.covariance[0, 0])
        y_ = numpy.linspace(mn, mx, len(y)*10)

        # Compute the probabilities at each of the extended samples
        ps_ = kernel(y_)

        # Compute the masses
        ms = []
        for yi in y:
            # compute the probability at this y value
            p = kernel(yi)
            if p <= max(ps_)*1e-5:
                m = 0.
            else:
                # Find out which samples have greater probability than P(y)
                bools = ps_ > p

                # Compute indices where to start and stop the integration
                stops = numpy.where(numpy.logical_and(~bools[:-1], bools[1:]))
                starts = numpy.where(numpy.logical_and(bools[:-1], ~bools[1:]))

                # Compute locations
                starts = [scipy.optimize.brentq(lambda u: kernel(u)-p,
                                                y_[i], y_[i+1])
                          for i in starts[0]]
                starts = [-numpy.inf] + starts
                stops = [scipy.optimize.brentq(lambda u: kernel(u)-p,
                                               y_[i], y_[i+1])
                         for i in stops[0]]
                stops = stops + [numpy.inf]

                # Sum up the masses
                m = sum(kernel.integrate_box_1d(a, b)
                        for a, b in zip(starts, stops))
            ms.append(m)
        return numpy.array(ms)

    except numpy.linalg.LinAlgError:
        return numpy.zeros_like(y)


[docs]def compute_pmf(fsamps, y, **kwargs):
    """ Compute the pmf defined by fsamps at each x for each y.

    Parameters
    ----------
    fsamps: 2D array-like
        array of function samples, as returned by
        :func:`fgivenx.compute_samples`

    y: 1D array-like
        y values to evaluate the PMF at

    parallel, tqdm_kwargs: optional
        see docstring for :func:`fgivenx.parallel.parallel_apply`.

    Returns
    -------
    2D numpy.array
        probability mass function at each x for each y
        `shape=(len(fsamps),len(y)`
    """
    parallel = kwargs.pop('parallel', False)
    cache = kwargs.pop('cache', '')
    tqdm_kwargs = kwargs.pop('tqdm_kwargs', {})
    if kwargs:
        raise TypeError('Unexpected **kwargs: %r' % kwargs)

    if cache:
        cache = Cache(cache + '_masses')
        try:
            return cache.check(fsamps, y)
        except CacheException as e:
            print(e)

    masses = parallel_apply(PMF, fsamps, postcurry=(y,), parallel=parallel,
                            tqdm_kwargs=tqdm_kwargs)
    masses = numpy.array(masses).transpose().copy()

    if cache:
        cache.save(fsamps, y, masses)

    return masses