Source code for fgivenx.mass

""" Utilities for computing the probability mass function. """
import scipy.stats
import scipy.interpolate
import numpy
from fgivenx.parallel import parallel_apply
from fgivenx.io import CacheException, Cache


[docs]def PMF(samples, y): """ Compute the probability mass function. The set of samples defines a probability density P(y), which is computed using a kernel density estimator. From :math:`P(y)` we define: :math:`\mathrm{pmf}(p) = \int_{P(y)<p} P(y) dy` This is the cumulative distribution function expressed as a function of the probability We aim to compute :math:`M(y)`, which indicates the amount of probability contained outside the iso-probability contour passing through :math:`y`:: ^ P(y) ... | | . . | | . p|- - - - - - - - - - .+- - - - . - - - - - - - - - - - | .#| #. | .##| ##. | .##| ##. | .###| ###. M(p) | .###| ###. is the | .###| ###. shaded area | .####| ####. | .####| ####. | ..#####| #####.. | ....#######| #######.... | .###########| ###########. +---------------------+-------------------------------> y t ^ M(p) ^ M(y) | | 1| +++ 1| + | + | + + | ++++++++ | + + | ++ | ++ ++ | ++ | ++ ++ |+++ |+++ +++ +---------------------> p +---------------------> y 0 Parameters ---------- samples: array-like Array of samples from a probability density P(y). y: array-like (optional) Array to evaluate the PDF at Returns ------- 1D numpy.array: PMF evaluated at each y value """ # Remove any nans from the samples samples = numpy.array(samples) samples = samples[~numpy.isnan(samples)] try: # Compute the kernel density estimate kernel = scipy.stats.gaussian_kde(samples) # Add two more samples definitely outside the range and sort them mn = min(samples) - 10*numpy.sqrt(kernel.covariance[0, 0]) mx = max(samples) + 10*numpy.sqrt(kernel.covariance[0, 0]) y_ = numpy.linspace(mn, mx, len(y)*10) # Compute the probabilities at each of the extended samples ps_ = kernel(y_) # Compute the masses ms = [] for yi in y: # compute the probability at this y value p = kernel(yi) if p <= max(ps_)*1e-5: m = 0. else: # Find out which samples have greater probability than P(y) bools = ps_ > p # Compute indices where to start and stop the integration stops = numpy.where(numpy.logical_and(~bools[:-1], bools[1:])) starts = numpy.where(numpy.logical_and(bools[:-1], ~bools[1:])) # Compute locations starts = [scipy.optimize.brentq(lambda u: kernel(u)-p, y_[i], y_[i+1]) for i in starts[0]] starts = [-numpy.inf] + starts stops = [scipy.optimize.brentq(lambda u: kernel(u)-p, y_[i], y_[i+1]) for i in stops[0]] stops = stops + [numpy.inf] # Sum up the masses m = sum(kernel.integrate_box_1d(a, b) for a, b in zip(starts, stops)) ms.append(m) return numpy.array(ms) except numpy.linalg.LinAlgError: return numpy.zeros_like(y)
[docs]def compute_pmf(fsamps, y, **kwargs): """ Compute the pmf defined by fsamps at each x for each y. Parameters ---------- fsamps: 2D array-like array of function samples, as returned by :func:`fgivenx.compute_samples` y: 1D array-like y values to evaluate the PMF at parallel, tqdm_kwargs: optional see docstring for :func:`fgivenx.parallel.parallel_apply`. Returns ------- 2D numpy.array probability mass function at each x for each y `shape=(len(fsamps),len(y)` """ parallel = kwargs.pop('parallel', False) cache = kwargs.pop('cache', '') tqdm_kwargs = kwargs.pop('tqdm_kwargs', {}) if kwargs: raise TypeError('Unexpected **kwargs: %r' % kwargs) if cache: cache = Cache(cache + '_masses') try: return cache.check(fsamps, y) except CacheException as e: print(e) masses = parallel_apply(PMF, fsamps, postcurry=(y,), parallel=parallel, tqdm_kwargs=tqdm_kwargs) masses = numpy.array(masses).transpose().copy() if cache: cache.save(fsamps, y, masses) return masses