Source code for Pipeline.PAk

# The pointwise-adaptive k-NN density estimator.
#
# Author: Maria d'Errico <mariaderr@gmail.com>
#
# Licence: BSD 3 clause

import numpy as np
from sklearn.base import BaseEstimator, DensityMixin, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import kneighbors_graph
from math import log, sqrt, exp, lgamma, pi, pow
from Pipeline.twoNN import twoNearestNeighbors

VALID_METRIC = ['precomputed', 'euclidean','cosine']
VALID_DIM = ['auto', 'twoNN']

[docs]def _PointAdaptive_kNN(distances, indices, k_max=1000, D_thr=23.92812698, dim=None):
    r"""Main function implementing the Pointwise Adaptive k-NN density estimator.

    Parameters
    ----------
    distances: array [n_samples, k_max+1]
        Distances to the k_max neighbors of each points. The point is included. 
  
    indices : array [n_samples, k_max+1]
        Indices of the k_max neighbors of each points. The point is included.

    k_max : int, default=1000
        The maximum number of nearest-neighbors considered by the procedure that returns the
        largest number of neighbors ``k_hat`` for which the condition of constant density 
        holds, within a given level of confidence. If the number of points in the sample `N` is 
        less than the default value, k_max will be set automatically to the value ``N/2``.
    
    D_thr : float, default=23.92812698
        Set the level of confidence. The default value corresponds to a p-value of 
        :math:`10^{-6}` for a :math:`\chiˆ2` distribution with one degree of freedom.

    dim : int
        Intrinsic dimensionality of the sample.

    Attributes
    ----------

    densities : array [n_samples]
        The logarithm of the density at each point.
    
    err_densities : array [n_samples]
        The uncertainty in the density estimation, obtained by computing 
        the inverse of the Fisher information matrix.

    k_hat : array [n_samples]
        The optimal number of neighbors for which the condition of constant density holds.

    dc : array [n_sample]
        The radius of the optimal neighborhood for each point.
    

    """
    from Pipeline import _PAk

    # The adaptive k-Nearest Neighbor density estimator
    k_hat, dc, densities, err_densities = _PAk.get_densities(dim, distances, k_max, D_thr, indices)

    return densities, err_densities, k_hat, dc


[docs]class PointAdaptive_kNN(BaseEstimator): 
    """Class definition for the Pointwise Adaptive k-NN density estimator.

    Parameters
    ----------
    k_max : int, default=1000
        The maximum number of nearest-neighbors considered by the procedure that returns the
        largest number of neighbors ``k_hat`` for which the condition of constant density 
        holds, within a given level of confidence. If the number of points in the sample `N` is 
        less than the default value, k_max will be set automatically to the value ``N/2``.
    
    D_thr : float, default=23.92812698
        Set the level of confidence. The default value corresponds to a p-value of 
        :math:`10^{-6}` for a :math:`\chiˆ2` distribution with one degree of freedom.

    metric : string, or callable
        The distance metric to use. 
        If metric is a string, it must be one of the options allowed by 
        scipy.spatial.distance.pdist for its metric parameter, or a metric listed in 
        :obj:`VALID_METRIC = [precomputed, euclidean,cosine]`. If metric is ``precomputed``, X is assumed to
        be a distance matrix. Alternatively, if metric is a callable function, it is 
        called on each pair of instances (rows) and the resulting value recorded. The 
        callable should take two arrays from X as input and return a value indicating 
        the distance between them. Default is ``euclidean``.

    dim_algo : string, or callable
        Method for intrinsic dimensionality calculation. If dim_algo is ``auto``, dim is assumed to be
        equal to n_samples. If dim_algo is a string, it must be one of the options allowed by :obj:`VALID_DIM = [auto, twoNN]`. 

    nn_distances  : array [n_samples, k_max+1]
        Distances to the k_max neighbors of each points.

    nn_indices : array [n_samples, k_max+1]
        Indices of the k_max neighbors of each points.
 
    blockAn : bool, default=True
        This parameter is considered if dim_algo is ``twoNN``, it is ignored otherwise.
        If blockAn is True the algorithm perform a block analysis that allows discriminating the relevant dimensions 
        as a function of the block size. This allows to study the stability of the estimation with respect to
        changes in the neighborhood size, which is crucial for ID estimations when the data lie on a manifold perturbed 
        by a high-dimensional noise. 

    block_ratio : int, default=20
        This parameter is considered if dim_algo is ``twoNN``, it is ignored otherwise.
        Set the minimum size of the blocks as `n_samples/block_ratio`. If ``blockAn=False``, block_ratio is ignored.

    frac : float, default=1
        This parameter is considered if dim_algo is ``twoNN``, it is ignored otherwise.
        Define the fraction of points in the data set used for ID calculation. By default the full data set is used.   

    dim : int
        Intrinsic dimensionality of the sample. If dim is provided, dim_algo is ignored.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    Attributes
    ----------

    dim_ : int, 
        Intrinsic dimensionality of the sample. If ``dim`` is not provided, ``dim_`` is set 
        to the number of features in the input file.

    k_max_ : int
        The maximum number of nearest-neighbors considered by the procedure that returns the
        largest number of neighbors ``k_hat`` for which the condition of constant density
        holds, within a given level of confidence. If the number of points in the sample `N` is
        less than the default value, k_max_ will be set automatically to the value ``N/2``.

    distances_ : array [n_samples, k_max+1]
        Distances to the k_max neighbors of each points.

    indices_ : array [n_samples, k_max+1]
        Indices of the k_max neighbors of each points.
 
    densities_ : array [n_samples]
        The logarithm of the density at each point.
    
    err_densities_ : array [n_samples]
        The uncertainty in the density estimation, obtained by computing 
        the inverse of the Fisher information matrix.

    k_hat_ : array [n_samples]
        The optimal number of neighbors for which the condition of constant density holds.

    dc_ : array [n_sample]
        The radius of the optimal neighborhood for each point.


    Examples
    --------
    >>> from DPA import PAk
    >>> import numpy as np
    >>> X = np.array([[1, 1], [2, 1], [1, 0],
    ...               [4, 7], [3, 5], [3, 6]])
    >>> est = PAk.PointAdaptive_kNN().fit(X)
    >>> est.distances_
    array([[0.        , 1.        , 1.        ],
           [0.        , 1.        , 1.41421356],
           [0.        , 1.        , 1.41421356],
           [0.        , 1.41421356, 2.23606798],
           [0.        , 1.        , 2.23606798],
           [0.        , 1.        , 1.41421356]])
    >>> est.indices_
    array([[0, 1, 2],
           [1, 0, 2],
           [2, 0, 1],
           [3, 5, 4],
           [4, 5, 3],
           [5, 4, 3]])    
    
    References
    ----------

    A. Rodriguez, M. d’Errico, E. Facco and A. Laio, Computing the free energy without collective variables. `J. chemical theory computation` 14, 1206–1215 (2018).
    
        
    """
    def __init__(self, k_max=1000, D_thr=23.92812698, metric="euclidean", dim_algo="auto",
                       nn_distances=None, nn_indices=None, 
                       blockAn=True, block_ratio=20, frac=1, dim=None, n_jobs=None):
        self.k_max = k_max
        self.D_thr = D_thr
        self.metric = metric
        self.dim_algo = dim_algo
        self.nn_distances = nn_distances
        self.nn_indices = nn_indices
        self.blockAn = blockAn
        self.block_ratio = block_ratio
        self.frac = frac
        self.dim = dim
        self.n_jobs = n_jobs

        if metric not in VALID_METRIC and not callable(metric):
            raise ValueError("invalid metric: '{0}'".format(metric))

        if dim_algo not in VALID_DIM:
            raise ValueError("invalid dim_algo: '{0}'".format(dim_algo))

        if self.dim_algo == "twoNN" and self.frac > 1:
            raise ValueError("frac should be between 0 and 1.")


[docs]    def fit(self, X, y=None):
        """Fit the PAk estimator on the data.

        Parameters
        ----------
        X : array [n_samples, n_samples] if metric == ``precomputed``, or, 
            [n_samples, n_features] otherwise
            The input samples.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self : object
            Returns self.
        """
        # Input validation
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64, ensure_min_samples=2)

        self.dim_ = self.dim
        if not self.dim:
            if self.dim_algo == "auto":
                self.dim_ = X.shape[1]
            elif self.dim_algo == "twoNN":
                if self.block_ratio >= X.shape[0]:
                    raise ValueError("block_ratio is larger than the sample size, the minimum size for block analysis \
                                would be zero. Please set a lower value.")
                self.dim_ = twoNearestNeighbors(blockAn=self.blockAn, block_ratio=self.block_ratio, metric=self.metric,
                                               frac=self.frac, n_jobs=self.n_jobs).fit(X).dim_
            else:
                pass

        self.k_max_ = self.k_max
        if self.k_max > X.shape[0]:
            # the following value is chosen to better address very small data set
            self.k_max_ = int(X.shape[0]*0.4)
        if self.k_max < 3:
            raise ValueError("k_max is below 3, the minimum value required for \
                        statistical significance. Please use a larger datasets.")

        # check if NN matrix is precomputed:
        if self.nn_distances is not None and self.nn_indices is not None:
            # overwrite the self.k_max_
            self.k_max_ = self.nn_distances.shape[1]-1
            self.distances_ = self.nn_distances
            self.indices_ = self.nn_indices
        elif self.metric == "precomputed":
            nbrs = NearestNeighbors(n_neighbors=self.k_max_+1, # The point i is counted in its neighborhood 
                                          algorithm="brute", 
                                        metric=self.metric,
                                        n_jobs=self.n_jobs).fit(X)
            self.distances_, self.indices_ = nbrs.kneighbors(X)
        else:
            nbrs = NearestNeighbors(n_neighbors=self.k_max_+1, # The point i is counted in its neighborhood 
                                         algorithm="auto", 
                                        metric=self.metric, 
                                        n_jobs=self.n_jobs).fit(X)
            self.distances_, self.indices_ = nbrs.kneighbors(X) 

        self.densities_, self.err_densities_, self.k_hat_, self.dc_ = _PointAdaptive_kNN(self.distances_, 
                                                                                 self.indices_,
                                                                              k_max=self.k_max_, 
                                                                              D_thr=self.D_thr, 
                                                                                  dim=self.dim_)
        self.is_fitted_ = True

        return self