Source code for Pipeline.PAk

# The pointwise-adaptive k-NN density estimator.
#
# Author: Maria d'Errico <mariaderr@gmail.com>
#
# Licence: BSD 3 clause

import numpy as np
from sklearn.base import BaseEstimator, DensityMixin, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import kneighbors_graph
from math import log, sqrt, exp, lgamma, pi, pow
from Pipeline.twoNN import twoNearestNeighbors

VALID_METRIC = ['precomputed', 'euclidean','cosine']
VALID_DIM = ['auto', 'twoNN']

[docs]def _PointAdaptive_kNN(distances, indices, k_max=1000, D_thr=23.92812698, dim=None): r"""Main function implementing the Pointwise Adaptive k-NN density estimator. Parameters ---------- distances: array [n_samples, k_max+1] Distances to the k_max neighbors of each points. The point is included. indices : array [n_samples, k_max+1] Indices of the k_max neighbors of each points. The point is included. k_max : int, default=1000 The maximum number of nearest-neighbors considered by the procedure that returns the largest number of neighbors ``k_hat`` for which the condition of constant density holds, within a given level of confidence. If the number of points in the sample `N` is less than the default value, k_max will be set automatically to the value ``N/2``. D_thr : float, default=23.92812698 Set the level of confidence. The default value corresponds to a p-value of :math:`10^{-6}` for a :math:`\chiˆ2` distribution with one degree of freedom. dim : int Intrinsic dimensionality of the sample. Attributes ---------- densities : array [n_samples] The logarithm of the density at each point. err_densities : array [n_samples] The uncertainty in the density estimation, obtained by computing the inverse of the Fisher information matrix. k_hat : array [n_samples] The optimal number of neighbors for which the condition of constant density holds. dc : array [n_sample] The radius of the optimal neighborhood for each point. """ from Pipeline import _PAk # The adaptive k-Nearest Neighbor density estimator k_hat, dc, densities, err_densities = _PAk.get_densities(dim, distances, k_max, D_thr, indices) return densities, err_densities, k_hat, dc
[docs]class PointAdaptive_kNN(BaseEstimator): """Class definition for the Pointwise Adaptive k-NN density estimator. Parameters ---------- k_max : int, default=1000 The maximum number of nearest-neighbors considered by the procedure that returns the largest number of neighbors ``k_hat`` for which the condition of constant density holds, within a given level of confidence. If the number of points in the sample `N` is less than the default value, k_max will be set automatically to the value ``N/2``. D_thr : float, default=23.92812698 Set the level of confidence. The default value corresponds to a p-value of :math:`10^{-6}` for a :math:`\chiˆ2` distribution with one degree of freedom. metric : string, or callable The distance metric to use. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in :obj:`VALID_METRIC = [precomputed, euclidean,cosine]`. If metric is ``precomputed``, X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. Default is ``euclidean``. dim_algo : string, or callable Method for intrinsic dimensionality calculation. If dim_algo is ``auto``, dim is assumed to be equal to n_samples. If dim_algo is a string, it must be one of the options allowed by :obj:`VALID_DIM = [auto, twoNN]`. nn_distances : array [n_samples, k_max+1] Distances to the k_max neighbors of each points. nn_indices : array [n_samples, k_max+1] Indices of the k_max neighbors of each points. blockAn : bool, default=True This parameter is considered if dim_algo is ``twoNN``, it is ignored otherwise. If blockAn is True the algorithm perform a block analysis that allows discriminating the relevant dimensions as a function of the block size. This allows to study the stability of the estimation with respect to changes in the neighborhood size, which is crucial for ID estimations when the data lie on a manifold perturbed by a high-dimensional noise. block_ratio : int, default=20 This parameter is considered if dim_algo is ``twoNN``, it is ignored otherwise. Set the minimum size of the blocks as `n_samples/block_ratio`. If ``blockAn=False``, block_ratio is ignored. frac : float, default=1 This parameter is considered if dim_algo is ``twoNN``, it is ignored otherwise. Define the fraction of points in the data set used for ID calculation. By default the full data set is used. dim : int Intrinsic dimensionality of the sample. If dim is provided, dim_algo is ignored. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Attributes ---------- dim_ : int, Intrinsic dimensionality of the sample. If ``dim`` is not provided, ``dim_`` is set to the number of features in the input file. k_max_ : int The maximum number of nearest-neighbors considered by the procedure that returns the largest number of neighbors ``k_hat`` for which the condition of constant density holds, within a given level of confidence. If the number of points in the sample `N` is less than the default value, k_max_ will be set automatically to the value ``N/2``. distances_ : array [n_samples, k_max+1] Distances to the k_max neighbors of each points. indices_ : array [n_samples, k_max+1] Indices of the k_max neighbors of each points. densities_ : array [n_samples] The logarithm of the density at each point. err_densities_ : array [n_samples] The uncertainty in the density estimation, obtained by computing the inverse of the Fisher information matrix. k_hat_ : array [n_samples] The optimal number of neighbors for which the condition of constant density holds. dc_ : array [n_sample] The radius of the optimal neighborhood for each point. Examples -------- >>> from DPA import PAk >>> import numpy as np >>> X = np.array([[1, 1], [2, 1], [1, 0], ... [4, 7], [3, 5], [3, 6]]) >>> est = PAk.PointAdaptive_kNN().fit(X) >>> est.distances_ array([[0. , 1. , 1. ], [0. , 1. , 1.41421356], [0. , 1. , 1.41421356], [0. , 1.41421356, 2.23606798], [0. , 1. , 2.23606798], [0. , 1. , 1.41421356]]) >>> est.indices_ array([[0, 1, 2], [1, 0, 2], [2, 0, 1], [3, 5, 4], [4, 5, 3], [5, 4, 3]]) References ---------- A. Rodriguez, M. d’Errico, E. Facco and A. Laio, Computing the free energy without collective variables. `J. chemical theory computation` 14, 1206–1215 (2018). """ def __init__(self, k_max=1000, D_thr=23.92812698, metric="euclidean", dim_algo="auto", nn_distances=None, nn_indices=None, blockAn=True, block_ratio=20, frac=1, dim=None, n_jobs=None): self.k_max = k_max self.D_thr = D_thr self.metric = metric self.dim_algo = dim_algo self.nn_distances = nn_distances self.nn_indices = nn_indices self.blockAn = blockAn self.block_ratio = block_ratio self.frac = frac self.dim = dim self.n_jobs = n_jobs if metric not in VALID_METRIC and not callable(metric): raise ValueError("invalid metric: '{0}'".format(metric)) if dim_algo not in VALID_DIM: raise ValueError("invalid dim_algo: '{0}'".format(dim_algo)) if self.dim_algo == "twoNN" and self.frac > 1: raise ValueError("frac should be between 0 and 1.")
[docs] def fit(self, X, y=None): """Fit the PAk estimator on the data. Parameters ---------- X : array [n_samples, n_samples] if metric == ``precomputed``, or, [n_samples, n_features] otherwise The input samples. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : object Returns self. """ # Input validation X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2) self.dim_ = self.dim if not self.dim: if self.dim_algo == "auto": self.dim_ = X.shape[1] elif self.dim_algo == "twoNN": if self.block_ratio >= X.shape[0]: raise ValueError("block_ratio is larger than the sample size, the minimum size for block analysis \ would be zero. Please set a lower value.") self.dim_ = twoNearestNeighbors(blockAn=self.blockAn, block_ratio=self.block_ratio, metric=self.metric, frac=self.frac, n_jobs=self.n_jobs).fit(X).dim_ else: pass self.k_max_ = self.k_max if self.k_max > X.shape[0]: # the following value is chosen to better address very small data set self.k_max_ = int(X.shape[0]*0.4) if self.k_max < 3: raise ValueError("k_max is below 3, the minimum value required for \ statistical significance. Please use a larger datasets.") # check if NN matrix is precomputed: if self.nn_distances is not None and self.nn_indices is not None: # overwrite the self.k_max_ self.k_max_ = self.nn_distances.shape[1]-1 self.distances_ = self.nn_distances self.indices_ = self.nn_indices elif self.metric == "precomputed": nbrs = NearestNeighbors(n_neighbors=self.k_max_+1, # The point i is counted in its neighborhood algorithm="brute", metric=self.metric, n_jobs=self.n_jobs).fit(X) self.distances_, self.indices_ = nbrs.kneighbors(X) else: nbrs = NearestNeighbors(n_neighbors=self.k_max_+1, # The point i is counted in its neighborhood algorithm="auto", metric=self.metric, n_jobs=self.n_jobs).fit(X) self.distances_, self.indices_ = nbrs.kneighbors(X) self.densities_, self.err_densities_, self.k_hat_, self.dc_ = _PointAdaptive_kNN(self.distances_, self.indices_, k_max=self.k_max_, D_thr=self.D_thr, dim=self.dim_) self.is_fitted_ = True return self