diff --git a/.circleci/config.yml b/.circleci/config.yml index 5c59d81..845014e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -205,22 +205,10 @@ workflows: name: test-linux-<< matrix.python-version >> | << matrix.pip-constraints >> matrix: parameters: - python-version: &python-versions ["3.7.9", "3.8.6", "3.9.0", "3.10.0", "3.11.0"] + python-version: &python-versions ["3.8.6", "3.9.0", "3.10.0", "3.11.0"] pip-constraints: - - "scikit-learn==1.0.2" # lowest supported by package - # - "scikit-learn~=1.0.0" # 1.0.2 is the highest in ~=1.0.0 - - "scikit-learn~=1.1.0" - - "scikit-learn~=1.2.0" # latest in current minor as of March 2023 + - "scikit-learn==1.2.1" # lowest supported by package - "scikit-learn~=1.0" # latest in current major - exclude: - # sklearn < 1.1.3 does not support Python 3.11 - - python-version: "3.11.0" - pip-constraints: "scikit-learn==1.0.2" - # sklearn > 1.0.2 does not support Python 3.7 - - python-version: "3.7.9" - pip-constraints: "scikit-learn~=1.1.0" - - python-version: "3.7.9" - pip-constraints: "scikit-learn~=1.2.0" - test-macos: matrix: diff --git a/dwave/plugins/sklearn/_conditional_mutual_info.py b/dwave/plugins/sklearn/_conditional_mutual_info.py new file mode 100644 index 0000000..36eec3c --- /dev/null +++ b/dwave/plugins/sklearn/_conditional_mutual_info.py @@ -0,0 +1,765 @@ +# The following traversal code is adapted from two sources +# I. Scikit-learn implementation for mutual information computation +# II. Computation of conditional mutual information in the repo +# https://github.com/jannisteunissen/mutual_information +# +# I. Methods and algorithms from `sklearn.feature_selection._mutual_info.py` +# +# Author: Nikolay Mayorov +# License: 3-clause BSD +# +# II. Modifications in https://github.com/jannisteunissen/mutual_information +# BSD 3-Clause License +# +# Copyright (c) 2022, Jannis Teunissen +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Modifications are distributed under the Apache 2.0 Software license. +# +# Copyright 2023 D-Wave Systems Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import combinations +import typing + +import numpy as np +import numpy.typing as npt + +from scipy.sparse import issparse +from scipy.special import digamma +from scipy.stats import entropy +from sklearn.metrics.cluster import mutual_info_score +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import check_array, check_X_y + + +def estimate_mi_matrix( + X: npt.ArrayLike, + y: npt.ArrayLike, + discrete_features: typing.Union[str, npt.ArrayLike] = "auto", + discrete_target: bool = False, + n_neighbors: int = 4, + conditional: bool = True, + copy: bool = True, + random_state: typing.Union[None, int] = None, + n_jobs: typing.Union[None, int] = None, +) -> npt.ArrayLike: + """ + For the feature array `X` and the target array `y` computes + the matrix of (conditional) mutual information interactions. + The matrix is defined as follows: + + `M_(i, i) = I(x_i; y)` + + If `conditional = True`, then the off-diagonal terms are computed: + + `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2` + + Otherwise + + `M_(i, j) = I(x_i; x_j)` + + Computation of I(x; y) uses modified scikit-learn methods. + The computation of I(x; y| z) is based on + https://github.com/jannisteunissen/mutual_information and [3]_. + + The method can be computationally expensive for a large number of features (> 1000) and + a large number of samples (> 100000). In this case, it can be advisable to downsample the + dataset. + + The estimation methods are linear in the number of outcomes (labels) of discrete distributions. + It may be beneficial to treat the discrete distrubitions with a large number of labels as + continuous distributions. + + Args: + X: See :func:`sklearn.feature_selection.mutual_info_regression` + + y: See :func:`sklearn.feature_selection.mutual_info_regression` + + conditional: bool, default=True + Whether to compute the off-diagonal terms using the conditional mutual + information or joint mutual information + + discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression` + + discrete_target: bool, default=False + Whether the target variable `y` is discrete + + n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression` + + copy: See :func:`sklearn.feature_selection.mutual_info_regression` + + random_state: See :func:`sklearn.feature_selection.mutual_info_regression` + + n_jobs: int, default=None + The number of parallel jobs to run for the conditional mutual information + computation. The parallelization is over the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns: + mi_matrix : ndarray, shape (n_features, n_features) + Interaction matrix between the features using (conditional) mutual information. + A negative value will be replaced by 0. + + References: + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) + n_samples, n_features = X.shape + + if isinstance(discrete_features, (str, bool)): + if isinstance(discrete_features, str): + if discrete_features == "auto": + discrete_features = issparse(X) + else: + raise ValueError("Invalid string value for discrete_features.") + discrete_mask = np.empty(n_features, dtype=bool) + discrete_mask.fill(discrete_features) + else: + discrete_features = check_array(discrete_features, ensure_2d=False) + if np.issubdtype(discrete_features.dtype, bool): + discrete_mask = np.zeros(n_features, dtype=bool) + discrete_mask[discrete_features] = True + else: + discrete_mask = discrete_features + + continuous_mask = ~discrete_mask + if np.any(continuous_mask) and issparse(X): + raise ValueError("Sparse matrix `X` can't have continuous features.") + + rng = check_random_state(random_state) + if np.any(continuous_mask): + X = X.astype(np.float64, copy=copy) + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) + + # Add small noise to continuous features as advised in Kraskov et. al. + means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) + X[:, continuous_mask] += ( + 1e-10 + * means + * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) + ) + + if not discrete_target: + y = scale(y, with_mean=False) + y += ( + 1e-10 + * np.maximum(1, np.mean(np.abs(y))) + * rng.standard_normal(size=n_samples) + ) + + mi_matrix = np.zeros((n_features, n_features), dtype=np.float64) + # Computing the diagonal terms + diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors) + for x, discrete_feature in zip(_iterate_columns(X), discrete_mask) + ) + np.fill_diagonal(mi_matrix, diagonal_vals) + # Computing the off-diagonal terms + off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2) + if conditional: + off_diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_cmi_distance) + (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) + else: + off_diagonal_vals = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi) + (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors) + for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter) + + off_diagonal_val = iter(off_diagonal_vals) + for i, j in combinations(range(n_features), 2): + mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val) + return mi_matrix + + +def _compute_cmi_distance( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + discrete_feature_i: bool, + discrete_feature_j: bool, + discrete_target: bool, + n_neighbors: int = 4, +) -> float: + """ + Computes a distance `d` based on the conditional mutual information + between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`. + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + discrete_feature_i: + Whether to consider `xi` as a discrete variable. + discrete_feature_j: + Whether to consider `xj` as a discrete variable. + discrete_target: + Whether to consider `y` as a discrete variable. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Distance between features based on conditional mutual information. + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional + mutual information estimation for mixed, discrete and continuous + data." IEEE Transactions on Information Theory 67.1 (2020): 464-484. + """ + if discrete_feature_i and discrete_feature_j and discrete_target: + ans = _compute_cmip_d(xi, xj, y) + elif discrete_target: + ans = _compute_cmip_ccd(xi, xj, y, n_neighbors) + elif discrete_feature_i and (not discrete_target): + ans = _compute_cmip_cdc(xj, xi, y, n_neighbors) + elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target): + ans = _compute_cmip_cdc(xi, xj, y, n_neighbors) + else: + ans = _compute_cmip_c(xi, xj, y, n_neighbors) + return np.mean(ans) + + +def _compute_cmip_d( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`. + All random variables `x_i`, `x_j` and `y` are discrete. + + Adapted from https://github.com/dwave-examples/mutual-information-feature-selection + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + # Computing joint probability distribution for the features and the target + bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)] + prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries) + + # Computing entropy + Hijy = entropy(prob.flatten()) # H(x_i, x_j, y) + Hij = entropy(prob.sum(axis=2).flatten()) # H(x_i, x_j) + cmi_ij = ( + Hij-Hijy + + entropy(prob.sum(axis=0).flatten()) # H(x_j, y) + - entropy(prob.sum(axis=(0, 2))) # H(x_j) + ) + cmi_ji = ( + Hij-Hijy + + entropy(prob.sum(axis=1).flatten()) # H(x_i, y) + - entropy(prob.sum(axis=(1, 2))) # H(x_i) + ) + return cmi_ij, cmi_ji + + +def _compute_cmip_c( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y` + are assumed to be continuous. + + Adapted from https://github.com/jannisteunissen/mutual_information + :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + xi = xi.reshape((-1, 1)) + xj = xj.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)), + n_neighbors=n_neighbors) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + n_j = _num_points_within_radius(xj, radius) + n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) + n_jy = _num_points_within_radius(np.hstack((y, xj)), radius) + + n_i = _num_points_within_radius(xi, radius) + n_iy = _num_points_within_radius(np.hstack((y, xi)), radius) + + return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy) + + +def _compute_cmip_ccd( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed + to be continuous, while `y` is assumed to be discrete. + + Adapted from https://github.com/jannisteunissen/mutual_information + :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + xi = xi.reshape((-1, 1)) + xj = xj.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y, + n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + # label_counts = label_counts[mask] + k_all = k_all[mask] + xi = xi[mask] + xj = xj[mask] + y = y[mask] + radius = radius[mask] + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + # continuous + n_i = _num_points_within_radius(xi, radius) + n_j = _num_points_within_radius(xj, radius) + n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius) + # mixed discrete/continuous estimates + n_iy = _num_points_within_radius_cd(xi, y, radius) + n_jy = _num_points_within_radius_cd(xj, y, radius) + + return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) + + +def _compute_cmip_cdc( + xi: npt.ArrayLike, + xj: npt.ArrayLike, + y: npt.ArrayLike, + n_neighbors: int = 4) -> typing.Tuple[float]: + """ + Computes conditional mutual information pair `I(x_i; y | x_j)` + and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed + to be continuous, while `x_j` is assumed to be discrete. + + Adapted from https://github.com/jannisteunissen/mutual_information + :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + xi: + A feature vector formatted as a numerical 1D array-like. + xj: + A feature vector formatted as a numerical 1D array-like. + y: + Target vector formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Pair of conditional mutual information values between two features and the target vector. + """ + xi = xi.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj, + n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + label_counts = label_counts[mask] + k_all = k_all[mask] + xi = xi[mask] + xj = xj[mask] + y = y[mask] + radius = radius[mask] + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + # continuous + n_i = _num_points_within_radius(xi, radius) + n_iy = _num_points_within_radius(np.hstack((xi, y)), radius) + # mixed coninuous/discrete estimates + n_ij = _num_points_within_radius_cd(xi, xj, radius) + n_jy = _num_points_within_radius_cd(y, xj, radius) + # discrete estimates + n_j = label_counts + + return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy) + + +def _get_cmi_pair_from_estimates( + k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike, + n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]: + """Get an estimate from nearest neighbors counts""" + common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij)) + cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy)) + cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy)) + return max(0, cmi_ij), max(0, cmi_ji) + + +def _compute_mi( + x: npt.ArrayLike, + y: npt.ArrayLike, + discrete_feature_x: bool, + discrete_feature_y: bool, + n_neighbors: int = 4, +): + """ + Compute mutual information between features `I(x_i; x_j)`. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + + Args: + x, y: + Feature vectors each formatted as a numerical 1D array-like. + discrete_feature_x, discrete_feature_y: + Whether to consider `x` and `y` as discrete variables. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Mutual information between feature vectors x and y. + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + """ + if discrete_feature_y and discrete_feature_x: + return mutual_info_score(x, y) + elif discrete_feature_x: + return _compute_mi_cd(y, x, n_neighbors=n_neighbors) + elif discrete_feature_y: + return _compute_mi_cd(x, y, n_neighbors=n_neighbors) + else: + return _compute_mi_cc(x, y, n_neighbors=n_neighbors) + + +def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: + """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed + to be continuous. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + x, y: + Feature vectors each formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Mutual information between feature vectors x and y. + """ + n_samples = x.size + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + nx = _num_points_within_radius(x, radius) + ny = _num_points_within_radius(y, radius) + + mi = ( + digamma(n_samples) + + digamma(n_neighbors) + - np.mean(digamma(nx)) + - np.mean(digamma(ny)) + ) + + return max(0, mi) + + +def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float: + """Computes mutual information `I(x; y)`. Random variable `x`, is assumed + to be continuous, while `y` is assumed to be discrete. + + Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + x, y: + Feature vectors each formatted as a numerical 1D array-like. + n_neighbors: + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + Returns: + Mutual information between feature vectors x and y. + """ + radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors) + + # Ignore points with unique labels. + mask = label_counts > 1 + n_samples = np.sum(mask) + label_counts = label_counts[mask] + k_all = k_all[mask] + x = x[mask] + radius = radius[mask] + + m_all = _num_points_within_radius(x.reshape(-1, 1), radius) + + mi = ( + digamma(n_samples) + + np.mean(digamma(k_all)) + - np.mean(digamma(label_counts)) + - np.mean(digamma(m_all)) + ) + + return max(0, mi) + + +def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike: + """ + Determine the smallest radius around `X` containing `n_neighbors` neighbors + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + + Args: + X: + Array of features. + n_neighbors: + Number of nearest neighbors. + Returns: + Vector of radii defined by the k nearest neighbors. + """ + nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) + return _get_radius_from_nn(X, nn) + + +def _get_radius_k_neighbours_d( + c: npt.ArrayLike, + d: npt.ArrayLike, + n_neighbors: int = 4 +) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]: + """ + Determine smallest radius around `c` and `d` containing `n_neighbors`. + + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + c: + Array of feature vectors. + d: + Vector of discrete features formatted as a numerical 1D array-like. + n_neighbors: + The number of nearest neighbors. + Returns: + radius: + Vector of radii defined by the k nearest neighbors. + label_counts: + Label counts of the discrete feature vector. + k_all: + Array of the number of points within a radius. + """ + d = d.flatten() + n_samples = d.shape[0] + c = c.reshape((n_samples, -1)) + + radius = np.empty(n_samples) + label_counts = np.empty(n_samples) + k_all = np.empty(n_samples) + nn = NearestNeighbors(metric="chebyshev") + for label in np.unique(d): + mask = d == label + count = np.sum(mask) + if count > 1: + k = min(n_neighbors, count - 1) + nn.set_params(n_neighbors=k) + radius[mask] = _get_radius_from_nn(c[mask], nn) + k_all[mask] = k + label_counts[mask] = np.sum(mask) + return radius, label_counts, k_all + + +def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike: + """ + Get the radius of nearest neighbors in `nn` model with dataset `X`. + + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + X: + Array of features. + nn: + Instance of the nearest neighbors class. + Returns: + Vector of radii defined by the k nearest neighbors. + """ + nn.fit(X) + r = nn.kneighbors()[0] + return np.nextafter(r[:, -1], 0) + + +def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike: + """For each point, determine the number of other points within a given radius + Function from https://github.com/jannisteunissen/mutual_information + Args: + X: + (Continuous) feature array. + radius: + Vector of radii. + Returns: + Vector containing the number of points within a radius. + """ + kd = KDTree(x, metric="chebyshev") + nx = kd.query_radius(x, radius, count_only=True, return_distance=False) + return np.array(nx) + + +def _num_points_within_radius_cd( + c: npt.ArrayLike, + d: npt.ArrayLike, + radius: npt.ArrayLike) -> npt.ArrayLike: + """ + For each point, determine the number of other points within a given radius. + + Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and + :func:`sklearn.feature_selection.mutual_info_classif`. + Args: + c: + (Continuous) feature array. + d: + Discrete feature vector formatted as a numerical 1D array-like + radius: + Vector of radii. + Returns: + Vector containing the number of points within a radius. + """ + c = c.reshape((-1, 1)) + n_samples = c.shape[0] + m_all = np.empty(n_samples) + for label in np.unique(d): + mask = d == label + m_all[mask] = _num_points_within_radius(c[mask], radius[mask]) + return m_all + + +def _iterate_columns(X, columns=None): + """Iterate over columns of a matrix. + Copied from :func:`sklearn.feature_selection._mutual_info`. + + Parameters + ---------- + X : ndarray or csc_matrix, shape (n_samples, n_features) + Matrix over which to iterate. + + columns : iterable or None, default=None + Indices of columns to iterate over. If None, iterate over all columns. + + Yields + ------ + x : ndarray, shape (n_samples,) + Columns of `X` in dense format. + """ + if columns is None: + columns = range(X.shape[1]) + + if issparse(X): + for i in columns: + x = np.zeros(X.shape[0]) + start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1] + x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr] + yield x + else: + for i in columns: + yield X[:, i] diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py index aa186be..1cb8f50 100644 --- a/dwave/plugins/sklearn/transformers.py +++ b/dwave/plugins/sklearn/transformers.py @@ -14,24 +14,20 @@ from __future__ import annotations -import itertools -import logging import tempfile import typing -import warnings import dimod -import numpy as np -import numpy.typing as npt - from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError +from dwave.plugins.sklearn._conditional_mutual_info import estimate_mi_matrix +from dwave.plugins.sklearn.utilities import corrcoef from dwave.system import LeapHybridCQMSampler +import numpy as np +import numpy.typing as npt from sklearn.base import BaseEstimator from sklearn.feature_selection import SelectorMixin from sklearn.utils.validation import check_is_fitted -from dwave.plugins.sklearn.utilities import corrcoef - __all__ = ["SelectFromQuadraticModel"] @@ -50,21 +46,36 @@ class SelectFromQuadraticModel(SelectorMixin, BaseEstimator): :class:`sklearn.feature_selection.SelectKBest`. num_features: The number of features to select. + method: + If equal to ``correlation`` uses a correlation as a criterion for + choosing the features according to [1]_. If equal to ``mutual_information`` + uses mutual information criteria [2]_ or [3]_ (advances feature). time_limit: The time limit for the run on the hybrid solver. + .. [1] [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature + Selection in Credit Scoring and Classification Using a Quantum Annealer." + 1QBit; White Paper. + https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer + .. [2] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria + of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern + analysis and machine intelligence, 27(8):1226–1238, 2005. + .. [3] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for + mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD + international conference on Knowledge discovery and data mining, + pages 512–521. ACM, 2014. """ ACCEPTED_METHODS = [ "correlation", - # "mutual information", # todo - ] + "mutual_information", + ] def __init__( self, *, alpha: float = .5, - method: str = "correlation", # undocumented until there is another supported + method: str = "correlation", num_features: int = 10, time_limit: typing.Optional[float] = None, ): @@ -110,8 +121,8 @@ def _get_support_mask(self) -> np.ndarray[typing.Any, np.dtype[np.bool_]]: except AttributeError: raise RuntimeError("fit hasn't been run yet") - @staticmethod def correlation_cqm( + self, X: npt.ArrayLike, y: npt.ArrayLike, *, @@ -122,7 +133,7 @@ def correlation_cqm( """Build a constrained quadratic model for feature selection. This method is based on maximizing influence and independence as - measured by correlation [Milne et al.]_. + measured by correlation [1]_. Args: X: @@ -147,32 +158,12 @@ def correlation_cqm( Returns: A constrained quadratic model. - .. [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature + .. [1] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature Selection in Credit Scoring and Classification Using a Quantum Annealer." 1QBit; White Paper. https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer """ - - X = np.atleast_2d(np.asarray(X)) - y = np.asarray(y) - - if X.ndim != 2: - raise ValueError("X must be a 2-dimensional array-like") - - if y.ndim != 1: - raise ValueError("y must be a 1-dimensional array-like") - - if y.shape[0] != X.shape[0]: - raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") - - if not 0 <= alpha <= 1: - raise ValueError(f"alpha must be between 0 and 1, given {alpha}") - - if num_features <= 0: - raise ValueError(f"num_features must be a positive integer, given {num_features}") - - if X.shape[0] <= 1: - raise ValueError("X must have at least two rows") + self._check_params(X, y, alpha, num_features) cqm = dimod.ConstrainedQuadraticModel() cqm.add_variables(dimod.BINARY, X.shape[1]) @@ -183,7 +174,7 @@ def correlation_cqm( '==' if strict else '<=', num_features, label=f"{num_features}-hot", - ) + ) with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout: # we make a copy of X because we'll be modifying it in-place within @@ -198,27 +189,139 @@ def correlation_cqm( dtype=np.result_type(X, y), mode="w+", shape=(X_copy.shape[1], X_copy.shape[1]), - ) - + ) # main calculation. It modifies X_copy in-place corrcoef(X_copy, out=correlations, rowvar=False, copy=False) - # we don't care about the direction of correlation in terms of # the penalty/quality np.absolute(correlations, out=correlations) - + # multiplying all but last columns and rows with (1 - alpha) + np.multiply(correlations[:-1, :-1], (1 - alpha), out=correlations[:-1, :-1]) # our objective - # we multiply by 2 because the matrix is symmetric - np.fill_diagonal(correlations, correlations[:, -1] * (-2 * alpha * num_features)) - - # Note: the full symmetric matrix (with both upper- and lower-diagonal - # entries for each correlation coefficient) is retained for consistency with - # the original formulation from Milne et al. + # we multiply by num_features to have consistent performance + # with the increase of the number of features + np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features)) + # Note: we only add terms on and above the diagonal it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']]) cqm.set_objective((*it.multi_index, x) for x in it if x) return cqm + def mutual_information_cqm( + self, + X: npt.ArrayLike, + y: npt.ArrayLike, + *, + num_features: int, + alpha: float = 0.5, + strict: bool = True, + conditional: bool = True, + discrete_features: typing.Union[str, npt.ArrayLike] = "auto", + discrete_target: bool = False, + copy: bool = True, + n_neighbors: int = 4, + n_jobs: typing.Union[None, int] = None, + random_state: typing.Union[None, int] = None, + ) -> dimod.ConstrainedQuadraticModel: + """Build a constrained quadratic model for feature selection. + + If ``conditional`` is True then the conditional mutual information + criterion from [2] is used, and if ``conditional`` is False then + mutual information based criteria from [1] are used. + + For computation of mutual information and conditional mutual information + + + Args: + X: + Feature vectors formatted as a numerical 2D array-like. + y: + Class labels formatted as a numerical 1D array-like. + alpha: + Hyperparameter between 0 and 1 that controls the relative weight of + the relevance and redundancy terms. + ``alpha=0`` places the maximum weight on the feature redundancy. + ``alpha=1`` places no weight on the feature redudancy, + and therefore will be equivalent to using + :class:`sklearn.feature_selection.SelectKBest`. + If conditional=True, ``alpha = 0`` is a default value in [2]_. + If conditional=False, ``alpha = (num_features - 1) / num_features`` + is a default value in [1]_. + num_features: + The number of features to select. + strict: + If ``False`` the constraint on the number of selected features + is ``<=`` rather than ``==``. + conditional: bool, default=True + Whether to compute the off-diagonal terms using the conditional mutual + information or joint mutual information + discrete_features: + See :func:`sklearn.feature_selection.mutual_info_regression` + discrete_target: bool, default=False + Whether the target variable `y` is discrete + n_neighbors: + See :func:`sklearn.feature_selection.mutual_info_regression` + copy: + See :func:`sklearn.feature_selection.mutual_info_regression` + random_state: + See :func:`sklearn.feature_selection.mutual_info_regression` + n_jobs: int, default=None + The number of parallel jobs to run for the conditional mutual information + computation. The parallelization is over the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns: + A constrained quadratic model. + + References: + .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria + of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern + analysis and machine intelligence, 27(8):1226–1238, 2005. + .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for + mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD + international conference on Knowledge discovery and data mining, + pages 512–521. ACM, 2014. + """ + + self._check_params(X, y, alpha, num_features) + + cqm = dimod.ConstrainedQuadraticModel() + cqm.add_variables(dimod.BINARY, X.shape[1]) + + # add the k-hot constraint + cqm.add_constraint( + ((v, 1) for v in cqm.variables), + '==' if strict else '<=', + num_features, + label=f"{num_features}-hot", + ) + + mi = estimate_mi_matrix( + X, y, + discrete_features=discrete_features, + discrete_target=discrete_target, + n_neighbors=n_neighbors, copy=copy, + random_state=random_state, n_jobs=n_jobs, + conditional=conditional) + + diagonal = -np.diag(mi).copy() + if conditional: + # method from [2]_ with another tuning dial. + # mutpliypling off-diagonal ones with -(1-alpha) + np.multiply(mi, -(1-alpha), out=mi) + else: + # method from [1]_. + # mutpliypling off-diagonal ones with 1-alpha + np.multiply(mi, 1-alpha, out=mi) + # keeping the diagonal + np.fill_diagonal(mi, diagonal) + + it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']]) + cqm.set_objective((*it.multi_index, x) for x in it if x) + return cqm + def fit( self, X: npt.ArrayLike, @@ -227,6 +330,7 @@ def fit( alpha: typing.Optional[float] = None, num_features: typing.Optional[int] = None, time_limit: typing.Optional[float] = None, + **kwargs ) -> SelectFromQuadraticModel: """Select the features to keep. @@ -277,8 +381,9 @@ def fit( if self.method == "correlation": cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha) - # elif self.method == "mutual information": - # cqm = self.mutual_information_cqm(X, y, num_features=num_features) + elif self.method == "mutual_information": + cqm = self.mutual_information_cqm( + X, y, num_features=num_features, alpha=alpha, **kwargs) else: raise ValueError(f"only methods {self.acceptable_methods} are implemented") @@ -311,3 +416,30 @@ def fit( def unfit(self): """Undo a previously executed ``fit`` method.""" del self._mask + + @staticmethod + def _check_params(X: npt.ArrayLike, + y: npt.ArrayLike, + alpha: float, + num_features: int): + X = np.atleast_2d(np.asarray(X)) + y = np.asarray(y) + + if X.ndim != 2: + raise ValueError("X must be a 2-dimensional array-like") + + if y.ndim != 1: + raise ValueError("y must be a 1-dimensional array-like") + + if y.shape[0] != X.shape[0]: + raise ValueError( + f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}") + + if not 0 <= alpha <= 1: + raise ValueError(f"alpha must be between 0 and 1, given {alpha}") + + if num_features <= 0: + raise ValueError(f"num_features must be a positive integer, given {num_features}") + + if X.shape[0] <= 1: + raise ValueError("X must have at least two rows") diff --git a/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml new file mode 100644 index 0000000..dc85f5f --- /dev/null +++ b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml @@ -0,0 +1,3 @@ +--- +fixes: + - Multiply off-diagonal terms of the correlation matrix with 1-alpha. \ No newline at end of file diff --git a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml new file mode 100644 index 0000000..b809dc2 --- /dev/null +++ b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + Add feature selection method using (conditional) mutual information +deprecations: + - | + Drop support for Python 3.7. + - | + Drop support for scikit-learn 1.2.0 diff --git a/requirements.txt b/requirements.txt index 00814cb..cc26dd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,8 @@ numpy==1.20.0; python_version<"3.10.0" # oldest supported by dwave-system numpy==1.22.0; python_version~="3.10.0" # needed for scipy on windows oldest-supported-numpy; python_version>="3.11.0" -scikit-learn==1.2.0; python_version >= '3.8' -scikit-learn==1.0.2; python_version < '3.8' +scikit-learn==1.2.1 +scipy==1.9.0; python_version<"3.11.0" +scipy==1.9.2; python_version=="3.11.0" reno==3.5.0 diff --git a/setup.cfg b/setup.cfg index 269ee72..b036d93 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,12 +23,13 @@ install_requires = dimod>=0.12.3 dwave-system>=1.18.0 numpy>=1.20.0 - scikit-learn>=1.0.2 -packages = + scikit-learn>=1.2.1 + scipy>=1.9.0 +packages = dwave dwave.plugins dwave.plugins.sklearn -python_requires = >=3.7 +python_requires = >=3.8 [pycodestyle] max-line-length = 100 diff --git a/tests/test_mutual_info.py b/tests/test_mutual_info.py new file mode 100644 index 0000000..45000b0 --- /dev/null +++ b/tests/test_mutual_info.py @@ -0,0 +1,369 @@ + +# Tests in TestMI are adapted from sklearn tests and licenced under: +# BSD 3-Clause License +# +# Copyright (c) 2007-2024 The scikit-learn developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Tests in TestCMI are adapted from https://github.com/jannisteunissen/mutual_information and +# distributed under the following licence: +# +# BSD 3-Clause License +# +# Copyright (c) 2022, Jannis Teunissen +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The modifications of the test and the tests in TestDCMI are distributed +# under the Apache 2.0 Software license: +# +# Copyright 2023 D-Wave Systems Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from dwave.plugins.sklearn._conditional_mutual_info import ( + _compute_mi, + _compute_cmip_c, _compute_cmip_d, + _compute_cmip_cdc, _compute_cmip_ccd +) + +from numpy.linalg import det +from sklearn.feature_selection import mutual_info_classif +from sklearn.preprocessing import scale +from sklearn.utils import check_random_state + + +class TestMI(unittest.TestCase): + def test_against_sklearn(self): + # the test will break if sklearn modifies its implementation + from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc + from sklearn.metrics.cluster import mutual_info_score + # Test discrete implementation + n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15 + np.random.seed(random_state) + xi = np.random.randint(0, 11, (n_samples, )) + xj = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(xi) + np.random.shuffle(xj) + xi_c = scale(np.random.randn(n_samples), with_mean=False) + xj_c = scale(np.random.randn(n_samples), with_mean=False) + + mi_ij = _compute_mi( + xi, xj, True, True, n_neighbors=n_neighbors) + mi_ij_cl = mutual_info_score(xi, xj) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij), + msg="Discrete mutual information is not computed correctly") + + mi_ij = _compute_mi( + xi_c, xj, False, True, n_neighbors=n_neighbors) + mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij, atol=1e-5), + msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}") + + mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors) + mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors) + self.assertTrue( + np.allclose(mi_ij_cl, mi_ij, atol=1e-5), + msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}") + + def test_compute_mi_dd(self): + # In discrete case computations are straightforward and can be done + # by hand on given vectors. + x = np.array([0, 1, 1, 0, 0]) + y = np.array([1, 0, 0, 0, 1]) + + H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5) + H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5) + I_xy = H_x + H_y - H_xy + + self.assertAlmostEqual(_compute_mi(x, y, discrete_feature_x=True, + discrete_feature_y=True), I_xy, places=4) + + def test_compute_mi_cc(self): + # For two continuous variables a good approach is to test on bivariate + # normal distribution, where mutual information is known. + + # Mean of the distribution, irrelevant for mutual information. + mean = np.zeros(2) + + # Setup covariance matrix with correlation coeff. equal 0.5. + sigma_1 = 1 + sigma_2 = 10 + corr = 0.5 + cov = np.array( + [ + [sigma_1**2, corr * sigma_1 * sigma_2], + [corr * sigma_1 * sigma_2, sigma_2**2], + ] + ) + + # True theoretical mutual information. + I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov)) + + rng = check_random_state(0) + Z = rng.multivariate_normal(mean, cov, size=5001).astype(np.float64, copy=False) + + x, y = Z[:, 0], Z[:, 1] + + # Theory and computed values won't be very close + # We here check with a large relative tolerance + for n_neighbors in [4, 6, 8]: + I_computed = _compute_mi( + x, y, discrete_feature_x=False, discrete_feature_y=False, n_neighbors=n_neighbors + ) + self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1) + + def test_compute_mi_cd(self): + # To test define a joint distribution as follows: + # p(x, y) = p(x) p(y | x) + # X ~ Bernoulli(p) + # (Y | x = 0) ~ Uniform(-1, 1) + # (Y | x = 1) ~ Uniform(0, 2) + + # Use the following formula for mutual information: + # I(X; Y) = H(Y) - H(Y | X) + # Two entropies can be computed by hand: + # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2) + # H(Y | X) = ln(2) + + # Now we need to implement sampling from out distribution, which is + # done easily using conditional distribution logic. + + n_samples = 5001 + rng = check_random_state(0) + + for p in [0.3, 0.5, 0.7]: + x = rng.uniform(size=n_samples) > p + + y = np.empty(n_samples, np.float64) + mask = x == 0 + y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) + y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) + + I_theory = -0.5 * ( + (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) + ) - np.log(2) + + # Assert the same tolerance. + for n_neighbors in [4, 6, 8]: + I_computed = _compute_mi( + x, y, discrete_feature_x=True, discrete_feature_y=False, n_neighbors=n_neighbors + ) + self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1) + + def test_compute_mi_cd_unique_label(self): + # Test that adding unique label doesn't change MI. + n_samples = 100 + x = np.random.uniform(size=n_samples) > 0.5 + + y = np.empty(n_samples, np.float64) + mask = x == 0 + y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)) + y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) + + mi_1 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False) + + x = np.hstack((x, 2)) + y = np.hstack((y, 10)) + mi_2 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False) + + self.assertAlmostEqual(mi_1, mi_2, places=5) + + +class TestCMI(unittest.TestCase): + def test_discrete(self): + # Third test from "Conditional Mutual Information Estimation for Mixed Discrete + # and Continuous Variables with Nearest Neighbors" (Mesner & Shalizi). Fully + # discrete + def rescale(xi): + xi = scale(xi + 0.0, with_mean=False, copy=False) + # Add small noise to continuous features as advised in Kraskov et. al. + xi = xi.astype(np.float64, copy=False) + means = np.maximum(1, np.mean(np.abs(xi), axis=0)) + xi += (1e-10 * means * rng.standard_normal(size=(xi.shape[0], ))) + return xi + N = 1001 + rng = check_random_state(0) + choices = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1]]) + p_discrete = rng.choice(4, p=[0.4, 0.4, 0.1, 0.1], size=N) + xy_discrete = choices[p_discrete] + x, y = xy_discrete[:, 0], xy_discrete[:, 1] + z = rng.poisson(2, size=N) + x_c = rescale(x) + y_c = rescale(y) + z_c = rescale(z) + cmi_analytic = 2 * 0.4 * np.log(0.4/0.5**2) + 2 * 0.1 * np.log(0.1/0.5**2) + # checking the computations with a large relative tolerance + for n_neighbors in [3, 5, 7]: + cmi_ij, _ = _compute_cmip_d(x, z, y) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + cmi_ij, _ = _compute_cmip_c(x_c, z_c, y_c, n_neighbors) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + cmi_ij, _ = _compute_cmip_ccd(x_c, z_c, y, n_neighbors) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + cmi_ij, _ = _compute_cmip_cdc(x_c, z, y_c, n_neighbors) + self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1) + + def test_bivariate(self): + """Test with bivariate normal variables""" + N = 1001 + rng = check_random_state(0) + mu = np.zeros(2) + cov = np.array([[1., 0.8], [0.8, 1.0]]) + xy_gauss = rng.multivariate_normal(mu, cov, size=N) + x, y = xy_gauss[:, 0], xy_gauss[:, 1] + z = rng.normal(size=N) + + cmi_analytic = -0.5 * np.log(det(cov)) + # checking the computations with a large relative tolerance + for n_neighbors in [4, 6, 8]: + cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors) + self.assertAlmostEqual(cmi_ij/cmi_analytic, 1.0, places=1) + + def test_trivariate(self): + # Test with 'trivariate' normal variables x, y, z + + mu = np.zeros(3) + + # Covariance matrix + cov_xy = 0.7 + cov_xz = 0.5 + cov_yz = 0.3 + cov = np.array([[1, cov_xy, cov_xz], + [cov_xy, 1.0, cov_yz], + [cov_xz, cov_yz, 1]]) + N = 1001 + rng = check_random_state(0) + samples = rng.multivariate_normal(mu, cov, size=N) + x, y, z = samples[:, 0], samples[:, 1], samples[:, 2] + + # Construct minor matrices for x and y + cov_x = cov[1:, 1:] + cov_y = cov[[0, 2]][:, [0, 2]] + cmi_analytic = -0.5 * np.log(det(cov) / (det(cov_x) * det(cov_y))) + # checking the computations with a large relative tolerance + for n_neighbors in [4, 6, 8]: + cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors) + self.assertAlmostEqual(cmi_ij/cmi_analytic, 1, places=1) + + +class TestDCMI(unittest.TestCase): + def test_cmi_symmetry(self): + # The pair produced by the computations should be symmetric in the first two arguments + np.random.seed(42) + n_samples, n_neighbors = 4003, 4 + # Test continuous implementation + Xy = np.random.randn(n_samples, 3) + cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors) + cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, + msg="Computation for continuous conditional mutual information is not symmetric") + + c1 = np.random.randn(n_samples,) + c2 = np.random.randn(n_samples,) + n_ss = 1001 + d = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(d) + cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors) + cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors) + self.assertAlmostEqual( + sum(cmi_ij_pair), sum(cmi_ji_pair), places=3, + msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric") + + def test_cmi_discrete(self): + # We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`. + # It is highly likely if our algorithm is correct, then formula holds. + np.random.seed(42) + # Test discrete implementation + n_samples, n_ss, n_neighbors = 103, 51, 4 + xi = np.random.randint(0, 11, (n_samples, )) + xj = np.hstack(( + np.random.randint(-2, 1, (n_ss, )), + np.random.randint(10, 14, (n_samples-n_ss, )))) + np.random.shuffle(xi) + np.random.shuffle(xj) + y = np.random.randint(-12, -10, (n_samples, )) + cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y) + # we estimate mutual information using + mi_i = mutual_info_classif( + xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) + mi_j = mutual_info_classif( + xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors) + + self.assertTrue( + np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5), + msg="The chain rule for discrete conditional mutual information is violated") + cmi_ij2 = _compute_cmip_d(xj, xi, y) + self.assertTrue( + np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5), + msg="Discrete conditional mutual information computation is not symmetric") diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 14214cc..cfe4c49 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -123,9 +123,13 @@ def test_pipeline(self): clf.predict(X) def test_alpha_0(self): - cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=0) + cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=0) self.assertTrue(not any(cqm.objective.linear.values())) + def test_alpha_1_no_quadratic(self): + cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=1) + self.assertTrue(not any(cqm.objective.quadratic.values())) + def test_alpha_1(self): rng = np.random.default_rng(42) @@ -179,7 +183,7 @@ def test_fixed_column(self): X[:, 1] = 0 X[:, 5] = 1 - cqm = SelectFromQuadraticModel.correlation_cqm(X, self.y, alpha=.5, num_features=5) + cqm = SelectFromQuadraticModel().correlation_cqm(X=X, y=self.y, alpha=.5, num_features=5) # in this case the linear bias for those two columns should be 0 self.assertEqual(cqm.objective.linear[1], 0) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 107f4d8..5b469e5 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -45,7 +45,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os.path import tempfile import unittest