diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5c59d81..845014e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -205,22 +205,10 @@ workflows:
           name: test-linux-<< matrix.python-version >> | << matrix.pip-constraints >>
           matrix:
             parameters:
-              python-version: &python-versions ["3.7.9", "3.8.6", "3.9.0", "3.10.0", "3.11.0"]
+              python-version: &python-versions ["3.8.6", "3.9.0", "3.10.0", "3.11.0"]
               pip-constraints:
-                - "scikit-learn==1.0.2"  # lowest supported by package
-                # - "scikit-learn~=1.0.0"  # 1.0.2 is the highest in ~=1.0.0
-                - "scikit-learn~=1.1.0"
-                - "scikit-learn~=1.2.0"  # latest in current minor as of March 2023
+                - "scikit-learn==1.2.1"  # lowest supported by package
                 - "scikit-learn~=1.0"  # latest in current major
-            exclude:
-              # sklearn < 1.1.3 does not support Python 3.11
-              - python-version: "3.11.0"
-                pip-constraints: "scikit-learn==1.0.2"
-              # sklearn > 1.0.2 does not support Python 3.7
-              - python-version: "3.7.9"
-                pip-constraints: "scikit-learn~=1.1.0"
-              - python-version: "3.7.9"
-                pip-constraints: "scikit-learn~=1.2.0"
 
       - test-macos:
           matrix:
diff --git a/dwave/plugins/sklearn/_conditional_mutual_info.py b/dwave/plugins/sklearn/_conditional_mutual_info.py
new file mode 100644
index 0000000..36eec3c
--- /dev/null
+++ b/dwave/plugins/sklearn/_conditional_mutual_info.py
@@ -0,0 +1,765 @@
+# The following traversal code is adapted from two sources
+# I. Scikit-learn implementation for mutual information computation
+# II. Computation of conditional mutual information in the repo
+#       https://github.com/jannisteunissen/mutual_information
+#
+# I. Methods and algorithms from `sklearn.feature_selection._mutual_info.py`
+#
+# Author: Nikolay Mayorov <n59_ru@hotmail.com>
+# License: 3-clause BSD
+#
+# II. Modifications in https://github.com/jannisteunissen/mutual_information
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Jannis Teunissen
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Modifications are distributed under the Apache 2.0 Software license.
+#
+# Copyright 2023 D-Wave Systems Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import combinations
+import typing
+
+import numpy as np
+import numpy.typing as npt
+
+from scipy.sparse import issparse
+from scipy.special import digamma
+from scipy.stats import entropy
+from sklearn.metrics.cluster import mutual_info_score
+from sklearn.neighbors import NearestNeighbors, KDTree
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+from sklearn.utils.parallel import Parallel, delayed
+from sklearn.utils.validation import check_array, check_X_y
+
+
+def estimate_mi_matrix(
+    X: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_features: typing.Union[str, npt.ArrayLike] = "auto",
+    discrete_target: bool = False,
+    n_neighbors: int = 4,
+    conditional: bool = True,
+    copy: bool = True,
+    random_state: typing.Union[None, int] = None,
+    n_jobs: typing.Union[None, int] = None,
+) -> npt.ArrayLike:
+    """
+    For the feature array `X` and the target array `y` computes
+    the matrix of (conditional) mutual information interactions.
+    The matrix is defined as follows:
+
+    `M_(i, i) = I(x_i; y)`
+
+    If `conditional = True`, then the off-diagonal terms are computed:
+
+    `M_(i, j) = (I(x_i; y| x_j) + I(x_j; y| x_i)) / 2`
+
+    Otherwise
+
+    `M_(i, j) = I(x_i; x_j)`
+
+    Computation of I(x; y) uses modified scikit-learn methods.
+    The computation of I(x; y| z) is based on
+    https://github.com/jannisteunissen/mutual_information and [3]_.
+
+    The method can be computationally expensive for a large number of features (> 1000) and
+    a large number of samples (> 100000). In this case, it can be advisable to downsample the
+    dataset.
+
+    The estimation methods are linear in the number of outcomes (labels) of discrete distributions.
+    It may be beneficial to treat the discrete distrubitions with a large number of labels as
+    continuous distributions.
+
+    Args:
+        X: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        y: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        conditional: bool, default=True
+            Whether to compute the off-diagonal terms using the conditional mutual
+            information or joint mutual information
+
+        discrete_features: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        discrete_target: bool, default=False
+            Whether the target variable `y` is discrete
+
+        n_neighbors: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        copy: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        random_state: See :func:`sklearn.feature_selection.mutual_info_regression`
+
+        n_jobs: int, default=None
+            The number of parallel jobs to run for the conditional mutual information
+            computation. The parallelization is over the columns of `X`.
+            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+            for more details.
+
+    Returns:
+        mi_matrix : ndarray, shape (n_features, n_features)
+        Interaction matrix between the features using (conditional) mutual information.
+        A negative value will be replaced by 0.
+
+    References:
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
+    n_samples, n_features = X.shape
+
+    if isinstance(discrete_features, (str, bool)):
+        if isinstance(discrete_features, str):
+            if discrete_features == "auto":
+                discrete_features = issparse(X)
+            else:
+                raise ValueError("Invalid string value for discrete_features.")
+        discrete_mask = np.empty(n_features, dtype=bool)
+        discrete_mask.fill(discrete_features)
+    else:
+        discrete_features = check_array(discrete_features, ensure_2d=False)
+        if np.issubdtype(discrete_features.dtype, bool):
+            discrete_mask = np.zeros(n_features, dtype=bool)
+            discrete_mask[discrete_features] = True
+        else:
+            discrete_mask = discrete_features
+
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        X = X.astype(np.float64, copy=copy)
+        X[:, continuous_mask] = scale(
+            X[:, continuous_mask], with_mean=False, copy=False
+        )
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += (
+            1e-10
+            * means
+            * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
+        )
+
+    if not discrete_target:
+        y = scale(y, with_mean=False)
+        y += (
+            1e-10
+            * np.maximum(1, np.mean(np.abs(y)))
+            * rng.standard_normal(size=n_samples)
+        )
+
+    mi_matrix = np.zeros((n_features, n_features), dtype=np.float64)
+    # Computing the diagonal terms
+    diagonal_vals = Parallel(n_jobs=n_jobs)(
+        delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors)
+        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
+    )
+    np.fill_diagonal(mi_matrix, diagonal_vals)
+    # Computing the off-diagonal terms
+    off_diagonal_iter = combinations(zip(_iterate_columns(X), discrete_mask), 2)
+    if conditional:
+        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_cmi_distance)
+            (xi, xj, y, discrete_feature_i, discrete_feature_j, discrete_target, n_neighbors)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
+    else:
+        off_diagonal_vals = Parallel(n_jobs=n_jobs)(
+            delayed(_compute_mi)
+            (xi, xj, discrete_feature_i, discrete_feature_j, n_neighbors)
+            for (xi, discrete_feature_i), (xj, discrete_feature_j) in off_diagonal_iter)
+
+    off_diagonal_val = iter(off_diagonal_vals)
+    for i, j in combinations(range(n_features), 2):
+        mi_matrix[i, j] = mi_matrix[j, i] = next(off_diagonal_val)
+    return mi_matrix
+
+
+def _compute_cmi_distance(
+    xi: npt.ArrayLike,
+    xj: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_feature_i: bool,
+    discrete_feature_j: bool,
+    discrete_target: bool,
+    n_neighbors: int = 4,
+) -> float:
+    """
+    Computes a distance `d` based on the conditional mutual information
+    between features `x_i` and `x_j`: `d = (I(x_i; y | x_j)+I(x_j; y | x_i))/2`.
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        discrete_feature_i:
+            Whether to consider `xi` as a discrete variable.
+        discrete_feature_j:
+            Whether to consider `xj` as a discrete variable.
+        discrete_target:
+            Whether to consider `y` as a discrete variable.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Distance between features based on conditional mutual information.
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [3] Mesner, Octavio César, and Cosma Rohilla Shalizi. "Conditional
+           mutual information estimation for mixed, discrete and continuous
+           data." IEEE Transactions on Information Theory 67.1 (2020): 464-484.
+    """
+    if discrete_feature_i and discrete_feature_j and discrete_target:
+        ans = _compute_cmip_d(xi, xj, y)
+    elif discrete_target:
+        ans = _compute_cmip_ccd(xi, xj, y, n_neighbors)
+    elif discrete_feature_i and (not discrete_target):
+        ans = _compute_cmip_cdc(xj, xi, y, n_neighbors)
+    elif (not discrete_feature_i) and discrete_feature_j and (not discrete_target):
+        ans = _compute_cmip_cdc(xi, xj, y, n_neighbors)
+    else:
+        ans = _compute_cmip_c(xi, xj, y, n_neighbors)
+    return np.mean(ans)
+
+
+def _compute_cmip_d(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)` and `I(x_j; y | x_i)`.
+    All random variables `x_i`, `x_j` and `y` are discrete.
+
+    Adapted from https://github.com/dwave-examples/mutual-information-feature-selection
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    # Computing joint probability distribution for the features and the target
+    bin_boundaries = [np.hstack((np.unique(col), np.inf)) for col in (xi, xj, y)]
+    prob, _ = np.histogramdd(np.vstack((xi, xj, y)).T, bins=bin_boundaries)
+
+    # Computing entropy
+    Hijy = entropy(prob.flatten())  # H(x_i, x_j, y)
+    Hij = entropy(prob.sum(axis=2).flatten())  # H(x_i, x_j)
+    cmi_ij = (
+        Hij-Hijy
+        + entropy(prob.sum(axis=0).flatten())  # H(x_j, y)
+        - entropy(prob.sum(axis=(0, 2)))  # H(x_j)
+    )
+    cmi_ji = (
+        Hij-Hijy
+        + entropy(prob.sum(axis=1).flatten())  # H(x_i, y)
+        - entropy(prob.sum(axis=(1, 2)))  # H(x_i)
+    )
+    return cmi_ij, cmi_ji
+
+
+def _compute_cmip_c(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. All random variables `x_i`, `x_j` and `y`
+    are assumed to be continuous.
+
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    xi = xi.reshape((-1, 1))
+    xj = xj.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius = _get_radius_k_neighbours(np.hstack((xi, xj, y)),
+                                      n_neighbors=n_neighbors)
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    n_j = _num_points_within_radius(xj, radius)
+    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
+    n_jy = _num_points_within_radius(np.hstack((y, xj)), radius)
+
+    n_i = _num_points_within_radius(xi, radius)
+    n_iy = _num_points_within_radius(np.hstack((y, xi)), radius)
+
+    return _get_cmi_pair_from_estimates([n_neighbors], n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _compute_cmip_ccd(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. Random variables `x_i`, `x_j` are assumed
+    to be continuous, while `y` is assumed to be discrete.
+
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    xi = xi.reshape((-1, 1))
+    xj = xj.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, xj)), y,
+                                                             n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    # label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    xi = xi[mask]
+    xj = xj[mask]
+    y = y[mask]
+    radius = radius[mask]
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    # continuous
+    n_i = _num_points_within_radius(xi, radius)
+    n_j = _num_points_within_radius(xj, radius)
+    n_ij = _num_points_within_radius(np.hstack((xi, xj)), radius)
+    # mixed discrete/continuous estimates
+    n_iy = _num_points_within_radius_cd(xi, y, radius)
+    n_jy = _num_points_within_radius_cd(xj, y, radius)
+
+    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _compute_cmip_cdc(
+        xi: npt.ArrayLike,
+        xj: npt.ArrayLike,
+        y: npt.ArrayLike,
+        n_neighbors: int = 4) -> typing.Tuple[float]:
+    """
+    Computes conditional mutual information pair `I(x_i; y | x_j)`
+    and `I(x_j; y | x_i)`. Random variables `x_i`, `y` are assumed
+    to be continuous, while `x_j` is assumed to be discrete.
+
+    Adapted from https://github.com/jannisteunissen/mutual_information
+    :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        xi:
+            A feature vector formatted as a numerical 1D array-like.
+        xj:
+            A feature vector formatted as a numerical 1D array-like.
+        y:
+            Target vector formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Pair of conditional mutual information values between two features and the target vector.
+    """
+    xi = xi.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(np.hstack((xi, y)), xj,
+                                                             n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    xi = xi[mask]
+    xj = xj[mask]
+    y = y[mask]
+    radius = radius[mask]
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    # continuous
+    n_i = _num_points_within_radius(xi, radius)
+    n_iy = _num_points_within_radius(np.hstack((xi, y)), radius)
+    # mixed coninuous/discrete estimates
+    n_ij = _num_points_within_radius_cd(xi, xj, radius)
+    n_jy = _num_points_within_radius_cd(y, xj, radius)
+    # discrete estimates
+    n_j = label_counts
+
+    return _get_cmi_pair_from_estimates(k_all, n_ij, n_j, n_jy, n_i, n_iy)
+
+
+def _get_cmi_pair_from_estimates(
+        k_all: npt.ArrayLike, n_ij: npt.ArrayLike, n_j: npt.ArrayLike,
+        n_jy: npt.ArrayLike, n_i: npt.ArrayLike, n_iy: npt.ArrayLike) -> typing.Tuple[float]:
+    """Get an estimate from nearest neighbors counts"""
+    common_terms = np.mean(digamma(k_all))-np.mean(digamma(n_ij))
+    cmi_ij = common_terms+np.mean(digamma(n_j))-np.mean(digamma(n_jy))
+    cmi_ji = common_terms+np.mean(digamma(n_i))-np.mean(digamma(n_iy))
+    return max(0, cmi_ij), max(0, cmi_ji)
+
+
+def _compute_mi(
+    x: npt.ArrayLike,
+    y: npt.ArrayLike,
+    discrete_feature_x: bool,
+    discrete_feature_y: bool,
+    n_neighbors: int = 4,
+):
+    """
+    Compute mutual information between features `I(x_i; x_j)`.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+
+    Args:
+        x, y:
+            Feature vectors each formatted as a numerical 1D array-like.
+        discrete_feature_x, discrete_feature_y:
+            Whether to consider `x` and `y` as discrete variables.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Mutual information between feature vectors x and y.
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    """
+    if discrete_feature_y and discrete_feature_x:
+        return mutual_info_score(x, y)
+    elif discrete_feature_x:
+        return _compute_mi_cd(y, x, n_neighbors=n_neighbors)
+    elif discrete_feature_y:
+        return _compute_mi_cd(x, y, n_neighbors=n_neighbors)
+    else:
+        return _compute_mi_cc(x, y, n_neighbors=n_neighbors)
+
+
+def _compute_mi_cc(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
+    """Computes mutual information `I(x; y)`. Random variables `x`, `y` are assumed
+    to be continuous.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        x, y:
+            Feature vectors each formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Mutual information between feature vectors x and y.
+    """
+    n_samples = x.size
+
+    x = x.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    radius = _get_radius_k_neighbours(np.hstack((x, y)), n_neighbors)
+
+    # KDTree is explicitly fit to allow for the querying of number of
+    # neighbors within a specified radius
+    nx = _num_points_within_radius(x, radius)
+    ny = _num_points_within_radius(y, radius)
+
+    mi = (
+        digamma(n_samples)
+        + digamma(n_neighbors)
+        - np.mean(digamma(nx))
+        - np.mean(digamma(ny))
+    )
+
+    return max(0, mi)
+
+
+def _compute_mi_cd(x: npt.ArrayLike, y: npt.ArrayLike, n_neighbors: int) -> float:
+    """Computes mutual information `I(x; y)`. Random variable `x`, is assumed
+    to be continuous, while `y` is assumed to be discrete.
+
+    Adapted from :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        x, y:
+            Feature vectors each formatted as a numerical 1D array-like.
+        n_neighbors:
+            Number of neighbors to use for MI estimation for continuous variables,
+            see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+            could introduce a bias.
+    Returns:
+        Mutual information between feature vectors x and y.
+    """
+    radius, label_counts, k_all = _get_radius_k_neighbours_d(x, y, n_neighbors=n_neighbors)
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    n_samples = np.sum(mask)
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    x = x[mask]
+    radius = radius[mask]
+
+    m_all = _num_points_within_radius(x.reshape(-1, 1), radius)
+
+    mi = (
+        digamma(n_samples)
+        + np.mean(digamma(k_all))
+        - np.mean(digamma(label_counts))
+        - np.mean(digamma(m_all))
+    )
+
+    return max(0, mi)
+
+
+def _get_radius_k_neighbours(X: npt.ArrayLike, n_neighbors: int = 4) -> npt.ArrayLike:
+    """
+    Determine the smallest radius around `X` containing `n_neighbors` neighbors
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+
+    Args:
+        X:
+            Array of features.
+        n_neighbors:
+            Number of nearest neighbors.
+    Returns:
+        Vector of radii defined by the k nearest neighbors.
+    """
+    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+    return _get_radius_from_nn(X, nn)
+
+
+def _get_radius_k_neighbours_d(
+    c: npt.ArrayLike,
+    d: npt.ArrayLike,
+    n_neighbors: int = 4
+) -> typing.Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
+    """
+    Determine smallest radius around `c` and `d` containing `n_neighbors`.
+
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        c:
+            Array of feature vectors.
+        d:
+            Vector of discrete features formatted as a numerical 1D array-like.
+        n_neighbors:
+            The number of nearest neighbors.
+    Returns:
+        radius:
+            Vector of radii defined by the k nearest neighbors.
+        label_counts:
+            Label counts of the discrete feature vector.
+        k_all:
+            Array of the number of points within a radius.
+    """
+    d = d.flatten()
+    n_samples = d.shape[0]
+    c = c.reshape((n_samples, -1))
+
+    radius = np.empty(n_samples)
+    label_counts = np.empty(n_samples)
+    k_all = np.empty(n_samples)
+    nn = NearestNeighbors(metric="chebyshev")
+    for label in np.unique(d):
+        mask = d == label
+        count = np.sum(mask)
+        if count > 1:
+            k = min(n_neighbors, count - 1)
+            nn.set_params(n_neighbors=k)
+            radius[mask] = _get_radius_from_nn(c[mask], nn)
+            k_all[mask] = k
+        label_counts[mask] = np.sum(mask)
+    return radius, label_counts, k_all
+
+
+def _get_radius_from_nn(X: npt.ArrayLike, nn: NearestNeighbors) -> npt.ArrayLike:
+    """
+    Get the radius of nearest neighbors in `nn` model with dataset `X`.
+
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        X:
+            Array of features.
+        nn:
+            Instance of the nearest neighbors class.
+    Returns:
+        Vector of radii defined by the k nearest neighbors.
+    """
+    nn.fit(X)
+    r = nn.kneighbors()[0]
+    return np.nextafter(r[:, -1], 0)
+
+
+def _num_points_within_radius(x: npt.ArrayLike, radius: npt.ArrayLike) -> npt.ArrayLike:
+    """For each point, determine the number of other points within a given radius
+    Function from https://github.com/jannisteunissen/mutual_information
+    Args:
+        X:
+            (Continuous) feature array.
+        radius:
+            Vector of radii.
+    Returns:
+        Vector containing the number of points within a radius.
+    """
+    kd = KDTree(x, metric="chebyshev")
+    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
+    return np.array(nx)
+
+
+def _num_points_within_radius_cd(
+        c: npt.ArrayLike,
+        d: npt.ArrayLike,
+        radius: npt.ArrayLike) -> npt.ArrayLike:
+    """
+    For each point, determine the number of other points within a given radius.
+
+    Inspired by :func:`sklearn.feature_selection.mutual_info_regression` and
+    :func:`sklearn.feature_selection.mutual_info_classif`.
+    Args:
+        c:
+            (Continuous) feature array.
+        d:
+            Discrete feature vector formatted as a numerical 1D array-like
+        radius:
+            Vector of radii.
+    Returns:
+        Vector containing the number of points within a radius.
+    """
+    c = c.reshape((-1, 1))
+    n_samples = c.shape[0]
+    m_all = np.empty(n_samples)
+    for label in np.unique(d):
+        mask = d == label
+        m_all[mask] = _num_points_within_radius(c[mask], radius[mask])
+    return m_all
+
+
+def _iterate_columns(X, columns=None):
+    """Iterate over columns of a matrix.
+    Copied from :func:`sklearn.feature_selection._mutual_info`.
+
+    Parameters
+    ----------
+    X : ndarray or csc_matrix, shape (n_samples, n_features)
+        Matrix over which to iterate.
+
+    columns : iterable or None, default=None
+        Indices of columns to iterate over. If None, iterate over all columns.
+
+    Yields
+    ------
+    x : ndarray, shape (n_samples,)
+        Columns of `X` in dense format.
+    """
+    if columns is None:
+        columns = range(X.shape[1])
+
+    if issparse(X):
+        for i in columns:
+            x = np.zeros(X.shape[0])
+            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
+            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
+            yield x
+    else:
+        for i in columns:
+            yield X[:, i]
diff --git a/dwave/plugins/sklearn/transformers.py b/dwave/plugins/sklearn/transformers.py
index aa186be..1cb8f50 100644
--- a/dwave/plugins/sklearn/transformers.py
+++ b/dwave/plugins/sklearn/transformers.py
@@ -14,24 +14,20 @@
 
 from __future__ import annotations
 
-import itertools
-import logging
 import tempfile
 import typing
-import warnings
 
 import dimod
-import numpy as np
-import numpy.typing as npt
-
 from dwave.cloud.exceptions import ConfigFileError, SolverAuthenticationError
+from dwave.plugins.sklearn._conditional_mutual_info import estimate_mi_matrix
+from dwave.plugins.sklearn.utilities import corrcoef
 from dwave.system import LeapHybridCQMSampler
+import numpy as np
+import numpy.typing as npt
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import SelectorMixin
 from sklearn.utils.validation import check_is_fitted
 
-from dwave.plugins.sklearn.utilities import corrcoef
-
 __all__ = ["SelectFromQuadraticModel"]
 
 
@@ -50,21 +46,36 @@ class SelectFromQuadraticModel(SelectorMixin, BaseEstimator):
             :class:`sklearn.feature_selection.SelectKBest`.
         num_features:
             The number of features to select.
+        method:
+            If equal to ``correlation`` uses a correlation as a criterion for
+            choosing the features according to [1]_. If equal to ``mutual_information``
+            uses mutual information criteria [2]_ or [3]_ (advances feature).
         time_limit:
             The time limit for the run on the hybrid solver.
 
+    .. [1] [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature
+            Selection in Credit Scoring and Classification Using a Quantum Annealer."
+            1QBit; White Paper.
+            https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer
+    .. [2] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria
+            of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern
+            analysis and machine intelligence, 27(8):1226–1238, 2005.
+    .. [3] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for
+            mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD
+            international conference on Knowledge discovery and data mining,
+            pages 512–521. ACM, 2014.
     """
 
     ACCEPTED_METHODS = [
         "correlation",
-        # "mutual information",  # todo
-        ]
+        "mutual_information",
+    ]
 
     def __init__(
         self,
         *,
         alpha: float = .5,
-        method: str = "correlation",  # undocumented until there is another supported
+        method: str = "correlation",
         num_features: int = 10,
         time_limit: typing.Optional[float] = None,
     ):
@@ -110,8 +121,8 @@ def _get_support_mask(self) -> np.ndarray[typing.Any, np.dtype[np.bool_]]:
         except AttributeError:
             raise RuntimeError("fit hasn't been run yet")
 
-    @staticmethod
     def correlation_cqm(
+        self,
         X: npt.ArrayLike,
         y: npt.ArrayLike,
         *,
@@ -122,7 +133,7 @@ def correlation_cqm(
         """Build a constrained quadratic model for feature selection.
 
         This method is based on maximizing influence and independence as
-        measured by correlation [Milne et al.]_.
+        measured by correlation [1]_.
 
         Args:
             X:
@@ -147,32 +158,12 @@ def correlation_cqm(
         Returns:
             A constrained quadratic model.
 
-        .. [Milne et al.] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature
+        .. [1] Milne, Andrew, Maxwell Rounds, and Phil Goddard. 2017. "Optimal Feature
             Selection in Credit Scoring and Classification Using a Quantum Annealer."
             1QBit; White Paper.
             https://1qbit.com/whitepaper/optimal-feature-selection-in-credit-scoring-classification-using-quantum-annealer
         """
-
-        X = np.atleast_2d(np.asarray(X))
-        y = np.asarray(y)
-
-        if X.ndim != 2:
-            raise ValueError("X must be a 2-dimensional array-like")
-
-        if y.ndim != 1:
-            raise ValueError("y must be a 1-dimensional array-like")
-
-        if y.shape[0] != X.shape[0]:
-            raise ValueError(f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
-
-        if not 0 <= alpha <= 1:
-            raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
-
-        if num_features <= 0:
-            raise ValueError(f"num_features must be a positive integer, given {num_features}")
-
-        if X.shape[0] <= 1:
-            raise ValueError("X must have at least two rows")
+        self._check_params(X, y, alpha, num_features)
 
         cqm = dimod.ConstrainedQuadraticModel()
         cqm.add_variables(dimod.BINARY, X.shape[1])
@@ -183,7 +174,7 @@ def correlation_cqm(
             '==' if strict else '<=',
             num_features,
             label=f"{num_features}-hot",
-            )
+        )
 
         with tempfile.TemporaryFile() as fX, tempfile.TemporaryFile() as fout:
             # we make a copy of X because we'll be modifying it in-place within
@@ -198,27 +189,139 @@ def correlation_cqm(
                 dtype=np.result_type(X, y),
                 mode="w+",
                 shape=(X_copy.shape[1], X_copy.shape[1]),
-                )
-
+            )
             # main calculation. It modifies X_copy in-place
             corrcoef(X_copy, out=correlations, rowvar=False, copy=False)
-
             # we don't care about the direction of correlation in terms of
             # the penalty/quality
             np.absolute(correlations, out=correlations)
-
+            # multiplying all but last columns and rows with (1 - alpha)
+            np.multiply(correlations[:-1, :-1], (1 - alpha), out=correlations[:-1, :-1])
             # our objective
-            # we multiply by 2 because the matrix is symmetric
-            np.fill_diagonal(correlations, correlations[:, -1] * (-2 * alpha * num_features))
-
-            # Note: the full symmetric matrix (with both upper- and lower-diagonal
-            # entries for each correlation coefficient) is retained for consistency with
-            # the original formulation from Milne et al.
+            # we multiply by num_features to have consistent performance
+            # with the increase of the number of features
+            np.fill_diagonal(correlations, correlations[:, -1] * (- alpha * num_features))
+            # Note: we only add terms on and above the diagonal
             it = np.nditer(correlations[:-1, :-1], flags=['multi_index'], op_flags=[['readonly']])
             cqm.set_objective((*it.multi_index, x) for x in it if x)
 
         return cqm
 
+    def mutual_information_cqm(
+        self,
+        X: npt.ArrayLike,
+        y: npt.ArrayLike,
+        *,
+        num_features: int,
+        alpha: float = 0.5,
+        strict: bool = True,
+        conditional: bool = True,
+        discrete_features: typing.Union[str, npt.ArrayLike] = "auto",
+        discrete_target: bool = False,
+        copy: bool = True,
+        n_neighbors: int = 4,
+        n_jobs: typing.Union[None, int] = None,
+        random_state: typing.Union[None, int] = None,
+    ) -> dimod.ConstrainedQuadraticModel:
+        """Build a constrained quadratic model for feature selection.
+
+        If ``conditional`` is True then the conditional mutual information
+        criterion from [2] is used, and if ``conditional`` is False then
+        mutual information based criteria from [1] are used.
+
+        For computation of mutual information and conditional mutual information
+
+
+        Args:
+            X:
+                Feature vectors formatted as a numerical 2D array-like.
+            y:
+                Class labels formatted as a numerical 1D array-like.
+            alpha:
+                Hyperparameter between 0 and 1 that controls the relative weight of
+                the relevance and redundancy terms.
+                ``alpha=0`` places the maximum weight on the feature redundancy.
+                ``alpha=1`` places no weight on the feature redudancy,
+                and therefore will be equivalent to using
+                :class:`sklearn.feature_selection.SelectKBest`.
+                If conditional=True, ``alpha = 0`` is a default value in [2]_.
+                If conditional=False, ``alpha = (num_features - 1) / num_features``
+                is a default value in [1]_.
+            num_features:
+                The number of features to select.
+            strict:
+                If ``False`` the constraint on the number of selected features
+                is ``<=`` rather than ``==``.
+            conditional: bool, default=True
+                Whether to compute the off-diagonal terms using the conditional mutual
+                information or joint mutual information
+            discrete_features:
+                See :func:`sklearn.feature_selection.mutual_info_regression`
+            discrete_target: bool, default=False
+                Whether the target variable `y` is discrete
+            n_neighbors:
+                See :func:`sklearn.feature_selection.mutual_info_regression`
+            copy:
+                See :func:`sklearn.feature_selection.mutual_info_regression`
+            random_state:
+                See :func:`sklearn.feature_selection.mutual_info_regression`
+            n_jobs: int, default=None
+                The number of parallel jobs to run for the conditional mutual information
+                computation. The parallelization is over the columns of `X`.
+                ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+                ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+                for more details.
+
+        Returns:
+            A constrained quadratic model.
+
+        References:
+        .. [1] Peng, F. Long, and C. Ding. Feature selection based on mutual information criteria
+               of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on pattern
+               analysis and machine intelligence, 27(8):1226–1238, 2005.
+        .. [2] X. V. Nguyen, J. Chan, S. Romano, and J. Bailey. Effective global approaches for
+               mutual information based feature selection. In Proceedings of the 20th ACM SIGKDD
+               international conference on Knowledge discovery and data mining,
+               pages 512–521. ACM, 2014.
+        """
+
+        self._check_params(X, y, alpha, num_features)
+
+        cqm = dimod.ConstrainedQuadraticModel()
+        cqm.add_variables(dimod.BINARY, X.shape[1])
+
+        # add the k-hot constraint
+        cqm.add_constraint(
+            ((v, 1) for v in cqm.variables),
+            '==' if strict else '<=',
+            num_features,
+            label=f"{num_features}-hot",
+        )
+
+        mi = estimate_mi_matrix(
+            X, y,
+            discrete_features=discrete_features,
+            discrete_target=discrete_target,
+            n_neighbors=n_neighbors, copy=copy,
+            random_state=random_state, n_jobs=n_jobs,
+            conditional=conditional)
+
+        diagonal = -np.diag(mi).copy()
+        if conditional:
+            # method from [2]_ with another tuning dial.
+            # mutpliypling off-diagonal ones with -(1-alpha)
+            np.multiply(mi, -(1-alpha), out=mi)
+        else:
+            # method from [1]_.
+            # mutpliypling off-diagonal ones with 1-alpha
+            np.multiply(mi, 1-alpha, out=mi)
+        # keeping the diagonal
+        np.fill_diagonal(mi, diagonal)
+
+        it = np.nditer(mi, flags=['multi_index'], op_flags=[['readonly']])
+        cqm.set_objective((*it.multi_index, x) for x in it if x)
+        return cqm
+
     def fit(
         self,
         X: npt.ArrayLike,
@@ -227,6 +330,7 @@ def fit(
         alpha: typing.Optional[float] = None,
         num_features: typing.Optional[int] = None,
         time_limit: typing.Optional[float] = None,
+        **kwargs
     ) -> SelectFromQuadraticModel:
         """Select the features to keep.
 
@@ -277,8 +381,9 @@ def fit(
 
         if self.method == "correlation":
             cqm = self.correlation_cqm(X, y, num_features=num_features, alpha=alpha)
-        # elif self.method == "mutual information":
-        #     cqm = self.mutual_information_cqm(X, y, num_features=num_features)
+        elif self.method == "mutual_information":
+            cqm = self.mutual_information_cqm(
+                X, y, num_features=num_features, alpha=alpha, **kwargs)
         else:
             raise ValueError(f"only methods {self.acceptable_methods} are implemented")
 
@@ -311,3 +416,30 @@ def fit(
     def unfit(self):
         """Undo a previously executed ``fit`` method."""
         del self._mask
+
+    @staticmethod
+    def _check_params(X: npt.ArrayLike,
+                      y: npt.ArrayLike,
+                      alpha: float,
+                      num_features: int):
+        X = np.atleast_2d(np.asarray(X))
+        y = np.asarray(y)
+
+        if X.ndim != 2:
+            raise ValueError("X must be a 2-dimensional array-like")
+
+        if y.ndim != 1:
+            raise ValueError("y must be a 1-dimensional array-like")
+
+        if y.shape[0] != X.shape[0]:
+            raise ValueError(
+                f"requires: X.shape[0] == y.shape[0] but {X.shape[0]} != {y.shape[0]}")
+
+        if not 0 <= alpha <= 1:
+            raise ValueError(f"alpha must be between 0 and 1, given {alpha}")
+
+        if num_features <= 0:
+            raise ValueError(f"num_features must be a positive integer, given {num_features}")
+
+        if X.shape[0] <= 1:
+            raise ValueError("X must have at least two rows")
diff --git a/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml
new file mode 100644
index 0000000..dc85f5f
--- /dev/null
+++ b/releasenotes/notes/algorithm-update-d73cd4f7c854a8b3.yaml
@@ -0,0 +1,3 @@
+---
+fixes:
+  - Multiply off-diagonal terms of the correlation matrix with 1-alpha.
\ No newline at end of file
diff --git a/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
new file mode 100644
index 0000000..b809dc2
--- /dev/null
+++ b/releasenotes/notes/mutual-info-ddd1a23e320f88ae.yaml
@@ -0,0 +1,9 @@
+---
+features:
+  - |
+    Add feature selection method using (conditional) mutual information
+deprecations:
+  - |
+    Drop support for Python 3.7.
+  - |
+    Drop support for scikit-learn 1.2.0
diff --git a/requirements.txt b/requirements.txt
index 00814cb..cc26dd2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,8 @@ numpy==1.20.0; python_version<"3.10.0"  # oldest supported by dwave-system
 numpy==1.22.0; python_version~="3.10.0"  # needed for scipy on windows
 oldest-supported-numpy; python_version>="3.11.0"
 
-scikit-learn==1.2.0; python_version >= '3.8'
-scikit-learn==1.0.2; python_version < '3.8'
+scikit-learn==1.2.1
+scipy==1.9.0; python_version<"3.11.0"
+scipy==1.9.2; python_version=="3.11.0"
 
 reno==3.5.0
diff --git a/setup.cfg b/setup.cfg
index 269ee72..b036d93 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,12 +23,13 @@ install_requires =
     dimod>=0.12.3
     dwave-system>=1.18.0
     numpy>=1.20.0
-    scikit-learn>=1.0.2 
-packages = 
+    scikit-learn>=1.2.1
+    scipy>=1.9.0
+packages =
     dwave
     dwave.plugins
     dwave.plugins.sklearn
-python_requires = >=3.7
+python_requires = >=3.8
 
 [pycodestyle]
 max-line-length = 100
diff --git a/tests/test_mutual_info.py b/tests/test_mutual_info.py
new file mode 100644
index 0000000..45000b0
--- /dev/null
+++ b/tests/test_mutual_info.py
@@ -0,0 +1,369 @@
+
+# Tests in TestMI are adapted from sklearn tests and licenced under:
+# BSD 3-Clause License
+#
+# Copyright (c) 2007-2024 The scikit-learn developers.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Tests in TestCMI are adapted from https://github.com/jannisteunissen/mutual_information and
+# distributed under the following licence:
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Jannis Teunissen
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The modifications of the test and the tests in TestDCMI are distributed
+# under the Apache 2.0 Software license:
+#
+# Copyright 2023 D-Wave Systems Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from dwave.plugins.sklearn._conditional_mutual_info import (
+    _compute_mi,
+    _compute_cmip_c, _compute_cmip_d,
+    _compute_cmip_cdc, _compute_cmip_ccd
+)
+
+from numpy.linalg import det
+from sklearn.feature_selection import mutual_info_classif
+from sklearn.preprocessing import scale
+from sklearn.utils import check_random_state
+
+
+class TestMI(unittest.TestCase):
+    def test_against_sklearn(self):
+        # the test will break if sklearn modifies its implementation
+        from sklearn.feature_selection._mutual_info import _compute_mi_cd, _compute_mi_cc
+        from sklearn.metrics.cluster import mutual_info_score
+        # Test discrete implementation
+        n_samples, n_ss, n_neighbors, random_state = 103, 51, 4, 15
+        np.random.seed(random_state)
+        xi = np.random.randint(0, 11, (n_samples, ))
+        xj = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(xi)
+        np.random.shuffle(xj)
+        xi_c = scale(np.random.randn(n_samples), with_mean=False)
+        xj_c = scale(np.random.randn(n_samples), with_mean=False)
+
+        mi_ij = _compute_mi(
+            xi, xj, True, True, n_neighbors=n_neighbors)
+        mi_ij_cl = mutual_info_score(xi, xj)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij),
+            msg="Discrete mutual information is not computed correctly")
+
+        mi_ij = _compute_mi(
+            xi_c, xj, False, True, n_neighbors=n_neighbors)
+        mi_ij_cl = _compute_mi_cd(xi_c, xj, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
+            msg=f"Error in continuous features/discrete target MI estimation is larger than expected {abs(mi_ij_cl-mi_ij)}")
+
+        mi_ij = _compute_mi(xi_c, xj_c, False, False, n_neighbors=n_neighbors)
+        mi_ij_cl = _compute_mi_cc(xi_c, xj_c, n_neighbors=n_neighbors)
+        self.assertTrue(
+            np.allclose(mi_ij_cl, mi_ij, atol=1e-5),
+            msg=f"Error in purely continuous MI is larger than expected {abs(mi_ij_cl-mi_ij)}")
+
+    def test_compute_mi_dd(self):
+        # In discrete case computations are straightforward and can be done
+        # by hand on given vectors.
+        x = np.array([0, 1, 1, 0, 0])
+        y = np.array([1, 0, 0, 0, 1])
+
+        H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5)
+        H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5)
+        I_xy = H_x + H_y - H_xy
+
+        self.assertAlmostEqual(_compute_mi(x, y, discrete_feature_x=True,
+                               discrete_feature_y=True), I_xy, places=4)
+
+    def test_compute_mi_cc(self):
+        # For two continuous variables a good approach is to test on bivariate
+        # normal distribution, where mutual information is known.
+
+        # Mean of the distribution, irrelevant for mutual information.
+        mean = np.zeros(2)
+
+        # Setup covariance matrix with correlation coeff. equal 0.5.
+        sigma_1 = 1
+        sigma_2 = 10
+        corr = 0.5
+        cov = np.array(
+            [
+                [sigma_1**2, corr * sigma_1 * sigma_2],
+                [corr * sigma_1 * sigma_2, sigma_2**2],
+            ]
+        )
+
+        # True theoretical mutual information.
+        I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))
+
+        rng = check_random_state(0)
+        Z = rng.multivariate_normal(mean, cov, size=5001).astype(np.float64, copy=False)
+
+        x, y = Z[:, 0], Z[:, 1]
+
+        # Theory and computed values won't be very close
+        # We here check with a large relative tolerance
+        for n_neighbors in [4, 6, 8]:
+            I_computed = _compute_mi(
+                x, y, discrete_feature_x=False, discrete_feature_y=False, n_neighbors=n_neighbors
+            )
+            self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1)
+
+    def test_compute_mi_cd(self):
+        # To test define a joint distribution as follows:
+        # p(x, y) = p(x) p(y | x)
+        # X ~ Bernoulli(p)
+        # (Y | x = 0) ~ Uniform(-1, 1)
+        # (Y | x = 1) ~ Uniform(0, 2)
+
+        # Use the following formula for mutual information:
+        # I(X; Y) = H(Y) - H(Y | X)
+        # Two entropies can be computed by hand:
+        # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
+        # H(Y | X) = ln(2)
+
+        # Now we need to implement sampling from out distribution, which is
+        # done easily using conditional distribution logic.
+
+        n_samples = 5001
+        rng = check_random_state(0)
+
+        for p in [0.3, 0.5, 0.7]:
+            x = rng.uniform(size=n_samples) > p
+
+            y = np.empty(n_samples, np.float64)
+            mask = x == 0
+            y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
+            y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))
+
+            I_theory = -0.5 * (
+                (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)
+            ) - np.log(2)
+
+            # Assert the same tolerance.
+            for n_neighbors in [4, 6, 8]:
+                I_computed = _compute_mi(
+                    x, y, discrete_feature_x=True, discrete_feature_y=False, n_neighbors=n_neighbors
+                )
+                self.assertLessEqual(abs(I_computed/I_theory-1.0), 1e-1)
+
+    def test_compute_mi_cd_unique_label(self):
+        # Test that adding unique label doesn't change MI.
+        n_samples = 100
+        x = np.random.uniform(size=n_samples) > 0.5
+
+        y = np.empty(n_samples, np.float64)
+        mask = x == 0
+        y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
+        y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
+
+        mi_1 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False)
+
+        x = np.hstack((x, 2))
+        y = np.hstack((y, 10))
+        mi_2 = _compute_mi(x, y, discrete_feature_x=True, discrete_feature_y=False)
+
+        self.assertAlmostEqual(mi_1, mi_2, places=5)
+
+
+class TestCMI(unittest.TestCase):
+    def test_discrete(self):
+        # Third test from "Conditional Mutual Information Estimation for Mixed Discrete
+        # and Continuous Variables with Nearest Neighbors" (Mesner & Shalizi). Fully
+        # discrete
+        def rescale(xi):
+            xi = scale(xi + 0.0, with_mean=False, copy=False)
+            # Add small noise to continuous features as advised in Kraskov et. al.
+            xi = xi.astype(np.float64, copy=False)
+            means = np.maximum(1, np.mean(np.abs(xi), axis=0))
+            xi += (1e-10 * means * rng.standard_normal(size=(xi.shape[0], )))
+            return xi
+        N = 1001
+        rng = check_random_state(0)
+        choices = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1]])
+        p_discrete = rng.choice(4, p=[0.4, 0.4, 0.1, 0.1], size=N)
+        xy_discrete = choices[p_discrete]
+        x, y = xy_discrete[:, 0], xy_discrete[:, 1]
+        z = rng.poisson(2, size=N)
+        x_c = rescale(x)
+        y_c = rescale(y)
+        z_c = rescale(z)
+        cmi_analytic = 2 * 0.4 * np.log(0.4/0.5**2) + 2 * 0.1 * np.log(0.1/0.5**2)
+        # checking the computations with a large relative tolerance
+        for n_neighbors in [3, 5, 7]:
+            cmi_ij, _ = _compute_cmip_d(x, z, y)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+            cmi_ij, _ = _compute_cmip_c(x_c, z_c, y_c, n_neighbors)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+            cmi_ij, _ = _compute_cmip_ccd(x_c, z_c, y, n_neighbors)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+            cmi_ij, _ = _compute_cmip_cdc(x_c, z, y_c, n_neighbors)
+            self.assertLessEqual(abs(cmi_ij/cmi_analytic-1.0), 1e-1)
+
+    def test_bivariate(self):
+        """Test with bivariate normal variables"""
+        N = 1001
+        rng = check_random_state(0)
+        mu = np.zeros(2)
+        cov = np.array([[1., 0.8], [0.8, 1.0]])
+        xy_gauss = rng.multivariate_normal(mu, cov, size=N)
+        x, y = xy_gauss[:, 0], xy_gauss[:, 1]
+        z = rng.normal(size=N)
+
+        cmi_analytic = -0.5 * np.log(det(cov))
+        # checking the computations with a large relative tolerance
+        for n_neighbors in [4, 6, 8]:
+            cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors)
+            self.assertAlmostEqual(cmi_ij/cmi_analytic, 1.0, places=1)
+
+    def test_trivariate(self):
+        # Test with 'trivariate' normal variables x, y, z
+
+        mu = np.zeros(3)
+
+        # Covariance matrix
+        cov_xy = 0.7
+        cov_xz = 0.5
+        cov_yz = 0.3
+        cov = np.array([[1, cov_xy, cov_xz],
+                        [cov_xy, 1.0, cov_yz],
+                        [cov_xz, cov_yz, 1]])
+        N = 1001
+        rng = check_random_state(0)
+        samples = rng.multivariate_normal(mu, cov, size=N)
+        x, y, z = samples[:, 0], samples[:, 1], samples[:, 2]
+
+        # Construct minor matrices for x and y
+        cov_x = cov[1:, 1:]
+        cov_y = cov[[0, 2]][:, [0, 2]]
+        cmi_analytic = -0.5 * np.log(det(cov) / (det(cov_x) * det(cov_y)))
+        # checking the computations with a large relative tolerance
+        for n_neighbors in [4, 6, 8]:
+            cmi_ij, _ = _compute_cmip_c(x, z, y, n_neighbors)
+            self.assertAlmostEqual(cmi_ij/cmi_analytic, 1, places=1)
+
+
+class TestDCMI(unittest.TestCase):
+    def test_cmi_symmetry(self):
+        # The pair produced by the computations should be symmetric in the first two arguments
+        np.random.seed(42)
+        n_samples, n_neighbors = 4003, 4
+        # Test continuous implementation
+        Xy = np.random.randn(n_samples, 3)
+        cmi_ij_pair = _compute_cmip_c(Xy[:, 0], Xy[:, 1], Xy[:, 2], n_neighbors=n_neighbors)
+        cmi_ji_pair = _compute_cmip_c(Xy[:, 1], Xy[:, 0], Xy[:, 2], n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
+            msg="Computation for continuous conditional mutual information is not symmetric")
+
+        c1 = np.random.randn(n_samples,)
+        c2 = np.random.randn(n_samples,)
+        n_ss = 1001
+        d = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(d)
+        cmi_ij_pair = _compute_cmip_ccd(c1, c2, d, n_neighbors=n_neighbors)
+        cmi_ji_pair = _compute_cmip_ccd(c2, c1, d, n_neighbors=n_neighbors)
+        self.assertAlmostEqual(
+            sum(cmi_ij_pair), sum(cmi_ji_pair), places=3,
+            msg="Computation for mixed continuous/discrete conditional mutual information is not symmetric")
+
+    def test_cmi_discrete(self):
+        # We test the algorithm using the formula `I(x;y|z) = I(x;y) + I(z;y) - I(z;y|x)`.
+        # It is highly likely if our algorithm is correct, then formula holds.
+        np.random.seed(42)
+        # Test discrete implementation
+        n_samples, n_ss, n_neighbors = 103, 51, 4
+        xi = np.random.randint(0, 11, (n_samples, ))
+        xj = np.hstack((
+            np.random.randint(-2, 1, (n_ss, )),
+            np.random.randint(10, 14, (n_samples-n_ss, ))))
+        np.random.shuffle(xi)
+        np.random.shuffle(xj)
+        y = np.random.randint(-12, -10, (n_samples, ))
+        cmi_ij, cmi_ji = _compute_cmip_d(xi, xj, y)
+        # we estimate mutual information using
+        mi_i = mutual_info_classif(
+            xi.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
+        mi_j = mutual_info_classif(
+            xj.reshape(-1, 1), y, discrete_features=True, n_neighbors=n_neighbors)
+
+        self.assertTrue(
+            np.allclose(cmi_ij + mi_j - mi_i, cmi_ji, atol=1e-5),
+            msg="The chain rule for discrete conditional mutual information is violated")
+        cmi_ij2 = _compute_cmip_d(xj, xi, y)
+        self.assertTrue(
+            np.allclose(sum(cmi_ij2), cmi_ji+cmi_ij, atol=1e-5),
+            msg="Discrete conditional mutual information computation is not symmetric")
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 14214cc..cfe4c49 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -123,9 +123,13 @@ def test_pipeline(self):
         clf.predict(X)
 
     def test_alpha_0(self):
-        cqm = SelectFromQuadraticModel.correlation_cqm(self.X, self.y, num_features=3, alpha=0)
+        cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=0)
         self.assertTrue(not any(cqm.objective.linear.values()))
 
+    def test_alpha_1_no_quadratic(self):
+        cqm = SelectFromQuadraticModel().correlation_cqm(X=self.X, y=self.y, num_features=3, alpha=1)
+        self.assertTrue(not any(cqm.objective.quadratic.values()))
+
     def test_alpha_1(self):
         rng = np.random.default_rng(42)
 
@@ -179,7 +183,7 @@ def test_fixed_column(self):
         X[:, 1] = 0
         X[:, 5] = 1
 
-        cqm = SelectFromQuadraticModel.correlation_cqm(X, self.y, alpha=.5, num_features=5)
+        cqm = SelectFromQuadraticModel().correlation_cqm(X=X, y=self.y, alpha=.5, num_features=5)
 
         # in this case the linear bias for those two columns should be 0
         self.assertEqual(cqm.objective.linear[1], 0)
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index 107f4d8..5b469e5 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -45,7 +45,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os.path
 import tempfile
 import unittest