From c9775b370b9760b80b824a6cff8da7bb1a102ed1 Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <sander.vandenhautte@tobania.be>
Date: Fri, 8 Apr 2022 10:48:47 +0200
Subject: [PATCH 1/2] Support tuning the model's hyperparameters, instead of
 the default ones.

---
 cobra/model_building/models.py | 47 +++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 3a921c0..23529a7 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -25,14 +25,30 @@ class LogisticRegressionModel:
         scikit-learn logistic regression model.
     predictors : list
         List of predictors used in the model.
+    kwargs: dict, optional
+        Pass a dictionary here (optional!), to override Cobra's default
+        choice of hyperparameter values for the scikit-learn
+        LogisticRegression model that is used behind the scenes. Our defaults
+        are: fit_intercept=True, C=1e9, solver='liblinear', random_state=42.
+        See scikit-learn's documentation of the possible hyperparameters and
+        values that can be set:
+        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
     """
 
-    def __init__(self):
-        self.logit = LogisticRegression(fit_intercept=True, C=1e9,
-                                        solver='liblinear', random_state=42)
+    def __init__(self, **kwargs):
+        # Initialize a scikit-learn linear regression model,
+        # with custom arguments passed by the data scientist (if any),
+        # supplemented with Cobra's default arguments, if a custom value was
+        # not provided by the data scientist for overriding purposes:
+        default_kwargs = dict(fit_intercept=True, C=1e9, solver='liblinear',
+                              random_state=42)
+        for kwarg, val in default_kwargs.items():
+            if kwarg not in kwargs:
+                kwargs[kwarg] = val
+        self.logit = LogisticRegression(**kwargs)
+
         self._is_fitted = False
-        # placeholder to keep track of a list of predictors
-        self.predictors = []
+        self.predictors = []  # placeholder to keep track of a list of predictors
         self._eval_metrics_by_split = {}
 
     def serialize(self) -> dict:
@@ -258,10 +274,27 @@ class LinearRegressionModel:
         scikit-learn linear regression model.
     predictors : list
         List of predictors used in the model.
+    kwargs: dict, optional
+        Pass a dictionary here (optional!), to override Cobra's default
+        choice of hyperparameter values for the scikit-learn
+        LinearRegression model that is used behind the scenes. Our default
+        setting is only fit_intercept=True.
+        See scikit-learn's documentation of the possible hyperparameters and
+        values that can be set:
+        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
     """
 
-    def __init__(self):
-        self.linear = LinearRegression(fit_intercept=True)
+    def __init__(self, **kwargs):
+        # Initialize a scikit-learn linear regression model,
+        # with custom arguments passed by the data scientist (if any),
+        # supplemented with Cobra's default arguments, if a custom value was
+        # not provided by the data scientist for overriding purposes:
+        default_kwargs = dict(fit_intercept=True)
+        for kwarg, val in default_kwargs.items():
+            if kwarg not in kwargs:
+                kwargs[kwarg] = val
+        self.linear = LinearRegression(**kwargs)
+
         self._is_fitted = False
         self.predictors = []  # placeholder to keep track of a list of predictors
         self._eval_metrics_by_split = {}

From a2486c1da7a74c675c5ac29cccf1531b529c1703 Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <sander.vandenhautte@tobania.be>
Date: Wed, 1 Jun 2022 16:14:55 +0200
Subject: [PATCH 2/2] Small improvements based on code review + fixes in
 forward selection for crashing unit tests.

---
 cobra/model_building/forward_selection.py | 16 +++++++++--
 cobra/model_building/models.py            | 33 ++++++++++++-----------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index 29e06b3..50961f9 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -31,6 +31,12 @@ class ForwardFeatureSelection:
         selection.
     pos_only : bool
         Whether or not the model coefficients should all be positive (no sign flips).
+    model_kwargs: dict, optional
+        An optional dictionary of hyperparameters and their values to
+        override the default hyperparameters that Cobra uses when
+        constructing the model during forward selection.
+        For more info, see the documentation of kwargs in the documentation
+        of the model that is used (e.g. LinearRegressionModel).
     self._fitted_models : list
         List of fitted models.
     """
@@ -38,7 +44,8 @@ class ForwardFeatureSelection:
     def __init__(self,
                  model_type: str="classification",
                  max_predictors: int=50,
-                 pos_only: bool=True):
+                 pos_only: bool=True,
+                 model_kwargs: Optional[dict]=None):
 
         self.model_type = model_type
         if model_type == "classification":
@@ -49,6 +56,8 @@ def __init__(self,
         self.max_predictors = max_predictors
         self.pos_only = pos_only
 
+        self.model_kwargs = model_kwargs
+
         self._fitted_models = []
 
     def get_model_from_step(self, step: int):
@@ -347,7 +356,10 @@ def _train_model(self, train_data: pd.DataFrame, target_column_name: str,
         self.MLModel
             Trained model.
         """
-        model = self.MLModel()
+        if self.model_kwargs is None:
+            model = self.MLModel()
+        else:
+            model = self.MLModel(**self.model_kwargs)
 
         model.fit(train_data[predictors], train_data[target_column_name])
 
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 23529a7..1977b54 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -28,8 +28,9 @@ class LogisticRegressionModel:
     kwargs: dict, optional
         Pass a dictionary here (optional!), to override Cobra's default
         choice of hyperparameter values for the scikit-learn
-        LogisticRegression model that is used behind the scenes. Our defaults
-        are: fit_intercept=True, C=1e9, solver='liblinear', random_state=42.
+        LogisticRegression model that is used behind the scenes.
+        Cobra's defaults are: fit_intercept=True, C=1e9, solver='liblinear',
+        random_state=42.
         See scikit-learn's documentation of the possible hyperparameters and
         values that can be set:
         https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
@@ -40,12 +41,10 @@ def __init__(self, **kwargs):
         # with custom arguments passed by the data scientist (if any),
         # supplemented with Cobra's default arguments, if a custom value was
         # not provided by the data scientist for overriding purposes:
-        default_kwargs = dict(fit_intercept=True, C=1e9, solver='liblinear',
+        model_kwargs = dict(fit_intercept=True, C=1e9, solver='liblinear',
                               random_state=42)
-        for kwarg, val in default_kwargs.items():
-            if kwarg not in kwargs:
-                kwargs[kwarg] = val
-        self.logit = LogisticRegression(**kwargs)
+        model_kwargs.update(kwargs)
+        self.logit = LogisticRegression(**model_kwargs)
 
         self._is_fitted = False
         self.predictors = []  # placeholder to keep track of a list of predictors
@@ -120,7 +119,12 @@ def get_intercept(self) -> float:
         float
             Intercept of the model.
         """
-        return self.logit.intercept_[0]
+        if self.logit.fit_intercept:
+            return self.logit.intercept_[0]
+        else:
+            raise ValueError("An intercept cannot be returned: this "
+                             "LogisticRegressionModel was created with "
+                             "the hyperparameter fit_intercept set to False.")
 
     def get_coef_by_predictor(self) -> dict:
         """Returns a dictionary mapping predictor (key) to coefficient (value).
@@ -277,8 +281,9 @@ class LinearRegressionModel:
     kwargs: dict, optional
         Pass a dictionary here (optional!), to override Cobra's default
         choice of hyperparameter values for the scikit-learn
-        LinearRegression model that is used behind the scenes. Our default
-        setting is only fit_intercept=True.
+        LinearRegression model that is used behind the scenes.
+        Cobra's only default setting is fit_intercept=True, but there are
+        other hyperparmeters that can be set too.
         See scikit-learn's documentation of the possible hyperparameters and
         values that can be set:
         https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
@@ -289,11 +294,9 @@ def __init__(self, **kwargs):
         # with custom arguments passed by the data scientist (if any),
         # supplemented with Cobra's default arguments, if a custom value was
         # not provided by the data scientist for overriding purposes:
-        default_kwargs = dict(fit_intercept=True)
-        for kwarg, val in default_kwargs.items():
-            if kwarg not in kwargs:
-                kwargs[kwarg] = val
-        self.linear = LinearRegression(**kwargs)
+        model_kwargs = dict(fit_intercept=True)
+        model_kwargs.update(kwargs)
+        self.linear = LinearRegression(**model_kwargs)
 
         self._is_fitted = False
         self.predictors = []  # placeholder to keep track of a list of predictors