From c9775b370b9760b80b824a6cff8da7bb1a102ed1 Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Fri, 8 Apr 2022 10:48:47 +0200 Subject: [PATCH 1/2] Support tuning the model's hyperparameters, instead of the default ones. --- cobra/model_building/models.py | 47 +++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 3a921c0..23529a7 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -25,14 +25,30 @@ class LogisticRegressionModel: scikit-learn logistic regression model. predictors : list List of predictors used in the model. + kwargs: dict, optional + Pass a dictionary here (optional!), to override Cobra's default + choice of hyperparameter values for the scikit-learn + LogisticRegression model that is used behind the scenes. Our defaults + are: fit_intercept=True, C=1e9, solver='liblinear', random_state=42. + See scikit-learn's documentation of the possible hyperparameters and + values that can be set: + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html """ - def __init__(self): - self.logit = LogisticRegression(fit_intercept=True, C=1e9, - solver='liblinear', random_state=42) + def __init__(self, **kwargs): + # Initialize a scikit-learn linear regression model, + # with custom arguments passed by the data scientist (if any), + # supplemented with Cobra's default arguments, if a custom value was + # not provided by the data scientist for overriding purposes: + default_kwargs = dict(fit_intercept=True, C=1e9, solver='liblinear', + random_state=42) + for kwarg, val in default_kwargs.items(): + if kwarg not in kwargs: + kwargs[kwarg] = val + self.logit = LogisticRegression(**kwargs) + self._is_fitted = False - # placeholder to keep track of a list of predictors - self.predictors = [] + self.predictors = [] # placeholder to keep track of a list of predictors self._eval_metrics_by_split = {} def serialize(self) -> dict: @@ -258,10 +274,27 @@ class LinearRegressionModel: scikit-learn linear regression model. predictors : list List of predictors used in the model. + kwargs: dict, optional + Pass a dictionary here (optional!), to override Cobra's default + choice of hyperparameter values for the scikit-learn + LinearRegression model that is used behind the scenes. Our default + setting is only fit_intercept=True. + See scikit-learn's documentation of the possible hyperparameters and + values that can be set: + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html """ - def __init__(self): - self.linear = LinearRegression(fit_intercept=True) + def __init__(self, **kwargs): + # Initialize a scikit-learn linear regression model, + # with custom arguments passed by the data scientist (if any), + # supplemented with Cobra's default arguments, if a custom value was + # not provided by the data scientist for overriding purposes: + default_kwargs = dict(fit_intercept=True) + for kwarg, val in default_kwargs.items(): + if kwarg not in kwargs: + kwargs[kwarg] = val + self.linear = LinearRegression(**kwargs) + self._is_fitted = False self.predictors = [] # placeholder to keep track of a list of predictors self._eval_metrics_by_split = {} From a2486c1da7a74c675c5ac29cccf1531b529c1703 Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Wed, 1 Jun 2022 16:14:55 +0200 Subject: [PATCH 2/2] Small improvements based on code review + fixes in forward selection for crashing unit tests. --- cobra/model_building/forward_selection.py | 16 +++++++++-- cobra/model_building/models.py | 33 ++++++++++++----------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 29e06b3..50961f9 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -31,6 +31,12 @@ class ForwardFeatureSelection: selection. pos_only : bool Whether or not the model coefficients should all be positive (no sign flips). + model_kwargs: dict, optional + An optional dictionary of hyperparameters and their values to + override the default hyperparameters that Cobra uses when + constructing the model during forward selection. + For more info, see the documentation of kwargs in the documentation + of the model that is used (e.g. LinearRegressionModel). self._fitted_models : list List of fitted models. """ @@ -38,7 +44,8 @@ class ForwardFeatureSelection: def __init__(self, model_type: str="classification", max_predictors: int=50, - pos_only: bool=True): + pos_only: bool=True, + model_kwargs: Optional[dict]=None): self.model_type = model_type if model_type == "classification": @@ -49,6 +56,8 @@ def __init__(self, self.max_predictors = max_predictors self.pos_only = pos_only + self.model_kwargs = model_kwargs + self._fitted_models = [] def get_model_from_step(self, step: int): @@ -347,7 +356,10 @@ def _train_model(self, train_data: pd.DataFrame, target_column_name: str, self.MLModel Trained model. """ - model = self.MLModel() + if self.model_kwargs is None: + model = self.MLModel() + else: + model = self.MLModel(**self.model_kwargs) model.fit(train_data[predictors], train_data[target_column_name]) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 23529a7..1977b54 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -28,8 +28,9 @@ class LogisticRegressionModel: kwargs: dict, optional Pass a dictionary here (optional!), to override Cobra's default choice of hyperparameter values for the scikit-learn - LogisticRegression model that is used behind the scenes. Our defaults - are: fit_intercept=True, C=1e9, solver='liblinear', random_state=42. + LogisticRegression model that is used behind the scenes. + Cobra's defaults are: fit_intercept=True, C=1e9, solver='liblinear', + random_state=42. See scikit-learn's documentation of the possible hyperparameters and values that can be set: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html @@ -40,12 +41,10 @@ def __init__(self, **kwargs): # with custom arguments passed by the data scientist (if any), # supplemented with Cobra's default arguments, if a custom value was # not provided by the data scientist for overriding purposes: - default_kwargs = dict(fit_intercept=True, C=1e9, solver='liblinear', + model_kwargs = dict(fit_intercept=True, C=1e9, solver='liblinear', random_state=42) - for kwarg, val in default_kwargs.items(): - if kwarg not in kwargs: - kwargs[kwarg] = val - self.logit = LogisticRegression(**kwargs) + model_kwargs.update(kwargs) + self.logit = LogisticRegression(**model_kwargs) self._is_fitted = False self.predictors = [] # placeholder to keep track of a list of predictors @@ -120,7 +119,12 @@ def get_intercept(self) -> float: float Intercept of the model. """ - return self.logit.intercept_[0] + if self.logit.fit_intercept: + return self.logit.intercept_[0] + else: + raise ValueError("An intercept cannot be returned: this " + "LogisticRegressionModel was created with " + "the hyperparameter fit_intercept set to False.") def get_coef_by_predictor(self) -> dict: """Returns a dictionary mapping predictor (key) to coefficient (value). @@ -277,8 +281,9 @@ class LinearRegressionModel: kwargs: dict, optional Pass a dictionary here (optional!), to override Cobra's default choice of hyperparameter values for the scikit-learn - LinearRegression model that is used behind the scenes. Our default - setting is only fit_intercept=True. + LinearRegression model that is used behind the scenes. + Cobra's only default setting is fit_intercept=True, but there are + other hyperparmeters that can be set too. See scikit-learn's documentation of the possible hyperparameters and values that can be set: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html @@ -289,11 +294,9 @@ def __init__(self, **kwargs): # with custom arguments passed by the data scientist (if any), # supplemented with Cobra's default arguments, if a custom value was # not provided by the data scientist for overriding purposes: - default_kwargs = dict(fit_intercept=True) - for kwarg, val in default_kwargs.items(): - if kwarg not in kwargs: - kwargs[kwarg] = val - self.linear = LinearRegression(**kwargs) + model_kwargs = dict(fit_intercept=True) + model_kwargs.update(kwargs) + self.linear = LinearRegression(**model_kwargs) self._is_fitted = False self.predictors = [] # placeholder to keep track of a list of predictors