From 5b0812bc4b55c8efc327b2cdfea23b688a6e4691 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Sun, 19 Oct 2025 16:38:05 +0800 Subject: [PATCH 1/4] fix categorical encoding in sequential imputation --- changelog_entry.yaml | 4 + microimpute/models/qrf.py | 208 ++++++++++++++++++++++++---------- tests/test_models/test_qrf.py | 100 ++++++++++++++++ 3 files changed, 255 insertions(+), 57 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..3f455a0 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Fixed QRF to encode categorical imputed variables that become predictors correctly. diff --git a/microimpute/models/qrf.py b/microimpute/models/qrf.py index 0f8a4c7..0c0eddd 100644 --- a/microimpute/models/qrf.py +++ b/microimpute/models/qrf.py @@ -249,6 +249,49 @@ def __init__( self.constant_targets = constant_targets or {} self.dummy_processor = dummy_processor + def _get_encoded_predictors( + self, current_predictors: List[str] + ) -> List[str]: + """Get properly encoded predictor columns for sequential imputation. + + Args: + current_predictors: List of predictor variable names + + Returns: + List of encoded predictor column names + """ + if self.dummy_processor: + return self.dummy_processor.get_sequential_predictor_columns( + current_predictors + ) + else: + return current_predictors + + def _encode_imputed_variable( + self, data: pd.DataFrame, variable: str + ) -> pd.DataFrame: + """Encode a categorical imputed variable for use as predictor in subsequent iterations. + + Args: + data: DataFrame containing the imputed variable + variable: Name of the variable that was just imputed + + Returns: + DataFrame with encoded variable (adds dummy columns if categorical) + """ + if ( + self.dummy_processor + and variable in self.dummy_processor.imputed_var_dummy_mapping + ): + data = self.dummy_processor.sequential_imputed_predictor_encoding( + data, variable + ) + self.logger.debug( + f" Encoded '{variable}' for use in sequential imputation" + ) + + return data + @validate_call(config=VALIDATE_CONFIG) def _predict( self, @@ -318,12 +361,9 @@ def _predict( ) # Get properly encoded predictor columns - if self.dummy_processor: - encoded_predictors = self.dummy_processor.get_sequential_predictor_columns( - var_predictors - ) - else: - encoded_predictors = var_predictors + encoded_predictors = self._get_encoded_predictors( + var_predictors + ) self.logger.debug( f"var_predictors for {variable}: {var_predictors}" @@ -390,15 +430,16 @@ def _predict( X_test_augmented[variable] = imputed_values # Encode categorical/boolean imputed variable for next iteration + X_test_augmented = self._encode_imputed_variable( + X_test_augmented, variable + ) + + # Track the dummy columns that were added if ( self.dummy_processor and variable in self.dummy_processor.imputed_var_dummy_mapping ): - X_test_augmented = self.dummy_processor.sequential_imputed_predictor_encoding( - X_test_augmented, variable - ) - # Track the dummy columns that were added var_info = ( self.dummy_processor.imputed_var_dummy_mapping[ variable @@ -488,6 +529,68 @@ def __init__( f"Batch processing enabled with batch_size={batch_size}" ) + def _get_encoded_predictors( + self, + current_predictors: List[str], + dummy_processor: Optional[Any] = None, + ) -> List[str]: + """Get properly encoded predictor columns for sequential imputation. + + This helper ensures consistent encoding of categorical predictors across + all code paths (batch, non-batch, hyperparameter tuning, etc.). + + Args: + current_predictors: List of predictor variable names + dummy_processor: Optional DummyVariableProcessor instance + + Returns: + List of encoded predictor column names + """ + if dummy_processor is None: + dummy_processor = getattr(self, "dummy_processor", None) + + if dummy_processor: + return dummy_processor.get_sequential_predictor_columns( + current_predictors + ) + else: + return current_predictors + + def _encode_imputed_variable( + self, + data: pd.DataFrame, + variable: str, + dummy_processor: Optional[Any] = None, + ) -> pd.DataFrame: + """Encode a categorical imputed variable for use as predictor in subsequent iterations. + + This helper ensures consistent encoding of imputed categorical variables + across all code paths. + + Args: + data: DataFrame containing the imputed variable + variable: Name of the variable that was just imputed + dummy_processor: Optional DummyVariableProcessor instance + + Returns: + DataFrame with encoded variable (adds dummy columns if categorical) + """ + if dummy_processor is None: + dummy_processor = getattr(self, "dummy_processor", None) + + if ( + dummy_processor + and variable in dummy_processor.imputed_var_dummy_mapping + ): + data = dummy_processor.sequential_imputed_predictor_encoding( + data, variable + ) + self.logger.debug( + f" Encoded '{variable}' for use in sequential imputation" + ) + + return data + def _create_model_for_variable(self, variable: str, **kwargs) -> Any: """Create the appropriate model (classifier or regressor) based on variable type.""" categorical_targets = getattr(self, "categorical_targets", {}) @@ -681,12 +784,9 @@ def _fit( dummy_processor = getattr( self, "dummy_processor", None ) - if dummy_processor: - encoded_predictors = dummy_processor.get_sequential_predictor_columns( - current_predictors - ) - else: - encoded_predictors = current_predictors + encoded_predictors = self._get_encoded_predictors( + current_predictors, dummy_processor + ) # Log detailed pre-imputation information self.logger.info( @@ -734,17 +834,9 @@ def _fit( self.models[variable] = model # Encode categorical/boolean imputed variable for next iteration - if ( - dummy_processor - and variable - in dummy_processor.imputed_var_dummy_mapping - ): - X_train = dummy_processor.sequential_imputed_predictor_encoding( - X_train, variable - ) - self.logger.debug( - f" Encoded '{variable}' for use in sequential imputation" - ) + X_train = self._encode_imputed_variable( + X_train, variable, dummy_processor + ) except Exception as e: self.logger.error( @@ -858,12 +950,20 @@ def _fit( predictors, imputed_variables, i ) + # Get properly encoded predictor columns + dummy_processor = getattr( + self, "dummy_processor", None + ) + encoded_predictors = self._get_encoded_predictors( + current_predictors, dummy_processor + ) + # Log detailed pre-imputation information self.logger.info( f"[{i+1}/{len(imputed_variables)}] Starting imputation for '{variable}'" ) self.logger.info( - f" Features: {len(current_predictors)} predictors" + f" Features: {len(encoded_predictors)} predictors" ) self.logger.info( f" Memory usage: {self._get_memory_usage_info()}" @@ -875,7 +975,7 @@ def _fit( try: self._fit_model( model, - X_train[current_predictors], + X_train[encoded_predictors], X_train[variable], variable, **qrf_kwargs, @@ -903,6 +1003,11 @@ def _fit( self.models[variable] = model + # Encode categorical/boolean imputed variable for next iteration + X_train = self._encode_imputed_variable( + X_train, variable, dummy_processor + ) + except Exception as e: self.logger.error( f" ✗ Failed: {variable} - {str(e)}" @@ -1107,14 +1212,9 @@ def objective(trial: optuna.Trial) -> float: # Get properly encoded predictor columns dummy_processor = getattr(self, "dummy_processor", None) - if dummy_processor: - encoded_predictors = ( - dummy_processor.get_sequential_predictor_columns( - current_predictors - ) - ) - else: - encoded_predictors = current_predictors + encoded_predictors = self._get_encoded_predictors( + current_predictors, dummy_processor + ) # Only fit and evaluate numeric variables if var in numeric_vars: @@ -1159,12 +1259,12 @@ def objective(trial: optuna.Trial) -> float: var_errors.append(normalized_loss) else: # Categorical variable - encode it for use as predictor in next iterations - if dummy_processor and var in X_train_fold.columns: - X_train_augmented = dummy_processor.sequential_imputed_predictor_encoding( - X_train_augmented, var + if var in X_train_fold.columns: + X_train_augmented = self._encode_imputed_variable( + X_train_augmented, var, dummy_processor ) - X_val_augmented = dummy_processor.sequential_imputed_predictor_encoding( - X_val_augmented, var + X_val_augmented = self._encode_imputed_variable( + X_val_augmented, var, dummy_processor ) # Average across variables for this fold @@ -1277,14 +1377,9 @@ def objective(trial: optuna.Trial) -> float: # Get properly encoded predictor columns dummy_processor = getattr(self, "dummy_processor", None) - if dummy_processor: - encoded_predictors = ( - dummy_processor.get_sequential_predictor_columns( - current_predictors - ) - ) - else: - encoded_predictors = current_predictors + encoded_predictors = self._get_encoded_predictors( + current_predictors, dummy_processor + ) # Only fit and evaluate categorical/boolean variables if var in categorical_vars: @@ -1353,13 +1448,12 @@ def objective(trial: optuna.Trial) -> float: var_errors.append(log_loss_value) # Encode the categorical variable for use as predictor in next iterations - if dummy_processor: - X_train_augmented = dummy_processor.sequential_imputed_predictor_encoding( - X_train_augmented, var - ) - X_val_augmented = dummy_processor.sequential_imputed_predictor_encoding( - X_val_augmented, var - ) + X_train_augmented = self._encode_imputed_variable( + X_train_augmented, var, dummy_processor + ) + X_val_augmented = self._encode_imputed_variable( + X_val_augmented, var, dummy_processor + ) else: # Numeric variable - just add it to augmented data (already there from fold data) pass diff --git a/tests/test_models/test_qrf.py b/tests/test_models/test_qrf.py index 63563c4..23df89c 100644 --- a/tests/test_models/test_qrf.py +++ b/tests/test_models/test_qrf.py @@ -964,6 +964,106 @@ def test_qrf_hyperparameter_tuning_with_cv_folds() -> None: model.logger.removeHandler(handler) +def test_qrf_sequential_imputation_discrete_numeric_categorical() -> None: + """Test sequential imputation with discrete numeric values (0-6) treated as numeric_categorical. + + This test specifically validates the fix for GitHub issue #133 where dummy encoding + during sequential imputation was failing for discrete numeric variables. + """ + np.random.seed(42) + + # Create DONOR dataset with discrete values 0-6 + n_samples_donor = 200 + + # Create three variables with discrete values from 0 to 6 + # These will be detected as numeric_categorical due to low unique count and equal spacing + base = np.random.choice([0, 1, 2, 3, 4, 5, 6], size=n_samples_donor) + + var1 = base + np.random.choice([-1, 0, 1], size=n_samples_donor) + var1 = np.clip(var1, 0, 6).astype(float) + + var2 = base + np.random.choice([-1, 0, 1], size=n_samples_donor) + var2 = np.clip(var2, 0, 6).astype(float) + + var3 = base + np.random.choice([-1, 0, 1], size=n_samples_donor) + var3 = np.clip(var3, 0, 6).astype(float) + + # Create predictor features + feature1 = np.random.randn(n_samples_donor) + feature2 = np.random.randn(n_samples_donor) * 2 + 1 + feature3 = np.random.choice([0, 1], size=n_samples_donor) + + # Create DONOR DataFrame + donor_df = pd.DataFrame( + { + "var1": var1, + "var2": var2, + "var3": var3, + "feature1": feature1, + "feature2": feature2, + "feature3": feature3, + } + ) + + # Create RECEIVER dataset (missing var1, var2, var3 entirely) + n_samples_receiver = 50 + receiver_df = pd.DataFrame( + { + "feature1": np.random.randn(n_samples_receiver), + "feature2": np.random.randn(n_samples_receiver) * 2 + 1, + "feature3": np.random.choice([0, 1], size=n_samples_receiver), + } + ) + + # Initialize and fit QRF model + model = QRF(log_level="WARNING") + + predictors = ["feature1", "feature2", "feature3"] + imputed_variables = ["var1", "var2", "var3"] + + # This should work without errors after the fix for issue #133 + fitted_model = model.fit( + X_train=donor_df, + predictors=predictors, + imputed_variables=imputed_variables, + n_estimators=30, + random_state=42, + ) + + # Predict should also work without dummy encoding errors + imputed_vars_df = fitted_model.predict(receiver_df) + + # Validate results + assert isinstance(imputed_vars_df, pd.DataFrame) + assert imputed_vars_df.shape == (n_samples_receiver, 3) + assert set(imputed_vars_df.columns) == {"var1", "var2", "var3"} + + # Check that all imputed values are within the expected range [0, 6] + for col in ["var1", "var2", "var3"]: + assert imputed_vars_df[col].min() >= 0, f"{col} has values below 0" + assert imputed_vars_df[col].max() <= 6, f"{col} has values above 6" + + # Test that sequential imputation is actually happening + # var2 and var3 should use var1 as a predictor + # Fit a model with just var1 to compare + single_model = QRF(log_level="WARNING") + single_fitted = single_model.fit( + X_train=donor_df, + predictors=predictors, + imputed_variables=["var1"], + n_estimators=30, + random_state=42, + ) + + single_pred = single_fitted.predict(receiver_df) + + # var1 predictions should be the same in both models + # (since var1 is imputed first in both cases with same predictors) + assert np.allclose( + imputed_vars_df["var1"].values, single_pred["var1"].values, rtol=1e-5 + ), "First variable in sequential should match single imputation" + + def test_qrf_hyperparameter_tuning_improves_performance() -> None: """Test that tuned hyperparameters perform better than untuned model.""" from microimpute.comparisons.metrics import compute_loss From e06ad915ffe3f4a04b28b3c407b3ef6ef94e0706 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Sun, 19 Oct 2025 17:02:59 +0800 Subject: [PATCH 2/4] add not_numeric_categorical parameter --- changelog_entry.yaml | 1 + microimpute/models/imputer.py | 35 ++++++-- microimpute/models/qrf.py | 7 +- microimpute/utils/type_handling.py | 47 +++++++++- tests/test_models/test_qrf.py | 133 +++++++++++++++++++++++++++++ 5 files changed, 213 insertions(+), 10 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 3f455a0..87e5f57 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -2,3 +2,4 @@ changes: fixed: - Fixed QRF to encode categorical imputed variables that become predictors correctly. + - Added `not_numeric_categorical` parameter to control whether discrete numeric variables are treated as categorical. diff --git a/microimpute/models/imputer.py b/microimpute/models/imputer.py index e59e4a2..193c57d 100644 --- a/microimpute/models/imputer.py +++ b/microimpute/models/imputer.py @@ -104,15 +104,22 @@ def _validate_data(self, data: pd.DataFrame, columns: List[str]) -> None: ) def identify_target_types( - self, data: pd.DataFrame, imputed_variables: List[str] + self, + data: pd.DataFrame, + imputed_variables: List[str], + not_numeric_categorical: Optional[List[str]] = None, ) -> None: """Identify and track variable types for imputation targets. Args: data: DataFrame containing the data. imputed_variables: List of variables to be imputed. + not_numeric_categorical: Optional list of variable names that should + be treated as numeric even if they would normally be detected as + numeric_categorical. """ detector = VariableTypeDetector() + not_numeric_categorical = not_numeric_categorical or [] for var in imputed_variables: if var not in data.columns: @@ -133,7 +140,10 @@ def identify_target_types( continue var_type, categories = detector.categorize_variable( - data[var], var, self.logger + data[var], + var, + self.logger, + force_numeric=(var in not_numeric_categorical), ) if var_type == "bool": @@ -163,6 +173,7 @@ def preprocess_data_types( data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], + not_numeric_categorical: Optional[List[str]] = None, ) -> Tuple[pd.DataFrame, List[str], List[str], Dict[str, Any]]: """Preprocess predictors only - convert categorical predictors to dummies. Imputation targets remain in original form for classification. @@ -171,6 +182,9 @@ def preprocess_data_types( data: DataFrame containing the data. predictors: List of predictor column names. imputed_variables: List of variables to impute (kept in original form). + not_numeric_categorical: Optional list of variable names that should + be treated as numeric even if they would normally be detected as + numeric_categorical. Returns: Tuple of (processed_data, updated_predictors, imputed_variables, empty_dict) @@ -182,7 +196,10 @@ def preprocess_data_types( processor = DummyVariableProcessor(self.logger) processed_data, updated_predictors = ( processor.preprocess_predictors( - data, predictors, imputed_variables + data, + predictors, + imputed_variables, + not_numeric_categorical, ) ) @@ -204,6 +221,7 @@ def fit( imputed_variables: List[str], weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None, skip_missing: bool = False, + not_numeric_categorical: Optional[List[str]] = None, **kwargs: Any, ) -> Any: # Returns ImputerResults """Fit the model to the training data. @@ -214,6 +232,9 @@ def fit( imputed_variables: List of column names to impute. weight_col: Optional name of the column or column array/series containing sampling weights. When provided, `X_train` will be sampled with replacement using this column as selection probabilities before fitting the model. skip_missing: If True, skip variables missing from training data with warning. If False, raise error for missing variables. + not_numeric_categorical: Optional list of variable names that should + be treated as numeric even if they would normally be detected as + numeric_categorical. **kwargs: Additional model-specific parameters. Returns: @@ -261,10 +282,14 @@ def fit( raise ValueError("Weights must be positive") # Identify target types BEFORE preprocessing - self.identify_target_types(X_train, imputed_variables) + self.identify_target_types( + X_train, imputed_variables, not_numeric_categorical + ) X_train, predictors, imputed_variables, imputed_vars_dummy_info = ( - self.preprocess_data_types(X_train, predictors, imputed_variables) + self.preprocess_data_types( + X_train, predictors, imputed_variables, not_numeric_categorical + ) ) if weights is not None: diff --git a/microimpute/models/qrf.py b/microimpute/models/qrf.py index 0c0eddd..2154f51 100644 --- a/microimpute/models/qrf.py +++ b/microimpute/models/qrf.py @@ -158,9 +158,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **qrf_kwargs: Any) -> None: """ self.output_column = y.name + # Remove random_state from kwargs if present, since we set it explicitly + qrf_kwargs_filtered = { + k: v for k, v in qrf_kwargs.items() if k != "random_state" + } + # Create and fit model self.qrf = RandomForestQuantileRegressor( - random_state=self.seed, **qrf_kwargs + random_state=self.seed, **qrf_kwargs_filtered ) self.qrf.fit(X, y.values.ravel()) diff --git a/microimpute/utils/type_handling.py b/microimpute/utils/type_handling.py index b22f3a7..243b597 100644 --- a/microimpute/utils/type_handling.py +++ b/microimpute/utils/type_handling.py @@ -57,11 +57,21 @@ def is_numeric_categorical_variable( @staticmethod def categorize_variable( - series: pd.Series, col_name: str, logger: logging.Logger + series: pd.Series, + col_name: str, + logger: logging.Logger, + force_numeric: bool = False, ) -> Tuple[str, Optional[List]]: """ Categorize a variable and return its type and categories if applicable. + Args: + series: The data series to categorize + col_name: Name of the column for logging + logger: Logger instance + force_numeric: If True, treat the variable as numeric even if it + would normally be detected as numeric_categorical + Returns: Tuple of (variable_type, categories) variable_type: 'bool', 'categorical', 'numeric_categorical', or 'numeric' @@ -73,13 +83,26 @@ def categorize_variable( if VariableTypeDetector.is_categorical_variable(series): return "categorical", series.unique().tolist() - if VariableTypeDetector.is_numeric_categorical_variable(series): + # Check if it would normally be numeric_categorical + if ( + not force_numeric + and VariableTypeDetector.is_numeric_categorical_variable(series) + ): categories = [float(i) for i in series.unique().tolist()] logger.info( f"Treating numeric variable '{col_name}' as categorical due to low unique count and equal spacing" ) return "numeric_categorical", categories + # If force_numeric is True or it's not numeric_categorical, treat as numeric + if ( + force_numeric + and VariableTypeDetector.is_numeric_categorical_variable(series) + ): + logger.info( + f"Variable '{col_name}' forced to be treated as numeric (override numeric_categorical detection)" + ) + return "numeric", None @@ -98,6 +121,7 @@ def preprocess_predictors( data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], + not_numeric_categorical: Optional[List[str]] = None, ) -> Tuple[pd.DataFrame, List[str]]: """ Process predictor variables and pre-compute dummy encodings. @@ -105,6 +129,14 @@ def preprocess_predictors( For predictors: converts categoricals to dummies and adds to dataframe. For imputed_variables: pre-computes dummy info but keeps original form. + Args: + data: DataFrame containing the data + predictors: List of predictor column names + imputed_variables: List of variables to impute + not_numeric_categorical: Optional list of variable names that should + be treated as numeric even if they would normally be detected as + numeric_categorical. + Returns: Tuple of (processed_data, updated_predictors) """ @@ -112,6 +144,7 @@ def preprocess_predictors( all_columns = list(set(predictors + imputed_variables)) data = data[all_columns].copy() detector = VariableTypeDetector() + not_numeric_categorical = not_numeric_categorical or [] # Identify categorical predictors (not imputed targets) categorical_predictors = [] @@ -119,7 +152,10 @@ def preprocess_predictors( if col not in data.columns: continue var_type, categories = detector.categorize_variable( - data[col], col, self.logger + data[col], + col, + self.logger, + force_numeric=(col in not_numeric_categorical), ) if var_type in ["categorical", "numeric_categorical"]: categorical_predictors.append(col) @@ -132,7 +168,10 @@ def preprocess_predictors( if col not in data.columns: continue var_type, categories = detector.categorize_variable( - data[col], col, self.logger + data[col], + col, + self.logger, + force_numeric=(col in not_numeric_categorical), ) if var_type in ["categorical", "numeric_categorical"]: # Create dummy columns to determine structure diff --git a/tests/test_models/test_qrf.py b/tests/test_models/test_qrf.py index 23df89c..2f26c52 100644 --- a/tests/test_models/test_qrf.py +++ b/tests/test_models/test_qrf.py @@ -1064,6 +1064,139 @@ def test_qrf_sequential_imputation_discrete_numeric_categorical() -> None: ), "First variable in sequential should match single imputation" +def test_qrf_not_numeric_categorical_override() -> None: + """Test that not_numeric_categorical parameter correctly overrides automatic detection. + + Variables with <10 unique equally-spaced values normally get treated as categorical, + but this parameter should force them to be treated as numeric. + """ + np.random.seed(42) + n_samples = 200 + + # Create data with discrete values that would normally be treated as categorical + # var1 and var2 have values 0-5 (6 unique, equally spaced -> normally categorical) + # var3 is continuous + donor_df = pd.DataFrame( + { + "predictor1": np.random.randn(n_samples), + "predictor2": np.random.randn(n_samples), + "discrete_var1": np.random.choice( + [0, 1, 2, 3, 4, 5], n_samples + ).astype(float), + "discrete_var2": np.random.choice( + [0, 1, 2, 3, 4, 5], n_samples + ).astype(float), + "continuous_var": np.random.randn(n_samples) * 5 + 10, + } + ) + + # Create receiver dataset without the imputed variables + receiver_df = pd.DataFrame( + {"predictor1": np.random.randn(50), "predictor2": np.random.randn(50)} + ) + + # Test 1: Default behavior - discrete vars should be treated as categorical + model_default = QRF(log_level="WARNING") + fitted_default = model_default.fit( + X_train=donor_df, + predictors=["predictor1", "predictor2"], + imputed_variables=["discrete_var1", "discrete_var2", "continuous_var"], + n_estimators=20, + random_state=42, + ) + + # Check that discrete vars were treated as categorical + assert ( + "discrete_var1" in model_default.categorical_targets + ), "discrete_var1 should be categorical by default" + assert ( + "discrete_var2" in model_default.categorical_targets + ), "discrete_var2 should be categorical by default" + assert ( + "continuous_var" in model_default.numeric_targets + ), "continuous_var should be numeric" + + # Test 2: Override discrete_var1 to be numeric, keep discrete_var2 as categorical + model_override = QRF(log_level="WARNING") + fitted_override = model_override.fit( + X_train=donor_df, + predictors=["predictor1", "predictor2"], + imputed_variables=["discrete_var1", "discrete_var2", "continuous_var"], + not_numeric_categorical=[ + "discrete_var1" + ], # Force discrete_var1 to be numeric + n_estimators=20, + random_state=42, + ) + + # Check that discrete_var1 is now numeric, but discrete_var2 is still categorical + assert ( + "discrete_var1" not in model_override.categorical_targets + ), "discrete_var1 should NOT be categorical with override" + assert ( + "discrete_var1" in model_override.numeric_targets + ), "discrete_var1 should be numeric with override" + assert ( + "discrete_var2" in model_override.categorical_targets + ), "discrete_var2 should still be categorical" + assert ( + "continuous_var" in model_override.numeric_targets + ), "continuous_var should still be numeric" + + # Test 3: Predictions should work with both models + predictions_default = fitted_default.predict(receiver_df) + predictions_override = fitted_override.predict(receiver_df) + + assert predictions_default.shape == (50, 3) + assert predictions_override.shape == (50, 3) + + # discrete_var1 predictions should be different between models + # (one uses classification, one uses regression) + assert not np.allclose( + predictions_default["discrete_var1"].values, + predictions_override["discrete_var1"].values, + rtol=1e-5, + ), "discrete_var1 predictions should differ between categorical and numeric treatment" + + # Test 4: Override for predictors as well + # Create data where a predictor has discrete values + donor_df_pred = pd.DataFrame( + { + "discrete_predictor": np.random.choice( + [10, 20, 30, 40], n_samples + ).astype(float), + "normal_predictor": np.random.randn(n_samples), + "target": np.random.randn(n_samples) * 2 + 5, + } + ) + + receiver_df_pred = pd.DataFrame( + { + "discrete_predictor": np.random.choice( + [10, 20, 30, 40], 30 + ).astype(float), + "normal_predictor": np.random.randn(30), + } + ) + + model_pred = QRF(log_level="WARNING") + fitted_pred = model_pred.fit( + X_train=donor_df_pred, + predictors=["discrete_predictor", "normal_predictor"], + imputed_variables=["target"], + not_numeric_categorical=[ + "discrete_predictor" + ], # Force predictor to stay numeric + n_estimators=20, + random_state=42, + ) + + # The model should still work and predict + predictions_pred = fitted_pred.predict(receiver_df_pred) + assert predictions_pred.shape == (30, 1) + assert "target" in predictions_pred.columns + + def test_qrf_hyperparameter_tuning_improves_performance() -> None: """Test that tuned hyperparameters perform better than untuned model.""" from microimpute.comparisons.metrics import compute_loss From 37b2037c91625c69ed39de85920de1018d7f2244 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Sun, 19 Oct 2025 17:42:46 +0800 Subject: [PATCH 3/4] implement kl-divergence --- changelog_entry.yaml | 1 + .../benchmarking-methods.ipynb | 19 ++++- microimpute/__init__.py | 3 + microimpute/comparisons/__init__.py | 2 + microimpute/comparisons/metrics.py | 68 ++++++++++------- pyproject.toml | 2 +- tests/test_metrics.py | 74 +++++++++++-------- 7 files changed, 110 insertions(+), 59 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 87e5f57..657dab7 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -3,3 +3,4 @@ fixed: - Fixed QRF to encode categorical imputed variables that become predictors correctly. - Added `not_numeric_categorical` parameter to control whether discrete numeric variables are treated as categorical. + - Replaced Total Variation Distance with KL-divergence. diff --git a/docs/imputation-benchmarking/benchmarking-methods.ipynb b/docs/imputation-benchmarking/benchmarking-methods.ipynb index 33b18c9..3ff1e54 100644 --- a/docs/imputation-benchmarking/benchmarking-methods.ipynb +++ b/docs/imputation-benchmarking/benchmarking-methods.ipynb @@ -3301,9 +3301,11 @@ "\n", "### Distribution similarity metrics\n", "\n", - "The `compare_distributions()` function evaluates how well the imputed values preserve the distributional characteristics of the original data. It uses the Wasserstein distance (also known as Earth Mover's Distance) to quantify the difference between the distribution of imputed values and the true distribution.\n", + "The `compare_distributions()` function evaluates how well the imputed values preserve the distributional characteristics of the original data. It automatically selects the appropriate metric based on the variable type: Wasserstein distance for continuous numerical variables and Kullback-Leibler (KL) divergence for discrete categorical and boolean variables.\n", "\n", - "The Wasserstein distance between two probability distributions $P$ and $Q$ is defined as:\n", + "#### Wasserstein distance for numerical variables\n", + "\n", + "For continuous numerical variables, the framework uses the Wasserstein distance (also known as Earth Mover's Distance) to quantify the difference between distributions. The Wasserstein distance between two probability distributions $P$ and $Q$ is defined as:\n", "\n", "$$W_p(P, Q) = \\left(\\inf_{\\gamma \\in \\Pi(P, Q)} \\int_{X \\times Y} d(x, y)^p d\\gamma(x, y)\\right)^{1/p}$$\n", "\n", @@ -3311,6 +3313,19 @@ "\n", "The Wasserstein distance measures the minimum \"work\" required to transform one distribution into another, where work is defined as the amount of distribution mass moved times the distance it's moved. Lower values indicate better preservation of the original distribution's shape. In the SCF example, QRF shows the lowest Wasserstein distance (1.2e7), indicating it best preserves the distribution of net worth values, while QuantReg shows the highest distance (2.8e7), suggesting greater distributional distortion.\n", "\n", + "#### Kullback-Leibler divergence for categorical and boolean variables\n", + "\n", + "For discrete distributions (categorical and boolean variables), the framework employs KL divergence, an information-theoretic measure that quantifies how one probability distribution diverges from a reference distribution. The KL divergence from distribution $Q$ to distribution $P$ is defined as:\n", + "\n", + "$$D_{KL}(P||Q) = \\sum_{x \\in \\mathcal{X}} P(x) \\log\\left(\\frac{P(x)}{Q(x)}\\right)$$\n", + "\n", + "where:\n", + "- $P$ is the reference distribution (original data)\n", + "- $Q$ is the approximation (imputed data)\n", + "- $\\mathcal{X}$ is the set of all possible categorical values\n", + "\n", + "In the context of imputation evaluation, KL divergence measures how much information is lost when using the imputed distribution $Q$ to approximate the true distribution $P$. Lower KL divergence values indicate better preservation of the original categorical distribution.\n", + "\n", "## Predictor analysis and sensitivity evaluation\n", "\n", "Beyond comparing imputation methods, understanding the relationship between predictors and target variables, as well as the sensitivity of imputation quality to predictor selection, provides crucial insights for model optimization and feature engineering.\n", diff --git a/microimpute/__init__.py b/microimpute/__init__.py index 0fc3f76..170ef61 100644 --- a/microimpute/__init__.py +++ b/microimpute/__init__.py @@ -27,11 +27,14 @@ # Import comparison and metric utilities from microimpute.comparisons.metrics import ( + compare_distributions, compare_metrics, compute_loss, get_metric_for_variable_type, + kl_divergence, log_loss, quantile_loss, + wasserstein_distance, ) # Import validation utilities diff --git a/microimpute/comparisons/__init__.py b/microimpute/comparisons/__init__.py index 27a1c59..c1d1bef 100644 --- a/microimpute/comparisons/__init__.py +++ b/microimpute/comparisons/__init__.py @@ -24,8 +24,10 @@ compare_metrics, compute_loss, get_metric_for_variable_type, + kl_divergence, log_loss, quantile_loss, + wasserstein_distance, ) # Import validation utilities diff --git a/microimpute/comparisons/metrics.py b/microimpute/comparisons/metrics.py index ad0bc42..15d1227 100644 --- a/microimpute/comparisons/metrics.py +++ b/microimpute/comparisons/metrics.py @@ -3,7 +3,7 @@ This module contains utilities for evaluating imputation quality using various metrics: - Quantile loss for numerical variables - Log loss for categorical variables -- Distributional similarity metrics (Wasserstein distance, Total Variation Distance) +- Distributional similarity metrics (Wasserstein distance, KL Divergence) The module automatically detects which metric to use based on variable type. """ @@ -13,6 +13,7 @@ import numpy as np import pandas as pd from pydantic import validate_call +from scipy.special import rel_entr from scipy.stats import wasserstein_distance from sklearn.metrics import log_loss as sklearn_log_loss @@ -495,25 +496,35 @@ def compare_metrics( raise RuntimeError(f"Failed to compare metrics: {str(e)}") from e -def total_variation_distance( +def kl_divergence( donor_values: np.ndarray, receiver_values: np.ndarray ) -> float: - """Calculate Total Variation Distance between two categorical distributions. + """Calculate Kullback-Leibler (KL) Divergence between two categorical distributions. - Total Variation Distance (TVD) measures the maximum difference between - two probability distributions. For categorical variables, it is calculated as: - TVD = 0.5 * sum(|P(x) - Q(x)|) for all categories x + KL divergence measures the difference between two probability distributions. + For categorical variables, it is calculated as: + KL(P||Q) = sum(P(x) * log(P(x) / Q(x))) for all categories x + + This implementation uses the donor distribution as P (reference) and + receiver distribution as Q (approximation), measuring how well the + receiver distribution approximates the donor distribution. Args: - donor_values: Array of categorical values from donor data. - receiver_values: Array of categorical values from receiver data. + donor_values: Array of categorical values from donor data (reference distribution P). + receiver_values: Array of categorical values from receiver data (approximation Q). Returns: - Total variation distance value between 0 and 1, where 0 indicates - identical distributions and 1 indicates completely disjoint distributions. + KL divergence value >= 0, where 0 indicates identical distributions + and larger values indicate greater divergence. Note: KL divergence is + unbounded and can be infinite if Q(x) = 0 for some x where P(x) > 0. Raises: ValueError: If inputs are empty or invalid. + + Note: + - KL divergence is not symmetric: KL(P||Q) != KL(Q||P) + - To handle zero probabilities, a small epsilon is added to avoid log(0) + - Uses scipy.special.rel_entr for numerical stability """ if len(donor_values) == 0 or len(receiver_values) == 0: raise ValueError( @@ -529,15 +540,22 @@ def total_variation_distance( donor_counts = pd.Series(donor_values).value_counts(normalize=True) receiver_counts = pd.Series(receiver_values).value_counts(normalize=True) - # Calculate TVD - tvd = 0.0 - for category in all_categories: - p_donor = donor_counts.get(category, 0.0) - p_receiver = receiver_counts.get(category, 0.0) - tvd += abs(p_donor - p_receiver) + # Create probability arrays for all categories + p_donor = np.array([donor_counts.get(cat, 0.0) for cat in all_categories]) + q_receiver = np.array( + [receiver_counts.get(cat, 0.0) for cat in all_categories] + ) - # TVD is half the sum of absolute differences - return tvd / 2.0 + # Add small epsilon to avoid log(0) and division by zero + epsilon = 1e-10 + q_receiver = np.maximum(q_receiver, epsilon) + + # Calculate KL divergence using scipy.special.kl_div + # kl_div(p, q) computes p * log(p/q) element-wise + kl_values = rel_entr(p_donor, q_receiver) + + # Sum over all categories to get total KL divergence + return np.sum(kl_values) @validate_call(config=VALIDATE_CONFIG) @@ -550,7 +568,7 @@ def compare_distributions( Evaluates distributional similarity using appropriate metrics: - Wasserstein Distance for numerical variables - - Total Variation Distance for categorical variables + - KL Divergence for categorical variables Args: donor_data: DataFrame containing original donor data. @@ -575,7 +593,7 @@ def compare_distributions( >>> print(result) Variable Metric Distance 0 income wasserstein_distance 66.666667 - 1 region total_variation_distance 0.166667 + 1 region kl_divergence 0.166667 """ try: log.info( @@ -613,13 +631,11 @@ def compare_distributions( # Choose appropriate metric if var_type in ["bool", "categorical", "numeric_categorical"]: - # Use Total Variation Distance for categorical - metric_name = "total_variation_distance" - distance = total_variation_distance( - donor_values, receiver_values - ) + # Use KL Divergence for categorical + metric_name = "kl_divergence" + distance = kl_divergence(donor_values, receiver_values) log.debug( - f"TVD for categorical variable '{var}': {distance:.6f}" + f"KL divergence for categorical variable '{var}': {distance:.6f}" ) else: # Use Wasserstein Distance for numerical diff --git a/pyproject.toml b/pyproject.toml index f8e7fb5..2e62945 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "quantile-forest>=1.4.1,<1.5.1", "pydantic>=2.8.0,<3.0.0", "optuna>=4.3.0,<5.0.0", - "joblib>=1.5.1,<2.0.0", + "joblib>=1.5.1,<2.0.0",e "psutil", ] diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 6578aea..8b1ee62 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -16,8 +16,8 @@ compare_distributions, compute_loss, get_metric_for_variable_type, + kl_divergence, log_loss, - total_variation_distance, ) from microimpute.config import QUANTILES from microimpute.evaluations.cross_validation import cross_validate_model @@ -1017,50 +1017,65 @@ def test_probability_ordering_with_real_model() -> None: # === Distribution Comparison Tests === -def test_total_variation_distance_identical() -> None: - """Test TVD between identical distributions.""" +def test_kl_divergence_identical() -> None: + """Test KL divergence between identical distributions.""" values = np.array(["A", "B", "C", "A", "B", "C"]) - tvd = total_variation_distance(values, values) - assert tvd == 0.0, "TVD should be 0 for identical distributions" + kl = kl_divergence(values, values) + assert np.isclose( + kl, 0.0, atol=1e-10 + ), "KL divergence should be 0 for identical distributions" -def test_total_variation_distance_disjoint() -> None: - """Test TVD between completely disjoint distributions.""" +def test_kl_divergence_disjoint() -> None: + """Test KL divergence between completely disjoint distributions.""" donor = np.array(["A", "A", "A", "A"]) receiver = np.array(["B", "B", "B", "B"]) - tvd = total_variation_distance(donor, receiver) - assert tvd == 1.0, "TVD should be 1 for disjoint distributions" + kl = kl_divergence(donor, receiver) + # KL divergence should be very large (approaching infinity) for disjoint distributions + assert ( + kl > 10 + ), "KL divergence should be very large for disjoint distributions" -def test_total_variation_distance_partial_overlap() -> None: - """Test TVD with partial distribution overlap.""" +def test_kl_divergence_partial_overlap() -> None: + """Test KL divergence with partial distribution overlap.""" # Donor: 75% A, 25% B donor = np.array(["A", "A", "A", "B"]) # Receiver: 50% A, 50% B receiver = np.array(["A", "A", "B", "B"]) - tvd = total_variation_distance(donor, receiver) - # Expected: 0.5 * (|0.75-0.50| + |0.25-0.50|) = 0.5 * (0.25 + 0.25) = 0.25 - assert np.isclose(tvd, 0.25), f"Expected TVD ~0.25, got {tvd}" + kl = kl_divergence(donor, receiver) + # KL(P||Q) = 0.75*log(0.75/0.50) + 0.25*log(0.25/0.50) + # = 0.75*log(1.5) + 0.25*log(0.5) + # ≈ 0.75*0.405 + 0.25*(-0.693) + # ≈ 0.304 - 0.173 ≈ 0.131 + assert ( + kl > 0 + ), "KL divergence should be positive for different distributions" + assert ( + kl < 1 + ), f"KL divergence should be reasonable for similar distributions, got {kl}" -def test_total_variation_distance_different_categories() -> None: - """Test TVD when distributions have different category sets.""" +def test_kl_divergence_different_categories() -> None: + """Test KL divergence when distributions have different category sets.""" donor = np.array(["A", "B", "A", "B"]) receiver = np.array(["B", "C", "B", "C"]) - tvd = total_variation_distance(donor, receiver) + kl = kl_divergence(donor, receiver) # Donor: A=0.5, B=0.5, C=0.0 - # Receiver: A=0.0, B=0.5, C=0.5 - # TVD = 0.5 * (|0.5-0| + |0.5-0.5| + |0-0.5|) = 0.5 * (0.5 + 0 + 0.5) = 0.5 - assert np.isclose(tvd, 0.5), f"Expected TVD ~0.5, got {tvd}" + # Receiver: A=0.0+ε, B=0.5, C=0.5 (ε added to avoid log(0)) + # KL divergence should be large due to A having 0 probability in receiver + assert ( + kl > 5 + ), f"KL divergence should be large when categories are missing, got {kl}" -def test_total_variation_distance_empty_input() -> None: - """Test TVD raises error with empty inputs.""" +def test_kl_divergence_empty_input() -> None: + """Test KL divergence raises error with empty inputs.""" with pytest.raises(ValueError, match="non-empty"): - total_variation_distance(np.array([]), np.array(["A"])) + kl_divergence(np.array([]), np.array(["A"])) with pytest.raises(ValueError, match="non-empty"): - total_variation_distance(np.array(["A"]), np.array([])) + kl_divergence(np.array(["A"]), np.array([])) def test_compare_distributions_numerical_only() -> None: @@ -1124,12 +1139,11 @@ def test_compare_distributions_categorical_only() -> None: # Check structure assert len(results) == 2 - assert all(results["Metric"] == "total_variation_distance") + assert all(results["Metric"] == "kl_divergence") assert set(results["Variable"]) == {"region", "status"} - # TVD should be between 0 and 1 + # KL divergence should be non-negative (can be > 1) assert all(results["Distance"] >= 0) - assert all(results["Distance"] <= 1) def test_compare_distributions_mixed_types() -> None: @@ -1167,9 +1181,9 @@ def test_compare_distributions_mixed_types() -> None: numerical_vars = results[results["Metric"] == "wasserstein_distance"][ "Variable" ].tolist() - categorical_vars = results[ - results["Metric"] == "total_variation_distance" - ]["Variable"].tolist() + categorical_vars = results[results["Metric"] == "kl_divergence"][ + "Variable" + ].tolist() assert "income" in numerical_vars assert "age" in numerical_vars From fb5a5d6709a1c282307d4cc870341c1b57d3e45f Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Sun, 19 Oct 2025 17:45:50 +0800 Subject: [PATCH 4/4] update toml to avoid package versions being overwritten --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2e62945..a5a9f6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,10 +24,10 @@ dependencies = [ "requests>=2.32.0,<3.0.0", "tqdm>=4.65.0,<5.0.0", "statsmodels>=0.14.5,<0.16.0", - "quantile-forest>=1.4.1,<1.5.1", + "quantile-forest>=1.4.1,<1.5.0", "pydantic>=2.8.0,<3.0.0", "optuna>=4.3.0,<5.0.0", - "joblib>=1.5.1,<2.0.0",e + "joblib>=1.5.0,<2.0.0", "psutil", ]