ThrunGroup · motiwari · Jul 28, 2022 · Jul 29, 2022 · Jul 29, 2022 · Jul 29, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,11 @@
+experiments/datasets/kdd98.npz.npy
 fastforest.egg-info/
 .DS_Store
 .idea/
 __pycache__/
 experiments/.ipynb_checkpoints/*
 experiments/mnist
 experiments/.DS_Store
+experiments/datasets/cup98*
+experiments/datasets/gpu*
+experiments/datasets/new_gpu*
diff --git a/data_structures/forest_base.py b/data_structures/forest_base.py
@@ -16,7 +16,7 @@
     DEFAULT_MIN_IMPURITY_DECREASE,
     BATCH_SIZE,
 )
-from utils.utils import data_to_discrete, set_seed, get_subset_2d
+from utils.utils import data_to_discrete, set_seed, get_subset_2d, class_to_idx, counts_of_labels
 from utils.boosting import get_next_targets
 from data_structures.tree_classifier import TreeClassifier
 from data_structures.tree_regressor import TreeRegressor
@@ -65,7 +65,7 @@ def __init__(
         self.new_targets = labels
 
         # self.curr_data and self.curr_targets are the data, targets that are used to fit the current tree.
-        # These attributes may be smaller than the original dataset size if self.bootstrap is true.
+        # These attributes may be smaller than the original datasets size if self.bootstrap is true.
         self.curr_data = None
         self.curr_targets = None
         self.trees = []
@@ -169,12 +169,18 @@ def fit(self, data: np.ndarray = None, labels: np.ndarray = None) -> None:
             self.data = data
             self.org_targets = labels
             self.new_targets = labels
+
         N = len(self.data)
         F = len(self.data[0])
 
         if self.is_classification:
             self.org_targets = self.org_targets.astype(np.int32)
             self.new_targets = self.new_targets.astype(np.int32)
+            if self.classes is None:
+                self.classes: dict = class_to_idx(
+                    np.unique(self.new_targets)
+                )  # a dictionary that maps class name to class index
+            self.n_classes = len(self.classes)
 
         if self.make_discrete:
             self.discrete_features: DefaultDict = data_to_discrete(self.data, n=10)
@@ -289,6 +295,9 @@ def fit(self, data: np.ndarray = None, labels: np.ndarray = None) -> None:
             ):
                 self.trees.append(tree)
             else:
+                # We have been unable to fit the tree within the budget.
+                # In this case, we do not add the tree to the list
+                # In the case of the very first tree not being able to be fit, this implies that self.trees = []
                 break
 
             if self.boosting:
@@ -336,27 +345,40 @@ def predict(self, datapoint: np.ndarray) -> Union[Tuple[int, np.ndarray], float]
                   (Regressor) the averaged mean value of labels in each tree
         """
         T = len(self.trees)
-        if self.is_classification:
-            agg_preds = np.empty((T, self.n_classes))
-            for tree_idx, tree in enumerate(self.trees):
-                # Average over predicted probabilities, not just hard labels
-                agg_preds[tree_idx] = tree.predict(datapoint)[1]
-            avg_preds = agg_preds.mean(axis=0)
-            label_pred = list(self.classes.keys())[avg_preds.argmax()]
-            return label_pred, avg_preds
+        if T == 0:
+            # We handle the case where no full trees were able to be split specially. In this case, we instantiate
+            # a stump (single root node) and predict the datapoint as the average (regression) or priors (classification).
+            N = len(self.data)
+            if self.is_classification:
+                # In the case of classification, we use the priors as the prediction
+                # WARNING: This may not work when the labels are not contiguous integers starting at 0. Fix after ddl.
+                # TODO(@motiwari): See warning above.
+                avg_preds = np.bincount(self.new_targets) / N
+                return np.argmax(avg_preds), avg_preds
+            else:
+                return np.mean(self.org_targets)
         else:
-            if self.boosting:
+            if self.is_classification:
+                agg_preds = np.empty((T, self.n_classes))
                 for tree_idx, tree in enumerate(self.trees):
-                    if tree_idx == 0:
-                        agg_pred = tree.predict(datapoint)
-                    else:
-                        agg_pred += self.boosting_lr * tree.predict(datapoint)
-                return agg_pred
+                    # Average over predicted probabilities, not just hard labels
+                    agg_preds[tree_idx] = tree.predict(datapoint)[1]
+                avg_preds = agg_preds.mean(axis=0)
+                label_pred = list(self.classes.keys())[avg_preds.argmax()]
+                return label_pred, avg_preds
             else:
-                agg_pred = np.empty(T)
-                for tree_idx, tree in enumerate(self.trees):
-                    agg_pred[tree_idx] = tree.predict(datapoint)
-                return float(agg_pred.mean())
+                if self.boosting:
+                    for tree_idx, tree in enumerate(self.trees):
+                        if tree_idx == 0:
+                            agg_pred = tree.predict(datapoint)
+                        else:
+                            agg_pred += self.boosting_lr * tree.predict(datapoint)
+                    return agg_pred
+                else:
+                    agg_pred = np.empty(T)
+                    for tree_idx, tree in enumerate(self.trees):
+                        agg_pred[tree_idx] = tree.predict(datapoint)
+                    return float(agg_pred.mean())
 
     def get_oob_score(self, data=None) -> float:
         """

diff --git a/data_structures/forest_classifier.py b/data_structures/forest_classifier.py
@@ -42,13 +42,8 @@ def __init__(
         alpha_F: float = 1.0,
         alpha_N: float = 1.0,
     ) -> None:
-        if classes is None:
-            self.classes: dict = class_to_idx(
-                np.unique(labels)
-            )  # a dictionary that maps class name to class index
-        else:
-            self.classes = classes
-        self.n_classes = len(self.classes)
+
+        self.classes = classes
         super().__init__(
             data=data,
             labels=labels,

diff --git a/data_structures/histogram.py b/data_structures/histogram.py
@@ -3,7 +3,7 @@
 import math
 from typing import Any, Tuple
 
-from utils.constants import LINEAR, DISCRETE, IDENTITY, RANDOM, DEFAULT_NUM_BINS, VECTORIZE
+from utils.constants import LINEAR, DISCRETE, IDENTITY, RANDOM, DEFAULT_NUM_BINS, VECTORIZE_HISTOGRAM
 from utils.utils_histogram import welford_variance_calc
 
 
@@ -109,8 +109,8 @@ def empty_samples(self, bin_idcs: np.ndarray, is_curr_empty: bool = True) -> Non
 
     def add(self, X: np.ndarray, Y: np.ndarray):
         """
-        Given dataset X , add all the points in the dataset to the histogram.
-        :param X: dataset to be histogrammed (subset of original X, although could be the same size)
+        Given datasets X , add all the points in the datasets to the histogram.
+        :param X: datasets to be histogrammed (subset of original X, although could be the same size)
         :return: None, but modify the histogram to include the relevant feature values
         """
         feature_values = X[:, self.feature_idx]
@@ -126,7 +126,7 @@ def add(self, X: np.ndarray, Y: np.ndarray):
                 Y
             ), "Error: sample sizes and label sizes must be the same"
             insert_idcs = self.get_bin(feature_values, self.bin_edges).astype("int64")
-            if VECTORIZE:
+            if VECTORIZE_HISTOGRAM:
                 new_Y = self.replace_array(Y, self.class_to_idx)
                 hist = np.zeros(
                     (self.left.shape[0] + 1, self.left.shape[1]), dtype=np.int64

diff --git a/data_structures/node.py b/data_structures/node.py
@@ -7,7 +7,7 @@
 from collections import defaultdict
 from utils.utils import get_subset_2d
 
-from utils.solvers import solve_mab, solve_exactly
+from utils.solvers import solve_mab, solve_exactly, solve_randomly
 from utils.utils import (
     type_check,
     counts_of_labels,
@@ -17,6 +17,7 @@
 from utils.constants import (
     MAB,
     EXACT,
+    RANDOM_SOLVER,
     GINI,
     LINEAR,
     DEFAULT_NUM_BINS,
@@ -49,9 +50,12 @@ def __init__(
         # To decrease memory usage and cost for making a copy of large array, we don't pass data array to child node
         # but indices
         # The features aren't global to the tree, so we should be resampling the features at every node
+
+        # Note: To compare with random solver, change self.feature_subsampling to None
         self.feature_idcs = choose_features(
             self.tree.feature_idcs, self.feature_subsampling, self.tree.rng
         )
+
         if self.tree.discrete_features is not None:
             self.discrete_features = remap_discrete_features(
                 self.feature_idcs, self.tree.discrete_features
@@ -149,6 +153,17 @@ def calculate_best_split(self, budget: int = None) -> Union[float, int]:
                 impurity_measure=self.criterion,
                 # NOTE: not implemented with budget yet
             )
+        elif self.solver == RANDOM_SOLVER:
+            results = solve_randomly(
+                data=self.data,
+                labels=self.labels,
+                minmax=self.minmax,
+                discrete_bins_dict=self.discrete_features,
+                binning_type=self.bin_type,
+                num_bins=self.num_bins,
+                is_classification=self.is_classification,
+                impurity_measure=self.criterion,
+            )
         else:
             raise Exception("Invalid solver specified, must be MAB or EXACT")
 
@@ -165,7 +180,7 @@ def calculate_best_split(self, budget: int = None) -> Union[float, int]:
             self.prev_split_feature = self.split_feature
             self.split_feature = self.feature_idcs[
                 self.split_feature
-            ]  # Feature index of original dataset
+            ]  # Feature index of original datasets
             self.split_reduction *= self.proportion  # Normalize by number of datapoints
             if self.verbose:
                 print("Calculated split with", self.num_queries, "queries")

diff --git a/data_structures/permutation.py b/data_structures/permutation.py
@@ -144,7 +144,7 @@ def get_importance_array(self) -> np.ndarray:
         Trains all of its forests and computes the importance vector for each of the trained forests.
         This is the main function that an object of this class will call.
 
-        :return: an array of importance vectors.
+        :return: an array of importance vectors. (i, j) = ith random forest, jth feature's importance score.
         """
         self.is_train = True
         self.train_forests()
@@ -229,11 +229,13 @@ def get_stability_freq(imp_data: np.ndarray, best_k_features: int) -> float:
         ), "Feature subset size should be less than feature dimension"
 
         # preprocess data
-        best_idcs = np.argsort(-imp_data)[:, :best_k_features]
-        for i in range(N):
-            top_k_imps = imp_data[i][best_idcs[i]]
-            # zero-out the top k importances that aren't below threshold (default 0.0)
-            best_idcs[i] = np.where(top_k_imps >= MIN_IMPORTANCE, best_idcs[i], -1)
+        random_noise = np.random.random(imp_data.shape)
+        sort_idcs = np.lexsort((random_noise, -imp_data), axis=1)  # Why use lexsort: to randomly sort the indices that
+        # have equal values, see https://bit.ly/3DeAnFY
+        best_idcs = sort_idcs[:, :best_k_features]
+
+        # zero-out the top k importances that aren't below threshold (default 0.0)
+        best_idcs = np.where(np.take_along_axis(imp_data, best_idcs, axis=1) >= MIN_IMPORTANCE, best_idcs, -1)
 
         c_var = 0
         for i in range(F):

diff --git a/data_structures/wrappers/histogram_random_forest_classifier.py b/data_structures/wrappers/histogram_random_forest_classifier.py
@@ -52,7 +52,7 @@ def __init__(
             random_state=random_state,
             with_replacement=with_replacement,
             verbose=verbose,
-            is_precomputed_minmax=True,
+            is_precomputed_minmax=False,
             use_logarithmic_split=False,
             epsilon=0.01,
         )
diff --git a/data_structures/wrappers/histogram_random_forest_regressor.py b/data_structures/wrappers/histogram_random_forest_regressor.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from data_structures.forest_regressor import ForestRegressor
-from utils.constants import SQRT, LINEAR, DEFAULT_NUM_BINS, BEST, EXACT, MSE
+from utils.constants import BATCH_SIZE, SQRT, LINEAR, DEFAULT_NUM_BINS, BEST, EXACT, MSE
 
 
 class HistogramRandomForestRegressor(ForestRegressor):
@@ -32,6 +32,7 @@ def __init__(
         random_state: int = 0,
         with_replacement: bool = False,
         verbose: bool = False,
+        batch_size: int = BATCH_SIZE,
     ) -> None:
         super().__init__(
             data=data,
@@ -52,7 +53,8 @@ def __init__(
             random_state=random_state,
             with_replacement=with_replacement,
             verbose=verbose,
-            is_precomputed_minmax=True,
+            is_precomputed_minmax=False,
             use_logarithmic_split=False,
             epsilon=0.01,
+            batch_size=batch_size,
         )
diff --git a/experiments/HT_exps/hoeffding_tree_exp.py b/experiments/HT_exps/hoeffding_tree_exp.py
@@ -0,0 +1,85 @@
+from skmultiflow.trees import HoeffdingTreeClassifier, HoeffdingTreeRegressor
+
+from sklearn.ensemble import RandomForestClassifier as RFC_sklearn
+from sklearn.ensemble import RandomForestRegressor as RFR_sklearn
+
+from experiments.datasets import data_loader
+
+from utils.constants import (
+    FLIGHT,
+    AIR,
+    APS,
+    BLOG,
+    SKLEARN_REGRESSION,
+    MNIST_STR,
+    HOUSING,
+    COVTYPE,
+    KDD,
+    GPU,
+    BATCH_SIZE,
+)
+
+
+def main():
+    SUBSAMPLE_SIZE = 60000
+    X_train, y_train, X_test, y_test = data_loader.fetch_data(MNIST_STR)
+    X_train = X_train[:SUBSAMPLE_SIZE]
+    y_train = y_train[:SUBSAMPLE_SIZE]
+
+    sklearn_RFC = RFC_sklearn(
+        n_estimators=1,
+        # defaults
+        criterion="gini",  # default
+        max_depth=None,  # default
+        min_samples_split=2,  # default
+        min_samples_leaf=1,  # default
+        random_state=0,
+        min_weight_fraction_leaf=0.0,
+        max_features="sqrt",
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=True,
+        oob_score=False,
+        n_jobs=None,
+        verbose=0,
+        warm_start=False,
+        class_weight=None,
+        ccp_alpha=0.0,
+        max_samples=None,
+    )
+    sklearn_RFC.fit(X_train, y_train)
+    sklearn_RFC.predict(X_test)
+    print(sklearn_RFC.score(X_test, y_test))
+
+
+    # Hoeffding Tree for classification
+    ht_classifier = HoeffdingTreeClassifier(
+        max_byte_size=float("inf"),
+        memory_estimate_period=float("inf"),
+        grace_period=10,
+        split_criterion="gini",
+        split_confidence=0.1,
+        # defaults
+        tie_threshold=0.05,  # default
+        binary_split=True,  # default
+        stop_mem_management=False,  # default
+        remove_poor_atts=False,  # default
+        no_preprune=False,  # default
+        leaf_prediction="nba",  # default
+        nb_threshold=0,  # default
+    )
+    ht_classifier.fit(X_train, y_train)
+    ht_classifier.predict(X_test)
+    print(ht_classifier.score(X_test, y_test))
+    print(ht_classifier.get_rules_description())
+
+    # # Hoeffding Tree for regression
+    # ht_regressor = HoeffdingTreeRegressor()
+    # ht_regressor.fit(X, y)
+    # ht_regressor.predict(X)
+    # ht_regressor.predict_proba(X)
+    # ht_regressor.score(X, y)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/budget_exps/ERFC_dict b/experiments/budget_exps/ERFC_dict
diff --git a/experiments/budget_exps/ERFR_dict b/experiments/budget_exps/ERFR_dict
diff --git a/experiments/budget_exps/HRFC_dict b/experiments/budget_exps/HRFC_dict
diff --git a/experiments/budget_exps/HRFR_dict b/experiments/budget_exps/HRFR_dict
diff --git a/experiments/budget_exps/HRPC_dict b/experiments/budget_exps/HRPC_dict