marrlab · smilesun · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,11 +2,10 @@ name: CI
 
 on:
   push:
-    branches: master
+    branches: mhof_dev
   pull_request:
-    branches: master
+    branches: mhof_dev
   workflow_dispatch:
-
 jobs:
   test:
     name: Run tests

diff --git a/.gitignore b/.gitignore
@@ -6,8 +6,6 @@ tests/__pycache__/
 .vscode/
 domainlab/zdata/pacs
 /data/
-/.snakemake/
 /dist 
 /domainlab.egg-info
 /runs
-/slurm_errors.txt
diff --git a/README.md b/README.md
@@ -124,6 +124,7 @@ For example,  the following result (without any augmentation like flip) is for P
 
 Source: https://arxiv.org/pdf/2403.14356.pdf
 
+Citation:
 ```bibtex
 @misc{sun2024domainlab,
   title={DomainLab: A modular Python package for domain generalization in deep learning},
@@ -132,3 +133,71 @@ Source: https://arxiv.org/pdf/2403.14356.pdf
   year={2024}
 }
 ```
+
+# M-HOF-Opt: Multi-Objective Hierarchical Output Feedback Optimization via Multiplier Induced Loss Landscape Scheduling
+Source: https://arxiv.org/pdf/2403.13728.pdf
+
+M-HOF-Opt is implemented in [DomainLab](https://github.com/marrlab/DomainLab). If you meet any problems, feel free to report them at https://github.com/marrlab/DomainLab/issues
+
+## Dependencies and Data Preparation
+#### Example dependencies installation
+```
+git checkout mhof  # switch to mhof branch
+conda create --name domainlab_py39 python=3.9  # create a virtual environment
+conda activate domainlab_py39  # activate virtual environment
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.6 -c pytorch -c conda-forge
+conda install torchmetrics==0.10.3
+pip install -r requirements_notorch.txt
+conda install tensorboard # install tensorboard
+```
+
+#### Data preparation: download the domain generalization dataset PACS
+
+step 1:
+
+use the following script to download PACS to your local laptop and upload it to your cluster
+
+https://github.com/marrlab/DomainLab/blob/fbopt/data/script/download_pacs.py
+
+step 2:
+make a symbolic link following the example script in https://github.com/marrlab/DomainLab/blob/master/sh_pacs.sh
+
+where `mkdir -p data/pacs` is executed under the repository directory,
+
+`ln -s /dir/to/yourdata/pacs/raw  ./data/pacs/PACS`
+will create a symbolic link under the repository directory
+
+### M-HOF experiments reproduction
+
+#### Run the experiment
+
+To execute a single run of the M-HOF method, from the root folder run the command:
+
+```
+python main_out.py -c a_reproduce_pacs_diva.yaml
+```
+
+which uses the configuration file [a_reproduce_pacs_diva.yaml](https://github.com/marrlab/DomainLab/blob/mhof/a_reproduce_pacs_diva.yaml).
+
+#### Visualization of the results
+
+The results of the experiment are stored in the `runs` directory generated by Tensorboard.
+The various loss curves with the corresponding setpoint change curves, as well as phase-portrait-like figures showing the loss dynamics between the task loss and the various regularization losses, can be obtained by running the script [script_generate_all_figures_diva.sh](https://github.com/marrlab/DomainLab/blob/mhof/script_generate_all_figures_diva.sh):
+
+```
+bash script_generate_all_figures_diva.sh
+```
+
+The resulting figures will be stored in the directory `figures_diva`, which can be changed by editing the top of the [script_generate_all_figures_diva.sh](https://github.com/marrlab/DomainLab/blob/mhof/script_generate_all_figures_diva.sh) file if needed.
+
+Citation:
+```bibtex
+@misc{sun2024m,
+  title={M-HOF-Opt: Multi-Objective Hierarchical Output Feedback Optimization via Multiplier Induced Loss Landscape Scheduling},
+  author={Sun, Xudong and Chen, Nutan and Gossmann, Alexej and Xing, Yu and Dorigatt, Emilio and Drost, Felix and Feistner, Carla and Scarcella, Daniele and Beer, Lisa and Marr, Carsten},
+  journal={https://arxiv.org/pdf/2403.13728.pdf},
+  number={2403.13728},
+  year={2024},
+  publisher={https://arxiv.org/pdf/2403.13728.pdf}
+}
+```
diff --git a/a_reproduce_pacs_diva.yaml b/a_reproduce_pacs_diva.yaml
@@ -0,0 +1,24 @@
+te_d: sketch
+tpath: examples/tasks/task_pacs_aug.py
+bs: 32
+model: diva
+trainer: fbopt
+gamma_y: 1.0
+ini_setpoint_ratio: 0.99
+str_diva_multiplier_type: gammad_recon
+coeff_ma_output_state: 0.1
+coeff_ma_setpoint: 0.9
+exp_shoulder_clip: 5
+mu_init: 0.000001
+k_i_gain_ratio: 0.5
+mu_clip: 10
+epos: 1000
+epos_min: 200
+npath: examples/nets/resnet50domainbed.py
+npath_dom: examples/nets/resnet50domainbed.py
+es: 2
+lr: 0.00005
+zx_dim: 0
+zy_dim: 64
+zd_dim: 64
+force_setpoint_change_once: True
diff --git a/a_test_feedforward_irm.sh b/a_test_feedforward_irm.sh
@@ -0,0 +1 @@
+python main_out.py --te_d=0 --tr_d 1 2 --task=mnistcolor10 --model=erm --nname=conv_bn_pool_2 --trainer=hyperscheduler_irm_dial --k_i_gain_ratio=0.5 --force_setpoint_change_once --epos=10 --epos_min=4 --exp_shoulder_clip=1 --mu_clip=100 --ini_setpoint_ratio=0.99999999
diff --git a/a_test_mhof_irm.sh b/a_test_mhof_irm.sh
@@ -0,0 +1 @@
+python main_out.py --te_d=0 --tr_d 1 2 --task=mnistcolor10 --model=erm --nname=conv_bn_pool_2 --trainer=fbopt_irm_dial --k_i_gain_ratio=0.5 --force_setpoint_change_once --epos=500 --epos_min=400 --exp_shoulder_clip=1 --mu_clip=100 --ini_setpoint_ratio=0.9 --nb4reg_over_task_ratio=0 --tr_with_init_mu --coeff_ma_setpoint=0.0 --str_setpoint_ada="SliderAnyComponent()"
diff --git a/domainlab/algos/builder_diva.py b/domainlab/algos/builder_diva.py
@@ -3,7 +3,9 @@
 """
 from domainlab.algos.a_algo_builder import NodeAlgoBuilder
 from domainlab.algos.msels.c_msel_oracle import MSelOracleVisitor
+from domainlab.algos.msels.c_msel_setpoint_delay import MSelSetpointDelay
 from domainlab.algos.msels.c_msel_val import MSelValPerf
+from domainlab.algos.msels.c_msel_val_top_k import MSelValPerfTopK
 from domainlab.algos.observers.b_obvisitor import ObVisitor
 from domainlab.algos.observers.c_obvisitor_cleanup import ObVisitorCleanUp
 from domainlab.algos.observers.c_obvisitor_gen import ObVisitorGen
@@ -35,7 +37,8 @@ def init_business(self, exp):
         request = RequestVAEBuilderCHW(task.isize.c, task.isize.h, task.isize.w, args)
         node = VAEChainNodeGetter(request)()
         task.get_list_domains_tr_te(args.tr_d, args.te_d)
-        model = mk_diva(list_str_y=task.list_str_y)(
+        model = mk_diva(str_diva_multiplier_type=args.str_diva_multiplier_type, list_str_y=task.list_str_y)(
+
             node,
             zd_dim=args.zd_dim,
             zy_dim=args.zy_dim,
@@ -48,7 +51,9 @@ def init_business(self, exp):
             beta_d=args.beta_d,
         )
         device = get_device(args)
-        model_sel = MSelOracleVisitor(MSelValPerf(max_es=args.es), val_threshold=args.val_threshold)
+        model_sel = MSelSetpointDelay(
+            MSelOracleVisitor(MSelValPerfTopK(max_es=args.es)), val_threshold=args.val_threshold
+        )
         if not args.gen:
             observer = ObVisitor(model_sel)
         else:

diff --git a/domainlab/algos/builder_fbopt_dial.py b/domainlab/algos/builder_fbopt_dial.py
@@ -0,0 +1,21 @@
+"""
+builder for feedback optimization of dial
+"""
+from domainlab.algos.builder_diva import NodeAlgoBuilderDIVA
+from domainlab.algos.trainers.train_fbopt_b import TrainerFbOpt
+
+
+class NodeAlgoBuilderFbOptDial(NodeAlgoBuilderDIVA):
+    """
+    builder for feedback optimization for dial
+    """
+
+    def init_business(self, exp):
+        """
+        return trainer, model, observer
+        """
+        trainer_in, model, observer, device = super().init_business(exp)
+        trainer_in.init_business(model, exp.task, observer, device, exp.args)
+        trainer = TrainerFbOpt()
+        trainer.init_business(trainer_in, exp.task, observer, device, exp.args)
+        return trainer, model, observer, device
diff --git a/domainlab/algos/builder_jigen1.py b/domainlab/algos/builder_jigen1.py
@@ -3,7 +3,9 @@
 """
 from domainlab.algos.a_algo_builder import NodeAlgoBuilder
 from domainlab.algos.msels.c_msel_oracle import MSelOracleVisitor
+from domainlab.algos.msels.c_msel_setpoint_delay import MSelSetpointDelay
 from domainlab.algos.msels.c_msel_val import MSelValPerf
+from domainlab.algos.msels.c_msel_val_top_k import MSelValPerfTopK
 from domainlab.algos.observers.b_obvisitor import ObVisitor
 from domainlab.algos.observers.c_obvisitor_cleanup import ObVisitorCleanUp
 from domainlab.algos.trainers.hyper_scheduler import HyperSchedulerWarmupExponential
@@ -30,7 +32,7 @@ def init_business(self, exp):
         task = exp.task
         args = exp.args
         device = get_device(args)
-        msel = MSelOracleVisitor(msel=MSelValPerf(max_es=args.es), val_threshold=args.val_threshold)
+        msel = MSelSetpointDelay(MSelOracleVisitor(MSelValPerfTopK(max_es=args.es)), val_threshold=args.val_threshold)
         observer = ObVisitor(msel)
         observer = ObVisitorCleanUp(observer)
 

diff --git a/domainlab/algos/msels/a_model_sel.py b/domainlab/algos/msels/a_model_sel.py
@@ -124,6 +124,12 @@ def sel_model_te_acc(self):
             return self.msel.sel_model_te_acc
         return -1
 
+    @property
+    def oracle_last_setpoint_sel_te_acc(self):
+        if self.msel is not None:
+            return self.msel.oracle_last_setpoint_sel_te_acc
+        return -1
+
     @property
     def model_selection_epoch(self):
         """

diff --git a/domainlab/algos/msels/c_msel_setpoint_delay.py b/domainlab/algos/msels/c_msel_setpoint_delay.py
@@ -0,0 +1,54 @@
+"""
+logs the best up-to-event selected model at each event when setpoint shrinks
+"""
+from domainlab.algos.msels.a_model_sel import AMSel
+from domainlab.utils.logger import Logger
+
+
+class MSelSetpointDelay(AMSel):
+    """
+    This class decorate another model selection object, it logs the current
+    selected performance from the decoratee each time the setpoint shrinks
+    """
+
+    def __init__(self, msel, val_threshold = None):
+        super().__init__(val_threshold)
+        # NOTE: super() has to come first always otherwise self.msel will be overwritten to be None
+        self.msel = msel
+        self._oracle_last_setpoint_sel_te_acc = 0.0
+
+    @property
+    def oracle_last_setpoint_sel_te_acc(self):
+        """
+        return the last setpoint best acc
+        """
+        return self._oracle_last_setpoint_sel_te_acc
+
+    def base_update(self, clear_counter=False):
+        """
+        if the best model should be updated
+        currently, clear_counter is set via
+        flag = super().tr_epoch(epoch, self.flag_setpoint_updated)
+        """
+        logger = Logger.get_logger()
+        logger.info(
+            f"setpoint selected current acc {self._oracle_last_setpoint_sel_te_acc}"
+        )
+        if clear_counter:
+            # for the current version of code, clear_counter = flag_setpoint_updated
+            log_message = (
+                f"setpoint msel te acc updated from "
+                # self._oracle_last_setpoint_sel_te_acc start from 0.0, and always saves
+                # the test acc when last setpoint decrease occurs
+                f"{self._oracle_last_setpoint_sel_te_acc} to "
+                # self.sel_model_te_acc defined as a property
+                # in a_msel, which returns self.msel.sel_model_te_acc
+                # is the validation acc based model selection, which
+                # does not take setpoint into account
+                f"{self.sel_model_te_acc}"
+            )
+            logger.info(log_message)
+            self._oracle_last_setpoint_sel_te_acc = self.sel_model_te_acc
+        # let decoratee decide if model should be selected or not
+        flag = self.msel.update(clear_counter)
+        return flag
diff --git a/domainlab/algos/msels/c_msel_val_top_k.py b/domainlab/algos/msels/c_msel_val_top_k.py
@@ -0,0 +1,61 @@
+"""
+Model Selection should be decoupled from
+"""
+from domainlab.algos.msels.c_msel_val import MSelValPerf
+from domainlab.utils.logger import Logger
+
+
+class MSelValPerfTopK(MSelValPerf):
+    """
+    1. Model selection using validation performance
+    2. Visitor pattern to trainer
+    """
+
+    def __init__(self, max_es, top_k=2):
+        super().__init__(max_es)  # construct self.tr_obs (observer)
+        self.top_k = top_k
+        self.list_top_k_acc = [0.0 for _ in range(top_k)]
+
+    def update(self, clear_counter=False):
+        """
+        if the best model should be updated
+        """
+        flag_super = super().update(clear_counter)
+        metric_val_current = self.tr_obs.metric_val[self.tr_obs.str_metric4msel]
+        acc_min = min(self.list_top_k_acc)
+        if metric_val_current > acc_min:
+            # overwrite
+            logger = Logger.get_logger()
+            logger.info(
+                f"top k validation acc: {self.list_top_k_acc} \
+                        overwriting/reset  counter"
+            )
+            self.es_c = 0  # restore counter
+            ind = self.list_top_k_acc.index(acc_min)
+            # avoid having identical values
+            if metric_val_current not in self.list_top_k_acc:
+                self.list_top_k_acc[ind] = metric_val_current
+                logger.info(
+                    f"top k validation acc updated: \
+                            {self.list_top_k_acc}"
+                )
+                # overwrite to ensure consistency
+                # issue #569: initially self.list_top_k_acc will be [xx, 0] and it does not matter since 0 will be overwriten by second epoch validation acc.
+                # actually, after epoch 1, most often, sefl._best_val_acc will be the higher value of self.list_top_k_acc will overwriten by min(self.list_top_k_acc)
+                logger.info(
+                    f"top-2 val sel: overwriting best val acc from {self._best_val_acc} to "
+                    f"minimum of {self.list_top_k_acc} which is {min(self.list_top_k_acc)} "
+                    f"to ensure consistency"
+                )
+                self._best_val_acc = min(self.list_top_k_acc)
+            # overwrite test acc, this does not depend on if val top-k acc has been overwritten or not
+            metric_te_current = self.tr_obs.metric_te[self.tr_obs.str_metric4msel]
+            if self._sel_model_te_acc != metric_te_current:
+                # this can only happen if the validation acc has decreased and current val acc is only bigger than min(self.list_top_k_acc} but lower than max(self.list_top_k_acc)
+                logger.info(
+                    f"top-2 val sel: overwriting selected model test acc from "
+                    f"{self._sel_model_te_acc} to {metric_te_current} to ensure consistency"
+                )
+            self._sel_model_te_acc = metric_te_current
+            return True # if metric_val_current > acc_min:
+        return flag_super
diff --git a/domainlab/algos/observers/b_obvisitor.py b/domainlab/algos/observers/b_obvisitor.py
@@ -28,14 +28,22 @@ def __init__(self, model_sel):
         self.metric_val = None
         self.perf_metric = None
 
+        self.flag_setpoint_changed_once = False
+
     @property
     def str_metric4msel(self):
         """
         string representing the metric used for persisting models on the disk
         """
         return self.host_trainer.str_metric4msel
 
-    def update(self, epoch):
+    def reset(self):
+        """
+        reset observer via reset model selector
+        """
+        self.model_sel.reset()
+
+    def update(self, epoch, flag_info=False):
         logger = Logger.get_logger()
         logger.info(f"epoch: {epoch}")
         self.epo = epoch
@@ -53,13 +61,18 @@ def update(self, epoch):
                     self.loader_te, self.device
                 )
                 self.metric_te = metric_te
-        if self.model_sel.update(epoch):
+        if self.model_sel.update(epoch, flag_info):
             logger.info("better model found")
             self.host_trainer.model.save()
             logger.info("persisted")
         acc = self.metric_te.get("acc")
         flag_stop = self.model_sel.if_stop(acc)
         flag_enough = epoch >= self.host_trainer.aconf.epos_min
+
+        self.flag_setpoint_changed_once |= flag_info
+        if self.host_trainer.aconf.force_setpoint_change_once:
+            return flag_stop & flag_enough & self.flag_setpoint_changed_once
+
         return flag_stop & flag_enough
 
     def accept(self, trainer):
@@ -106,7 +119,15 @@ def after_all(self):
             metric_te.update({"model_selection_epoch": self.model_sel.model_selection_epoch})
         else:
             metric_te.update({"acc_val": -1})
-            metric_te.update({"model_selection_epoch": -1})
+
+        if hasattr(self, "model_sel") and hasattr(
+            self.model_sel, "oracle_last_setpoint_sel_te_acc"
+        ):
+            metric_te.update(
+                {"acc_setpoint": self.model_sel.oracle_last_setpoint_sel_te_acc}
+            )
+        else:
+            metric_te.update({"acc_setpoint": -1})
         self.dump_prediction(model_ld, metric_te)
         # save metric to one line in csv result file
         self.host_trainer.model.visitor(metric_te)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python main_out.py --te_d=0 --tr_d 1 2 --task=mnistcolor10 --model=erm --nname=conv_bn_pool_2 --trainer=hyperscheduler_irm_dial --k_i_gain_ratio=0.5 --force_setpoint_change_once --epos=10 --epos_min=4 --exp_shoulder_clip=1 --mu_clip=100 --ini_setpoint_ratio=0.99999999