From b4210c105a34a2b7f83f5e6a29095f8017318cda Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Jan 2023 01:49:50 +0530
Subject: [PATCH 01/54] Merge py file changes from benchmark-algs

---
 src/imitation/algorithms/dagger.py            |  62 +++
 src/imitation/scripts/analyze.py              |  24 +-
 src/imitation/scripts/config/parallel.py      | 406 ++++++++++++++++--
 .../scripts/config/train_adversarial.py       | 175 +++++++-
 .../scripts/config/train_imitation.py         |  26 ++
 .../config/train_preference_comparisons.py    | 128 +++++-
 src/imitation/scripts/config/train_rl.py      | 203 ++++++++-
 src/imitation/scripts/ingredients/reward.py   |   5 +
 src/imitation/scripts/parallel.py             | 166 ++++++-
 src/imitation/scripts/train_adversarial.py    |   1 +
 src/imitation/scripts/train_imitation.py      |   4 +-
 .../scripts/train_preference_comparisons.py   |   1 +
 src/imitation/scripts/train_rl.py             |   4 +-
 tests/algorithms/test_dagger.py               |  25 +-
 tests/scripts/test_scripts.py                 |  31 +-
 15 files changed, 1173 insertions(+), 88 deletions(-)

diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
index a7194a5bf..0034fc4ba 100644
--- a/src/imitation/algorithms/dagger.py
+++ b/src/imitation/algorithms/dagger.py
@@ -65,6 +65,68 @@ def __call__(self, round_num: int) -> float:
         assert round_num >= 0
         return min(1, max(0, (self.rampdown_rounds - round_num) / self.rampdown_rounds))
 
+    def __repr__(self):
+        return f"{type(self).__name__}({self.rampdown_rounds!r})"
+
+
+class IndicatorBetaSchedule(BetaSchedule):
+    """Beta schedule that switches off after a number of rounds."""
+
+    def __init__(self, rampdown_rounds: int):
+        """Builds IndicatorBetaSchedule.
+
+        Args:
+            rampdown_rounds: number of rounds after which beta switches off.
+        """
+        self.rampdown_rounds = rampdown_rounds
+
+    def __call__(self, round_num: int) -> float:
+        """Computes beta value.
+
+        Args:
+            round_num: the current round number.
+
+        Returns:
+            beta as `1` until `self.rampdown_rounds` and then beta as `0`.
+        """
+        assert round_num >= 0
+        return 1 if round_num < self.rampdown_rounds else 0
+
+    def __repr__(self):
+        return f"{type(self).__name__}({self.rampdown_rounds!r})"
+
+
+class ExponentialBetaSchedule(BetaSchedule):
+    """Exponentially decaying schedule for beta."""
+
+    def __init__(self, decay_probability: float):
+        """Builds ExponentialBetaSchedule.
+
+        Args:
+            decay_probability: the decay factor for beta.
+
+        Raises:
+            ValueError: if `decay_probability` not within (0, 1].
+        """
+        if not (0 < decay_probability <= 1):
+            raise ValueError("decay_probability lies outside the range (0, 1].")
+        self.decay_probability = decay_probability
+
+    def __call__(self, round_num: int) -> float:
+        """Computes beta value.
+
+        Args:
+            round_num: the current round number.
+
+        Returns:
+            beta as `self.decay_probability ^ round_num`
+        """
+        assert round_num >= 0
+        return self.decay_probability**round_num
+
+    def __repr__(self):
+        return f"{type(self).__name__}({self.decay_probability!r})"
+
 
 def reconstruct_trainer(
     scratch_dir: types.AnyPath,
diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index 0586f86d6..54fed52f9 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -166,6 +166,8 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str:
 
 def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
     imit_stats = get(sd.run, "result.imit_stats")
+    if imit_stats is None:
+        imit_stats = get(sd.run, "result.rollout")
     expert_stats = get(sd.run, "result.expert_stats")
 
     expert_return_summary = None
@@ -232,7 +234,7 @@ def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
 # verbosity 2
 table_verbosity_mapping.append(
     table_verbosity_mapping[-1]
-    | {"status", "imit_expert_ratio", "exp_command", "run_name"},
+    | {"status", "imit_expert_ratio", "exp_command", "run_name", "seed", ""},
 )
 
 
@@ -268,20 +270,26 @@ def analyze_imitation(
     Returns:
         The DataFrame generated from the Sacred logs.
     """
-    table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
+    if table_verbosity == -1:
+        table_entry_fns_subset = _get_table_entry_fns_subset(0)
+    else:
+        table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
 
-    rows = []
+    df = pd.DataFrame()
     for sd in _gather_sacred_dicts():
-        row = {}
+        new_df = pd.DataFrame()
+        if table_verbosity == -1:
+            new_df = pd.json_normalize(sd.config)
+
         for col_name, make_entry_fn in table_entry_fns_subset.items():
-            row[col_name] = make_entry_fn(sd)
-        rows.append(row)
+            new_df[col_name] = make_entry_fn(sd)
+
+        df = pd.concat([df, new_df])
 
-    df = pd.DataFrame(rows)
     if len(df) > 0:
         df.sort_values(by=["algo", "env_name"], inplace=True)
 
-    display_options = dict(index=False)
+    display_options: Mapping[str, Any] = dict(index=False)
     if csv_output_path is not None:
         df.to_csv(csv_output_path, **display_options)
         print(f"Wrote CSV file to {csv_output_path}")
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index eb206893f..59295d3d3 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -5,13 +5,15 @@
 `@parallel_ex.named_config` to define a new parallel experiment.
 
 Adding custom named configs is necessary because the CLI interface can't add
-search spaces to the config like `"seed": tune.grid_search([0, 1, 2, 3])`.
+search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
 """
 
 import numpy as np
 import ray.tune as tune
 import sacred
+from torch import nn
 
+from imitation.algorithms.dagger import ExponentialBetaSchedule, LinearBetaSchedule
 from imitation.util.util import make_unique_timestamp
 
 parallel_ex = sacred.Experiment("parallel")
@@ -33,12 +35,39 @@ def config():
 
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
-    n_seeds = 3  # Number of seeds to search over by default
+    # n_seeds_start = 0
+    # n_seeds = 1  # Number of seeds to search over by default
+    experiment_checkpoint_path = ""
+    eval_best_trial = False
+    eval_trial_seeds = 5  # Number of seeds to search over by default
+    num_samples = 1  # Number of samples per grid search configuration
+    repeat = 3
+    env = "seals_half_cheetah"
+    wandb_name_prefix = ""
+
+
+# @parallel_ex.config
+# def seeds(n_seeds_start, n_seeds):
+#     search_space = {
+#         "config_updates": {
+#             "seed": tune.choice(
+#                 list(range(n_seeds_start, n_seeds_start + n_seeds)),
+#             )
+#         }
+#     }
 
 
 @parallel_ex.config
-def seeds(n_seeds):
-    search_space = {"config_updates": {"seed": tune.grid_search(list(range(n_seeds)))}}
+def wandb(run_name):
+    base_config_updates = {
+        "common": {
+            "wandb": {
+                "wandb_name_prefix": run_name,
+                "wandb_kwargs": {"project": "algorithm-benchmark"},
+            },
+        },
+    }
+    # base_named_configs = ["common.wandb_logging"]
 
 
 @parallel_ex.named_config
@@ -63,7 +92,7 @@ def generate_test_data():
         "config_updates": {
             "rl": {
                 "rl_kwargs": {
-                    "learning_rate": tune.grid_search(
+                    "learning_rate": tune.choice(
                         [3e-4 * x for x in (1 / 3, 1 / 2)],
                     ),
                 },
@@ -91,8 +120,8 @@ def example_cartpole_rl():
         "config_updates": {
             "rl": {
                 "rl_kwargs": {
-                    "learning_rate": tune.grid_search(np.logspace(3e-6, 1e-1, num=3)),
-                    "nminibatches": tune.grid_search([16, 32, 64]),
+                    "learning_rate": tune.choice(np.logspace(3e-6, 1e-1, num=3)),
+                    "nminibatches": tune.choice([16, 32, 64]),
                 },
             },
         },
@@ -105,44 +134,367 @@ def example_cartpole_rl():
 
 
 @parallel_ex.named_config
-def example_rl_easy():
+def example_rl():
     sacred_ex_name = "train_rl"
-    run_name = "example-rl-easy"
-    n_seeds = 2
+    run_name = "rl_tuning"
+    # n_seeds = 2
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        "common": {
+            "wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}},
+            "num_vec": 1,
+        },
+    }
     search_space = {
-        "named_configs": tune.grid_search([[env] for env in EASY_ENVS]),
+        # "named_configs": tune.choice([[env] for env in EASY_ENVS]),
         "config_updates": {
             "rl": {
+                "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
                 "rl_kwargs": {
-                    "learning_rate": tune.grid_search(np.logspace(3e-6, 1e-1, num=3)),
-                    "nminibatches": tune.grid_search([16, 32, 64]),
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    "batch_size": tune.choice([64, 128, 256, 512]),
+                    "n_epochs": tune.choice([5, 10, 20]),
                 },
             },
         },
     }
-    resources_per_trial = dict(cpu=4)
+    num_samples = 100
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 1
+    resources_per_trial = dict(cpu=1)
 
 
 @parallel_ex.named_config
-def example_gail_easy():
+def example_bc():
+    sacred_ex_name = "train_imitation"
+    run_name = "bc_tuning_hc"
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        # "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}},
+        "common": {"num_vec": 1},
+    }
+    search_space = {
+        "config_updates": {
+            "bc_kwargs": dict(
+                batch_size=tune.choice([8, 16, 32, 64]),
+                l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
+                optimizer_kwargs=dict(
+                    lr=tune.loguniform(1e-5, 1e-2),
+                ),
+            ),
+            "bc_train_kwargs": dict(
+                n_epochs=tune.choice([1, 5, 10, 20]),
+            ),
+        },
+        "command_name": "bc",
+    }
+    num_samples = 64
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_dagger():
+    sacred_ex_name = "train_imitation"
+    run_name = "dagger_tuning_hc"
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        # "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}},
+        "common": {"num_vec": 1},
+        "dagger": {"total_timesteps": 1e5},
+        "bc_kwargs": {
+            "batch_size": 16,
+            "l2_weight": 1e-4,
+            "optimizer_kwargs": {"lr": 1e-3},
+        },
+    }
+    search_space = {
+        "config_updates": {
+            "bc_train_kwargs": dict(
+                n_epochs=tune.choice([1, 5, 10]),
+            ),
+            "dagger": dict(
+                beta_schedule=tune.choice(
+                    [LinearBetaSchedule(i) for i in [1, 5, 15]]
+                    + [ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                ),
+                rollout_round_min_episodes=tune.choice([3, 5, 10]),
+            ),
+        },
+        "command_name": "dagger",
+    }
+    num_samples = 50
+    repeat = 3
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_gail():
     sacred_ex_name = "train_adversarial"
-    run_name = "example-gail-easy"
-    n_seeds = 1
+    run_name = "gail_tuning_hc"
+    base_named_configs = ["common.wandb_logging"]
+    base_config_updates = {
+        "common": {"num_vec": 1},
+        "total_timesteps": 1e7,
+    }
     search_space = {
-        "named_configs": tune.grid_search([[env] for env in EASY_ENVS]),
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
         "config_updates": {
-            "init_trainer_kwargs": {
-                "rl": {
-                    "rl_kwargs": {
-                        "learning_rate": tune.grid_search(
-                            np.logspace(3e-6, 1e-1, num=3),
-                        ),
-                        "nminibatches": tune.grid_search([16, 32, 64]),
-                    },
+            "algorithm_kwargs": dict(
+                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                n_disc_updates_per_round=tune.choice([8, 16]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": tune.choice([4096, 8192, 16384]),
+                "rl_kwargs": {
+                    "ent_coef": tune.loguniform(1e-7, 1e-3),
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
                 },
             },
+            "algorithm_specific": {},
         },
+        "command_name": "gail",
+    }
+    num_samples = 100
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_airl():
+    sacred_ex_name = "train_adversarial"
+    run_name = "airl_tuning_hc"
+    # n_seeds = 1
+    base_named_configs = ["common.wandb_logging"]
+    base_config_updates = {
+        "common": {"num_vec": 1},
+        "total_timesteps": 1e7,
     }
     search_space = {
-        "command_name": "gail",
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            "algorithm_kwargs": dict(
+                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                n_disc_updates_per_round=tune.choice([8, 16]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": tune.choice([4096, 8192, 16384]),
+                "rl_kwargs": {
+                    "ent_coef": tune.loguniform(1e-7, 1e-3),
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
+                },
+            },
+            "algorithm_specific": {},
+        },
+        "command_name": "airl",
+    }
+    num_samples = 100
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_pc():
+    sacred_ex_name = "train_preference_comparisons"
+    run_name = "pc_tuning"
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        "common": {"num_vec": 1},
+        "total_timesteps": 2e7,
+        "total_comparisons": 5000,
+        "query_schedule": "hyperbolic",
+        "gatherer_kwargs": {"sample": True},
+    }
+    search_space = {
+        "named_configs": tune.choice(
+            [
+                ["reward.normalize_output_disable"],
+                # ["reward.normalize_output_running"],
+                # ["reward.normalize_output_ema"],
+            ],
+        ),
+        "config_updates": {
+            "train": {
+                "policy_kwargs": {
+                    "activation_fn": tune.choice(
+                        [
+                            nn.ReLU,
+                            # nn.Tanh,
+                        ],
+                    ),
+                },
+            },
+            "num_iterations": tune.choice([25, 50]),
+            # "initial_comparison_frac": tune.choice([0.1, 0.25]),
+            # "reward_trainer_kwargs": {
+            #     "epochs": tune.choice([1, 3, 6]),
+            # },
+            # "query_schedule": tune.choice(
+            #     ["constant", "hyperbolic", "inverse_quadratic"],
+            # ),
+            "rl": {
+                "batch_size": tune.choice([512, 2048, 8192]),
+                "rl_kwargs": {
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    "ent_coef": tune.loguniform(1e-7, 1e-3),
+                },
+            },
+        },
     }
+    num_samples = 24
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def debug_eval():
+    sacred_ex_name = "train_preference_comparisons"
+    run_name = "debug_eval"
+    eval_trial_seeds = 2
+    eval_best_trial = True
+    # base_named_configs = ["seals_half_cheetah"]
+    base_config_updates = {
+        "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}},
+        "total_timesteps": 30,
+        "total_comparisons": 10,
+        # "query_schedule": "hyperbolic",
+        "num_iterations": 1,
+        "fragment_length": 2,
+    }
+    search_space = {
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            # "num_iterations": tune.choice([5, 20, 50]),
+            "initial_comparison_frac": tune.choice([0.1, 0.2]),
+            # "reward_trainer_kwargs": {
+            #     "epochs": tune.choice([1, 2, 3]),
+            # },
+            # "query_schedule": tune.choice(
+            #     ["constant", "hyperbolic", "inverse_quadratic"],
+            # ),
+        },
+    }
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def debug_eval_adv():
+    sacred_ex_name = "train_adversarial"
+    run_name = "airl_tuning_debug"
+    # n_seeds = 5
+    base_named_configs = []
+    eval_best_trial = True
+    eval_trial_seeds = 2
+    base_config_updates = {
+        "common": {
+            "wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}},
+            # "num_env": 1,
+        },
+        "total_timesteps": 2048,
+    }
+    search_space = {
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            "algorithm_kwargs": dict(
+                # demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                n_disc_updates_per_round=tune.choice([1, 2]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": 8,
+                # "rl_kwargs": {
+                #     "ent_coef": tune.choice([0, 1e-3, 1e-1]),
+                #     "learning_rate": tune.loguniform(1e-5, 5e-3),
+                # },
+            },
+            "algorithm_specific": dict(demo_batch_size=1),
+        },
+        "command_name": "airl",
+    }
+    num_samples = 2
+    repeat = 2
+    resources_per_trial = dict(cpu=8)
+
+
+@parallel_ex.named_config
+def debug_airl():
+    sacred_ex_name = "train_adversarial"
+    run_name = "airl_debug"
+    # n_seeds = 1
+    base_named_configs = ["common.wandb_logging", "seals_walker"]
+    base_config_updates = {
+        "common": {"num_vec": 8},
+        "total_timesteps": 1e7,
+    }
+    search_space = {
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            "train": {
+                "policy_kwargs": {
+                    "activation_fn": tune.choice(
+                        [
+                            nn.ReLU,
+                            # nn.Tanh,
+                        ],
+                    ),
+                },
+            },
+            "algorithm_kwargs": dict(
+                demo_batch_size=tune.choice([32]),
+                n_disc_updates_per_round=tune.choice([10]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": tune.choice([10000]),
+                "rl_kwargs": {
+                    "ent_coef": tune.choice([0.1]),
+                    "learning_rate": tune.choice([1e-4]),
+                },
+            },
+            "algorithm_specific": {},
+        },
+        "command_name": "airl",
+    }
+    num_samples = 1
+    eval_best_trial = False
+    # eval_trial_seeds = 5
+    repeat = 5
+    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
+    resources_per_trial = dict(cpu=8)
+
+
+# @parallel_ex.config_hook
+# def config_hook(config, command_name, logger):
+#     """Sets env."""
+#     del command_name, logger
+#     res = {}
+#     print(config)
+#     if config["env"]:
+#         res["base_named_configs"] = tuple(
+#             config["base_named_configs"] + [config["env"]]
+#         )
+#     print(res)
+#     return res
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index aae3baeb0..bd9df6287 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -1,6 +1,7 @@
 """Configuration for imitation.scripts.train_adversarial."""
 
 import sacred
+from torch import nn
 
 from imitation.rewards import reward_nets
 from imitation.scripts.ingredients import demonstrations, environment, expert
@@ -98,9 +99,25 @@ def pendulum():
 
 @train_adversarial_ex.named_config
 def seals_ant():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    locals().update(**ANT_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=16,
+            clip_range=0.3,
+            ent_coef=3.1441389214159857e-06,
+            gae_lambda=0.8,
+            gamma=0.995,
+            learning_rate=0.00017959211641976886,
+            max_grad_norm=0.9,
+            n_epochs=10,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.4351450387648799,
+        ),
+    )
 
 
 CHEETAH_SHARED_LOCALS = dict(
@@ -139,40 +156,145 @@ def half_cheetah():
 
 @train_adversarial_ex.named_config
 def seals_half_cheetah():
-    locals().update(**CHEETAH_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/HalfCheetah-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    rl = dict(
+        batch_size=512,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=3.794797423594763e-06,
+            gae_lambda=0.95,
+            gamma=0.95,
+            learning_rate=0.0003286871805949382,
+            max_grad_norm=0.8,
+            n_epochs=5,
+            vf_coef=0.11483689492120866,
+        ),
+    )
+    # algorithm_specific = dict(
+    #     airl=dict(total_timesteps=int(5e6)),
+    #     gail=dict(total_timesteps=int(8e6)),
+    # )
+    # reward = dict(
+    #     algorithm_specific=dict(
+    #         airl=dict(
+    #             net_cls=reward_nets.BasicShapedRewardNet,
+    #             net_kwargs=dict(
+    #                 reward_hid_sizes=(32,),
+    #                 potential_hid_sizes=(32,),
+    #             ),
+    #         ),
+    #     ),
+    # )
+    algorithm_kwargs = dict(
+        # Number of discriminator updates after each round of generator updates
+        n_disc_updates_per_round=16,
+        # Equivalent to no replay buffer if batch size is the same
+        gen_replay_buffer_capacity=512,
+        demo_batch_size=8192,
+    )
 
 
 @train_adversarial_ex.named_config
 def seals_hopper():
-    locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=512,
+            clip_range=0.1,
+            ent_coef=0.0010159833764878474,
+            gae_lambda=0.98,
+            gamma=0.995,
+            learning_rate=0.0003904770450788824,
+            max_grad_norm=0.9,
+            n_epochs=20,
+            vf_coef=0.20315938606555833,
+        ),
+    )
 
 
 @train_adversarial_ex.named_config
-def seals_humanoid():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Humanoid-v0")
-    total_timesteps = int(4e6)
+def seals_swimmer():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Swimmer-v0")
+    total_timesteps = int(2e6)
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=5.167107294612664e-08,
+            gae_lambda=0.95,
+            gamma=0.999,
+            learning_rate=0.000414936134792374,
+            max_grad_norm=2,
+            n_epochs=5,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6162112311062333,
+        ),
+    )
 
 
 @train_adversarial_ex.named_config
-def reacher():
-    environment = dict(gym_id="Reacher-v2")
-    algorithm_kwargs = {"allow_variable_horizon": True}
+def seals_walker():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Walker2d-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=8192,
+        rl_kwargs=dict(
+            batch_size=128,
+            clip_range=0.4,
+            ent_coef=0.00013057334805552262,
+            gae_lambda=0.92,
+            gamma=0.98,
+            learning_rate=0.000138575372312869,
+            max_grad_norm=0.6,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6167177795726859,
+        ),
+    )
 
 
 @train_adversarial_ex.named_config
-def seals_swimmer():
+def seals_humanoid():
     locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Swimmer-v0")
-    total_timesteps = int(2e6)
+    environment = dict(gym_id="seals/Humanoid-v0")
+    total_timesteps = int(4e6)
 
 
 @train_adversarial_ex.named_config
-def seals_walker():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Walker2d-v0")
+def reacher():
+    environment = dict(gym_id="Reacher-v2")
+    algorithm_kwargs = {"allow_variable_horizon": True}
 
 
 # Debug configs
@@ -189,3 +311,22 @@ def fast():
         demo_batch_size=1,
         n_disc_updates_per_round=4,
     )
+
+
+@train_adversarial_ex.named_config
+def debug_nans():
+    environment = {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}
+    total_timesteps = 1e7
+    algorithm_kwargs = dict(
+        demo_batch_size=128,
+        n_disc_updates_per_round=8,
+        # both are same as rl.batch_size
+        # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+        # gen_train_timesteps=0,
+    )
+    rl = {
+        "batch_size": 4096,
+        "rl_kwargs": {"ent_coef": 0.1, "learning_rate": 7.316377404994506e-05},
+    }
+    seed = 0
+    checkpoint_interval = 1
diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
index 16da9c694..23e24ec0b 100644
--- a/src/imitation/scripts/config/train_imitation.py
+++ b/src/imitation/scripts/config/train_imitation.py
@@ -38,6 +38,7 @@ def config():
     dagger = dict(
         use_offline_rollouts=False,  # warm-start policy with BC from offline demos
         total_timesteps=1e5,
+        rollout_round_min_episodes=None,  # use default value
     )
     agent_path = None  # Path to load agent from, optional.
 
@@ -81,6 +82,8 @@ def ant():
 @train_imitation_ex.named_config
 def seals_ant():
     environment = dict(gym_id="seals/Ant-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
 
 
 @train_imitation_ex.named_config
@@ -95,6 +98,29 @@ def seals_half_cheetah():
     environment = dict(gym_id="seals/HalfCheetah-v0")
     bc_kwargs = dict(l2_weight=0.0)
     dagger = dict(total_timesteps=60000)
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
+
+
+@train_imitation_ex.named_config
+def seals_hopper():
+    environment = dict(gym_id="seals/Hopper-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
+
+
+@train_imitation_ex.named_config
+def seals_swimmer():
+    environment = dict(gym_id="seals/Swimmer-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
+
+
+@train_imitation_ex.named_config
+def seals_walker():
+    environment = dict(gym_id="seals/Walker2d-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
 
 
 @train_imitation_ex.named_config
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index cf25f4783..d12869bf0 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -1,6 +1,7 @@
 """Configuration for imitation.scripts.train_preference_comparisons."""
 
 import sacred
+from torch import nn
 
 from imitation.algorithms import preference_comparisons
 from imitation.scripts.ingredients import environment
@@ -72,9 +73,24 @@ def cartpole():
 
 @train_preference_comparisons_ex.named_config
 def seals_ant():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    locals().update(**ANT_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=16,
+            clip_range=0.3,
+            ent_coef=3.1441389214159857e-06,
+            gae_lambda=0.8,
+            gamma=0.995,
+            learning_rate=0.00017959211641976886,
+            max_grad_norm=0.9,
+            n_epochs=10,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.4351450387648799,
+        ),
+    )
 
 
 @train_preference_comparisons_ex.named_config
@@ -84,10 +100,116 @@ def half_cheetah():
     rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024))
 
 
+@train_preference_comparisons_ex.named_config
+def seals_half_cheetah():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/HalfCheetah-v0")
+    rl = dict(
+        batch_size=512,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=3.794797423594763e-06,
+            gae_lambda=0.95,
+            gamma=0.95,
+            learning_rate=0.0003286871805949382,
+            max_grad_norm=0.8,
+            n_epochs=5,
+            vf_coef=0.11483689492120866,
+        ),
+    )
+    num_iterations = 50
+    total_timesteps = 20000000
+    # train = dict(
+    #     policy_cls="MlpPolicy",
+    #     policy_kwargs=dict(
+    #         activation_fn=nn.ReLU,
+    #         # net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+    #     ),
+    # )
+
+
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
-    locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=512,
+            clip_range=0.1,
+            ent_coef=0.0010159833764878474,
+            gae_lambda=0.98,
+            gamma=0.995,
+            learning_rate=0.0003904770450788824,
+            max_grad_norm=0.9,
+            n_epochs=20,
+            vf_coef=0.20315938606555833,
+        ),
+    )
+
+
+@train_preference_comparisons_ex.named_config
+def seals_swimmer():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Swimmer-v0")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=5.167107294612664e-08,
+            gae_lambda=0.95,
+            gamma=0.999,
+            learning_rate=0.000414936134792374,
+            max_grad_norm=2,
+            n_epochs=5,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6162112311062333,
+        ),
+    )
+
+
+@train_preference_comparisons_ex.named_config
+def seals_walker():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Walker2d-v0")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=8192,
+        rl_kwargs=dict(
+            batch_size=128,
+            clip_range=0.4,
+            ent_coef=0.00013057334805552262,
+            gae_lambda=0.92,
+            gamma=0.98,
+            learning_rate=0.000138575372312869,
+            max_grad_norm=0.6,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6167177795726859,
+        ),
+    )
 
 
 @train_preference_comparisons_ex.named_config
diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py
index 6d48f8695..9df2581a6 100644
--- a/src/imitation/scripts/config/train_rl.py
+++ b/src/imitation/scripts/config/train_rl.py
@@ -1,6 +1,8 @@
 """Configuration settings for train_rl, training a policy with RL."""
 
+
 import sacred
+from torch import nn
 
 from imitation.scripts.ingredients import environment
 from imitation.scripts.ingredients import logging as logging_ingredient
@@ -70,8 +72,30 @@ def cartpole():
 
 @train_rl_ex.named_config
 def seals_cartpole():
-    environment = dict(gym_id="seals/CartPole-v0")
-    total_timesteps = int(1e6)
+    environment = dict(gym_id="seals/CartPole-v0", num_vec=8)
+    total_timesteps = int(1e5)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    normalize_reward = False
+    rl = dict(
+        batch_size=4096,
+        rl_kwargs=dict(
+            batch_size=256,
+            clip_range=0.4,
+            ent_coef=0.008508727919228772,
+            gae_lambda=0.9,
+            gamma=0.9999,
+            learning_rate=0.0012403278189645594,
+            max_grad_norm=0.8,
+            n_epochs=10,
+            vf_coef=0.489343896591493,
+        ),
+    )
 
 
 @train_rl_ex.named_config
@@ -80,9 +104,69 @@ def half_cheetah():
     total_timesteps = int(5e6)  # does OK after 1e6, but continues improving
 
 
+@train_rl_ex.named_config
+def seals_half_cheetah():
+    environment = dict(
+        gym_id="seals/HalfCheetah-v0",
+        num_vec=1,
+    )
+
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.Tanh,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    # total_timesteps = int(5e6)  # does OK after 1e6, but continues improving
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=512,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=3.794797423594763e-06,
+            gae_lambda=0.95,
+            gamma=0.95,
+            learning_rate=0.0003286871805949382,
+            max_grad_norm=0.8,
+            n_epochs=5,
+            vf_coef=0.11483689492120866,
+        ),
+    )
+
+
 @train_rl_ex.named_config
 def seals_hopper():
-    environment = dict(gym_id="seals/Hopper-v0")
+    environment = dict(gym_id="seals/Hopper-v0", num_vec=1)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=512,
+            clip_range=0.1,
+            ent_coef=0.0010159833764878474,
+            gae_lambda=0.98,
+            gamma=0.995,
+            learning_rate=0.0003904770450788824,
+            max_grad_norm=0.9,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.20315938606555833,
+        ),
+    )
 
 
 @train_rl_ex.named_config
@@ -104,15 +188,34 @@ def seals_mountain_car():
 
 @train_rl_ex.named_config
 def pendulum():
-    environment = dict(gym_id="Pendulum-v1")
+    environment = dict(gym_id="Pendulum-v1", num_vec=4)
+    total_timesteps = int(1e5)
+
+    train = dict(
+        policy_cls="MlpPolicy",
+        # policy_kwargs=dict(
+        #     activation_fn=nn.Tanh,
+        #     net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        # ),
+    )
+    normalize_reward = False
+
     rl = dict(
-        batch_size=4096,
+        batch_size=1024 * 4,
         rl_kwargs=dict(
+            gae_lambda=0.95,
             gamma=0.9,
+            n_epochs=10,
+            ent_coef=0.0,
             learning_rate=1e-3,
+            clip_range=0.2,
+            use_sde=True,
+            sde_sample_freq=4,
+            # batch_size=64,
+            # max_grad_norm=0.8,
+            # vf_coef=0.11483689492120866,
         ),
     )
-    total_timesteps = int(2e5)
 
 
 @train_rl_ex.named_config
@@ -122,17 +225,99 @@ def reacher():
 
 @train_rl_ex.named_config
 def seals_ant():
-    environment = dict(gym_id="seals/Ant-v0")
+    environment = dict(
+        gym_id="seals/Ant-v0",
+        num_vec=1,
+    )
+
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.Tanh,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=16,
+            clip_range=0.3,
+            ent_coef=3.1441389214159857e-06,
+            gae_lambda=0.8,
+            gamma=0.995,
+            learning_rate=0.00017959211641976886,
+            max_grad_norm=0.9,
+            n_epochs=10,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.4351450387648799,
+        ),
+    )
 
 
 @train_rl_ex.named_config
 def seals_swimmer():
-    environment = dict(gym_id="seals/Swimmer-v0")
+    environment = dict(gym_id="seals/Swimmer-v0", num_vec=1)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=5.167107294612664e-08,
+            gae_lambda=0.95,
+            gamma=0.999,
+            learning_rate=0.000414936134792374,
+            max_grad_norm=2,
+            n_epochs=5,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6162112311062333,
+        ),
+    )
 
 
 @train_rl_ex.named_config
 def seals_walker():
-    environment = dict(gym_id="seals/Walker2d-v0")
+    environment = dict(gym_id="seals/Walker2d-v0", num_vec=1)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=8192,
+        rl_kwargs=dict(
+            batch_size=128,
+            clip_range=0.4,
+            ent_coef=0.00013057334805552262,
+            gae_lambda=0.92,
+            gamma=0.98,
+            learning_rate=0.000138575372312869,
+            max_grad_norm=0.6,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6167177795726859,
+        ),
+    )
 
 
 # Debug configs
diff --git a/src/imitation/scripts/ingredients/reward.py b/src/imitation/scripts/ingredients/reward.py
index c40d3751f..a4bd98d1f 100644
--- a/src/imitation/scripts/ingredients/reward.py
+++ b/src/imitation/scripts/ingredients/reward.py
@@ -46,6 +46,11 @@ def normalize_output_running():
     normalize_output_layer = networks.RunningNorm  # noqa: F841
 
 
+@reward_ingredient.named_config
+def normalize_output_ema():
+    normalize_output_layer = networks.EMANorm  # noqa: F841
+
+
 @reward_ingredient.named_config
 def reward_ensemble():
     net_cls = reward_nets.RewardEnsemble
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 6014a08b6..c196954d1 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -2,12 +2,18 @@
 
 import collections.abc
 import copy
+import glob
 import pathlib
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
 
+import numpy as np
 import ray
 import ray.tune
 import sacred
+from pandas.api.types import is_object_dtype
+from ray.tune import search
+from ray.tune.registry import register_trainable
+from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
 
 from imitation.scripts.config.parallel import parallel_ex
@@ -17,6 +23,7 @@
 def parallel(
     sacred_ex_name: str,
     run_name: str,
+    num_samples: int,
     search_space: Mapping[str, Any],
     base_named_configs: Sequence[str],
     base_config_updates: Mapping[str, Any],
@@ -24,6 +31,12 @@ def parallel(
     init_kwargs: Mapping[str, Any],
     local_dir: Optional[str],
     upload_dir: Optional[str],
+    repeat: int = 3,
+    eval_best_trial: bool = False,
+    eval_trial_seeds: int = 5,
+    experiment_checkpoint_path: str = "",
+    syncer=None,
+    resume: Union[str, bool] = False,
 ) -> None:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
@@ -40,6 +53,7 @@ def parallel(
             under the 'experiment.name' key. This is equivalent to using the Sacred
             CLI '--name' option on the inner experiment. Offline analysis jobs can use
             this argument to group similar data.
+        num_samples: Number of times to sample from the hyperparameter space.
         search_space: A dictionary which can contain Ray Tune search objects like
             `ray.tune.grid_search` and `ray.tune.sample_from`, and is
             passed as the `config` argument to `ray.tune.run()`. After the
@@ -62,6 +76,19 @@ def parallel(
         init_kwargs: Arguments to pass to `ray.init`.
         local_dir: `local_dir` argument to `ray.tune.run()`.
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
+        repeat: Number of runs to repeat each trial for.
+        eval_best_trial: Whether to evaluate the trial with the best mean return
+            at the end of tuning on a different set of seeds.
+        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
+        experiment_checkpoint_path: Path containing the checkpoints of a previous
+            experiment. ran using this script. Useful for resuming cancelled trials
+            of the experiments (using `resume`) or evaluating the best trial of the
+            experiment (using `eval_best_trial`).
+        resume: If true and `experiment_checkpoint_path` is given, then resumes the
+            experiment by restarting the trials that did not finish in the experiment
+            checkpoint path.
+        syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
+
 
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
@@ -73,8 +100,8 @@ def parallel(
     if not isinstance(base_config_updates, collections.abc.Mapping):
         raise TypeError("base_config_updates must be a Mapping")
 
-    if not isinstance(search_space["named_configs"], collections.abc.Sequence):
-        raise TypeError('search_space["named_configs"] must be a Sequence')
+    # if not isinstance(search_space["named_configs"], collections.abc.Sequence):
+    #     raise TypeError('search_space["named_configs"] must be a Sequence')
 
     if not isinstance(search_space["config_updates"], collections.abc.Mapping):
         raise TypeError('search_space["config_updates"] must be a Mapping')
@@ -95,15 +122,104 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
+    search_alg = optuna.OptunaSearch()
+    search_alg = search.Repeater(search_alg, repeat=repeat)
     try:
-        ray.tune.run(
-            trainable,
-            config=search_space,
-            name=run_name,
-            local_dir=local_dir,
-            resources_per_trial=resources_per_trial,
-            sync_config=ray.tune.syncer.SyncConfig(upload_dir=upload_dir),
+        if experiment_checkpoint_path:
+            if resume:
+                register_trainable("inner", trainable)
+                runner = ray.tune.execution.trial_runner.TrialRunner(
+                    local_checkpoint_dir=experiment_checkpoint_path,
+                    sync_config=ray.tune.syncer.SyncConfig(
+                        upload_dir=upload_dir,
+                        syncer=syncer,
+                    ),
+                    metric="mean_return",
+                    resume=resume,
+                )
+                print(
+                    "Live trials:", len(runner._live_trials), "/", len(runner._trials)
+                )
+                while not runner.is_finished():
+                    runner.step()
+                    print("Debug:", runner.debug_string())
+
+            result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
+            result._load_checkpoints_from_latest(
+                glob.glob(experiment_checkpoint_path + "/experiment_state*.json"),
+            )
+            result.trials = None
+            result.fetch_trial_dataframes()
+        else:
+            result = ray.tune.run(
+                trainable,
+                config=search_space,
+                num_samples=num_samples * repeat,
+                name=run_name,
+                local_dir=local_dir,
+                resources_per_trial=resources_per_trial,
+                sync_config=ray.tune.syncer.SyncConfig(
+                    upload_dir=upload_dir,
+                    syncer=syncer,
+                ),
+                search_alg=search_alg,
+                metric="mean_return",
+                mode="max",
+            )
+
+        key = (
+            "rollout/"
+            if sacred_ex_name == "train_preference_comparisons"
+            else ""
+            if sacred_ex_name == "train_rl"
+            else "imit_stats/"
         )
+        key += "monitor_return_mean"
+        if eval_best_trial:
+            df = result.results_df
+            df = df[df["config/named_configs"].notna()]
+            for col in df.columns:
+                if is_object_dtype(df[col]):
+                    df[col] = df[col].astype("str")
+
+            grp_keys = [
+                c for c in df.columns if c.startswith("config") and "seed" not in c
+            ]
+            grps = df.groupby(grp_keys)
+            print(grps[key])
+            df["mean_return"] = grps[key].transform(lambda x: x.mean())
+            best_config_df = df[df["mean_return"] == df["mean_return"].max()]
+            envs_processed = set()
+            for i, row in best_config_df.iterrows():
+                tag = row["experiment_tag"]
+                trial = [t for t in result.trials if tag in t.experiment_tag][0]
+                best_config = trial.config
+                env = tuple(best_config["named_configs"])
+                if env in envs_processed:
+                    continue
+                envs_processed.add(env)
+                print("Named configs:", env)
+                print("Mean return:", row["mean_return"])
+                print("All returns:", df[df["mean_return"] == row["mean_return"]][key])
+                print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
+                best_config["config_updates"].update(
+                    seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
+                )
+                resources_per_trial = {k: 2 * v for k, v in resources_per_trial.items()}
+                eval_result = ray.tune.run(
+                    trainable,
+                    config={
+                        "named_configs": best_config["named_configs"],
+                        "config_updates": best_config["config_updates"],
+                        "command_name": best_config.get("command_name", None),
+                    },
+                    name=run_name + "_best_hp_eval",
+                    resources_per_trial=resources_per_trial,
+                )
+                returns = eval_result.results_df["mean_return"].to_numpy()
+                print("Returns:", returns)
+                print(np.mean(returns), np.std(returns))
+
     finally:
         ray.shutdown()
 
@@ -148,7 +264,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
@@ -169,11 +285,17 @@ def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
         # Import inside function rather than in module because Sacred experiments
         # are not picklable, and Ray requires this function to be picklable.
         from imitation.scripts.train_adversarial import train_adversarial_ex
+        from imitation.scripts.train_imitation import train_imitation_ex
+        from imitation.scripts.train_preference_comparisons import (
+            train_preference_comparisons_ex,
+        )
         from imitation.scripts.train_rl import train_rl_ex
 
         experiments = {
             "train_rl": train_rl_ex,
             "train_adversarial": train_adversarial_ex,
+            "train_imitation": train_imitation_ex,
+            "train_preference_comparisons": train_preference_comparisons_ex,
         }
         ex = experiments[sacred_ex_name]
 
@@ -181,22 +303,28 @@ def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
         named_configs = base_named_configs + run_kwargs["named_configs"]
         updated_run_kwargs["named_configs"] = named_configs
 
-        config_updates = {**base_config_updates, **run_kwargs["config_updates"]}
+        config_updates: Mapping[str, Any] = {}
+        config_updates.update(base_config_updates)
+        config_updates.update(run_kwargs["config_updates"])
+        if "__trial_index__" in run_kwargs:
+            config_updates.update(seed=run_kwargs.pop("__trial_index__"))
         updated_run_kwargs["config_updates"] = config_updates
 
         # Add other run_kwargs items to updated_run_kwargs.
         for k, v in run_kwargs.items():
             if k not in updated_run_kwargs:
                 updated_run_kwargs[k] = v
-
-        run = ex.run(
-            **updated_run_kwargs,
-            options={"--run": run_name, "--file_storage": "sacred"},
-        )
-
+        run = ex.run(**updated_run_kwargs, options={"--run": run_name})
         # Ray Tune has a string formatting error if raylet completes without
         # any calls to `reporter`.
-        reporter(done=True)
+        # reporter(done=True)
+        # if sacred_ex_name == "train_preference_comparisons":
+        #     #reporter(mean_return=run.result["rollout"]["monitor_return_mean"])
+        #     #ray.tune.report(mean_return=run.result["rollout"]["monitor_return_mean"])
+        #     ray.tune.report(mean_return=234)
+        # else:
+        #     # reporter(mean_return=run.result["imit_stats"]["monitor_return_mean"])
+        #     ray.tune.report(mean_return=run.result["imit_stats"]["monitor_return_mean"])
 
         assert run.status == "COMPLETED"
         return run.result
diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py
index 71fc0c2c9..58f7fb4c4 100644
--- a/src/imitation/scripts/train_adversarial.py
+++ b/src/imitation/scripts/train_adversarial.py
@@ -162,6 +162,7 @@ def callback(round_num: int, /) -> None:
     return {
         "imit_stats": imit_stats,
         "expert_stats": rollout.rollout_stats(expert_trajs),
+        "mean_return": imit_stats["monitor_return_mean"],
     }
 
 
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 2b4946668..c5673fa3e 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -125,10 +125,12 @@ def train_imitation(
                 expert_policy=expert_policy,
                 custom_logger=custom_logger,
                 bc_trainer=bc_trainer,
+                beta_schedule=dagger["beta_schedule"],
                 rng=_rnd,
             )
             model.train(
                 total_timesteps=int(dagger["total_timesteps"]),
+                rollout_round_min_episodes=dagger["rollout_round_min_episodes"],
                 bc_train_kwargs=bc_train_kwargs,
             )
             # TODO(adam): add checkpointing to DAgger?
@@ -141,7 +143,7 @@ def train_imitation(
 
         imit_stats = train.eval_policy(imit_policy, venv)
 
-    stats = {"imit_stats": imit_stats}
+    stats = {"imit_stats": imit_stats, "mean_return": imit_stats["monitor_return_mean"]}
     trajectories = model._all_demos if use_dagger else expert_trajs
     assert trajectories is not None
     if all(isinstance(t, types.TrajectoryWithRew) for t in trajectories):
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index e1aab27ff..1daa306af 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -268,6 +268,7 @@ def save_callback(iteration_num):
         if bool(trajectory_path is None):
             results = dict(results)
             results["rollout"] = train.eval_policy(agent, venv)
+            results["mean_return"] = results["rollout"]["monitor_return_mean"]
 
     if save_preferences:
         main_trainer.dataset.save(log_dir / "preferences.pkl")
diff --git a/src/imitation/scripts/train_rl.py b/src/imitation/scripts/train_rl.py
index fd345ca62..a88e6096a 100644
--- a/src/imitation/scripts/train_rl.py
+++ b/src/imitation/scripts/train_rl.py
@@ -157,7 +157,9 @@ def train_rl(
             serialize.save_stable_model(output_dir, rl_algo)
 
         # Final evaluation of expert policy.
-        return train.eval_policy(rl_algo, venv)
+        eval_stats = train.eval_policy(rl_algo, venv)
+        eval_stats["mean_return"] = eval_stats["monitor_return_mean"]
+        return eval_stats
 
 
 def main_console():
diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py
index 549e38fd2..6cc42bc78 100644
--- a/tests/algorithms/test_dagger.py
+++ b/tests/algorithms/test_dagger.py
@@ -33,7 +33,7 @@ def maybe_pendulum_expert_trajectories(
         return None
 
 
-def test_beta_schedule():
+def test_linear_beta_schedule():
     one_step_sched = dagger.LinearBetaSchedule(1)
     three_step_sched = dagger.LinearBetaSchedule(3)
     for i in range(10):
@@ -41,6 +41,29 @@ def test_beta_schedule():
         assert np.allclose(three_step_sched(i), (3 - i) / 3 if i <= 2 else 0)
 
 
+def test_indicator_beta_schedule():
+    one_step_sched = dagger.IndicatorBetaSchedule(1)
+    three_step_sched = dagger.IndicatorBetaSchedule(3)
+    for i in range(10):
+        assert np.allclose(one_step_sched(i), 1 if i == 0 else 0)
+        assert np.allclose(three_step_sched(i), 1 if i <= 2 else 0)
+
+
+def test_exponential_beta_schedule():
+    constant_sched = dagger.ExponentialBetaSchedule(1)
+    decay = 0.5
+    decaying_sched = dagger.ExponentialBetaSchedule(decay)
+    for i in range(10):
+        assert np.allclose(constant_sched(i), 1)
+        assert np.allclose(decaying_sched(i), decay**i)
+
+    with pytest.raises(
+        ValueError,
+        match=r"decay_probability lies outside the range \(0, 1\]\.",
+    ):
+        decaying_sched = dagger.ExponentialBetaSchedule(1.1)
+
+
 def test_traj_collector_seed(tmpdir, pendulum_venv, rng):
     collector = dagger.InteractiveTrajectoryCollector(
         venv=pendulum_venv,
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 2196b4af1..0a2766dbb 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -375,7 +375,10 @@ def bc_config(tmpdir, request):
             policy_type="ppo",
             loader_kwargs=dict(path=CARTPOLE_TEST_POLICY_PATH / "model.zip"),
         ),
-        expert_from_huggingface=dict(policy_type="ppo-huggingface"),
+        expert_from_huggingface=dict(
+            policy_type="ppo-huggingface",
+            loader_kwargs=dict(env_id="seals/CartPole-v0"),
+        ),
         random_expert=dict(policy_type="random"),
         zero_expert=dict(policy_type="zero"),
     )[request.param]
@@ -403,7 +406,10 @@ def test_train_bc_warmstart(tmpdir):
         config_updates=dict(
             logging=dict(log_root=tmpdir),
             demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
-            expert=dict(policy_type="ppo-huggingface"),
+            expert=dict(
+                policy_type="ppo-huggingface",
+                loader_kwargs=dict(env_id="seals/CartPole-v0"),
+            ),
         ),
     )
     assert run.status == "COMPLETED"
@@ -559,6 +565,27 @@ def test_train_adversarial(tmpdir, named_configs, command):
     _check_train_ex_result(run.result)
 
 
+def test_train_adversarial_debug():
+    """Smoke test for imitation.scripts.train_adversarial."""
+    named_configs = ["seals_ant", "debug_nans"]
+    config_updates = {
+        "common": dict(log_root="/home/tf/imitation/debug", parallel=False),
+        "demonstrations": dict(
+            rollout_path="/home/tf/imitation/download/final.pkl",
+        ),
+        # TensorBoard logs to get extra coverage
+        # "algorithm_kwargs": dict(init_tensorboard=True),
+        "agent_path": "/home/tf/imitation/download/01124/gen_policy",
+    }
+    run = train_adversarial.train_adversarial_ex.run(
+        command_name="airl",
+        named_configs=named_configs,
+        config_updates=config_updates,
+    )
+    assert run.status == "COMPLETED"
+    _check_train_ex_result(run.result)
+
+
 @pytest.mark.parametrize("command", ("airl", "gail"))
 def test_train_adversarial_warmstart(tmpdir, command):
     named_configs = ["cartpole"] + ALGO_FAST_CONFIGS["adversarial"]

From 97bc063e72e6fc769222351d954f68be28cf761f Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 10 Jan 2023 15:56:14 +0530
Subject: [PATCH 02/54] Clean parallel script

---
 src/imitation/scripts/parallel.py | 54 +++++++++++++++++++------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index c196954d1..da492804e 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -27,12 +27,13 @@ def parallel(
     search_space: Mapping[str, Any],
     base_named_configs: Sequence[str],
     base_config_updates: Mapping[str, Any],
-    resources_per_trial: Mapping[str, Any],
+    resources_per_trial: Dict[str, Any],
     init_kwargs: Mapping[str, Any],
     local_dir: Optional[str],
     upload_dir: Optional[str],
     repeat: int = 3,
     eval_best_trial: bool = False,
+    eval_best_trial_resource_multiplier: int = 2,
     eval_trial_seeds: int = 5,
     experiment_checkpoint_path: str = "",
     syncer=None,
@@ -79,6 +80,8 @@ def parallel(
         repeat: Number of runs to repeat each trial for.
         eval_best_trial: Whether to evaluate the trial with the best mean return
             at the end of tuning on a different set of seeds.
+        eval_best_trial_resource_multiplier: factor by which to multiply the
+            number of cpus per trial in `resources_per_trial`.
         eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
             experiment. ran using this script. Useful for resuming cancelled trials
@@ -122,11 +125,11 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
-    search_alg = optuna.OptunaSearch()
-    search_alg = search.Repeater(search_alg, repeat=repeat)
+    search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
     try:
         if experiment_checkpoint_path:
             if resume:
+                # restart failed runs from experiment_checkpoint_path
                 register_trainable("inner", trainable)
                 runner = ray.tune.execution.trial_runner.TrialRunner(
                     local_checkpoint_dir=experiment_checkpoint_path,
@@ -138,16 +141,21 @@ def parallel(
                     resume=resume,
                 )
                 print(
-                    "Live trials:", len(runner._live_trials), "/", len(runner._trials)
+                    "Live trials:",
+                    len(runner._live_trials),
+                    "/",
+                    len(runner._trials),
                 )
                 while not runner.is_finished():
                     runner.step()
                     print("Debug:", runner.debug_string())
 
+            # load experiment analysis results
             result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
             result._load_checkpoints_from_latest(
                 glob.glob(experiment_checkpoint_path + "/experiment_state*.json"),
             )
+            # update result.trials using all the experiment_state json files
             result.trials = None
             result.fetch_trial_dataframes()
         else:
@@ -167,45 +175,50 @@ def parallel(
                 mode="max",
             )
 
-        key = (
+        key_prefix = (
             "rollout/"
             if sacred_ex_name == "train_preference_comparisons"
             else ""
             if sacred_ex_name == "train_rl"
             else "imit_stats/"
         )
-        key += "monitor_return_mean"
+        key = key_prefix + "monitor_return_mean"
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
+            # convert object dtype to str required by df.groupby
             for col in df.columns:
                 if is_object_dtype(df[col]):
                     df[col] = df[col].astype("str")
-
+            # group into separate HP configs
             grp_keys = [
                 c for c in df.columns if c.startswith("config") and "seed" not in c
             ]
             grps = df.groupby(grp_keys)
-            print(grps[key])
+            # store mean return of runs across all seeds in a group
             df["mean_return"] = grps[key].transform(lambda x: x.mean())
             best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-            envs_processed = set()
-            for i, row in best_config_df.iterrows():
-                tag = row["experiment_tag"]
-                trial = [t for t in result.trials if tag in t.experiment_tag][0]
+            row = best_config_df.loc[0]
+            best_config_tag = row["experiment_tag"]
+            if result.trials is not None:
+                trial = [
+                    t for t in result.trials if best_config_tag in t.experiment_tag
+                ][0]
                 best_config = trial.config
-                env = tuple(best_config["named_configs"])
-                if env in envs_processed:
-                    continue
-                envs_processed.add(env)
-                print("Named configs:", env)
                 print("Mean return:", row["mean_return"])
                 print("All returns:", df[df["mean_return"] == row["mean_return"]][key])
                 print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
                 best_config["config_updates"].update(
                     seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
                 )
-                resources_per_trial = {k: 2 * v for k, v in resources_per_trial.items()}
+                # update cpus per trial only if it is provided in `resources_per_trial`
+                # Uses the default values (cpu=1) if it is not provided
+                if "cpu" in resources_per_trial:
+                    resources_per_trial["cpu"] *= eval_best_trial_resource_multiplier
+                    best_config["config_updates"].update(
+                        environment=dict(num_vec=resources_per_trial["cpu"]),
+                    )
+
                 eval_result = ray.tune.run(
                     trainable,
                     config={
@@ -219,7 +232,6 @@ def parallel(
                 returns = eval_result.results_df["mean_return"].to_numpy()
                 print("Returns:", returns)
                 print(np.mean(returns), np.std(returns))
-
     finally:
         ray.shutdown()
 
@@ -229,7 +241,7 @@ def _ray_tune_sacred_wrapper(
     run_name: str,
     base_named_configs: list,
     base_config_updates: Mapping[str, Any],
-) -> Callable[[Mapping[str, Any], Any], Mapping[str, Any]]:
+) -> Callable[[Dict[str, Any], Any], Mapping[str, Any]]:
     """From an Experiment build a wrapped run function suitable for Ray Tune.
 
     `ray.tune.run(...)` expects a trainable function that takes a dict
@@ -303,7 +315,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         named_configs = base_named_configs + run_kwargs["named_configs"]
         updated_run_kwargs["named_configs"] = named_configs
 
-        config_updates: Mapping[str, Any] = {}
+        config_updates: Dict[str, Any] = {}
         config_updates.update(base_config_updates)
         config_updates.update(run_kwargs["config_updates"])
         if "__trial_index__" in run_kwargs:

From 92912256816e51ce6e4266ac80ed990c6416493d Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 26 Jan 2023 15:18:04 +0100
Subject: [PATCH 03/54] Undo the changes from #653 to the dagger benchmark
 config files.

This change just made some error messages go away indicating the missing imitation.algorithms.dagger.ExponentialBetaSchedule but it did not fix the root cause.
---
 benchmarking/example_dagger_seals_ant_best_hp_eval.json         | 2 +-
 .../example_dagger_seals_half_cheetah_best_hp_eval.json         | 2 +-
 benchmarking/example_dagger_seals_hopper_best_hp_eval.json      | 2 +-
 benchmarking/example_dagger_seals_swimmer_best_hp_eval.json     | 2 +-
 benchmarking/example_dagger_seals_walker_best_hp_eval.json      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarking/example_dagger_seals_ant_best_hp_eval.json b/benchmarking/example_dagger_seals_ant_best_hp_eval.json
index 035beab83..38f3f504a 100644
--- a/benchmarking/example_dagger_seals_ant_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_ant_best_hp_eval.json
@@ -16,7 +16,7 @@
   },
   "dagger": {
     "beta_schedule": {
-      "py/type": "imitation.algorithms.dagger.LinearBetaSchedule",
+      "py/object": "imitation.algorithms.dagger.LinearBetaSchedule",
       "rampdown_rounds": 15
     },
     "rollout_round_min_episodes": 5,
diff --git a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
index 8961f8c26..708c92547 100644
--- a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
@@ -17,7 +17,7 @@
   "dagger": {
     "beta_schedule": {
       "decay_probability": 0.7,
-      "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule"
+      "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
     },
     "rollout_round_min_episodes": 5,
     "total_timesteps": 60000,
diff --git a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json b/benchmarking/example_dagger_seals_hopper_best_hp_eval.json
index fe47291e0..001479ec3 100644
--- a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_hopper_best_hp_eval.json
@@ -17,7 +17,7 @@
   "dagger": {
     "beta_schedule": {
       "decay_probability": 0.7,
-      "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule"
+      "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
     },
     "rollout_round_min_episodes": 10,
     "total_timesteps": 100000,
diff --git a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json b/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
index 2e6cba2c0..df1606fca 100644
--- a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
@@ -16,7 +16,7 @@
   },
   "dagger": {
     "beta_schedule": {
-      "py/type": "imitation.algorithms.dagger.LinearBetaSchedule",
+      "py/object": "imitation.algorithms.dagger.LinearBetaSchedule",
       "rampdown_rounds": 15
     },
     "rollout_round_min_episodes": 3,
diff --git a/benchmarking/example_dagger_seals_walker_best_hp_eval.json b/benchmarking/example_dagger_seals_walker_best_hp_eval.json
index e4569321f..ce6baff1c 100644
--- a/benchmarking/example_dagger_seals_walker_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_walker_best_hp_eval.json
@@ -17,7 +17,7 @@
   "dagger": {
     "beta_schedule": {
       "decay_probability": 0.7,
-      "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule"
+      "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
     },
     "rollout_round_min_episodes": 5,
     "total_timesteps": 100000,

From 276d863f488512067c38408ecf1386e8199abf50 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 25 Jan 2023 17:08:27 +0100
Subject: [PATCH 04/54] Improve readability and interpretability of
 benchmarking tests.

---
 tests/test_benchmarking.py | 51 ++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 5c42063c6..67b9eb489 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -1,6 +1,4 @@
 """Tests for config files in benchmarking/ folder."""
-import glob
-import os
 import pathlib
 
 import pytest
@@ -10,24 +8,39 @@
 THIS_DIR = pathlib.Path(__file__).absolute().parent
 BENCHMARKING_DIR = THIS_DIR.parent / "benchmarking"
 
+ALGORITHMS = ["bc", "dagger", "airl", "gail"]
+ENVIRONMENTS = [
+    "seals_walker",
+    "seals_ant",
+    "seals_half_cheetah",
+    "seals_hopper",
+    "seals_swimmer",
+]
 
-@pytest.mark.parametrize(
-    "command_name",
-    ["bc", "dagger", "airl", "gail"],
-)
-def test_benchmarking_configs(tmpdir, command_name):
+
+@pytest.mark.parametrize("environment", ENVIRONMENTS)
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
     # We test the configs using the print_config command,
     # because running the configs requires MuJoCo.
     # Requiring MuJoCo to run the tests adds too much complexity.
-    if command_name in ("bc", "dagger"):
-        ex = train_imitation.train_imitation_ex
-    elif command_name in ("airl", "gail"):
-        ex = train_adversarial.train_adversarial_ex
-    cfg_pattern = os.path.join(BENCHMARKING_DIR, f"example_{command_name}_*.json")
-    cfg_files = glob.glob(cfg_pattern)
-    assert len(cfg_files) == 5, "There should be 1 config file for each of environment."
-    for i, cfg_file in enumerate(cfg_files):
-        cfg_name = f"{tmpdir.basename}_{i}"
-        ex.add_named_config(cfg_name, cfg_file)
-        run = ex.run(command_name="print_config", named_configs=[cfg_name])
-        assert run.status == "COMPLETED"
+
+    # GIVEN
+    if algorithm in ("bc", "dagger"):
+        experiment = train_imitation.train_imitation_ex
+    elif algorithm in ("airl", "gail"):
+        experiment = train_adversarial.train_adversarial_ex
+    else:
+        raise ValueError(f"Unknown algorithm: {algorithm}")
+
+    config_name = f"{algorithm}_{environment}"
+    config_file = str(
+        BENCHMARKING_DIR / f"example_{algorithm}_{environment}_best_hp_eval.json",
+    )
+
+    # WHEN
+    experiment.add_named_config(config_name, config_file)
+    run = experiment.run(command_name="print_config", named_configs=[config_name])
+
+    # THEN
+    assert run.status == "COMPLETED"

From 37eb914cba0aaa416543b763b6f2246eae8f9fa7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 1 Mar 2023 21:48:13 +0530
Subject: [PATCH 05/54] Add pxponential beta scheduler for dagger

---
 src/imitation/algorithms/dagger.py            | 29 +++++++++++++++++++
 .../scripts/config/train_imitation.py         |  1 +
 src/imitation/scripts/train_imitation.py      |  1 +
 3 files changed, 31 insertions(+)

diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
index d43ca5eec..34d8cef7e 100644
--- a/src/imitation/algorithms/dagger.py
+++ b/src/imitation/algorithms/dagger.py
@@ -66,6 +66,35 @@ def __call__(self, round_num: int) -> float:
         return min(1, max(0, (self.rampdown_rounds - round_num) / self.rampdown_rounds))
 
 
+class ExponentialBetaSchedule(BetaSchedule):
+    """Exponentially decaying schedule for beta."""
+
+    def __init__(self, decay_probability: float):
+        """Builds ExponentialBetaSchedule.
+
+        Args:
+            decay_probability: the decay factor for beta.
+
+        Raises:
+            ValueError: if `decay_probability` not within (0, 1].
+        """
+        if not (0 < decay_probability <= 1):
+            raise ValueError("decay_probability lies outside the range (0, 1].")
+        self.decay_probability = decay_probability
+
+    def __call__(self, round_num: int) -> float:
+        """Computes beta value.
+
+        Args:
+            round_num: the current round number.
+
+        Returns:
+            beta as `self.decay_probability ^ round_num`
+        """
+        assert round_num >= 0
+        return self.decay_probability**round_num
+
+
 def reconstruct_trainer(
     scratch_dir: types.AnyPath,
     venv: vec_env.VecEnv,
diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
index 16da9c694..2ef2eed44 100644
--- a/src/imitation/scripts/config/train_imitation.py
+++ b/src/imitation/scripts/config/train_imitation.py
@@ -38,6 +38,7 @@ def config():
     dagger = dict(
         use_offline_rollouts=False,  # warm-start policy with BC from offline demos
         total_timesteps=1e5,
+        beta_schedule=None,
     )
     agent_path = None  # Path to load agent from, optional.
 
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 2b4946668..f8cc992fd 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -125,6 +125,7 @@ def train_imitation(
                 expert_policy=expert_policy,
                 custom_logger=custom_logger,
                 bc_trainer=bc_trainer,
+                beta_schedule=dagger["beta_schedule"],
                 rng=_rnd,
             )
             model.train(

From 877383b03d7d3260746997f3cab7b5272125b07b Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 2 Feb 2023 13:00:06 +0100
Subject: [PATCH 06/54] Ignore coverage for unknown algorithms.

---
 tests/test_benchmarking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 67b9eb489..ba01b38a2 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -31,7 +31,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
     elif algorithm in ("airl", "gail"):
         experiment = train_adversarial.train_adversarial_ex
     else:
-        raise ValueError(f"Unknown algorithm: {algorithm}")
+        raise ValueError(f"Unknown algorithm: {algorithm}")  # pragma: no cover
 
     config_name = f"{algorithm}_{environment}"
     config_file = str(

From c8e55cb1efee3913bf306c23f6a5c361674d7380 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 2 Feb 2023 13:04:02 +0100
Subject: [PATCH 07/54] Cleanup and extend tests for beta schedules in dagger.

---
 tests/algorithms/test_dagger.py | 39 ++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py
index 525fc449a..6e5582810 100644
--- a/tests/algorithms/test_dagger.py
+++ b/tests/algorithms/test_dagger.py
@@ -33,12 +33,39 @@ def maybe_pendulum_expert_trajectories(
         return None
 
 
-def test_beta_schedule():
-    one_step_sched = dagger.LinearBetaSchedule(1)
-    three_step_sched = dagger.LinearBetaSchedule(3)
-    for i in range(10):
-        assert np.allclose(one_step_sched(i), 1 if i == 0 else 0)
-        assert np.allclose(three_step_sched(i), (3 - i) / 3 if i <= 2 else 0)
+@pytest.mark.parametrize("num_rampdown_rounds", [1, 2, 3, 10])
+def test_linear_beta_schedule(num_rampdown_rounds):
+    # GIVEN
+    sched = dagger.LinearBetaSchedule(num_rampdown_rounds)
+    idx_after_rampdown = num_rampdown_rounds + 1
+
+    # WHEN
+    betas = [sched(i) for i in range(num_rampdown_rounds + 10)]
+
+    # THEN
+    assert np.allclose(
+        betas[:idx_after_rampdown],
+        np.linspace(1, 0, idx_after_rampdown),
+    )
+    assert np.allclose(betas[idx_after_rampdown:], 0)
+
+
+@pytest.mark.parametrize("decay_probability", [0.1, 0.5, 0.9, 1])
+def test_exponential_beta_schedule(decay_probability):
+    # GIVEN
+    sched = dagger.ExponentialBetaSchedule(decay_probability)
+
+    # WHEN
+    betas = [sched(i) for i in range(10)]
+
+    # THEN
+    assert np.allclose(betas, decay_probability ** np.arange(10))
+
+
+@pytest.mark.parametrize("decay_probability", [-0.1, 0, 1.1, 2])
+def test_forbidden_decay_probability_on_exp_beta_schedule(decay_probability):
+    with pytest.raises(ValueError):
+        dagger.ExponentialBetaSchedule(decay_probability)
 
 
 def test_traj_collector_seed(tmpdir, pendulum_venv, rng):

From d81eb68d2359ebb1927f6ebb2ba573f0c7e5745a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 02:02:21 +0530
Subject: [PATCH 08/54] Add optuna to dependencies

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 557015d91..867c1b775 100644
--- a/setup.py
+++ b/setup.py
@@ -210,6 +210,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "chai-sacred>=0.8.3",
         "tensorboard>=1.14",
         "huggingface_sb3>=2.2.1",
+        "optuna>=3.0.1",
     ],
     tests_require=TESTS_REQUIRE,
     extras_require={

From 27467d38268a2217731f019dc0202ce3a520cf2a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 02:22:24 +0530
Subject: [PATCH 09/54] Fix test case

---
 tests/scripts/test_scripts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 78bbca9bd..ad559d2d9 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -910,7 +910,7 @@ def test_parallel_train_adversarial_custom_env(tmpdir):
             logging=dict(log_root=tmpdir),
             demonstrations=dict(rollout_path=rollout_path),
         ),
-        search_space=dict(command_name="gail"),
+        search_space=dict(command_name=tune.choice(["gail"])),
     )
     config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE)
     run = parallel.parallel_ex.run(config_updates=config_updates)

From 1a3b6b81f70cdfc515dc41a264ae1e81347ac588 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 12:04:03 +0530
Subject: [PATCH 10/54] Clean up the scripts

---
 src/imitation/scripts/analyze.py              |  12 +-
 src/imitation/scripts/config/parallel.py      | 219 ++----------------
 .../scripts/config/train_adversarial.py       |  40 +---
 src/imitation/scripts/parallel.py             |  39 ++--
 4 files changed, 48 insertions(+), 262 deletions(-)

diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index a7b52af36..b7b990800 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -167,6 +167,7 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str:
 def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
     imit_stats = get(sd.run, "result.imit_stats")
     if imit_stats is None:
+        # stored in rollout key for preference comparison
         imit_stats = get(sd.run, "result.rollout")
     expert_stats = get(sd.run, "result.expert_stats")
 
@@ -234,7 +235,7 @@ def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
 # verbosity 2
 table_verbosity_mapping.append(
     table_verbosity_mapping[-1]
-    | {"status", "imit_expert_ratio", "exp_command", "run_name", "seed", ""},
+    | {"status", "imit_expert_ratio", "exp_command", "run_name"},
 )
 
 
@@ -264,14 +265,14 @@ def analyze_imitation(
         csv_output_path: If provided, then save a CSV output file to this path.
         tex_output_path: If provided, then save a LaTeX-format table to this path.
         print_table: If True, then print the dataframe to stdout.
-        table_verbosity: Increasing levels of verbosity, from 0 to 2, increase the
-            number of columns in the table.
+        table_verbosity: Increasing levels of verbosity, from 0 to 3, increase the
+            number of columns in the table. Level 3 prints all of the columns available.
 
     Returns:
         The DataFrame generated from the Sacred logs.
     """
-    if table_verbosity == -1:
-        table_entry_fns_subset = _get_table_entry_fns_subset(0)
+    if table_verbosity == 3:
+        table_entry_fns_subset = _get_table_entry_fns_subset(2)
     else:
         table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
 
@@ -279,6 +280,7 @@ def analyze_imitation(
     for sd in _gather_sacred_dicts():
         new_df = pd.DataFrame()
         if table_verbosity == -1:
+            # gets all config columns
             new_df = pd.json_normalize(sd.config)
         else:
             new_df = new_df.append({}, ignore_index=True)
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 0525641e3..697c5d862 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -6,6 +6,11 @@
 
 Adding custom named configs is necessary because the CLI interface can't add
 search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
+
+For tuning hyperparameters of an algorithm on a given environment, override
+the `base_named_configs` argument with the named config of the environment.
+Ex: python -m imitation.scripts.parallel with example_gail \
+    'base_named_configs=["logging.wandb_logging", "seals_half_cheetah"]'
 """
 
 import numpy as np
@@ -13,7 +18,7 @@
 import sacred
 from torch import nn
 
-from imitation.algorithms.dagger import ExponentialBetaSchedule, LinearBetaSchedule
+from imitation.algorithms import dagger
 from imitation.util.util import make_unique_timestamp
 
 parallel_ex = sacred.Experiment("parallel")
@@ -35,44 +40,11 @@ def config():
 
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
-    # n_seeds_start = 0
-    # n_seeds = 1  # Number of seeds to search over by default
     experiment_checkpoint_path = ""
     eval_best_trial = False
     eval_trial_seeds = 5  # Number of seeds to search over by default
     num_samples = 1  # Number of samples per grid search configuration
-    repeat = 3
-    env = "seals_half_cheetah"
-    wandb_name_prefix = ""
-
-
-# @parallel_ex.config
-# def seeds(n_seeds_start, n_seeds):
-#     search_space = {
-#         "config_updates": {
-#             "seed": tune.choice(
-#                 list(range(n_seeds_start, n_seeds_start + n_seeds)),
-#             )
-#         }
-#     }
-
-
-# @parallel_ex.config
-# def wandb(run_name):
-#     base_config_updates = {
-#         "logging": {
-#             "wandb": {
-#                 "wandb_name_prefix": run_name,
-#                 "wandb_kwargs": {"project": "algorithm-benchmark"},
-#             },
-#         },
-#     }
-# base_named_configs = ["logging.wandb_logging"]
-
-
-@parallel_ex.named_config
-def s3():
-    upload_dir = "s3://shwang-chai/private"
+    repeat = 1
 
 
 # Debug named configs
@@ -137,11 +109,9 @@ def example_cartpole_rl():
 def example_rl():
     sacred_ex_name = "train_rl"
     run_name = "rl_tuning"
-    # n_seeds = 2
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {"environment": {"num_vec": 1}}
     search_space = {
-        # "named_configs": tune.choice([[env] for env in EASY_ENVS]),
         "config_updates": {
             "rl": {
                 "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
@@ -163,8 +133,8 @@ def example_rl():
 @parallel_ex.named_config
 def example_bc():
     sacred_ex_name = "train_imitation"
-    run_name = "bc_tuning_hc"
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    run_name = "bc_tuning"
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {"environment": {"num_vec": 1}}
     search_space = {
         "config_updates": {
@@ -191,8 +161,8 @@ def example_bc():
 @parallel_ex.named_config
 def example_dagger():
     sacred_ex_name = "train_imitation"
-    run_name = "dagger_tuning_hc"
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    run_name = "dagger_tuning"
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
         "dagger": {"total_timesteps": 1e5},
@@ -209,8 +179,8 @@ def example_dagger():
             ),
             "dagger": dict(
                 beta_schedule=tune.choice(
-                    [LinearBetaSchedule(i) for i in [1, 5, 15]]
-                    + [ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                    [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
+                    + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
                 ),
                 rollout_round_min_episodes=tune.choice([3, 5, 10]),
             ),
@@ -234,14 +204,10 @@ def example_gail():
         "total_timesteps": 1e7,
     }
     search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
         "config_updates": {
             "algorithm_kwargs": dict(
                 demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
                 n_disc_updates_per_round=tune.choice([8, 16]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
             ),
             "rl": {
                 "batch_size": tune.choice([4096, 8192, 16384]),
@@ -258,29 +224,23 @@ def example_gail():
     eval_best_trial = True
     eval_trial_seeds = 5
     repeat = 3
-    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
     resources_per_trial = dict(cpu=1)
 
 
 @parallel_ex.named_config
 def example_airl():
     sacred_ex_name = "train_adversarial"
-    run_name = "airl_tuning_hc"
-    # n_seeds = 1
+    run_name = "airl_tuning"
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
         "total_timesteps": 1e7,
     }
     search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
         "config_updates": {
             "algorithm_kwargs": dict(
                 demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
                 n_disc_updates_per_round=tune.choice([8, 16]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
             ),
             "rl": {
                 "batch_size": tune.choice([4096, 8192, 16384]),
@@ -297,7 +257,6 @@ def example_airl():
     eval_best_trial = True
     eval_trial_seeds = 5
     repeat = 3
-    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
     resources_per_trial = dict(cpu=1)
 
 
@@ -305,7 +264,7 @@ def example_airl():
 def example_pc():
     sacred_ex_name = "train_preference_comparisons"
     run_name = "pc_tuning"
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
         "total_timesteps": 2e7,
@@ -317,8 +276,6 @@ def example_pc():
         "named_configs": tune.choice(
             [
                 ["reward.normalize_output_disable"],
-                # ["reward.normalize_output_running"],
-                # ["reward.normalize_output_ema"],
             ],
         ),
         "config_updates": {
@@ -327,19 +284,15 @@ def example_pc():
                     "activation_fn": tune.choice(
                         [
                             nn.ReLU,
-                            # nn.Tanh,
                         ],
                     ),
                 },
             },
             "num_iterations": tune.choice([25, 50]),
-            # "initial_comparison_frac": tune.choice([0.1, 0.25]),
-            # "reward_trainer_kwargs": {
-            #     "epochs": tune.choice([1, 3, 6]),
-            # },
-            # "query_schedule": tune.choice(
-            #     ["constant", "hyperbolic", "inverse_quadratic"],
-            # ),
+            "initial_comparison_frac": tune.choice([0.1, 0.25]),
+            "reward_trainer_kwargs": {
+                "epochs": tune.choice([1, 3, 6]),
+            },
             "rl": {
                 "batch_size": tune.choice([512, 2048, 8192]),
                 "rl_kwargs": {
@@ -349,138 +302,8 @@ def example_pc():
             },
         },
     }
-    num_samples = 24
+    num_samples = 100
     eval_best_trial = True
     eval_trial_seeds = 5
     repeat = 3
     resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def debug_eval():
-    sacred_ex_name = "train_preference_comparisons"
-    run_name = "debug_eval"
-    eval_trial_seeds = 2
-    eval_best_trial = True
-    # base_named_configs = ["seals_half_cheetah"]
-    base_config_updates = {
-        "total_timesteps": 30,
-        "total_comparisons": 10,
-        # "query_schedule": "hyperbolic",
-        "num_iterations": 1,
-        "fragment_length": 2,
-    }
-    search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
-        "config_updates": {
-            # "num_iterations": tune.choice([5, 20, 50]),
-            "initial_comparison_frac": tune.choice([0.1, 0.2]),
-            # "reward_trainer_kwargs": {
-            #     "epochs": tune.choice([1, 2, 3]),
-            # },
-            # "query_schedule": tune.choice(
-            #     ["constant", "hyperbolic", "inverse_quadratic"],
-            # ),
-        },
-    }
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def debug_eval_adv():
-    sacred_ex_name = "train_adversarial"
-    run_name = "airl_tuning_debug"
-    # n_seeds = 5
-    base_named_configs = []
-    eval_best_trial = True
-    eval_trial_seeds = 2
-    base_config_updates = {
-        "total_timesteps": 2048,
-    }
-    search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
-        "config_updates": {
-            "algorithm_kwargs": dict(
-                # demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
-                n_disc_updates_per_round=tune.choice([1, 2]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
-            ),
-            "rl": {
-                "batch_size": 8,
-                # "rl_kwargs": {
-                #     "ent_coef": tune.choice([0, 1e-3, 1e-1]),
-                #     "learning_rate": tune.loguniform(1e-5, 5e-3),
-                # },
-            },
-            "algorithm_specific": dict(demo_batch_size=1),
-        },
-        "command_name": "airl",
-    }
-    num_samples = 2
-    repeat = 2
-    resources_per_trial = dict(cpu=8)
-
-
-@parallel_ex.named_config
-def debug_airl():
-    sacred_ex_name = "train_adversarial"
-    run_name = "airl_debug"
-    # n_seeds = 1
-    base_named_configs = ["logging.wandb_logging", "seals_walker"]
-    base_config_updates = {
-        "environment": {"num_vec": 8},
-        "total_timesteps": 1e7,
-    }
-    search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
-        "config_updates": {
-            "train": {
-                "policy_kwargs": {
-                    "activation_fn": tune.choice(
-                        [
-                            nn.ReLU,
-                            # nn.Tanh,
-                        ],
-                    ),
-                },
-            },
-            "algorithm_kwargs": dict(
-                demo_batch_size=tune.choice([32]),
-                n_disc_updates_per_round=tune.choice([10]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
-            ),
-            "rl": {
-                "batch_size": tune.choice([10000]),
-                "rl_kwargs": {
-                    "ent_coef": tune.choice([0.1]),
-                    "learning_rate": tune.choice([1e-4]),
-                },
-            },
-            "algorithm_specific": {},
-        },
-        "command_name": "airl",
-    }
-    num_samples = 1
-    eval_best_trial = False
-    # eval_trial_seeds = 5
-    repeat = 5
-    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
-    resources_per_trial = dict(cpu=8)
-
-
-# @parallel_ex.config_hook
-# def config_hook(config, command_name, logger):
-#     """Sets env."""
-#     del command_name, logger
-#     res = {}
-#     print(config)
-#     if config["env"]:
-#         res["base_named_configs"] = tuple(
-#             config["base_named_configs"] + [config["env"]]
-#         )
-#     print(res)
-#     return res
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index bd9df6287..fb26c99c6 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -99,8 +99,8 @@ def pendulum():
 
 @train_adversarial_ex.named_config
 def seals_ant():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
-    # locals().update(**ANT_SHARED_LOCALS)
+    locals().update(**MUJOCO_SHARED_LOCALS)
+    locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     rl = dict(
@@ -173,21 +173,6 @@ def seals_half_cheetah():
             vf_coef=0.11483689492120866,
         ),
     )
-    # algorithm_specific = dict(
-    #     airl=dict(total_timesteps=int(5e6)),
-    #     gail=dict(total_timesteps=int(8e6)),
-    # )
-    # reward = dict(
-    #     algorithm_specific=dict(
-    #         airl=dict(
-    #             net_cls=reward_nets.BasicShapedRewardNet,
-    #             net_kwargs=dict(
-    #                 reward_hid_sizes=(32,),
-    #                 potential_hid_sizes=(32,),
-    #             ),
-    #         ),
-    #     ),
-    # )
     algorithm_kwargs = dict(
         # Number of discriminator updates after each round of generator updates
         n_disc_updates_per_round=16,
@@ -257,7 +242,7 @@ def seals_swimmer():
 
 @train_adversarial_ex.named_config
 def seals_walker():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
+    locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     train = dict(
@@ -311,22 +296,3 @@ def fast():
         demo_batch_size=1,
         n_disc_updates_per_round=4,
     )
-
-
-@train_adversarial_ex.named_config
-def debug_nans():
-    environment = {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}
-    total_timesteps = 1e7
-    algorithm_kwargs = dict(
-        demo_batch_size=128,
-        n_disc_updates_per_round=8,
-        # both are same as rl.batch_size
-        # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-        # gen_train_timesteps=0,
-    )
-    rl = {
-        "batch_size": 4096,
-        "rl_kwargs": {"ent_coef": 0.1, "learning_rate": 7.316377404994506e-05},
-    }
-    seed = 0
-    checkpoint_interval = 1
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 3e713777e..9ee8e6ee9 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -33,7 +33,7 @@ def parallel(
     upload_dir: Optional[str],
     repeat: int = 1,
     eval_best_trial: bool = False,
-    eval_best_trial_resource_multiplier: int = 2,
+    eval_best_trial_resource_multiplier: int = 1,
     eval_trial_seeds: int = 5,
     experiment_checkpoint_path: str = "",
     syncer=None,
@@ -54,7 +54,8 @@ def parallel(
             under the 'experiment.name' key. This is equivalent to using the Sacred
             CLI '--name' option on the inner experiment. Offline analysis jobs can use
             this argument to group similar data.
-        num_samples: Number of times to sample from the hyperparameter space.
+        num_samples: Number of times to sample from the hyperparameter space without
+            considering repetition using `repeat`.
         search_space: A dictionary which can contain Ray Tune search objects like
             `ray.tune.grid_search` and `ray.tune.sample_from`, and is
             passed as the `config` argument to `ray.tune.run()`. After the
@@ -79,12 +80,12 @@ def parallel(
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
         repeat: Number of runs to repeat each trial for.
         eval_best_trial: Whether to evaluate the trial with the best mean return
-            at the end of tuning on a different set of seeds.
+            at the end of tuning on a separate set of seeds.
         eval_best_trial_resource_multiplier: factor by which to multiply the
             number of cpus per trial in `resources_per_trial`.
         eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment. ran using this script. Useful for resuming cancelled trials
+            experiment ran using this script. Useful for resuming cancelled trials
             of the experiments (using `resume`) or evaluating the best trial of the
             experiment (using `eval_best_trial`).
         resume: If true and `experiment_checkpoint_path` is given, then resumes the
@@ -159,6 +160,7 @@ def parallel(
             result.trials = None
             result.fetch_trial_dataframes()
         else:
+            # run hyperparameter tuning
             result = ray.tune.run(
                 trainable,
                 config=search_space,
@@ -174,15 +176,14 @@ def parallel(
                 metric="mean_return",
                 mode="max",
             )
-
-        key_prefix = (
-            "rollout/"
-            if sacred_ex_name == "train_preference_comparisons"
-            else ""
-            if sacred_ex_name == "train_rl"
-            else "imit_stats/"
-        )
+        if sacred_ex_name == "train_rl":
+            key_prefix = ""
+        elif sacred_ex_name == "train_preference_comparisons":
+            key_prefix = "rollout/"
+        else:
+            key_prefix = "imit_stats/"
         key = key_prefix + "monitor_return_mean"
+
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
@@ -230,8 +231,9 @@ def parallel(
                     resources_per_trial=resources_per_trial,
                 )
                 returns = eval_result.results_df["mean_return"].to_numpy()
-                print("Returns:", returns)
-                print(np.mean(returns), np.std(returns))
+                print("All returns:", returns)
+                print("Mean:", np.mean(returns))
+                print("Std:", np.std(returns))
     finally:
         ray.shutdown()
 
@@ -333,14 +335,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         )
         # Ray Tune has a string formatting error if raylet completes without
         # any calls to `reporter`.
-        # reporter(done=True)
-        # if sacred_ex_name == "train_preference_comparisons":
-        #     #reporter(mean_return=run.result["rollout"]["monitor_return_mean"])
-        #     #ray.tune.report(mean_return=run.result["rollout"]["monitor_return_mean"])
-        #     ray.tune.report(mean_return=234)
-        # else:
-        #     # reporter(mean_return=run.result["imit_stats"]["monitor_return_mean"])
-        #     ray.tune.report(mean_return=run.result["imit_stats"]["monitor_return_mean"])
+        reporter(done=True)
 
         assert run.status == "COMPLETED"
         return run.result

From 7a438da0f5421f0d98fdb4db9747a8af10d26297 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 19:53:14 +0530
Subject: [PATCH 11/54] Remove reporter(done) since mean_return is reported by
 the runs

---
 src/imitation/scripts/parallel.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 9ee8e6ee9..2dd2254bf 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -333,9 +333,6 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
             **updated_run_kwargs,
             options={"--run": run_name, "--file_storage": "sacred"},
         )
-        # Ray Tune has a string formatting error if raylet completes without
-        # any calls to `reporter`.
-        reporter(done=True)
 
         assert run.status == "COMPLETED"
         return run.result

From 2e56de8eb97713b88ada09564369214f5e4fa661 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 23 Feb 2023 23:53:12 +0530
Subject: [PATCH 12/54] Add beta_schedule parameter to dagger script

---
 src/imitation/scripts/train_imitation.py              | 1 +
 src/imitation/scripts/train_preference_comparisons.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index e607339b4..56633e33a 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -119,6 +119,7 @@ def dagger(
             expert_policy=expert_policy,
             custom_logger=custom_logger,
             bc_trainer=bc_trainer,
+            beta_schedule=dagger["beta_schedule"],
             rng=_rnd,
         )
 
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index 3d4fb4e33..4030317c4 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -280,7 +280,6 @@ def save_callback(iteration_num):
             results = dict(results)
             results["rollout"] = policy_evaluation.eval_policy(agent, venv)
             results["mean_return"] = results["rollout"]["monitor_return_mean"]
-            
 
     if save_preferences:
         main_trainer.dataset.save(log_dir / "preferences.pkl")

From 73d8576fc893868c68442b657bd25aaffb7df9bf Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 17 Mar 2023 03:37:15 +0530
Subject: [PATCH 13/54] Update config policy kwargs

---
 src/imitation/scripts/config/train_adversarial.py   |  6 +++---
 .../scripts/config/train_preference_comparisons.py  | 13 +++----------
 src/imitation/scripts/config/train_rl.py            | 12 ++++++------
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index 08b92fe9c..7989f3eab 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -187,7 +187,7 @@ def seals_hopper():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -216,7 +216,7 @@ def seals_swimmer():
     environment = dict(gym_id="seals/Swimmer-v0")
     total_timesteps = int(2e6)
     demonstrations = dict(rollout_type="ppo-huggingface")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -245,7 +245,7 @@ def seals_walker():
     locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 236edad47..1a039c762 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -120,20 +120,13 @@ def seals_half_cheetah():
     )
     num_iterations = 50
     total_timesteps = 20000000
-    # train = dict(
-    #     policy_cls="MlpPolicy",
-    #     policy_kwargs=dict(
-    #         activation_fn=nn.ReLU,
-    #         # net_arch=[dict(pi=[64, 64], vf=[64, 64])],
-    #     ),
-    # )
 
 
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -160,7 +153,7 @@ def seals_hopper():
 def seals_swimmer():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Swimmer-v0")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -188,7 +181,7 @@ def seals_swimmer():
 def seals_walker():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py
index 34b45250c..a5475540d 100644
--- a/src/imitation/scripts/config/train_rl.py
+++ b/src/imitation/scripts/config/train_rl.py
@@ -74,7 +74,7 @@ def cartpole():
 def seals_cartpole():
     environment = dict(gym_id="seals/CartPole-v0", num_vec=8)
     total_timesteps = int(1e5)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -111,7 +111,7 @@ def seals_half_cheetah():
         num_vec=1,
     )
 
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.Tanh,
@@ -141,7 +141,7 @@ def seals_half_cheetah():
 @train_rl_ex.named_config
 def seals_hopper():
     environment = dict(gym_id="seals/Hopper-v0", num_vec=1)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -211,7 +211,7 @@ def seals_ant():
         num_vec=1,
     )
 
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.Tanh,
@@ -242,7 +242,7 @@ def seals_ant():
 @train_rl_ex.named_config
 def seals_swimmer():
     environment = dict(gym_id="seals/Swimmer-v0", num_vec=1)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -273,7 +273,7 @@ def seals_swimmer():
 @train_rl_ex.named_config
 def seals_walker():
     environment = dict(gym_id="seals/Walker2d-v0", num_vec=1)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,

From 9fdf8786663473334f94b24a841a832b29da435f Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 16 May 2023 19:00:32 +0530
Subject: [PATCH 14/54] Changes from review

---
 src/imitation/scripts/config/parallel.py       | 16 ++++++++--------
 .../scripts/config/train_adversarial.py        |  4 ----
 .../config/train_preference_comparisons.py     |  6 ------
 src/imitation/scripts/parallel.py              | 18 +++++++-----------
 src/imitation/scripts/train_imitation.py       |  1 +
 5 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index ea90f11b8..b52446154 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -102,9 +102,6 @@ def example_cartpole_rl():
     resources_per_trial = dict(cpu=4)
 
 
-EASY_ENVS = ["cartpole", "pendulum", "mountain_car"]
-
-
 @parallel_ex.named_config
 def example_rl():
     sacred_ex_name = "train_rl"
@@ -135,18 +132,21 @@ def example_bc():
     sacred_ex_name = "train_imitation"
     run_name = "bc_tuning"
     base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {"environment": {"num_vec": 1}}
+    base_config_updates = {
+        "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
+    }
     search_space = {
         "config_updates": {
-            "bc_kwargs": dict(
+            "bc": dict(
                 batch_size=tune.choice([8, 16, 32, 64]),
                 l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
                 optimizer_kwargs=dict(
                     lr=tune.loguniform(1e-5, 1e-2),
                 ),
-            ),
-            "bc_train_kwargs": dict(
-                n_epochs=tune.choice([1, 5, 10, 20]),
+                train_kwargs=dict(
+                    n_epochs=tune.choice([1, 5, 10, 20]),
+                ),
             ),
         },
         "command_name": "bc",
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index 7989f3eab..ef675eab6 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -156,7 +156,6 @@ def half_cheetah():
 
 @train_adversarial_ex.named_config
 def seals_half_cheetah():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/HalfCheetah-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     rl = dict(
@@ -184,7 +183,6 @@ def seals_half_cheetah():
 
 @train_adversarial_ex.named_config
 def seals_hopper():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     policy = dict(
@@ -212,7 +210,6 @@ def seals_hopper():
 
 @train_adversarial_ex.named_config
 def seals_swimmer():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Swimmer-v0")
     total_timesteps = int(2e6)
     demonstrations = dict(rollout_type="ppo-huggingface")
@@ -242,7 +239,6 @@ def seals_swimmer():
 
 @train_adversarial_ex.named_config
 def seals_walker():
-    locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     policy = dict(
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 1a039c762..4fe9c793e 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -73,8 +73,6 @@ def cartpole():
 
 @train_preference_comparisons_ex.named_config
 def seals_ant():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
-    # locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
     rl = dict(
         batch_size=2048,
@@ -102,7 +100,6 @@ def half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_half_cheetah():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/HalfCheetah-v0")
     rl = dict(
         batch_size=512,
@@ -124,7 +121,6 @@ def seals_half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
     policy = dict(
         policy_cls="MlpPolicy",
@@ -151,7 +147,6 @@ def seals_hopper():
 
 @train_preference_comparisons_ex.named_config
 def seals_swimmer():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Swimmer-v0")
     policy = dict(
         policy_cls="MlpPolicy",
@@ -179,7 +174,6 @@ def seals_swimmer():
 
 @train_preference_comparisons_ex.named_config
 def seals_walker():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     policy = dict(
         policy_cls="MlpPolicy",
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 2dd2254bf..53b4c2b32 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -139,7 +139,7 @@ def parallel(
                         syncer=syncer,
                     ),
                     metric="mean_return",
-                    resume=resume,
+                    resume=True,
                 )
                 print(
                     "Live trials:",
@@ -176,14 +176,7 @@ def parallel(
                 metric="mean_return",
                 mode="max",
             )
-        if sacred_ex_name == "train_rl":
-            key_prefix = ""
-        elif sacred_ex_name == "train_preference_comparisons":
-            key_prefix = "rollout/"
-        else:
-            key_prefix = "imit_stats/"
-        key = key_prefix + "monitor_return_mean"
-
+        key = "mean_return"
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
@@ -199,7 +192,7 @@ def parallel(
             # store mean return of runs across all seeds in a group
             df["mean_return"] = grps[key].transform(lambda x: x.mean())
             best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-            row = best_config_df.loc[0]
+            row = best_config_df.iloc[0]
             best_config_tag = row["experiment_tag"]
             if result.trials is not None:
                 trial = [
@@ -215,7 +208,10 @@ def parallel(
                 # update cpus per trial only if it is provided in `resources_per_trial`
                 # Uses the default values (cpu=1) if it is not provided
                 if "cpu" in resources_per_trial:
-                    resources_per_trial["cpu"] *= eval_best_trial_resource_multiplier
+                    resources_per_trial_eval = copy.deepcopy(resources_per_trial)
+                    resources_per_trial_eval[
+                        "cpu"
+                    ] *= eval_best_trial_resource_multiplier
                     best_config["config_updates"].update(
                         environment=dict(num_vec=resources_per_trial["cpu"]),
                     )
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 56633e33a..5a6925eb3 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -76,6 +76,7 @@ def bc(
     expert_stats = _try_computing_expert_stats(expert_trajs)
     if expert_stats is not None:
         stats["expert_stats"] = expert_stats
+    stats["mean_return"] = imit_stats["monitor_return_mean"]
     return stats
 
 

From 1c1dbc44970016fd5ef6bb965cf69afbf33590a1 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 16 May 2023 21:43:43 +0530
Subject: [PATCH 15/54] Fix errors with some configs

---
 src/imitation/scripts/config/parallel.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index b52446154..095c67107 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -165,8 +165,9 @@ def example_dagger():
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
         "dagger": {"total_timesteps": 1e5},
-        "bc_kwargs": {
+        "bc": {
             "batch_size": 16,
             "l2_weight": 1e-4,
             "optimizer_kwargs": {"lr": 1e-3},
@@ -174,8 +175,10 @@ def example_dagger():
     }
     search_space = {
         "config_updates": {
-            "bc_train_kwargs": dict(
-                n_epochs=tune.choice([1, 5, 10]),
+            "bc": dict(
+                train_kwargs=dict(
+                    n_epochs=tune.choice([1, 5, 10]),
+                ),
             ),
             "dagger": dict(
                 beta_schedule=tune.choice(
@@ -201,6 +204,7 @@ def example_gail():
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
         "total_timesteps": 1e7,
     }
     search_space = {
@@ -234,6 +238,7 @@ def example_airl():
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
         "total_timesteps": 1e7,
     }
     search_space = {
@@ -273,11 +278,9 @@ def example_pc():
         "gatherer_kwargs": {"sample": True},
     }
     search_space = {
-        "named_configs": tune.choice(
-            [
-                ["reward.normalize_output_disable"],
-            ],
-        ),
+        "named_configs": [
+            ["reward.normalize_output_disable"],
+        ],
         "config_updates": {
             "train": {
                 "policy_kwargs": {

From 44c4e97d64980118b3a07f06f7c15edb273a16a1 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 14 Jun 2023 06:38:42 +0530
Subject: [PATCH 16/54] Updates based on review

---
 src/imitation/scripts/analyze.py              | 29 ++++++++++---------
 src/imitation/scripts/parallel.py             | 26 ++++++++++++-----
 src/imitation/scripts/train_adversarial.py    |  1 -
 src/imitation/scripts/train_imitation.py      |  1 -
 .../scripts/train_preference_comparisons.py   |  3 +-
 src/imitation/scripts/train_rl.py             |  1 -
 6 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index f036efe40..8977fed47 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -272,40 +272,43 @@ def analyze_imitation(
         The DataFrame generated from the Sacred logs.
     """
     if table_verbosity == 3:
+        # Get column names for which we have get value using make_entry_fn
+        # These are same across Level 2 & 3. In Level 3, we additionally add remaining
+        #  config columns.
         table_entry_fns_subset = _get_table_entry_fns_subset(2)
     else:
         table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
 
-    df = pd.DataFrame()
+    output_table = pd.DataFrame()
     for sd in _gather_sacred_dicts():
-        new_df = pd.DataFrame()
-        if table_verbosity == -1:
+        if table_verbosity == 3:
             # gets all config columns
-            new_df = pd.json_normalize(sd.config)
+            row = pd.json_normalize(sd.config)
         else:
-            new_df = new_df.append({}, ignore_index=True)
+            # create an empty dataframe with a single row
+            row = pd.DataFrame(index=[0])
 
         for col_name, make_entry_fn in table_entry_fns_subset.items():
-            new_df[col_name] = make_entry_fn(sd)
+            row[col_name] = make_entry_fn(sd)
 
-        df = pd.concat([df, new_df])
+        output_table = pd.concat([output_table, row])
 
-    if len(df) > 0:
-        df.sort_values(by=["algo", "env_name"], inplace=True)
+    if len(output_table) > 0:
+        output_table.sort_values(by=["algo", "env_name"], inplace=True)
 
     display_options: Mapping[str, Any] = dict(index=False)
     if csv_output_path is not None:
-        df.to_csv(csv_output_path, **display_options)
+        output_table.to_csv(csv_output_path, **display_options)
         print(f"Wrote CSV file to {csv_output_path}")
     if tex_output_path is not None:
-        s: str = df.to_latex(**display_options)
+        s: str = output_table.to_latex(**display_options)
         with open(tex_output_path, "w") as f:
             f.write(s)
         print(f"Wrote TeX file to {tex_output_path}")
 
     if print_table:
-        print(df.to_string(**display_options))
-    return df
+        print(output_table.to_string(**display_options))
+    return output_table
 
 
 def _make_return_summary(stats: dict, prefix="") -> str:
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 53b4c2b32..2bb0129cb 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -127,6 +127,12 @@ def parallel(
 
     ray.init(**init_kwargs)
     search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
+
+    if sacred_ex_name == "train_rl":
+        return_key = "monitor_return_mean"
+    else:
+        return_key = "imit_stats/monitor_return_mean"
+
     try:
         if experiment_checkpoint_path:
             if resume:
@@ -173,10 +179,9 @@ def parallel(
                     syncer=syncer,
                 ),
                 search_alg=search_alg,
-                metric="mean_return",
+                metric=return_key,
                 mode="max",
             )
-        key = "mean_return"
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
@@ -190,7 +195,7 @@ def parallel(
             ]
             grps = df.groupby(grp_keys)
             # store mean return of runs across all seeds in a group
-            df["mean_return"] = grps[key].transform(lambda x: x.mean())
+            df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
             best_config_df = df[df["mean_return"] == df["mean_return"].max()]
             row = best_config_df.iloc[0]
             best_config_tag = row["experiment_tag"]
@@ -200,20 +205,25 @@ def parallel(
                 ][0]
                 best_config = trial.config
                 print("Mean return:", row["mean_return"])
-                print("All returns:", df[df["mean_return"] == row["mean_return"]][key])
+                print(
+                    "All returns:",
+                    df[df["mean_return"] == row["mean_return"]][return_key],
+                )
                 print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
                 best_config["config_updates"].update(
                     seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
                 )
+
+                resources_per_trial_eval = copy.deepcopy(resources_per_trial)
                 # update cpus per trial only if it is provided in `resources_per_trial`
                 # Uses the default values (cpu=1) if it is not provided
                 if "cpu" in resources_per_trial:
-                    resources_per_trial_eval = copy.deepcopy(resources_per_trial)
+
                     resources_per_trial_eval[
                         "cpu"
                     ] *= eval_best_trial_resource_multiplier
                     best_config["config_updates"].update(
-                        environment=dict(num_vec=resources_per_trial["cpu"]),
+                        environment=dict(num_vec=resources_per_trial_eval["cpu"]),
                     )
 
                 eval_result = ray.tune.run(
@@ -224,9 +234,9 @@ def parallel(
                         "command_name": best_config.get("command_name", None),
                     },
                     name=run_name + "_best_hp_eval",
-                    resources_per_trial=resources_per_trial,
+                    resources_per_trial=resources_per_trial_eval,
                 )
-                returns = eval_result.results_df["mean_return"].to_numpy()
+                returns = eval_result.results_df[return_key].to_numpy()
                 print("All returns:", returns)
                 print("Mean:", np.mean(returns))
                 print("Std:", np.std(returns))
diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py
index d1f99a54b..26c8d7bcf 100644
--- a/src/imitation/scripts/train_adversarial.py
+++ b/src/imitation/scripts/train_adversarial.py
@@ -167,7 +167,6 @@ def callback(round_num: int, /) -> None:
     return {
         "imit_stats": imit_stats,
         "expert_stats": rollout.rollout_stats(expert_trajs),
-        "mean_return": imit_stats["monitor_return_mean"],
     }
 
 
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 5a6925eb3..56633e33a 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -76,7 +76,6 @@ def bc(
     expert_stats = _try_computing_expert_stats(expert_trajs)
     if expert_stats is not None:
         stats["expert_stats"] = expert_stats
-    stats["mean_return"] = imit_stats["monitor_return_mean"]
     return stats
 
 
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index b054a5a6c..867a666a4 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -280,8 +280,7 @@ def save_callback(iteration_num):
         # Storing and evaluating policy only useful if we generated trajectory data
         if bool(trajectory_path is None):
             results = dict(results)
-            results["rollout"] = policy_evaluation.eval_policy(agent, venv)
-            results["mean_return"] = results["rollout"]["monitor_return_mean"]
+            results["imit_stats"] = policy_evaluation.eval_policy(agent, venv)
 
     if save_preferences:
         main_trainer.dataset.save(log_dir / "preferences.pkl")
diff --git a/src/imitation/scripts/train_rl.py b/src/imitation/scripts/train_rl.py
index 20a7b263c..6780a557b 100644
--- a/src/imitation/scripts/train_rl.py
+++ b/src/imitation/scripts/train_rl.py
@@ -159,7 +159,6 @@ def train_rl(
 
         # Final evaluation of expert policy.
         eval_stats = policy_evaluation.eval_policy(rl_algo, venv)
-        eval_stats["mean_return"] = eval_stats["monitor_return_mean"]
         return eval_stats
 
 

From ab0126998a4f8beb44e93eb11d6c2b17e68038a8 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 14 Jun 2023 07:40:52 +0530
Subject: [PATCH 17/54] Change metric everywhere

---
 src/imitation/scripts/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 2bb0129cb..6f77330df 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -144,7 +144,7 @@ def parallel(
                         upload_dir=upload_dir,
                         syncer=syncer,
                     ),
-                    metric="mean_return",
+                    metric=return_key,
                     resume=True,
                 )
                 print(

From e896d7db127f9025d89387cc10e513409fd973b1 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 11 Jul 2023 16:03:02 +0530
Subject: [PATCH 18/54] Separate tuning code from parallel.py

---
 benchmarking/tuning.py                   | 102 ++++++++++
 benchmarking/tuning_config.py            | 237 +++++++++++++++++++++++
 setup.cfg                                |   1 +
 src/imitation/scripts/config/parallel.py | 216 +--------------------
 src/imitation/scripts/parallel.py        | 101 ++--------
 5 files changed, 363 insertions(+), 294 deletions(-)
 create mode 100644 benchmarking/tuning.py
 create mode 100644 benchmarking/tuning_config.py

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
new file mode 100644
index 000000000..b4e62a84a
--- /dev/null
+++ b/benchmarking/tuning.py
@@ -0,0 +1,102 @@
+"""Tunes the hyperparameters of the algorithms."""
+
+import copy
+import pathlib
+from typing import Any, Dict
+
+import numpy as np
+import ray
+from pandas.api import types as pd_types
+from sacred.observers import FileStorageObserver
+from tuning_config import parallel_ex, tuning_ex
+
+
+@tuning_ex.main
+def tune(
+    parallel: Dict[str, Any],
+    eval_best_trial: bool = False,
+    eval_best_trial_resource_multiplier: int = 1,
+    eval_trial_seeds: int = 5,
+) -> None:
+    """Tune hyperparameters of imitation algorithms using parallel script.
+
+    Args:
+        parallel: A dictionary of arguments from the parallel script.
+        eval_best_trial: Whether to evaluate the trial with the best mean return
+            at the end of tuning on a separate set of seeds.
+        eval_best_trial_resource_multiplier: factor by which to multiply the
+            number of cpus per trial in `resources_per_trial`.
+        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
+    """
+    run = parallel_ex.run(config_updates=parallel)
+    result = run.result
+
+    if eval_best_trial:
+        if parallel["sacred_ex_name"] == "train_rl":
+            return_key = "monitor_return_mean"
+        else:
+            return_key = "imit_stats/monitor_return_mean"
+        df = result.results_df
+        df = df[df["config/named_configs"].notna()]
+        # convert object dtype to str required by df.groupby
+        for col in df.columns:
+            if pd_types.is_object_dtype(df[col]):
+                df[col] = df[col].astype("str")
+        # group into separate HP configs
+        grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
+        grps = df.groupby(grp_keys)
+        # store mean return of runs across all seeds in a group
+        df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
+        best_config_df = df[df["mean_return"] == df["mean_return"].max()]
+        row = best_config_df.iloc[0]
+        best_config_tag = row["experiment_tag"]
+        if result.trials is not None:
+            trial = [t for t in result.trials if best_config_tag in t.experiment_tag][0]
+            best_config = trial.config
+            print("Mean return:", row["mean_return"])
+            print(
+                "All returns:",
+                df[df["mean_return"] == row["mean_return"]][return_key],
+            )
+            print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
+
+            best_config["config_updates"].update(
+                seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
+            )
+
+            resources_per_trial_eval = copy.deepcopy(parallel["resources_per_trial"])
+            # update cpus per trial only if it is provided in `resources_per_trial`
+            # Uses the default values (cpu=1) if it is not provided
+            if "cpu" in parallel["resources_per_trial"]:
+                resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
+
+            eval_config_updates = parallel.copy()
+            eval_config_updates.update(
+                run_name=parallel["run_name"] + "_best_hp_eval",
+                num_samples=1,
+                search_space=best_config,
+                base_named_configs=parallel["base_named_configs"],
+                base_config_updates=parallel["base_config_updates"],
+                resources_per_trial=resources_per_trial_eval,
+                search_alg=None,
+                repeat=1,
+                experiment_checkpoint_path="",
+                resume=False,
+            )
+            eval_run = parallel_ex.run(config_updates=eval_config_updates)
+            eval_result = eval_run.result
+            returns = eval_result.results_df[return_key].to_numpy()
+            print("All returns:", returns)
+            print("Mean:", np.mean(returns))
+            print("Std:", np.std(returns))
+
+
+def main_console():
+    observer_path = pathlib.Path.cwd() / "output" / "sacred" / "tuning"
+    observer = FileStorageObserver(observer_path)
+    tuning_ex.observers.append(observer)
+    tuning_ex.run_commandline()
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main_console()
diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py
new file mode 100644
index 000000000..79c8d0347
--- /dev/null
+++ b/benchmarking/tuning_config.py
@@ -0,0 +1,237 @@
+"""Config files for tuning experiments."""
+
+import ray.tune as tune
+import sacred
+from torch import nn
+
+from imitation.algorithms import dagger
+from imitation.scripts.parallel import parallel_ex
+
+tuning_ex = sacred.Experiment("tuning", ingredients=[parallel_ex])
+
+
+@tuning_ex.named_config
+def example_rl():
+    parallel = dict(
+        sacred_ex_name="train_rl",
+        run_name="rl_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={"environment": {"num_vec": 1}},
+        search_space={
+            "config_updates": {
+                "rl": {
+                    "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
+                    "rl_kwargs": {
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                        "batch_size": tune.choice([64, 128, 256, 512]),
+                        "n_epochs": tune.choice([5, 10, 20]),
+                    },
+                },
+            },
+        },
+        num_samples=100,
+        repeat=1,
+        resources_per_trial=dict(cpu=1),
+    )
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_bc():
+    parallel = dict(
+        sacred_ex_name="train_imitation",
+        run_name="bc_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+        },
+        search_space={
+            "config_updates": {
+                "bc": dict(
+                    batch_size=tune.choice([8, 16, 32, 64]),
+                    l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
+                    optimizer_kwargs=dict(
+                        lr=tune.loguniform(1e-5, 1e-2),
+                    ),
+                    train_kwargs=dict(
+                        n_epochs=tune.choice([1, 5, 10, 20]),
+                    ),
+                ),
+            },
+            "command_name": "bc",
+        },
+        num_samples=2,
+        repeat=1,
+        resources_per_trial=dict(cpu=1),
+    )
+
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    eval_best_trial_resource_multiplier = 1
+
+
+@tuning_ex.named_config
+def example_dagger():
+    parallel = dict(
+        sacred_ex_name="train_imitation",
+        run_name="dagger_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "dagger": {"total_timesteps": 1e5},
+            "bc": {
+                "batch_size": 16,
+                "l2_weight": 1e-4,
+                "optimizer_kwargs": {"lr": 1e-3},
+            },
+        },
+        search_space={
+            "config_updates": {
+                "bc": dict(
+                    train_kwargs=dict(
+                        n_epochs=tune.choice([1, 5, 10]),
+                    ),
+                ),
+                "dagger": dict(
+                    beta_schedule=tune.choice(
+                        [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
+                        + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                    ),
+                    rollout_round_min_episodes=tune.choice([3, 5, 10]),
+                ),
+            },
+            "command_name": "dagger",
+        },
+        num_samples=50,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_gail():
+    parallel = dict(
+        sacred_ex_name="train_adversarial",
+        run_name="gail_tuning_hc",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "total_timesteps": 1e7,
+        },
+        search_space={
+            "config_updates": {
+                "algorithm_kwargs": dict(
+                    demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                    n_disc_updates_per_round=tune.choice([8, 16]),
+                ),
+                "rl": {
+                    "batch_size": tune.choice([4096, 8192, 16384]),
+                    "rl_kwargs": {
+                        "ent_coef": tune.loguniform(1e-7, 1e-3),
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    },
+                },
+                "algorithm_specific": {},
+            },
+            "command_name": "gail",
+        },
+        num_samples=100,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_airl():
+    parallel = dict(
+        sacred_ex_name="train_adversarial",
+        run_name="airl_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "total_timesteps": 1e7,
+        },
+        search_space={
+            "config_updates": {
+                "algorithm_kwargs": dict(
+                    demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                    n_disc_updates_per_round=tune.choice([8, 16]),
+                ),
+                "rl": {
+                    "batch_size": tune.choice([4096, 8192, 16384]),
+                    "rl_kwargs": {
+                        "ent_coef": tune.loguniform(1e-7, 1e-3),
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    },
+                },
+                "algorithm_specific": {},
+            },
+            "command_name": "airl",
+        },
+        num_samples=100,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_pc():
+    parallel = dict(
+        sacred_ex_name="train_preference_comparisons",
+        run_name="pc_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "total_timesteps": 2e7,
+            "total_comparisons": 5000,
+            "query_schedule": "hyperbolic",
+            "gatherer_kwargs": {"sample": True},
+        },
+        search_space={
+            "named_configs": [
+                ["reward.normalize_output_disable"],
+            ],
+            "config_updates": {
+                "train": {
+                    "policy_kwargs": {
+                        "activation_fn": tune.choice(
+                            [
+                                nn.ReLU,
+                            ],
+                        ),
+                    },
+                },
+                "num_iterations": tune.choice([25, 50]),
+                "initial_comparison_frac": tune.choice([0.1, 0.25]),
+                "reward_trainer_kwargs": {
+                    "epochs": tune.choice([1, 3, 6]),
+                },
+                "rl": {
+                    "batch_size": tune.choice([512, 2048, 8192]),
+                    "rl_kwargs": {
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                        "ent_coef": tune.loguniform(1e-7, 1e-3),
+                    },
+                },
+            },
+        },
+        num_samples=100,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+
+    eval_best_trial = True
+    eval_trial_seeds = 5
diff --git a/setup.cfg b/setup.cfg
index 979c3ca46..f39db322f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,6 +7,7 @@ per-file-ignores =
 # F841 local variable unused [for Sacred config scopes]
   src/imitation/scripts/config/*.py:F841
   ../src/imitation/scripts/config/*.py:F841
+  benchmarking/tuning_config.py:F841
   src/imitation/envs/examples/airl_envs/*.py:D
 
 [darglint]
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 095c67107..e9c5b8245 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -16,9 +16,7 @@
 import numpy as np
 import ray.tune as tune
 import sacred
-from torch import nn
 
-from imitation.algorithms import dagger
 from imitation.util.util import make_unique_timestamp
 
 parallel_ex = sacred.Experiment("parallel")
@@ -45,6 +43,10 @@ def config():
     eval_trial_seeds = 5  # Number of seeds to search over by default
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1
+    search_alg = "optuna"  # search algorithm to use
+    experiment_checkpoint_path = ""  # Path to checkpoint of experiment to resume
+    syncer = None  # Sacred syncer to use
+    resume = False  # Whether to resume experiment from checkpoint
 
 
 # Debug named configs
@@ -100,213 +102,3 @@ def example_cartpole_rl():
     }
     base_named_configs = ["cartpole"]
     resources_per_trial = dict(cpu=4)
-
-
-@parallel_ex.named_config
-def example_rl():
-    sacred_ex_name = "train_rl"
-    run_name = "rl_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {"environment": {"num_vec": 1}}
-    search_space = {
-        "config_updates": {
-            "rl": {
-                "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
-                "rl_kwargs": {
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                    "batch_size": tune.choice([64, 128, 256, 512]),
-                    "n_epochs": tune.choice([5, 10, 20]),
-                },
-            },
-        },
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 1
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_bc():
-    sacred_ex_name = "train_imitation"
-    run_name = "bc_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-    }
-    search_space = {
-        "config_updates": {
-            "bc": dict(
-                batch_size=tune.choice([8, 16, 32, 64]),
-                l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
-                optimizer_kwargs=dict(
-                    lr=tune.loguniform(1e-5, 1e-2),
-                ),
-                train_kwargs=dict(
-                    n_epochs=tune.choice([1, 5, 10, 20]),
-                ),
-            ),
-        },
-        "command_name": "bc",
-    }
-    num_samples = 64
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_dagger():
-    sacred_ex_name = "train_imitation"
-    run_name = "dagger_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-        "dagger": {"total_timesteps": 1e5},
-        "bc": {
-            "batch_size": 16,
-            "l2_weight": 1e-4,
-            "optimizer_kwargs": {"lr": 1e-3},
-        },
-    }
-    search_space = {
-        "config_updates": {
-            "bc": dict(
-                train_kwargs=dict(
-                    n_epochs=tune.choice([1, 5, 10]),
-                ),
-            ),
-            "dagger": dict(
-                beta_schedule=tune.choice(
-                    [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
-                    + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
-                ),
-                rollout_round_min_episodes=tune.choice([3, 5, 10]),
-            ),
-        },
-        "command_name": "dagger",
-    }
-    num_samples = 50
-    repeat = 3
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_gail():
-    sacred_ex_name = "train_adversarial"
-    run_name = "gail_tuning_hc"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-        "total_timesteps": 1e7,
-    }
-    search_space = {
-        "config_updates": {
-            "algorithm_kwargs": dict(
-                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
-                n_disc_updates_per_round=tune.choice([8, 16]),
-            ),
-            "rl": {
-                "batch_size": tune.choice([4096, 8192, 16384]),
-                "rl_kwargs": {
-                    "ent_coef": tune.loguniform(1e-7, 1e-3),
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                },
-            },
-            "algorithm_specific": {},
-        },
-        "command_name": "gail",
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_airl():
-    sacred_ex_name = "train_adversarial"
-    run_name = "airl_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-        "total_timesteps": 1e7,
-    }
-    search_space = {
-        "config_updates": {
-            "algorithm_kwargs": dict(
-                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
-                n_disc_updates_per_round=tune.choice([8, 16]),
-            ),
-            "rl": {
-                "batch_size": tune.choice([4096, 8192, 16384]),
-                "rl_kwargs": {
-                    "ent_coef": tune.loguniform(1e-7, 1e-3),
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                },
-            },
-            "algorithm_specific": {},
-        },
-        "command_name": "airl",
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_pc():
-    sacred_ex_name = "train_preference_comparisons"
-    run_name = "pc_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "total_timesteps": 2e7,
-        "total_comparisons": 5000,
-        "query_schedule": "hyperbolic",
-        "gatherer_kwargs": {"sample": True},
-    }
-    search_space = {
-        "named_configs": [
-            ["reward.normalize_output_disable"],
-        ],
-        "config_updates": {
-            "train": {
-                "policy_kwargs": {
-                    "activation_fn": tune.choice(
-                        [
-                            nn.ReLU,
-                        ],
-                    ),
-                },
-            },
-            "num_iterations": tune.choice([25, 50]),
-            "initial_comparison_frac": tune.choice([0.1, 0.25]),
-            "reward_trainer_kwargs": {
-                "epochs": tune.choice([1, 3, 6]),
-            },
-            "rl": {
-                "batch_size": tune.choice([512, 2048, 8192]),
-                "rl_kwargs": {
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                    "ent_coef": tune.loguniform(1e-7, 1e-3),
-                },
-            },
-        },
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 6f77330df..2417414cb 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -6,11 +6,9 @@
 import pathlib
 from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
 
-import numpy as np
 import ray
 import ray.tune
 import sacred
-from pandas.api.types import is_object_dtype
 from ray.tune import search
 from ray.tune.registry import register_trainable
 from ray.tune.search import optuna
@@ -31,14 +29,12 @@ def parallel(
     init_kwargs: Mapping[str, Any],
     local_dir: Optional[str],
     upload_dir: Optional[str],
-    repeat: int = 1,
-    eval_best_trial: bool = False,
-    eval_best_trial_resource_multiplier: int = 1,
-    eval_trial_seeds: int = 5,
-    experiment_checkpoint_path: str = "",
-    syncer=None,
-    resume: Union[str, bool] = False,
-) -> None:
+    repeat: int,
+    search_alg: Optional[str],
+    experiment_checkpoint_path: str,
+    syncer,
+    resume: Union[str, bool],
+) -> ray.tune.ExperimentAnalysis:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
     A Sacred FileObserver is attached to the inner experiment and writes Sacred
@@ -47,7 +43,7 @@ def parallel(
 
     Args:
         sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or
-            "train_adversarial".
+            "train_imitation" or "train_adversarial" or "train_preference_comparisons".
         run_name: A name describing this parallelizing experiment.
             This argument is also passed to `ray.tune.run` as the `name` argument.
             It is also saved in 'sacred/run.json' of each inner Sacred experiment
@@ -78,24 +74,19 @@ def parallel(
         init_kwargs: Arguments to pass to `ray.init`.
         local_dir: `local_dir` argument to `ray.tune.run()`.
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
+        search_alg: can be either "optuna" or None.
         repeat: Number of runs to repeat each trial for.
-        eval_best_trial: Whether to evaluate the trial with the best mean return
-            at the end of tuning on a separate set of seeds.
-        eval_best_trial_resource_multiplier: factor by which to multiply the
-            number of cpus per trial in `resources_per_trial`.
-        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
-        experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment ran using this script. Useful for resuming cancelled trials
-            of the experiments (using `resume`) or evaluating the best trial of the
-            experiment (using `eval_best_trial`).
+            Not used if `search_alg` is None.
         resume: If true and `experiment_checkpoint_path` is given, then resumes the
             experiment by restarting the trials that did not finish in the experiment
             checkpoint path.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
-
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
+
+    Returns:
+        The result of `ray.tune.run()`.
     """
     # Basic validation for config options before we enter parallel jobs.
     if not isinstance(base_named_configs, collections.abc.Sequence):
@@ -126,7 +117,11 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
-    search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
+    if search_alg == "optuna":
+        algo = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
+    else:
+        assert repeat == 1  # repeat should not be used if search_alg is None
+        algo = None
 
     if sacred_ex_name == "train_rl":
         return_key = "monitor_return_mean"
@@ -166,7 +161,6 @@ def parallel(
             result.trials = None
             result.fetch_trial_dataframes()
         else:
-            # run hyperparameter tuning
             result = ray.tune.run(
                 trainable,
                 config=search_space,
@@ -178,68 +172,11 @@ def parallel(
                     upload_dir=upload_dir,
                     syncer=syncer,
                 ),
-                search_alg=search_alg,
+                search_alg=algo,
                 metric=return_key,
                 mode="max",
             )
-        if eval_best_trial:
-            df = result.results_df
-            df = df[df["config/named_configs"].notna()]
-            # convert object dtype to str required by df.groupby
-            for col in df.columns:
-                if is_object_dtype(df[col]):
-                    df[col] = df[col].astype("str")
-            # group into separate HP configs
-            grp_keys = [
-                c for c in df.columns if c.startswith("config") and "seed" not in c
-            ]
-            grps = df.groupby(grp_keys)
-            # store mean return of runs across all seeds in a group
-            df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
-            best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-            row = best_config_df.iloc[0]
-            best_config_tag = row["experiment_tag"]
-            if result.trials is not None:
-                trial = [
-                    t for t in result.trials if best_config_tag in t.experiment_tag
-                ][0]
-                best_config = trial.config
-                print("Mean return:", row["mean_return"])
-                print(
-                    "All returns:",
-                    df[df["mean_return"] == row["mean_return"]][return_key],
-                )
-                print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
-                best_config["config_updates"].update(
-                    seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
-                )
-
-                resources_per_trial_eval = copy.deepcopy(resources_per_trial)
-                # update cpus per trial only if it is provided in `resources_per_trial`
-                # Uses the default values (cpu=1) if it is not provided
-                if "cpu" in resources_per_trial:
-
-                    resources_per_trial_eval[
-                        "cpu"
-                    ] *= eval_best_trial_resource_multiplier
-                    best_config["config_updates"].update(
-                        environment=dict(num_vec=resources_per_trial_eval["cpu"]),
-                    )
-
-                eval_result = ray.tune.run(
-                    trainable,
-                    config={
-                        "named_configs": best_config["named_configs"],
-                        "config_updates": best_config["config_updates"],
-                        "command_name": best_config.get("command_name", None),
-                    },
-                    name=run_name + "_best_hp_eval",
-                    resources_per_trial=resources_per_trial_eval,
-                )
-                returns = eval_result.results_df[return_key].to_numpy()
-                print("All returns:", returns)
-                print("Mean:", np.mean(returns))
-                print("Std:", np.std(returns))
+        return result
     finally:
         ray.shutdown()
 

From 64c3a8d0deb8748eba2a69be20d7f9a464639523 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 11 Jul 2023 16:07:13 +0530
Subject: [PATCH 19/54] Fix docstring

---
 src/imitation/scripts/parallel.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 2417414cb..10ae9f924 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -77,6 +77,10 @@ def parallel(
         search_alg: can be either "optuna" or None.
         repeat: Number of runs to repeat each trial for.
             Not used if `search_alg` is None.
+        experiment_checkpoint_path: Path containing the checkpoints of a previous
+            experiment ran using this script. Useful for resuming cancelled trials
+            of the experiments (using `resume`) or evaluating the best trial of the
+            experiment (using `eval_best_trial`).
         resume: If true and `experiment_checkpoint_path` is given, then resumes the
             experiment by restarting the trials that did not finish in the experiment
             checkpoint path.

From 8fba0d3ac9b690613b7526b68bd1c68b3ac6efa7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 11 Jul 2023 17:42:08 +0530
Subject: [PATCH 20/54] Removing resume option as it is getting tricky to
 correctly implement

---
 src/imitation/scripts/config/parallel.py |  5 +---
 src/imitation/scripts/parallel.py        | 31 ++----------------------
 tests/scripts/test_scripts.py            |  1 +
 3 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index e9c5b8245..3416f9442 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -39,14 +39,11 @@ def config():
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
     experiment_checkpoint_path = ""
-    eval_best_trial = False
-    eval_trial_seeds = 5  # Number of seeds to search over by default
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1
     search_alg = "optuna"  # search algorithm to use
-    experiment_checkpoint_path = ""  # Path to checkpoint of experiment to resume
+    experiment_checkpoint_path = ""  # Path to checkpoint of experiment 
     syncer = None  # Sacred syncer to use
-    resume = False  # Whether to resume experiment from checkpoint
 
 
 # Debug named configs
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 10ae9f924..bf73c1c72 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -33,7 +33,6 @@ def parallel(
     search_alg: Optional[str],
     experiment_checkpoint_path: str,
     syncer,
-    resume: Union[str, bool],
 ) -> ray.tune.ExperimentAnalysis:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
@@ -78,12 +77,8 @@ def parallel(
         repeat: Number of runs to repeat each trial for.
             Not used if `search_alg` is None.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment ran using this script. Useful for resuming cancelled trials
-            of the experiments (using `resume`) or evaluating the best trial of the
-            experiment (using `eval_best_trial`).
-        resume: If true and `experiment_checkpoint_path` is given, then resumes the
-            experiment by restarting the trials that did not finish in the experiment
-            checkpoint path.
+            experiment ran using this script. Useful for  evaluating the best trial
+             of the experiment.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
     Raises:
@@ -134,28 +129,6 @@ def parallel(
 
     try:
         if experiment_checkpoint_path:
-            if resume:
-                # restart failed runs from experiment_checkpoint_path
-                register_trainable("inner", trainable)
-                runner = ray.tune.execution.trial_runner.TrialRunner(
-                    local_checkpoint_dir=experiment_checkpoint_path,
-                    sync_config=ray.tune.syncer.SyncConfig(
-                        upload_dir=upload_dir,
-                        syncer=syncer,
-                    ),
-                    metric=return_key,
-                    resume=True,
-                )
-                print(
-                    "Live trials:",
-                    len(runner._live_trials),
-                    "/",
-                    len(runner._trials),
-                )
-                while not runner.is_finished():
-                    runner.step()
-                    print("Debug:", runner.debug_string())
-
             # load experiment analysis results
             result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
             result._load_checkpoints_from_latest(
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 4435155cd..586fa91ba 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -802,6 +802,7 @@ def test_train_rl_cnn_policy(tmpdir: str, rng):
             # Need absolute path because raylet runs in different working directory.
             "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(),
         },
+        search_alg=None, # Use default search algorithm of ray.
         search_space={
             "command_name": "airl",
             "config_updates": {"total_timesteps": tune.choice([5, 10])},

From 12ab31c1641b6b99abb6823cf037a3f9340cb86c Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 12 Jul 2023 04:26:17 +0530
Subject: [PATCH 21/54] Minor fixes

---
 src/imitation/scripts/config/analyze.py  | 2 +-
 src/imitation/scripts/config/parallel.py | 2 +-
 src/imitation/scripts/parallel.py        | 5 ++---
 tests/scripts/test_scripts.py            | 7 ++++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/imitation/scripts/config/analyze.py b/src/imitation/scripts/config/analyze.py
index 5213a875d..01cc2d035 100644
--- a/src/imitation/scripts/config/analyze.py
+++ b/src/imitation/scripts/config/analyze.py
@@ -18,7 +18,7 @@ def config():
     tex_output_path = None  # Write LaTex output to this path
     print_table = True  # Set to True to print analysis to stdout
     split_str = ","  # str used to split source_dir_str into multiple source dirs
-    table_verbosity = 1  # Choose from 0, 1, or 2
+    table_verbosity = 1  # Choose from 0, 1, 2 or 3
     source_dirs = None
 
 
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 3416f9442..b09f9fc4a 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -42,7 +42,7 @@ def config():
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1
     search_alg = "optuna"  # search algorithm to use
-    experiment_checkpoint_path = ""  # Path to checkpoint of experiment 
+    experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     syncer = None  # Sacred syncer to use
 
 
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index bf73c1c72..ebda17c82 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -4,13 +4,12 @@
 import copy
 import glob
 import pathlib
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence
 
 import ray
 import ray.tune
 import sacred
 from ray.tune import search
-from ray.tune.registry import register_trainable
 from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
 
@@ -78,7 +77,7 @@ def parallel(
             Not used if `search_alg` is None.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
             experiment ran using this script. Useful for  evaluating the best trial
-             of the experiment.
+            of the experiment.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
     Raises:
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 586fa91ba..e17765471 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -802,7 +802,7 @@ def test_train_rl_cnn_policy(tmpdir: str, rng):
             # Need absolute path because raylet runs in different working directory.
             "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(),
         },
-        search_alg=None, # Use default search algorithm of ray.
+        search_alg=None,  # Use default search algorithm of ray.
         search_space={
             "command_name": "airl",
             "config_updates": {"total_timesteps": tune.choice([5, 10])},
@@ -942,7 +942,7 @@ def test_analyze_imitation(tmpdir: str, run_names: List[str], run_sacred_fn):
             assert run.status == "COMPLETED"
 
     # Check that analyze script finds the correct number of logs.
-    def check(run_name: Optional[str], count: int) -> None:
+    def check(run_name: Optional[str], count: int, table_verbosity=1) -> None:
         run = analyze.analysis_ex.run(
             command_name="analyze_imitation",
             config_updates=dict(
@@ -952,6 +952,7 @@ def check(run_name: Optional[str], count: int) -> None:
                 csv_output_path=tmpdir_path / "analysis.csv",
                 tex_output_path=tmpdir_path / "analysis.tex",
                 print_table=True,
+                table_verbosity=table_verbosity,
             ),
         )
         assert run.status == "COMPLETED"
@@ -961,7 +962,7 @@ def check(run_name: Optional[str], count: int) -> None:
     for run_name, count in Counter(run_names).items():
         check(run_name, count)
 
-    check(None, len(run_names))  # Check total number of logs.
+    check(None, len(run_names), table_verbosity=3)  # Check total number of logs.
 
 
 def test_analyze_gather_tb(tmpdir: str):

From 19b0f2c3ed8d7d2ef10aaabab21739d31b51261c Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Sun, 16 Jul 2023 10:39:12 +0530
Subject: [PATCH 22/54] Updates from review

---
 benchmarking/tuning.py                   | 202 +++++++++++++++--------
 benchmarking/tuning_config.py            |  36 ++--
 src/imitation/scripts/config/parallel.py |   3 +-
 src/imitation/scripts/parallel.py        |   9 +-
 tests/test_benchmarking.py               |  27 +++
 5 files changed, 180 insertions(+), 97 deletions(-)

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index b4e62a84a..0c18b1256 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -13,82 +13,144 @@
 
 @tuning_ex.main
 def tune(
-    parallel: Dict[str, Any],
-    eval_best_trial: bool = False,
+    parallel_run_config: Dict[str, Any],
     eval_best_trial_resource_multiplier: int = 1,
-    eval_trial_seeds: int = 5,
+    num_eval_seeds: int = 5,
 ) -> None:
     """Tune hyperparameters of imitation algorithms using parallel script.
 
     Args:
-        parallel: A dictionary of arguments from the parallel script.
-        eval_best_trial: Whether to evaluate the trial with the best mean return
-            at the end of tuning on a separate set of seeds.
-        eval_best_trial_resource_multiplier: factor by which to multiply the
-            number of cpus per trial in `resources_per_trial`.
-        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
+        parallel_run_config: Dictionary of arguments to pass to the parallel script.
+        eval_best_trial_resource_multiplier: Factor by which to multiply the
+            number of cpus per trial in `resources_per_trial`. This is useful for
+            allocating more resources per trial to the evaluation trials than the
+            resources for hyperparameter tuning since number of evaluation trials
+            is usually much smaller than the number of tuning trials.
+        num_eval_seeds: Number of distinct seeds to evaluate the best trial on.
+            Set to 0 to disable evaluation.
+
+    Raises:
+        ValueError: If no trials are returned by.
+    """
+    run = parallel_ex.run(config_updates=parallel_run_config)
+    experiment_analysis = run.result
+    if not experiment_analysis.trials:
+        raise ValueError(
+            "No trials found. Please ensure that the `experiment_checkpoint_path` "
+            "in `parallel_run_config` is passed correctly "
+            "or that the tuning run finished properly.",
+        )
+
+    return_key = "imit_stats/monitor_return_mean"
+    if parallel_run_config["sacred_ex_name"] == "train_rl":
+        return_key = "monitor_return_mean"
+    best_trial = find_best_trial(experiment_analysis, return_key, print_return=True)
+
+    if num_eval_seeds > 0:  # evaluate the best trial
+        resources_per_trial_eval = copy.deepcopy(
+            parallel_run_config["resources_per_trial"],
+        )
+        # update cpus per trial only if it is provided in `resources_per_trial`
+        # Uses the default values (cpu=1) if it is not provided
+        if "cpu" in parallel_run_config["resources_per_trial"]:
+            resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
+        evaluate_best_trial(
+            best_trial,
+            num_eval_seeds,
+            parallel_run_config,
+            resources_per_trial_eval,
+            return_key,
+        )
+
+
+def find_best_trial(
+    experiment_analysis: ray.tune.analysis.ExperimentAnalysis,
+    return_key: str,
+    print_return: bool = False,
+) -> ray.tune.experiment.Trial:
+    """Find the trial with the best mean return across all seeds.
+
+    Args:
+        experiment_analysis: The result of a parallel/tuning experiment.
+        return_key: The key of the return metric in the results dataframe.
+        print_return: Whether to print the mean and std of the returns
+            of the best trial.
+
+    Returns:
+        best_trial: The trial with the best mean return across all seeds.
+    """
+    df = experiment_analysis.results_df
+    # convert object dtype to str required by df.groupby
+    for col in df.columns:
+        if pd_types.is_object_dtype(df[col]):
+            df[col] = df[col].astype("str")
+    # group into separate HP configs
+    grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
+    grps = df.groupby(grp_keys)
+    # store mean return of runs across all seeds in a group
+    df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
+    best_config_df = df[df["mean_return"] == df["mean_return"].max()]
+    row = best_config_df.iloc[0]
+    best_config_tag = row["experiment_tag"]
+    assert experiment_analysis.trials is not None  # for mypy
+    best_trial = [
+        t for t in experiment_analysis.trials if best_config_tag in t.experiment_tag
+    ][0]
+
+    if print_return:
+        all_returns = df[df["mean_return"] == row["mean_return"]][return_key]
+        all_returns = all_returns.to_numpy()
+        print("All returns:", all_returns)
+        print("Mean return:", row["mean_return"])
+        print("Std return:", np.std(all_returns))
+        print("Total seeds:", len(all_returns))
+    return best_trial
+
+
+def evaluate_best_trial(
+    best_trial: ray.tune.experiment.Trial,
+    num_eval_seeds: int,
+    parallel_run_config: Dict[str, Any],
+    resources_per_trial: Dict[str, int],
+    return_key: str,
+    print_return: bool = False,
+):
+    """Evaluate the best trial of a parallel run on a separate set of seeds.
+
+    Args:
+        best_trial: The trial with the best mean return across all seeds.
+        num_eval_seeds: Number of distinct seeds to evaluate the best trial on.
+        parallel_run_config: Dictionary of arguments passed to the parallel
+            script to get best_trial.
+        resources_per_trial: Resources to be used for each evaluation trial.
+        return_key: The key of the return metric in the results dataframe.
+        print_return: Whether to print the mean and std of the evaluation returns.
+
+    Returns:
+        eval_run: The result of the evaluation run.
     """
-    run = parallel_ex.run(config_updates=parallel)
-    result = run.result
-
-    if eval_best_trial:
-        if parallel["sacred_ex_name"] == "train_rl":
-            return_key = "monitor_return_mean"
-        else:
-            return_key = "imit_stats/monitor_return_mean"
-        df = result.results_df
-        df = df[df["config/named_configs"].notna()]
-        # convert object dtype to str required by df.groupby
-        for col in df.columns:
-            if pd_types.is_object_dtype(df[col]):
-                df[col] = df[col].astype("str")
-        # group into separate HP configs
-        grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
-        grps = df.groupby(grp_keys)
-        # store mean return of runs across all seeds in a group
-        df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
-        best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-        row = best_config_df.iloc[0]
-        best_config_tag = row["experiment_tag"]
-        if result.trials is not None:
-            trial = [t for t in result.trials if best_config_tag in t.experiment_tag][0]
-            best_config = trial.config
-            print("Mean return:", row["mean_return"])
-            print(
-                "All returns:",
-                df[df["mean_return"] == row["mean_return"]][return_key],
-            )
-            print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
-
-            best_config["config_updates"].update(
-                seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
-            )
-
-            resources_per_trial_eval = copy.deepcopy(parallel["resources_per_trial"])
-            # update cpus per trial only if it is provided in `resources_per_trial`
-            # Uses the default values (cpu=1) if it is not provided
-            if "cpu" in parallel["resources_per_trial"]:
-                resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
-
-            eval_config_updates = parallel.copy()
-            eval_config_updates.update(
-                run_name=parallel["run_name"] + "_best_hp_eval",
-                num_samples=1,
-                search_space=best_config,
-                base_named_configs=parallel["base_named_configs"],
-                base_config_updates=parallel["base_config_updates"],
-                resources_per_trial=resources_per_trial_eval,
-                search_alg=None,
-                repeat=1,
-                experiment_checkpoint_path="",
-                resume=False,
-            )
-            eval_run = parallel_ex.run(config_updates=eval_config_updates)
-            eval_result = eval_run.result
-            returns = eval_result.results_df[return_key].to_numpy()
-            print("All returns:", returns)
-            print("Mean:", np.mean(returns))
-            print("Std:", np.std(returns))
+    best_config = best_trial.config
+    best_config["config_updates"].update(
+        seed=ray.tune.grid_search(list(range(100, 100 + num_eval_seeds))),
+    )
+    eval_config_updates = parallel_run_config.copy()
+    eval_config_updates.update(
+        run_name=parallel_run_config["run_name"] + "_best_hp_eval",
+        num_samples=1,
+        search_space=best_config,
+        resources_per_trial=resources_per_trial,
+        search_alg=None,
+        repeat=1,
+        experiment_checkpoint_path="",
+    )
+    eval_run = parallel_ex.run(config_updates=eval_config_updates)
+    eval_result = eval_run.result
+    returns = eval_result.results_df[return_key].to_numpy()
+    if print_return:
+        print("All returns:", returns)
+        print("Mean:", np.mean(returns))
+        print("Std:", np.std(returns))
+    return eval_run
 
 
 def main_console():
diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py
index 79c8d0347..187963d02 100644
--- a/benchmarking/tuning_config.py
+++ b/benchmarking/tuning_config.py
@@ -12,7 +12,7 @@
 
 @tuning_ex.named_config
 def example_rl():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_rl",
         run_name="rl_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -33,13 +33,12 @@ def example_rl():
         repeat=1,
         resources_per_trial=dict(cpu=1),
     )
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_bc():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="bc_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -62,19 +61,18 @@ def example_bc():
             },
             "command_name": "bc",
         },
-        num_samples=2,
-        repeat=1,
+        num_samples=64,
+        repeat=3,
         resources_per_trial=dict(cpu=1),
     )
 
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
     eval_best_trial_resource_multiplier = 1
 
 
 @tuning_ex.named_config
 def example_dagger():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="dagger_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -109,13 +107,12 @@ def example_dagger():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_gail():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="gail_tuning_hc",
         base_named_configs=["logging.wandb_logging"],
@@ -145,13 +142,12 @@ def example_gail():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_airl():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="airl_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -181,14 +177,12 @@ def example_airl():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_pc():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_preference_comparisons",
         run_name="pc_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -232,6 +226,4 @@ def example_pc():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index b09f9fc4a..b38b6f28c 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -38,9 +38,8 @@ def config():
 
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
-    experiment_checkpoint_path = ""
     num_samples = 1  # Number of samples per grid search configuration
-    repeat = 1
+    repeat = 1 # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     syncer = None  # Sacred syncer to use
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index ebda17c82..93aa932b9 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -72,11 +72,13 @@ def parallel(
         init_kwargs: Arguments to pass to `ray.init`.
         local_dir: `local_dir` argument to `ray.tune.run()`.
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
-        search_alg: can be either "optuna" or None.
+        search_alg: can be either "optuna" or None. Setting `None` allows for
+            adding grid_search to the `search_space` hyperparameters but doesn't allow
+            for trials to be repeated.
         repeat: Number of runs to repeat each trial for.
             Not used if `search_alg` is None.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment ran using this script. Useful for  evaluating the best trial
+            experiment ran using this script. Useful for evaluating the best trial
             of the experiment.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
@@ -84,7 +86,8 @@ def parallel(
         TypeError: Named configs not string sequences or config updates not mappings.
 
     Returns:
-        The result of `ray.tune.run()`.
+        The result of running the parallel experiment with `ray.tune.run()`.
+        Useful for fetching the configs and results dataframe of all the trials.
     """
     # Basic validation for config options before we enter parallel jobs.
     if not isinstance(base_named_configs, collections.abc.Sequence):
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index ba01b38a2..4a8f6ea6f 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -1,5 +1,7 @@
 """Tests for config files in benchmarking/ folder."""
 import pathlib
+import subprocess
+import sys
 
 import pytest
 
@@ -44,3 +46,28 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
 
     # THEN
     assert run.status == "COMPLETED"
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_tuning_print_config_succeeds(algorithm: str):
+    # We test the configs using the print_config command,
+    # because running the configs requires MuJoCo.
+    # Requiring MuJoCo to run the tests adds too much complexity.
+
+    # We need to use sys.executable, not just "python", on Windows as
+    # subprocess.call ignores PATH (unless shell=True) so runs a
+    # system-wide Python interpreter outside of our venv. See:
+    # https://stackoverflow.com/questions/5658622/
+    tuning_path = str(BENCHMARKING_DIR / "tuning.py")
+    env = 'parallel_run_config.base_named_configs=["seals_cartpole"]'
+    exit_code = subprocess.call(
+        [
+            sys.executable,
+            tuning_path,
+            "print_config",
+            "with",
+            f"example_{algorithm}",
+            env,
+        ],
+    )
+    assert exit_code == 0

From 046b8d9987e13a8d87f2bd52fe75be562e80db04 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Sun, 16 Jul 2023 13:04:14 +0530
Subject: [PATCH 23/54] fix lint error

---
 src/imitation/scripts/config/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index b38b6f28c..e81a617db 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -39,7 +39,7 @@ def config():
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
     num_samples = 1  # Number of samples per grid search configuration
-    repeat = 1 # Number of times to repeat a sampled configuration
+    repeat = 1  # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     syncer = None  # Sacred syncer to use

From 8eee0822d3fb4686d5801a6e955fdde0c9a90ce7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Sun, 16 Jul 2023 13:52:43 +0530
Subject: [PATCH 24/54] Add documentation for using the tuning script

---
 benchmarking/README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 3f5114545..95e67f1d3 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -17,3 +17,24 @@ python -m imitation.scripts.<train_script> <algo> with benchmarking/<config_name
 ...
 ex.add_config('benchmarking/<config_name>.json')
 ```
+
+# Tuning Hyperparameters
+
+The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script.
+The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
+the search space defined in the `tuning_config.py` script. The tuning script proceeds in two
+phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best 
+hyperparameter config found in the first phase based on the maximum mean return is
+re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials
+are reported.
+
+To tune the hyperparameters of an algorithm using the default search space provided:
+```bash
+python tuning.py with example_{algo} 'parallel_run_config.base_named_configs=["{env}"]'
+```
+
+In this command, `example_{algo}` provides the default search space and settings to be used for
+the specific algorithm, which is defined in the `tuning_config.py` script and
+`'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in.
+See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be
+provided through the command line to change the tuning behavior.

From 5ce765859f7cd295ae607cab2709d0f626c65de7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Mon, 17 Jul 2023 09:08:04 +0530
Subject: [PATCH 25/54] Fix lint error

---
 benchmarking/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 95e67f1d3..892908ac8 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -23,7 +23,7 @@ ex.add_config('benchmarking/<config_name>.json')
 The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script.
 The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
 the search space defined in the `tuning_config.py` script. The tuning script proceeds in two
-phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best 
+phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best
 hyperparameter config found in the first phase based on the maximum mean return is
 re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials
 are reported.

From a8be3316b653451ce8366379cf413627dd22e1ec Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 18 Jul 2023 11:09:05 +0530
Subject: [PATCH 26/54] Updates from the review

---
 benchmarking/README.md                        |  4 ++--
 ....json => airl_seals_ant_best_hp_eval.json} |  0
 ...airl_seals_half_cheetah_best_hp_eval.json} |  0
 ...on => airl_seals_hopper_best_hp_eval.json} |  0
 ...n => airl_seals_swimmer_best_hp_eval.json} |  0
 ...on => airl_seals_walker_best_hp_eval.json} |  0
 ...al.json => bc_seals_ant_best_hp_eval.json} |  0
 ...> bc_seals_half_cheetah_best_hp_eval.json} |  0
 ...json => bc_seals_hopper_best_hp_eval.json} |  0
 ...son => bc_seals_swimmer_best_hp_eval.json} |  0
 ...json => bc_seals_walker_best_hp_eval.json} |  0
 ...son => dagger_seals_ant_best_hp_eval.json} |  0
 ...gger_seals_half_cheetah_best_hp_eval.json} |  0
 ... => dagger_seals_hopper_best_hp_eval.json} |  0
 ...=> dagger_seals_swimmer_best_hp_eval.json} |  0
 ... => dagger_seals_walker_best_hp_eval.json} |  0
 ....json => gail_seals_ant_best_hp_eval.json} |  0
 ...gail_seals_half_cheetah_best_hp_eval.json} |  0
 ...on => gail_seals_hopper_best_hp_eval.json} |  0
 ...n => gail_seals_swimmer_best_hp_eval.json} |  0
 ...on => gail_seals_walker_best_hp_eval.json} |  0
 benchmarking/tuning.py                        | 23 +++++++++++--------
 benchmarking/tuning_config.py                 | 21 +++++++++--------
 benchmarking/util.py                          |  2 +-
 experiments/commands.py                       | 18 +++++++--------
 src/imitation/scripts/config/parallel.py      |  6 ++---
 tests/test_benchmarking.py                    |  4 ++--
 tests/test_experiments.py                     | 16 ++++++-------
 28 files changed, 49 insertions(+), 45 deletions(-)
 rename benchmarking/{example_airl_seals_ant_best_hp_eval.json => airl_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_half_cheetah_best_hp_eval.json => airl_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_hopper_best_hp_eval.json => airl_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_swimmer_best_hp_eval.json => airl_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_walker_best_hp_eval.json => airl_seals_walker_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_ant_best_hp_eval.json => bc_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_half_cheetah_best_hp_eval.json => bc_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_hopper_best_hp_eval.json => bc_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_swimmer_best_hp_eval.json => bc_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_walker_best_hp_eval.json => bc_seals_walker_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_ant_best_hp_eval.json => dagger_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_half_cheetah_best_hp_eval.json => dagger_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_hopper_best_hp_eval.json => dagger_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_swimmer_best_hp_eval.json => dagger_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_walker_best_hp_eval.json => dagger_seals_walker_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_ant_best_hp_eval.json => gail_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_half_cheetah_best_hp_eval.json => gail_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_hopper_best_hp_eval.json => gail_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_swimmer_best_hp_eval.json => gail_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_walker_best_hp_eval.json => gail_seals_walker_best_hp_eval.json} (100%)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 892908ac8..3973c6181 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -30,10 +30,10 @@ are reported.
 
 To tune the hyperparameters of an algorithm using the default search space provided:
 ```bash
-python tuning.py with example_{algo} 'parallel_run_config.base_named_configs=["{env}"]'
+python tuning.py with {algo} 'parallel_run_config.base_named_configs=["{env}"]'
 ```
 
-In this command, `example_{algo}` provides the default search space and settings to be used for
+In this command, `{algo}` provides the default search space and settings to be used for
 the specific algorithm, which is defined in the `tuning_config.py` script and
 `'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in.
 See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be
diff --git a/benchmarking/example_airl_seals_ant_best_hp_eval.json b/benchmarking/airl_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_ant_best_hp_eval.json
rename to benchmarking/airl_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_half_cheetah_best_hp_eval.json b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/airl_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_hopper_best_hp_eval.json b/benchmarking/airl_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_hopper_best_hp_eval.json
rename to benchmarking/airl_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_swimmer_best_hp_eval.json b/benchmarking/airl_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_swimmer_best_hp_eval.json
rename to benchmarking/airl_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_walker_best_hp_eval.json b/benchmarking/airl_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_walker_best_hp_eval.json
rename to benchmarking/airl_seals_walker_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_ant_best_hp_eval.json b/benchmarking/bc_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_ant_best_hp_eval.json
rename to benchmarking/bc_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/bc_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_hopper_best_hp_eval.json b/benchmarking/bc_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_hopper_best_hp_eval.json
rename to benchmarking/bc_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_swimmer_best_hp_eval.json b/benchmarking/bc_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_swimmer_best_hp_eval.json
rename to benchmarking/bc_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_walker_best_hp_eval.json b/benchmarking/bc_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_walker_best_hp_eval.json
rename to benchmarking/bc_seals_walker_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_ant_best_hp_eval.json b/benchmarking/dagger_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_ant_best_hp_eval.json
rename to benchmarking/dagger_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json b/benchmarking/dagger_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_hopper_best_hp_eval.json
rename to benchmarking/dagger_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json b/benchmarking/dagger_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
rename to benchmarking/dagger_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_walker_best_hp_eval.json b/benchmarking/dagger_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_walker_best_hp_eval.json
rename to benchmarking/dagger_seals_walker_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_ant_best_hp_eval.json b/benchmarking/gail_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_ant_best_hp_eval.json
rename to benchmarking/gail_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_half_cheetah_best_hp_eval.json b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/gail_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_hopper_best_hp_eval.json b/benchmarking/gail_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_hopper_best_hp_eval.json
rename to benchmarking/gail_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_swimmer_best_hp_eval.json b/benchmarking/gail_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_swimmer_best_hp_eval.json
rename to benchmarking/gail_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_walker_best_hp_eval.json b/benchmarking/gail_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_walker_best_hp_eval.json
rename to benchmarking/gail_seals_walker_best_hp_eval.json
diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index 0c18b1256..324032088 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -30,7 +30,7 @@ def tune(
             Set to 0 to disable evaluation.
 
     Raises:
-        ValueError: If no trials are returned by.
+        ValueError: If no trials are returned by the parallel run of tuning.
     """
     run = parallel_ex.run(config_updates=parallel_run_config)
     experiment_analysis = run.result
@@ -54,9 +54,10 @@ def tune(
         # Uses the default values (cpu=1) if it is not provided
         if "cpu" in parallel_run_config["resources_per_trial"]:
             resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
-        evaluate_best_trial(
+        evaluate_trial(
             best_trial,
             num_eval_seeds,
+            parallel_run_config["run_name"] + "_best_hp_eval",
             parallel_run_config,
             resources_per_trial_eval,
             return_key,
@@ -107,19 +108,21 @@ def find_best_trial(
     return best_trial
 
 
-def evaluate_best_trial(
-    best_trial: ray.tune.experiment.Trial,
+def evaluate_trial(
+    trial: ray.tune.experiment.Trial,
     num_eval_seeds: int,
+    run_name: str,
     parallel_run_config: Dict[str, Any],
     resources_per_trial: Dict[str, int],
     return_key: str,
     print_return: bool = False,
 ):
-    """Evaluate the best trial of a parallel run on a separate set of seeds.
+    """Evaluate a given trial of a parallel run on a separate set of seeds.
 
     Args:
-        best_trial: The trial with the best mean return across all seeds.
+        trial: The trial to evaluate.
         num_eval_seeds: Number of distinct seeds to evaluate the best trial on.
+        run_name: The name of the evaluation run.
         parallel_run_config: Dictionary of arguments passed to the parallel
             script to get best_trial.
         resources_per_trial: Resources to be used for each evaluation trial.
@@ -129,15 +132,15 @@ def evaluate_best_trial(
     Returns:
         eval_run: The result of the evaluation run.
     """
-    best_config = best_trial.config
-    best_config["config_updates"].update(
+    config = trial.config
+    config["config_updates"].update(
         seed=ray.tune.grid_search(list(range(100, 100 + num_eval_seeds))),
     )
     eval_config_updates = parallel_run_config.copy()
     eval_config_updates.update(
-        run_name=parallel_run_config["run_name"] + "_best_hp_eval",
+        run_name=run_name,
         num_samples=1,
-        search_space=best_config,
+        search_space=config,
         resources_per_trial=resources_per_trial,
         search_alg=None,
         repeat=1,
diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py
index 187963d02..239537406 100644
--- a/benchmarking/tuning_config.py
+++ b/benchmarking/tuning_config.py
@@ -4,14 +4,14 @@
 import sacred
 from torch import nn
 
-from imitation.algorithms import dagger
+from imitation.algorithms import dagger as dagger_alg
 from imitation.scripts.parallel import parallel_ex
 
 tuning_ex = sacred.Experiment("tuning", ingredients=[parallel_ex])
 
 
 @tuning_ex.named_config
-def example_rl():
+def rl():
     parallel_run_config = dict(
         sacred_ex_name="train_rl",
         run_name="rl_tuning",
@@ -37,7 +37,7 @@ def example_rl():
 
 
 @tuning_ex.named_config
-def example_bc():
+def bc():
     parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="bc_tuning",
@@ -71,7 +71,7 @@ def example_bc():
 
 
 @tuning_ex.named_config
-def example_dagger():
+def dagger():
     parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="dagger_tuning",
@@ -95,8 +95,11 @@ def example_dagger():
                 ),
                 "dagger": dict(
                     beta_schedule=tune.choice(
-                        [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
-                        + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                        [dagger_alg.LinearBetaSchedule(i) for i in [1, 5, 15]]
+                        + [
+                            dagger_alg.ExponentialBetaSchedule(i)
+                            for i in [0.3, 0.5, 0.7]
+                        ],
                     ),
                     rollout_round_min_episodes=tune.choice([3, 5, 10]),
                 ),
@@ -111,7 +114,7 @@ def example_dagger():
 
 
 @tuning_ex.named_config
-def example_gail():
+def gail():
     parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="gail_tuning_hc",
@@ -146,7 +149,7 @@ def example_gail():
 
 
 @tuning_ex.named_config
-def example_airl():
+def airl():
     parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="airl_tuning",
@@ -181,7 +184,7 @@ def example_airl():
 
 
 @tuning_ex.named_config
-def example_pc():
+def pc():
     parallel_run_config = dict(
         sacred_ex_name="train_preference_comparisons",
         run_name="pc_tuning",
diff --git a/benchmarking/util.py b/benchmarking/util.py
index 408f0d812..88416344d 100644
--- a/benchmarking/util.py
+++ b/benchmarking/util.py
@@ -79,7 +79,7 @@ def clean_config_file(file: pathlib.Path, write_path: pathlib.Path, /) -> None:
 
     remove_empty_dicts(config)
     # files are of the format
-    # /path/to/file/example_<algo>_<env>_best_hp_eval/<other_info>/sacred/1/config.json
+    # /path/to/file/<algo>_<env>_best_hp_eval/<other_info>/sacred/1/config.json
     # we want to write to /<write_path>/<algo>_<env>.json
     with open(write_path / f"{file.parents[3].name}.json", "w") as f:
         json.dump(config, f, indent=4)
diff --git a/experiments/commands.py b/experiments/commands.py
index 2ac737e06..9021d3738 100644
--- a/experiments/commands.py
+++ b/experiments/commands.py
@@ -22,13 +22,13 @@
 python -m imitation.scripts.train_adversarial airl \
     --capture=sys --name=run0 \
     --file_storage=output/sacred/$USER-cmd-run0-airl-0-a3531726 \
-    with ../benchmarking/example_airl_seals_walker_best_hp_eval.json \
+    with ../benchmarking/airl_seals_walker_best_hp_eval.json \
     seed=0 logging.log_root=output
 
 python -m imitation.scripts.train_adversarial gail \
     --capture=sys --name=run0 \
     --file_storage=output/sacred/$USER-cmd-run0-gail-0-a1ec171b \
-    with ../benchmarking/example_gail_seals_walker_best_hp_eval.json \
+    with ../benchmarking/gail_seals_walker_best_hp_eval.json \
     seed=0 logging.log_root=output
 
 We can execute commands in parallel by piping them to GNU parallel:
@@ -42,7 +42,7 @@
 
 python commands.py \
     --name=run0 \
-    --cfg_pattern=../benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json \
+    --cfg_pattern=../benchmarking/bc_seals_half_cheetah_best_hp_eval.json \
     --output_dir=/data/output \
     --remote
 
@@ -52,7 +52,7 @@
     --command "python -m imitation.scripts.train_imitation bc \
     --capture=sys --name=run0 \
     --file_storage=/data/output/sacred/$USER-cmd-run0-bc-0-72cb1df3 \
-    with /data/imitation/benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json \
+    with /data/imitation/benchmarking/bc_seals_half_cheetah_best_hp_eval.json \
     seed=0 logging.log_root=/data/output" \
     --container hacobe/devbox:imitation \
     --login --force-pull --never-restart --gpu 0 --shared-host-dir-mount /data
@@ -177,19 +177,19 @@ def parse() -> argparse.Namespace:
     parser.add_argument(
         "--cfg_pattern",
         type=str,
-        default="example_bc_seals_half_cheetah_best_hp_eval.json",
+        default="bc_seals_half_cheetah_best_hp_eval.json",
         help="""Generate a command for every file that matches this glob pattern. \
 Each matching file should be a config file that has its algorithm name \
 (bc, dagger, airl or gail) bookended by underscores in the filename. \
 If the --remote flag is enabled, then generate a command for every file in the \
 --remote_cfg_dir directory that has the same filename as a file that matches this \
 glob pattern. E.g., suppose the current, local working directory is 'foo' and \
-the subdirectory 'foo/bar' contains the config files 'example_bc_best.json' and \
-'example_dagger_best.json'. If the pattern 'bar/*.json' is supplied, then globbing \
-will return ['bar/example_bc_best.json', 'bar/example_dagger_best.json']. \
+the subdirectory 'foo/bar' contains the config files 'bc_best.json' and \
+'dagger_best.json'. If the pattern 'bar/*.json' is supplied, then globbing \
+will return ['bar/bc_best.json', 'bar/dagger_best.json']. \
 If the --remote flag is enabled, 'bar' will be replaced with `remote_cfg_dir` and \
 commands will be created for the following configs: \
-[`remote_cfg_dir`/example_bc_best.json, `remote_cfg_dir`/example_dagger_best.json] \
+[`remote_cfg_dir`/bc_best.json, `remote_cfg_dir`/dagger_best.json] \
 Why not just supply the pattern '`remote_cfg_dir`/*.json' directly? \
 Because the `remote_cfg_dir` directory may not exist on the local machine.""",
     )
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index e81a617db..a591f3d9a 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -7,10 +7,8 @@
 Adding custom named configs is necessary because the CLI interface can't add
 search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
 
-For tuning hyperparameters of an algorithm on a given environment, override
-the `base_named_configs` argument with the named config of the environment.
-Ex: python -m imitation.scripts.parallel with example_gail \
-    'base_named_configs=["logging.wandb_logging", "seals_half_cheetah"]'
+For tuning hyperparameters of an algorithm on a given environment,
+check out the benchmarking/tuning.py script.
 """
 
 import numpy as np
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 4a8f6ea6f..18d4f12cf 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -37,7 +37,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
 
     config_name = f"{algorithm}_{environment}"
     config_file = str(
-        BENCHMARKING_DIR / f"example_{algorithm}_{environment}_best_hp_eval.json",
+        BENCHMARKING_DIR / f"{algorithm}_{environment}_best_hp_eval.json",
     )
 
     # WHEN
@@ -66,7 +66,7 @@ def test_tuning_print_config_succeeds(algorithm: str):
             tuning_path,
             "print_config",
             "with",
-            f"example_{algorithm}",
+            f"{algorithm}",
             env,
         ],
     )
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 0f6d314fe..0d431d0e9 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -245,13 +245,13 @@ def test_commands_hofvarpnir_config_with_special_characters_in_flags(tmpdir):
 def test_commands_bc_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_bc_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("bc_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation bc \
 --capture=sys --name=run0 --file_storage=output/sacred/\
 $USER-cmd-run0-bc-0-138a1475 \
-with benchmarking/example_bc_seals_ant_best_hp_eval.json \
+with benchmarking/bc_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -259,13 +259,13 @@ def test_commands_bc_config():
 def test_commands_dagger_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_dagger_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("dagger_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=run0 --file_storage=output/sacred/\
 $USER-cmd-run0-dagger-0-6a49161a \
-with benchmarking/example_dagger_seals_ant_best_hp_eval.json \
+with benchmarking/dagger_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -273,13 +273,13 @@ def test_commands_dagger_config():
 def test_commands_gail_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_gail_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("gail_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial gail \
 --capture=sys --name=run0 --file_storage=output/sacred/\
 $USER-cmd-run0-gail-0-3ec8154d \
-with benchmarking/example_gail_seals_ant_best_hp_eval.json \
+with benchmarking/gail_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -287,13 +287,13 @@ def test_commands_gail_config():
 def test_commands_airl_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_airl_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("airl_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial airl \
 --capture=sys --name=run0 \
 --file_storage=output/sacred/$USER-cmd-run0-airl-0-400e1558 \
-with benchmarking/example_airl_seals_ant_best_hp_eval.json \
+with benchmarking/airl_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 

From 4ff006d1f2162c8f5085c1f824a19090846dd23c Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 18 Jul 2023 12:06:30 +0530
Subject: [PATCH 27/54] Fix file name test errors

---
 experiments/commands.py   | 2 +-
 tests/test_experiments.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/experiments/commands.py b/experiments/commands.py
index 9021d3738..738a55011 100644
--- a/experiments/commands.py
+++ b/experiments/commands.py
@@ -85,7 +85,7 @@ def _get_algo_name(cfg_file: str) -> str:
     """Get the algorithm name from the given config filename."""
     algo_names = set()
     for key in _ALGO_NAME_TO_SCRIPT_NAME:
-        if cfg_file.find("_" + key + "_") != -1:
+        if cfg_file.find(key + "_") != -1:
             algo_names.add(key)
 
     if len(algo_names) == 0:
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 0d431d0e9..b2417a9f9 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -250,7 +250,7 @@ def test_commands_bc_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation bc \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-bc-0-138a1475 \
+$USER-cmd-run0-bc-0-78e5112a \
 with benchmarking/bc_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -264,7 +264,7 @@ def test_commands_dagger_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-dagger-0-6a49161a \
+$USER-cmd-run0-dagger-0-c27812cf \
 with benchmarking/dagger_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -278,7 +278,7 @@ def test_commands_gail_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial gail \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-gail-0-3ec8154d \
+$USER-cmd-run0-gail-0-9d8d1202 \
 with benchmarking/gail_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -292,7 +292,7 @@ def test_commands_airl_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial airl \
 --capture=sys --name=run0 \
---file_storage=output/sacred/$USER-cmd-run0-airl-0-400e1558 \
+--file_storage=output/sacred/$USER-cmd-run0-airl-0-9ed3120d \
 with benchmarking/airl_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected

From 6933afacb22c555fcd70a833041bd716d2d78807 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 14:41:39 +0530
Subject: [PATCH 28/54] Add tune_run_kwargs in parallel script

---
 src/imitation/scripts/config/parallel.py |  3 --
 src/imitation/scripts/parallel.py        | 39 +++++++++++-------------
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index a591f3d9a..4773b713e 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -34,13 +34,10 @@ def config():
         "config_updates": {},
     }  # `config` argument to `ray.tune.run(trainable, config)`
 
-    local_dir = None  # `local_dir` arg for `ray.tune.run`
-    upload_dir = None  # `upload_dir` arg for `ray.tune.run`
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1  # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
-    syncer = None  # Sacred syncer to use
 
 
 # Debug named configs
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 93aa932b9..7bf3db16f 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -26,12 +26,9 @@ def parallel(
     base_config_updates: Mapping[str, Any],
     resources_per_trial: Dict[str, Any],
     init_kwargs: Mapping[str, Any],
-    local_dir: Optional[str],
-    upload_dir: Optional[str],
     repeat: int,
-    search_alg: Optional[str],
     experiment_checkpoint_path: str,
-    syncer,
+    tune_run_kwargs: Dict[str, Any],
 ) -> ray.tune.ExperimentAnalysis:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
@@ -70,17 +67,13 @@ def parallel(
             generated Ray directory name, unlike config updates from `search_space`.
         resources_per_trial: Argument to `ray.tune.run()`.
         init_kwargs: Arguments to pass to `ray.init`.
-        local_dir: `local_dir` argument to `ray.tune.run()`.
-        upload_dir: `upload_dir` argument to `ray.tune.run()`.
-        search_alg: can be either "optuna" or None. Setting `None` allows for
-            adding grid_search to the `search_space` hyperparameters but doesn't allow
-            for trials to be repeated.
         repeat: Number of runs to repeat each trial for.
-            Not used if `search_alg` is None.
+            If `repeat` > 1, then optuna is used as the default search algorithm
+            unless specified otherwise in `tune_run_kwargs`.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
             experiment ran using this script. Useful for evaluating the best trial
             of the experiment.
-        syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
+        tune_run_kwargs: Other arguments to pass to `ray.tune.run()`.
 
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
@@ -118,11 +111,18 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
-    if search_alg == "optuna":
-        algo = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
-    else:
-        assert repeat == 1  # repeat should not be used if search_alg is None
-        algo = None
+    if repeat > 1:
+        if "search_alg" not in tune_run_kwargs:
+            tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
+        try:
+            algo = tune_run_kwargs["search_alg"]
+            algo = search.Repeater(algo, repeat)
+            tune_run_kwargs["search_alg"] = algo
+        except AttributeError:
+            raise ValueError(
+                "repeat > 1 but search_alg is not an instance of "
+                "ray.tune.search.SearchAlgorithm",
+            )
 
     if sacred_ex_name == "train_rl":
         return_key = "monitor_return_mean"
@@ -145,15 +145,10 @@ def parallel(
                 config=search_space,
                 num_samples=num_samples * repeat,
                 name=run_name,
-                local_dir=local_dir,
                 resources_per_trial=resources_per_trial,
-                sync_config=ray.tune.syncer.SyncConfig(
-                    upload_dir=upload_dir,
-                    syncer=syncer,
-                ),
-                search_alg=algo,
                 metric=return_key,
                 mode="max",
+                **tune_run_kwargs,
             )
         return result
     finally:

From 77f9d9b74ddcb42e9181f9f493ca2f144b6a443f Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 16:10:15 +0530
Subject: [PATCH 29/54] Fix test errors

---
 src/imitation/scripts/config/parallel.py |  1 +
 src/imitation/scripts/parallel.py        | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 4773b713e..bdc591422 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -38,6 +38,7 @@ def config():
     repeat = 1  # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
+    tune_run_kwargs = {}  # Additional kwargs to pass to `tune.run`
 
 
 # Debug named configs
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 7bf3db16f..65a72eae3 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -111,13 +111,14 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
+    updated_tune_run_kwargs = copy.deepcopy(tune_run_kwargs)
     if repeat > 1:
-        if "search_alg" not in tune_run_kwargs:
-            tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
+        if "search_alg" not in updated_tune_run_kwargs:
+            updated_tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
         try:
-            algo = tune_run_kwargs["search_alg"]
+            algo = updated_tune_run_kwargs["search_alg"]
             algo = search.Repeater(algo, repeat)
-            tune_run_kwargs["search_alg"] = algo
+            updated_tune_run_kwargs["search_alg"] = algo
         except AttributeError:
             raise ValueError(
                 "repeat > 1 but search_alg is not an instance of "
@@ -148,7 +149,7 @@ def parallel(
                 resources_per_trial=resources_per_trial,
                 metric=return_key,
                 mode="max",
-                **tune_run_kwargs,
+                **updated_tune_run_kwargs,
             )
         return result
     finally:

From 54eb8a6f44ea599236b6165fa5de9079df7ca49a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 16:31:49 +0530
Subject: [PATCH 30/54] Fix test

---
 tests/scripts/test_scripts.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index e17765471..146048c42 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -969,7 +969,10 @@ def test_analyze_gather_tb(tmpdir: str):
     if os.name == "nt":  # pragma: no cover
         pytest.skip("gather_tb uses symlinks: not supported by Windows")
     num_runs = 2
-    config_updates: Dict[str, Any] = dict(local_dir=tmpdir, run_name="test")
+    config_updates: Dict[str, Any] = dict(
+        tune_run_kwargs=dict(local_dir=tmpdir),
+        run_name="test",
+    )
     config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE)
     config_updates.update(num_samples=num_runs)
     parallel_run = parallel.parallel_ex.run(

From d50238f1b900b05296d081954624cac9e2bcf6ab Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 17:02:37 +0530
Subject: [PATCH 31/54] Fix lint

---
 src/imitation/scripts/parallel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 65a72eae3..a7a08064b 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -4,7 +4,7 @@
 import copy
 import glob
 import pathlib
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence
+from typing import Any, Callable, Dict, Mapping, Sequence
 
 import ray
 import ray.tune
@@ -77,6 +77,8 @@ def parallel(
 
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
+        ValueError: `repeat` > 1 but `search_alg` is not an instance of
+            `ray.tune.search.SearchAlgorithm`.
 
     Returns:
         The result of running the parallel experiment with `ray.tune.run()`.

From 3fe22d4e6904c60c581a69004788b08b0184c8ed Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 21:37:18 +0530
Subject: [PATCH 32/54] Updates from review

---
 benchmarking/tuning.py                   | 21 +++++++++++++++------
 src/imitation/scripts/config/parallel.py |  1 -
 src/imitation/scripts/parallel.py        |  2 +-
 tests/scripts/test_scripts.py            |  1 -
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index 324032088..409d0b5af 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -7,6 +7,7 @@
 import numpy as np
 import ray
 from pandas.api import types as pd_types
+from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
 from tuning_config import parallel_ex, tuning_ex
 
@@ -32,7 +33,15 @@ def tune(
     Raises:
         ValueError: If no trials are returned by the parallel run of tuning.
     """
-    run = parallel_ex.run(config_updates=parallel_run_config)
+    search_alg = optuna.OptunaSearch()
+    updated_parallel_run_config = copy.deepcopy(parallel_run_config)
+    if "tune_run_kwargs" not in updated_parallel_run_config:
+        tune_run_kwargs = {}
+    else:
+        tune_run_kwargs = updated_parallel_run_config["tune_run_kwargs"]
+    tune_run_kwargs.update(search_alg=search_alg)
+    updated_parallel_run_config.update(tune_run_kwargs=tune_run_kwargs)
+    run = parallel_ex.run(config_updates=updated_parallel_run_config)
     experiment_analysis = run.result
     if not experiment_analysis.trials:
         raise ValueError(
@@ -42,23 +51,23 @@ def tune(
         )
 
     return_key = "imit_stats/monitor_return_mean"
-    if parallel_run_config["sacred_ex_name"] == "train_rl":
+    if updated_parallel_run_config["sacred_ex_name"] == "train_rl":
         return_key = "monitor_return_mean"
     best_trial = find_best_trial(experiment_analysis, return_key, print_return=True)
 
     if num_eval_seeds > 0:  # evaluate the best trial
         resources_per_trial_eval = copy.deepcopy(
-            parallel_run_config["resources_per_trial"],
+            updated_parallel_run_config["resources_per_trial"],
         )
         # update cpus per trial only if it is provided in `resources_per_trial`
         # Uses the default values (cpu=1) if it is not provided
-        if "cpu" in parallel_run_config["resources_per_trial"]:
+        if "cpu" in updated_parallel_run_config["resources_per_trial"]:
             resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
         evaluate_trial(
             best_trial,
             num_eval_seeds,
-            parallel_run_config["run_name"] + "_best_hp_eval",
-            parallel_run_config,
+            updated_parallel_run_config["run_name"] + "_best_hp_eval",
+            updated_parallel_run_config,
             resources_per_trial_eval,
             return_key,
         )
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index bdc591422..c9c898feb 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -36,7 +36,6 @@ def config():
 
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1  # Number of times to repeat a sampled configuration
-    search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     tune_run_kwargs = {}  # Additional kwargs to pass to `tune.run`
 
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index a7a08064b..57503d6e0 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -34,7 +34,7 @@ def parallel(
 
     A Sacred FileObserver is attached to the inner experiment and writes Sacred
     logs to "{RAY_LOCAL_DIR}/sacred/". These files are automatically copied over
-    to `upload_dir` if that argument is provided.
+    to `upload_dir` if that argument is provided in `tune_run_kwargs`.
 
     Args:
         sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 146048c42..7ff241323 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -802,7 +802,6 @@ def test_train_rl_cnn_policy(tmpdir: str, rng):
             # Need absolute path because raylet runs in different working directory.
             "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(),
         },
-        search_alg=None,  # Use default search algorithm of ray.
         search_space={
             "command_name": "airl",
             "config_updates": {"total_timesteps": tune.choice([5, 10])},

From c50aa20ddfa9f7ce5987a3fd08083d22757925a7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 20 Jul 2023 16:19:04 +0530
Subject: [PATCH 33/54] Simplify few lines of code

---
 benchmarking/tuning.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index 409d0b5af..9c3f52498 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -33,14 +33,12 @@ def tune(
     Raises:
         ValueError: If no trials are returned by the parallel run of tuning.
     """
-    search_alg = optuna.OptunaSearch()
     updated_parallel_run_config = copy.deepcopy(parallel_run_config)
-    if "tune_run_kwargs" not in updated_parallel_run_config:
-        tune_run_kwargs = {}
+    search_alg = optuna.OptunaSearch()
+    if "tune_run_kwargs" in updated_parallel_run_config:
+        updated_parallel_run_config["tune_run_kwargs"]["search_alg"] = search_alg
     else:
-        tune_run_kwargs = updated_parallel_run_config["tune_run_kwargs"]
-    tune_run_kwargs.update(search_alg=search_alg)
-    updated_parallel_run_config.update(tune_run_kwargs=tune_run_kwargs)
+        updated_parallel_run_config["tune_run_kwargs"] = dict(search_alg=search_alg)
     run = parallel_ex.run(config_updates=updated_parallel_run_config)
     experiment_analysis = run.result
     if not experiment_analysis.trials:

From 000af616fb159c165f4806df11d865ee2a6b3663 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 21:54:48 +0530
Subject: [PATCH 34/54] Updates from review

---
 benchmarking/README.md                           |  3 ++-
 src/imitation/scripts/analyze.py                 |  3 ---
 .../scripts/config/train_adversarial.py          |  4 ++++
 .../config/train_preference_comparisons.py       |  4 ++++
 src/imitation/scripts/config/train_rl.py         |  5 +++++
 src/imitation/scripts/parallel.py                | 16 +++++++---------
 tests/scripts/test_scripts.py                    |  3 +++
 7 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 3973c6181..ba89da69d 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -15,7 +15,8 @@ python -m imitation.scripts.<train_script> <algo> with benchmarking/<config_name
 
 ```python
 ...
-ex.add_config('benchmarking/<config_name>.json')
+from imitation.scripts.<train_script> import <train_ex>
+<train_ex>.run(command_name="<algo>", named_configs=["benchmarking/<config_name>.json"])
 ```
 
 # Tuning Hyperparameters
diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index 8977fed47..96b34bd6e 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -166,9 +166,6 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str:
 
 def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
     imit_stats = get(sd.run, "result.imit_stats")
-    if imit_stats is None:
-        # stored in rollout key for preference comparison
-        imit_stats = get(sd.run, "result.rollout")
     expert_stats = get(sd.run, "result.expert_stats")
 
     expert_return_summary = None
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index ef675eab6..acc842095 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -8,6 +8,10 @@
 from imitation.scripts.ingredients import logging as logging_ingredient
 from imitation.scripts.ingredients import policy_evaluation, reward, rl
 
+# Note: All the hyperparameter configs in the file are of the tuned
+# hyperparameters of the RL algorithm of the respective environment.
+# Taken from imitation/scripts/config/train_rl.py
+
 train_adversarial_ex = sacred.Experiment(
     "train_adversarial",
     ingredients=[
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 4fe9c793e..4d8531732 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -8,6 +8,10 @@
 from imitation.scripts.ingredients import logging as logging_ingredient
 from imitation.scripts.ingredients import policy_evaluation, reward, rl
 
+# Note: All the hyperparameter configs in the file are of the tuned
+# hyperparameters of the RL algorithm of the respective environment.
+# Taken from imitation/scripts/config/train_rl.py
+
 train_preference_comparisons_ex = sacred.Experiment(
     "train_preference_comparisons",
     ingredients=[
diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py
index a5475540d..e4ab71da1 100644
--- a/src/imitation/scripts/config/train_rl.py
+++ b/src/imitation/scripts/config/train_rl.py
@@ -8,6 +8,11 @@
 from imitation.scripts.ingredients import logging as logging_ingredient
 from imitation.scripts.ingredients import policy_evaluation, rl
 
+# Note: All the hyperparameter configs in the file are tuned
+# for the PPO algorithm on the respective environment using the
+# RL Baselines Zoo library:
+# https://github.com/HumanCompatibleAI/rl-baselines3-zoo/
+
 train_rl_ex = sacred.Experiment(
     "train_rl",
     ingredients=[
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 57503d6e0..9f5478a6e 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -24,7 +24,7 @@ def parallel(
     search_space: Mapping[str, Any],
     base_named_configs: Sequence[str],
     base_config_updates: Mapping[str, Any],
-    resources_per_trial: Dict[str, Any],
+    resources_per_trial: Mapping[str, Any],
     init_kwargs: Mapping[str, Any],
     repeat: int,
     experiment_checkpoint_path: str,
@@ -115,17 +115,15 @@ def parallel(
     ray.init(**init_kwargs)
     updated_tune_run_kwargs = copy.deepcopy(tune_run_kwargs)
     if repeat > 1:
-        if "search_alg" not in updated_tune_run_kwargs:
-            updated_tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
         try:
-            algo = updated_tune_run_kwargs["search_alg"]
-            algo = search.Repeater(algo, repeat)
-            updated_tune_run_kwargs["search_alg"] = algo
-        except AttributeError:
+            # Use optuna as the default search algorithm for repeat runs.
+            algo = tune_run_kwargs.get("search_alg", optuna.OptunaSearch())
+            updated_tune_run_kwargs["search_alg"] = search.Repeater(algo, repeat)
+        except AttributeError as e:
             raise ValueError(
                 "repeat > 1 but search_alg is not an instance of "
                 "ray.tune.search.SearchAlgorithm",
-            )
+            ) from e
 
     if sacred_ex_name == "train_rl":
         return_key = "monitor_return_mean"
@@ -198,7 +196,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: dict[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 7ff241323..b0271d83b 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -889,6 +889,9 @@ def test_parallel_train_adversarial_custom_env(tmpdir):
             logging=dict(log_root=tmpdir),
             demonstrations=dict(path=path),
         ),
+        # specifying repeat=2 uses the optuna search algorithm which
+        # requires the search space to be non-empty. So we provide
+        # the command name using tune.choice.
         search_space=dict(command_name=tune.choice(["gail"])),
     )
     config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE)

From 8b551341a89a5008fd5c35e04110710ea746d52a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 23:11:15 +0530
Subject: [PATCH 35/54] Fix test

---
 .../algorithms/adversarial/common.py          | 37 +++++++++++++++----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index 62b459a0d..545109b0d 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -2,13 +2,13 @@
 import abc
 import dataclasses
 import logging
-from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload
+from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload, List
 
 import numpy as np
 import torch as th
 import torch.utils.tensorboard as thboard
 import tqdm
-from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env
+from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env, callbacks
 from stable_baselines3.sac import policies as sac_policies
 from torch.nn import functional as F
 
@@ -86,6 +86,30 @@ def compute_train_stats(
     }
 
 
+class TrainDiscriminatorCallback(callbacks.BaseCallback):
+    """Callback for training discriminator after collecting rollouts."""
+
+    def __init__(self, adversarial_trainer, *args, **kwargs):
+        """Builds TrainDiscriminatorCallback.
+
+        Args:
+            *args: Passed through to `callbacks.BaseCallback`.
+            **kwargs: Passed through to `callbacks.BaseCallback`.
+        """
+        self.adversarial_trainer = adversarial_trainer
+        super().__init__(*args, **kwargs)
+
+    def _on_step(self) -> bool:
+        return True
+
+    def _on_rollout_end(self) -> None:
+        self.adversarial_trainer.model.train_disc()
+        for _ in range(self.adversarial_trainer.n_disc_updates_per_round):
+            with networks.training(self.adversarial_trainer.reward_train):
+                # switch to training mode (affects dropout, normalization)
+                self.adversarial_trainer.train_disc()
+
+
 class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]):
     """Base class for adversarial imitation learning algorithms like GAIL and AIRL."""
 
@@ -222,16 +246,17 @@ def __init__(
 
         self.venv_buffering = wrappers.BufferingWrapper(self.venv)
 
+        self.disc_trainer_callback = TrainDiscriminatorCallback(self)
         if debug_use_ground_truth:
             # Would use an identity reward fn here, but RewardFns can't see rewards.
             self.venv_wrapped = self.venv_buffering
-            self.gen_callback = None
+            self.gen_callback: List[callbacks.BaseCallback] = [self.disc_trainer_callback]
         else:
             self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                 self.venv_buffering,
                 reward_fn=self.reward_train.predict_processed,
             )
-            self.gen_callback = self.venv_wrapped.make_log_callback()
+            self.gen_callback = [self.venv_wrapped.make_log_callback(), self.disc_trainer_callback]
         self.venv_train = self.venv_wrapped
 
         self.gen_algo.set_env(self.venv_train)
@@ -446,10 +471,6 @@ def train(
         )
         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
             self.train_gen(self.gen_train_timesteps)
-            for _ in range(self.n_disc_updates_per_round):
-                with networks.training(self.reward_train):
-                    # switch to training mode (affects dropout, normalization)
-                    self.train_disc()
             if callback:
                 callback(r)
             self.logger.dump(self._global_step)

From f3ba2b5ec01331f03295856e4219c68212fc7aee Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 23:13:59 +0530
Subject: [PATCH 36/54] Revert "Fix test"

This reverts commit 8b551341a89a5008fd5c35e04110710ea746d52a.
---
 .../algorithms/adversarial/common.py          | 37 ++++---------------
 1 file changed, 8 insertions(+), 29 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index 545109b0d..62b459a0d 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -2,13 +2,13 @@
 import abc
 import dataclasses
 import logging
-from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload, List
+from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload
 
 import numpy as np
 import torch as th
 import torch.utils.tensorboard as thboard
 import tqdm
-from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env, callbacks
+from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env
 from stable_baselines3.sac import policies as sac_policies
 from torch.nn import functional as F
 
@@ -86,30 +86,6 @@ def compute_train_stats(
     }
 
 
-class TrainDiscriminatorCallback(callbacks.BaseCallback):
-    """Callback for training discriminator after collecting rollouts."""
-
-    def __init__(self, adversarial_trainer, *args, **kwargs):
-        """Builds TrainDiscriminatorCallback.
-
-        Args:
-            *args: Passed through to `callbacks.BaseCallback`.
-            **kwargs: Passed through to `callbacks.BaseCallback`.
-        """
-        self.adversarial_trainer = adversarial_trainer
-        super().__init__(*args, **kwargs)
-
-    def _on_step(self) -> bool:
-        return True
-
-    def _on_rollout_end(self) -> None:
-        self.adversarial_trainer.model.train_disc()
-        for _ in range(self.adversarial_trainer.n_disc_updates_per_round):
-            with networks.training(self.adversarial_trainer.reward_train):
-                # switch to training mode (affects dropout, normalization)
-                self.adversarial_trainer.train_disc()
-
-
 class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]):
     """Base class for adversarial imitation learning algorithms like GAIL and AIRL."""
 
@@ -246,17 +222,16 @@ def __init__(
 
         self.venv_buffering = wrappers.BufferingWrapper(self.venv)
 
-        self.disc_trainer_callback = TrainDiscriminatorCallback(self)
         if debug_use_ground_truth:
             # Would use an identity reward fn here, but RewardFns can't see rewards.
             self.venv_wrapped = self.venv_buffering
-            self.gen_callback: List[callbacks.BaseCallback] = [self.disc_trainer_callback]
+            self.gen_callback = None
         else:
             self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                 self.venv_buffering,
                 reward_fn=self.reward_train.predict_processed,
             )
-            self.gen_callback = [self.venv_wrapped.make_log_callback(), self.disc_trainer_callback]
+            self.gen_callback = self.venv_wrapped.make_log_callback()
         self.venv_train = self.venv_wrapped
 
         self.gen_algo.set_env(self.venv_train)
@@ -471,6 +446,10 @@ def train(
         )
         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
             self.train_gen(self.gen_train_timesteps)
+            for _ in range(self.n_disc_updates_per_round):
+                with networks.training(self.reward_train):
+                    # switch to training mode (affects dropout, normalization)
+                    self.train_disc()
             if callback:
                 callback(r)
             self.logger.dump(self._global_step)

From f8251c70e98f0ccf29e10f1b1ac35ce08e25a580 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 23:14:49 +0530
Subject: [PATCH 37/54] Fix test

---
 src/imitation/scripts/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 9f5478a6e..bb90f6174 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -196,7 +196,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: dict[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:

From 664fc37c0dfd118768186e83006fc06def21a48b Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Mon, 7 Aug 2023 22:58:00 +0530
Subject: [PATCH 38/54] Convert Dict to Mapping in input argument

---
 src/imitation/scripts/parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index bb90f6174..38881ee2b 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -196,7 +196,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
@@ -212,7 +212,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         # TODO(shwang): Stop modifying CAPTURE_MODE once the issue is fixed.
         sacred.SETTINGS.CAPTURE_MODE = "sys"
 
-        run_kwargs = config
+        run_kwargs = dict(**config)
         updated_run_kwargs: Dict[str, Any] = {}
         # Import inside function rather than in module because Sacred experiments
         # are not picklable, and Ray requires this function to be picklable.

From 8690e1dcb01fc96fcfa1813c038f2b1ac26f4a3c Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 30 Aug 2023 10:47:28 +0200
Subject: [PATCH 39/54] Ignore coverage in script configurations.

---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index f39db322f..85dedb3e3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -42,6 +42,8 @@ source = imitation
 include=
     src/*
     tests/*
+omit =
+    src/imitation/scripts/config/*
 
 [coverage:report]
 exclude_lines =

From dd9eb6a5b7e62b5cf1faf84d9111bac9bef77e9d Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 30 Aug 2023 11:12:10 +0200
Subject: [PATCH 40/54] Pin huggingface_sb3 version.

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1781a4031..6d1f2489c 100644
--- a/setup.py
+++ b/setup.py
@@ -207,7 +207,9 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         STABLE_BASELINES3,
         "sacred>=0.8.4",
         "tensorboard>=1.14",
-        "huggingface_sb3>=2.2.1",
+        # TODO: remove once https://github.com/huggingface/huggingface_sb3/issues/37 is
+        #  fixed
+        "huggingface_sb3==2.2.5",
         "optuna>=3.0.1",
         "datasets>=2.8.0",
     ],

From 40d87ef2e99dcb8a34041d27dd62327ec8faf8b4 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 26 Sep 2023 16:46:04 +0200
Subject: [PATCH 41/54] Update to the newest seals environment versions.

---
 benchmarking/airl_seals_ant_best_hp_eval.json            | 2 +-
 benchmarking/airl_seals_half_cheetah_best_hp_eval.json   | 2 +-
 benchmarking/airl_seals_hopper_best_hp_eval.json         | 2 +-
 benchmarking/airl_seals_swimmer_best_hp_eval.json        | 4 ++--
 benchmarking/airl_seals_walker_best_hp_eval.json         | 4 ++--
 benchmarking/bc_seals_ant_best_hp_eval.json              | 2 +-
 benchmarking/bc_seals_half_cheetah_best_hp_eval.json     | 2 +-
 benchmarking/bc_seals_hopper_best_hp_eval.json           | 2 +-
 benchmarking/bc_seals_swimmer_best_hp_eval.json          | 2 +-
 benchmarking/bc_seals_walker_best_hp_eval.json           | 2 +-
 benchmarking/dagger_seals_ant_best_hp_eval.json          | 2 +-
 benchmarking/dagger_seals_half_cheetah_best_hp_eval.json | 2 +-
 benchmarking/dagger_seals_hopper_best_hp_eval.json       | 2 +-
 benchmarking/dagger_seals_swimmer_best_hp_eval.json      | 2 +-
 benchmarking/dagger_seals_walker_best_hp_eval.json       | 2 +-
 benchmarking/gail_seals_ant_best_hp_eval.json            | 2 +-
 benchmarking/gail_seals_half_cheetah_best_hp_eval.json   | 2 +-
 benchmarking/gail_seals_hopper_best_hp_eval.json         | 2 +-
 benchmarking/gail_seals_swimmer_best_hp_eval.json        | 4 ++--
 benchmarking/gail_seals_walker_best_hp_eval.json         | 4 ++--
 20 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/benchmarking/airl_seals_ant_best_hp_eval.json b/benchmarking/airl_seals_ant_best_hp_eval.json
index 17f969ff0..d4131433e 100644
--- a/benchmarking/airl_seals_ant_best_hp_eval.json
+++ b/benchmarking/airl_seals_ant_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
index 754ba6736..f69ba5cb5 100644
--- a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/airl_seals_hopper_best_hp_eval.json b/benchmarking/airl_seals_hopper_best_hp_eval.json
index 91080d7ce..58c2475f5 100644
--- a/benchmarking/airl_seals_hopper_best_hp_eval.json
+++ b/benchmarking/airl_seals_hopper_best_hp_eval.json
@@ -75,6 +75,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/airl_seals_swimmer_best_hp_eval.json b/benchmarking/airl_seals_swimmer_best_hp_eval.json
index fcca8e6b3..8529c58b5 100644
--- a/benchmarking/airl_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/airl_seals_swimmer_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Swimmer-v0",
+      "gym_id": "seals/Swimmer-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/airl_seals_walker_best_hp_eval.json b/benchmarking/airl_seals_walker_best_hp_eval.json
index c63070751..edd99806d 100644
--- a/benchmarking/airl_seals_walker_best_hp_eval.json
+++ b/benchmarking/airl_seals_walker_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Walker2d-v0",
+      "gym_id": "seals/Walker2d-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }
diff --git a/benchmarking/bc_seals_ant_best_hp_eval.json b/benchmarking/bc_seals_ant_best_hp_eval.json
index 108a93ce7..e9baa8fc1 100644
--- a/benchmarking/bc_seals_ant_best_hp_eval.json
+++ b/benchmarking/bc_seals_ant_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
index ecaff2eb0..041f159b0 100644
--- a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/bc_seals_hopper_best_hp_eval.json b/benchmarking/bc_seals_hopper_best_hp_eval.json
index e8c821841..9a7872d37 100644
--- a/benchmarking/bc_seals_hopper_best_hp_eval.json
+++ b/benchmarking/bc_seals_hopper_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/bc_seals_swimmer_best_hp_eval.json b/benchmarking/bc_seals_swimmer_best_hp_eval.json
index 30884c9c4..8a8f2456a 100644
--- a/benchmarking/bc_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/bc_seals_swimmer_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/bc_seals_walker_best_hp_eval.json b/benchmarking/bc_seals_walker_best_hp_eval.json
index 0ca30120e..f33e6c5a2 100644
--- a/benchmarking/bc_seals_walker_best_hp_eval.json
+++ b/benchmarking/bc_seals_walker_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_ant_best_hp_eval.json b/benchmarking/dagger_seals_ant_best_hp_eval.json
index de75b80f1..e02828667 100644
--- a/benchmarking/dagger_seals_ant_best_hp_eval.json
+++ b/benchmarking/dagger_seals_ant_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
index 7f42bfdf9..d1c9e5923 100644
--- a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_hopper_best_hp_eval.json b/benchmarking/dagger_seals_hopper_best_hp_eval.json
index 1cf29a1a4..b91f66298 100644
--- a/benchmarking/dagger_seals_hopper_best_hp_eval.json
+++ b/benchmarking/dagger_seals_hopper_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_swimmer_best_hp_eval.json b/benchmarking/dagger_seals_swimmer_best_hp_eval.json
index c112db680..545761cbc 100644
--- a/benchmarking/dagger_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/dagger_seals_swimmer_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_walker_best_hp_eval.json b/benchmarking/dagger_seals_walker_best_hp_eval.json
index e59bef464..7b694c8d2 100644
--- a/benchmarking/dagger_seals_walker_best_hp_eval.json
+++ b/benchmarking/dagger_seals_walker_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }
diff --git a/benchmarking/gail_seals_ant_best_hp_eval.json b/benchmarking/gail_seals_ant_best_hp_eval.json
index 81399b00c..3d43b34ba 100644
--- a/benchmarking/gail_seals_ant_best_hp_eval.json
+++ b/benchmarking/gail_seals_ant_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
index 1d2f26648..914f3712a 100644
--- a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/gail_seals_hopper_best_hp_eval.json b/benchmarking/gail_seals_hopper_best_hp_eval.json
index 70787ff7e..cebdae71c 100644
--- a/benchmarking/gail_seals_hopper_best_hp_eval.json
+++ b/benchmarking/gail_seals_hopper_best_hp_eval.json
@@ -75,6 +75,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/gail_seals_swimmer_best_hp_eval.json b/benchmarking/gail_seals_swimmer_best_hp_eval.json
index 650c5f46a..b0bd0e645 100644
--- a/benchmarking/gail_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/gail_seals_swimmer_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Swimmer-v0",
+      "gym_id": "seals/Swimmer-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/gail_seals_walker_best_hp_eval.json b/benchmarking/gail_seals_walker_best_hp_eval.json
index d85eb46d5..2626b4c43 100644
--- a/benchmarking/gail_seals_walker_best_hp_eval.json
+++ b/benchmarking/gail_seals_walker_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Walker2d-v0",
+      "gym_id": "seals/Walker2d-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }

From 71f6c9283a387d35ed94f832ca660711942052e3 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 27 Sep 2023 09:49:28 +0200
Subject: [PATCH 42/54] Push gymnasium dependency to 0.29 to ensure mujoco envs
 work.

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7bc4051a9..0384014ee 100644
--- a/setup.py
+++ b/setup.py
@@ -187,7 +187,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
     #   encode only known incompatibilities here. This prevents nasty dependency issues
     #   for our users.
     install_requires=[
-        "gymnasium[classic-control]~=0.28.1",
+        "gymnasium[classic-control]~=0.29",
         "matplotlib",
         "numpy>=1.15",
         "torch>=1.4.0",
@@ -220,7 +220,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "docs": DOCS_REQUIRE,
         "parallel": PARALLEL_REQUIRE,
         "mujoco": [
-            "gymnasium[classic-control,mujoco]~=0.28.1",
+            "gymnasium[classic-control,mujoco]~=0.29",
         ],
         "atari": ATARI_REQUIRE,
     },

From 747ad32787e56a6939f6064eedb0cda8a67c3b1a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 4 Oct 2023 05:58:13 +0530
Subject: [PATCH 43/54] Incorporate review comments

---
 src/imitation/scripts/analyze.py              | 11 +++-----
 .../imitation/scripts/config/tuning.py        | 12 ++++-----
 src/imitation/scripts/parallel.py             | 12 ++-------
 .../imitation/scripts}/tuning.py              | 26 +++++++++++++------
 tests/scripts/test_scripts.py                 |  2 +-
 5 files changed, 31 insertions(+), 32 deletions(-)
 rename benchmarking/tuning_config.py => src/imitation/scripts/config/tuning.py (97%)
 rename {benchmarking => src/imitation/scripts}/tuning.py (85%)

diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index 96b34bd6e..b63538f6d 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -268,13 +268,10 @@ def analyze_imitation(
     Returns:
         The DataFrame generated from the Sacred logs.
     """
-    if table_verbosity == 3:
-        # Get column names for which we have get value using make_entry_fn
-        # These are same across Level 2 & 3. In Level 3, we additionally add remaining
-        #  config columns.
-        table_entry_fns_subset = _get_table_entry_fns_subset(2)
-    else:
-        table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
+    # Get column names for which we have get value using make_entry_fn
+    # These are same across Level 2 & 3. In Level 3, we additionally add remaining
+    #  config columns.
+    table_entry_fns_subset = _get_table_entry_fns_subset(min(table_verbosity, 2))
 
     output_table = pd.DataFrame()
     for sd in _gather_sacred_dicts():
diff --git a/benchmarking/tuning_config.py b/src/imitation/scripts/config/tuning.py
similarity index 97%
rename from benchmarking/tuning_config.py
rename to src/imitation/scripts/config/tuning.py
index 239537406..07161d04c 100644
--- a/benchmarking/tuning_config.py
+++ b/src/imitation/scripts/config/tuning.py
@@ -49,24 +49,24 @@ def bc():
         search_space={
             "config_updates": {
                 "bc": dict(
-                    batch_size=tune.choice([8, 16, 32, 64]),
+                    batch_size=tune.choice([8]),
                     l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
                     optimizer_kwargs=dict(
                         lr=tune.loguniform(1e-5, 1e-2),
                     ),
                     train_kwargs=dict(
-                        n_epochs=tune.choice([1, 5, 10, 20]),
+                        n_epochs=tune.choice([1]),
                     ),
                 ),
             },
             "command_name": "bc",
         },
-        num_samples=64,
-        repeat=3,
+        num_samples=2,
+        repeat=2,
         resources_per_trial=dict(cpu=1),
     )
 
-    num_eval_seeds = 5
+    num_eval_seeds = 1
     eval_best_trial_resource_multiplier = 1
 
 
@@ -117,7 +117,7 @@ def dagger():
 def gail():
     parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
-        run_name="gail_tuning_hc",
+        run_name="gail_tuning",
         base_named_configs=["logging.wandb_logging"],
         base_config_updates={
             "environment": {"num_vec": 1},
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 38881ee2b..d5e5e2378 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -2,7 +2,6 @@
 
 import collections.abc
 import copy
-import glob
 import pathlib
 from typing import Any, Callable, Dict, Mapping, Sequence
 
@@ -37,8 +36,8 @@ def parallel(
     to `upload_dir` if that argument is provided in `tune_run_kwargs`.
 
     Args:
-        sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or
-            "train_imitation" or "train_adversarial" or "train_preference_comparisons".
+        sacred_ex_name: The Sacred experiment to tune. Either "train_rl",
+            "train_imitation", "train_adversarial" or "train_preference_comparisons".
         run_name: A name describing this parallelizing experiment.
             This argument is also passed to `ray.tune.run` as the `name` argument.
             It is also saved in 'sacred/run.json' of each inner Sacred experiment
@@ -132,14 +131,7 @@ def parallel(
 
     try:
         if experiment_checkpoint_path:
-            # load experiment analysis results
             result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
-            result._load_checkpoints_from_latest(
-                glob.glob(experiment_checkpoint_path + "/experiment_state*.json"),
-            )
-            # update result.trials using all the experiment_state json files
-            result.trials = None
-            result.fetch_trial_dataframes()
         else:
             result = ray.tune.run(
                 trainable,
diff --git a/benchmarking/tuning.py b/src/imitation/scripts/tuning.py
similarity index 85%
rename from benchmarking/tuning.py
rename to src/imitation/scripts/tuning.py
index 9c3f52498..a605a206a 100644
--- a/benchmarking/tuning.py
+++ b/src/imitation/scripts/tuning.py
@@ -9,7 +9,9 @@
 from pandas.api import types as pd_types
 from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
-from tuning_config import parallel_ex, tuning_ex
+
+from imitation.scripts.config.parallel import parallel_ex
+from imitation.scripts.config.tuning import tuning_ex
 
 
 @tuning_ex.main
@@ -18,10 +20,15 @@ def tune(
     eval_best_trial_resource_multiplier: int = 1,
     num_eval_seeds: int = 5,
 ) -> None:
-    """Tune hyperparameters of imitation algorithms using parallel script.
+    """Tune hyperparameters of imitation algorithms using the parallel script.
+
+    The parallel script is called twice in this function. The first call is to
+    tune the hyperparameters. The second call is to evaluate the best trial on
+    a separate set of seeds.
 
     Args:
         parallel_run_config: Dictionary of arguments to pass to the parallel script.
+            This is used to define the search space for tuning the hyperparameters.
         eval_best_trial_resource_multiplier: Factor by which to multiply the
             number of cpus per trial in `resources_per_trial`. This is useful for
             allocating more resources per trial to the evaluation trials than the
@@ -35,10 +42,8 @@ def tune(
     """
     updated_parallel_run_config = copy.deepcopy(parallel_run_config)
     search_alg = optuna.OptunaSearch()
-    if "tune_run_kwargs" in updated_parallel_run_config:
-        updated_parallel_run_config["tune_run_kwargs"]["search_alg"] = search_alg
-    else:
-        updated_parallel_run_config["tune_run_kwargs"] = dict(search_alg=search_alg)
+    tune_run_kwargs = updated_parallel_run_config.setdefault("tune_run_kwargs", dict())
+    tune_run_kwargs["search_alg"] = search_alg
     run = parallel_ex.run(config_updates=updated_parallel_run_config)
     experiment_analysis = run.result
     if not experiment_analysis.trials:
@@ -93,9 +98,13 @@ def find_best_trial(
         if pd_types.is_object_dtype(df[col]):
             df[col] = df[col].astype("str")
     # group into separate HP configs
-    grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
+    grp_keys = [c for c in df.columns if c.startswith("config")]
+    grp_keys = [c for c in grp_keys if "seed" not in c and "trial_index" not in c]
     grps = df.groupby(grp_keys)
     # store mean return of runs across all seeds in a group
+    # the transform method is applied to get the mean return for every trial
+    # instead of for every group. So every trial in a group will have the same
+    # mean return column.
     df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
     best_config_df = df[df["mean_return"] == df["mean_return"].max()]
     row = best_config_df.iloc[0]
@@ -149,10 +158,11 @@ def evaluate_trial(
         num_samples=1,
         search_space=config,
         resources_per_trial=resources_per_trial,
-        search_alg=None,
         repeat=1,
         experiment_checkpoint_path="",
     )
+    # required for grid search
+    eval_config_updates["tune_run_kwargs"].update(search_alg=None)
     eval_run = parallel_ex.run(config_updates=eval_config_updates)
     eval_result = eval_run.result
     returns = eval_result.results_df[return_key].to_numpy()
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index a44639cef..5fc2f122d 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -981,7 +981,7 @@ def test_analyze_imitation(tmpdir: str, run_names: List[str], run_sacred_fn):
             assert run.status == "COMPLETED"
 
     # Check that analyze script finds the correct number of logs.
-    def check(run_name: Optional[str], count: int, table_verbosity=1) -> None:
+    def check(run_name: Optional[str], count: int, table_verbosity: int = 1) -> None:
         run = analyze.analysis_ex.run(
             command_name="analyze_imitation",
             config_updates=dict(

From 691e75945579cd8aaea3a133ffd1178bb978a450 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 4 Oct 2023 07:49:02 +0530
Subject: [PATCH 44/54] Fix test errors

---
 src/imitation/scripts/tuning.py |  6 +++---
 tests/test_benchmarking.py      | 34 +++++++++++++--------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/src/imitation/scripts/tuning.py b/src/imitation/scripts/tuning.py
index a605a206a..24095b1de 100644
--- a/src/imitation/scripts/tuning.py
+++ b/src/imitation/scripts/tuning.py
@@ -2,7 +2,7 @@
 
 import copy
 import pathlib
-from typing import Any, Dict
+from typing import Dict
 
 import numpy as np
 import ray
@@ -16,7 +16,7 @@
 
 @tuning_ex.main
 def tune(
-    parallel_run_config: Dict[str, Any],
+    parallel_run_config,
     eval_best_trial_resource_multiplier: int = 1,
     num_eval_seeds: int = 5,
 ) -> None:
@@ -128,7 +128,7 @@ def evaluate_trial(
     trial: ray.tune.experiment.Trial,
     num_eval_seeds: int,
     run_name: str,
-    parallel_run_config: Dict[str, Any],
+    parallel_run_config,
     resources_per_trial: Dict[str, int],
     return_key: str,
     print_return: bool = False,
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 18d4f12cf..0a93943ef 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -1,11 +1,9 @@
 """Tests for config files in benchmarking/ folder."""
 import pathlib
-import subprocess
-import sys
 
 import pytest
 
-from imitation.scripts import train_adversarial, train_imitation
+from imitation.scripts import train_adversarial, train_imitation, tuning
 
 THIS_DIR = pathlib.Path(__file__).absolute().parent
 BENCHMARKING_DIR = THIS_DIR.parent / "benchmarking"
@@ -48,26 +46,20 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
     assert run.status == "COMPLETED"
 
 
+@pytest.mark.parametrize("environment", ENVIRONMENTS)
 @pytest.mark.parametrize("algorithm", ALGORITHMS)
-def test_tuning_print_config_succeeds(algorithm: str):
+def test_tuning_print_config_succeeds(algorithm: str, environment: str):
     # We test the configs using the print_config command,
     # because running the configs requires MuJoCo.
     # Requiring MuJoCo to run the tests adds too much complexity.
-
-    # We need to use sys.executable, not just "python", on Windows as
-    # subprocess.call ignores PATH (unless shell=True) so runs a
-    # system-wide Python interpreter outside of our venv. See:
-    # https://stackoverflow.com/questions/5658622/
-    tuning_path = str(BENCHMARKING_DIR / "tuning.py")
-    env = 'parallel_run_config.base_named_configs=["seals_cartpole"]'
-    exit_code = subprocess.call(
-        [
-            sys.executable,
-            tuning_path,
-            "print_config",
-            "with",
-            f"{algorithm}",
-            env,
-        ],
+    experiment = tuning.tuning_ex
+    run = experiment.run(
+        command_name="print_config",
+        named_configs=[algorithm],
+        config_updates=dict(
+            parallel_run_config=dict(
+                base_named_configs=[environment],
+            ),
+        ),
     )
-    assert exit_code == 0
+    assert run.status == "COMPLETED"

From 2038e60f9935372ca91a6fad15d665e68e85e5a2 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Oct 2023 05:08:23 +0530
Subject: [PATCH 45/54] Move benchmarking/ to scripts/ and add named configs
 for tuned hyperparams

---
 experiments/commands.py                       |  17 +-
 setup.cfg                                     |   1 -
 .../config}/airl_seals_ant_best_hp_eval.json  |   0
 .../airl_seals_walker_best_hp_eval.json       |   0
 src/imitation/scripts/config/parallel.py      |   2 +-
 .../scripts/config/train_adversarial.py       | 157 +++---------------
 .../scripts/config/train_imitation.py         |  59 +++----
 .../airl_seals_ant_best_hp_eval.json          |  67 ++++++++
 .../airl_seals_half_cheetah_best_hp_eval.json |   0
 .../airl_seals_hopper_best_hp_eval.json       |   0
 .../airl_seals_swimmer_best_hp_eval.json      |   0
 .../airl_seals_walker_best_hp_eval.json       |  86 ++++++++++
 .../tuned_hps}/bc_seals_ant_best_hp_eval.json |   0
 .../bc_seals_half_cheetah_best_hp_eval.json   |   0
 .../bc_seals_hopper_best_hp_eval.json         |   0
 .../bc_seals_swimmer_best_hp_eval.json        |   0
 .../bc_seals_walker_best_hp_eval.json         |   0
 .../dagger_seals_ant_best_hp_eval.json        |   0
 ...agger_seals_half_cheetah_best_hp_eval.json |   0
 .../dagger_seals_hopper_best_hp_eval.json     |   0
 .../dagger_seals_swimmer_best_hp_eval.json    |   0
 .../dagger_seals_walker_best_hp_eval.json     |   0
 .../fast_dagger_seals_cartpole.json           |   0
 .../gail_seals_ant_best_hp_eval.json          |   0
 .../gail_seals_half_cheetah_best_hp_eval.json |   0
 .../gail_seals_hopper_best_hp_eval.json       |   0
 .../gail_seals_swimmer_best_hp_eval.json      |   0
 .../gail_seals_walker_best_hp_eval.json       |   0
 tests/test_benchmarking.py                    |  17 +-
 tests/test_experiments.py                     |  58 +++----
 30 files changed, 240 insertions(+), 224 deletions(-)
 rename {benchmarking => src/imitation/scripts/config}/airl_seals_ant_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config}/airl_seals_walker_best_hp_eval.json (100%)
 create mode 100644 src/imitation/scripts/config/tuned_hps/airl_seals_ant_best_hp_eval.json
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/airl_seals_half_cheetah_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/airl_seals_hopper_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/airl_seals_swimmer_best_hp_eval.json (100%)
 create mode 100644 src/imitation/scripts/config/tuned_hps/airl_seals_walker_best_hp_eval.json
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/bc_seals_ant_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/bc_seals_half_cheetah_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/bc_seals_hopper_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/bc_seals_swimmer_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/bc_seals_walker_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/dagger_seals_ant_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/dagger_seals_half_cheetah_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/dagger_seals_hopper_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/dagger_seals_swimmer_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/dagger_seals_walker_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/fast_dagger_seals_cartpole.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/gail_seals_ant_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/gail_seals_half_cheetah_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/gail_seals_hopper_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/gail_seals_swimmer_best_hp_eval.json (100%)
 rename {benchmarking => src/imitation/scripts/config/tuned_hps}/gail_seals_walker_best_hp_eval.json (100%)

diff --git a/experiments/commands.py b/experiments/commands.py
index 738a55011..0dc0cce7c 100644
--- a/experiments/commands.py
+++ b/experiments/commands.py
@@ -12,9 +12,10 @@
 
 For example, we can run:
 
+TUNED_HPS_DIR=../src/imitation/scripts/config/tuned_hps
 python commands.py \
     --name=run0 \
-    --cfg_pattern=../benchmarking/*ai*_seals_walker_*.json \
+    --cfg_pattern=$TUNED_HPS_DIR/*ai*_seals_walker_*.json \
     --output_dir=output
 
 And get the following commands printed out:
@@ -22,13 +23,13 @@
 python -m imitation.scripts.train_adversarial airl \
     --capture=sys --name=run0 \
     --file_storage=output/sacred/$USER-cmd-run0-airl-0-a3531726 \
-    with ../benchmarking/airl_seals_walker_best_hp_eval.json \
+    with ../src/imitation/scripts/config/tuned_hps/airl_seals_walker_best_hp_eval.json \
     seed=0 logging.log_root=output
 
 python -m imitation.scripts.train_adversarial gail \
     --capture=sys --name=run0 \
     --file_storage=output/sacred/$USER-cmd-run0-gail-0-a1ec171b \
-    with ../benchmarking/gail_seals_walker_best_hp_eval.json \
+    with $TUNED_HPS_DIR/gail_seals_walker_best_hp_eval.json \
     seed=0 logging.log_root=output
 
 We can execute commands in parallel by piping them to GNU parallel:
@@ -40,9 +41,10 @@
 
 For example, we can run:
 
+TUNED_HPS_DIR=../src/imitation/scripts/config/tuned_hps
 python commands.py \
     --name=run0 \
-    --cfg_pattern=../benchmarking/bc_seals_half_cheetah_best_hp_eval.json \
+    --cfg_pattern=$TUNED_HPS_DIR/bc_seals_half_cheetah_best_hp_eval.json \
     --output_dir=/data/output \
     --remote
 
@@ -51,8 +53,9 @@
 ctl job run --name $USER-cmd-run0-bc-0-72cb1df3 \
     --command "python -m imitation.scripts.train_imitation bc \
     --capture=sys --name=run0 \
-    --file_storage=/data/output/sacred/$USER-cmd-run0-bc-0-72cb1df3 \
-    with /data/imitation/benchmarking/bc_seals_half_cheetah_best_hp_eval.json \
+    --file_storage=/data/output/sacred/$USER-cmd-run0-bc-0-72cb1df3 with \
+    /data/imitation/src/imitation/scripts/config/tuned_hps/
+    bc_seals_half_cheetah_best_hp_eval.json \
     seed=0 logging.log_root=/data/output" \
     --container hacobe/devbox:imitation \
     --login --force-pull --never-restart --gpu 0 --shared-host-dir-mount /data
@@ -220,7 +223,7 @@ def parse() -> argparse.Namespace:
     parser.add_argument(
         "--remote_cfg_dir",
         type=str,
-        default="/data/imitation/benchmarking",
+        default="/data/imitation/src/imitation/scripts/config/tuned_hps",
         help="""Path to a directory storing config files \
 accessible from each container. """,
     )
diff --git a/setup.cfg b/setup.cfg
index 95f2223d9..560cac137 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,7 +7,6 @@ per-file-ignores =
 # F841 local variable unused [for Sacred config scopes]
   src/imitation/scripts/config/*.py:F841
   ../src/imitation/scripts/config/*.py:F841
-  benchmarking/tuning_config.py:F841
   src/imitation/envs/examples/airl_envs/*.py:D
 
 [darglint]
diff --git a/benchmarking/airl_seals_ant_best_hp_eval.json b/src/imitation/scripts/config/airl_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/airl_seals_ant_best_hp_eval.json
rename to src/imitation/scripts/config/airl_seals_ant_best_hp_eval.json
diff --git a/benchmarking/airl_seals_walker_best_hp_eval.json b/src/imitation/scripts/config/airl_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/airl_seals_walker_best_hp_eval.json
rename to src/imitation/scripts/config/airl_seals_walker_best_hp_eval.json
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index c9c898feb..62ebbd9e3 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -8,7 +8,7 @@
 search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
 
 For tuning hyperparameters of an algorithm on a given environment,
-check out the benchmarking/tuning.py script.
+check out the imitation/scripts/tuning.py script.
 """
 
 import numpy as np
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index acc842095..ff32a551b 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -1,7 +1,8 @@
 """Configuration for imitation.scripts.train_adversarial."""
 
+import pathlib
+
 import sacred
-from torch import nn
 
 from imitation.rewards import reward_nets
 from imitation.scripts.ingredients import demonstrations, environment, expert
@@ -101,29 +102,6 @@ def pendulum():
 # Standard MuJoCo Gym environment named configs
 
 
-@train_adversarial_ex.named_config
-def seals_ant():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    locals().update(**ANT_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Ant-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    rl = dict(
-        batch_size=2048,
-        rl_kwargs=dict(
-            batch_size=16,
-            clip_range=0.3,
-            ent_coef=3.1441389214159857e-06,
-            gae_lambda=0.8,
-            gamma=0.995,
-            learning_rate=0.00017959211641976886,
-            max_grad_norm=0.9,
-            n_epochs=10,
-            # policy_kwargs are same as the defaults
-            vf_coef=0.4351450387648799,
-        ),
-    )
-
-
 CHEETAH_SHARED_LOCALS = dict(
     MUJOCO_SHARED_LOCALS,
     rl=dict(batch_size=16384, rl_kwargs=dict(batch_size=1024)),
@@ -158,117 +136,6 @@ def half_cheetah():
     environment = dict(gym_id="HalfCheetah-v2")
 
 
-@train_adversarial_ex.named_config
-def seals_half_cheetah():
-    environment = dict(gym_id="seals/HalfCheetah-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    rl = dict(
-        batch_size=512,
-        rl_kwargs=dict(
-            batch_size=64,
-            clip_range=0.1,
-            ent_coef=3.794797423594763e-06,
-            gae_lambda=0.95,
-            gamma=0.95,
-            learning_rate=0.0003286871805949382,
-            max_grad_norm=0.8,
-            n_epochs=5,
-            vf_coef=0.11483689492120866,
-        ),
-    )
-    algorithm_kwargs = dict(
-        # Number of discriminator updates after each round of generator updates
-        n_disc_updates_per_round=16,
-        # Equivalent to no replay buffer if batch size is the same
-        gen_replay_buffer_capacity=512,
-        demo_batch_size=8192,
-    )
-
-
-@train_adversarial_ex.named_config
-def seals_hopper():
-    environment = dict(gym_id="seals/Hopper-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    policy = dict(
-        policy_cls="MlpPolicy",
-        policy_kwargs=dict(
-            activation_fn=nn.ReLU,
-            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
-        ),
-    )
-    rl = dict(
-        batch_size=2048,
-        rl_kwargs=dict(
-            batch_size=512,
-            clip_range=0.1,
-            ent_coef=0.0010159833764878474,
-            gae_lambda=0.98,
-            gamma=0.995,
-            learning_rate=0.0003904770450788824,
-            max_grad_norm=0.9,
-            n_epochs=20,
-            vf_coef=0.20315938606555833,
-        ),
-    )
-
-
-@train_adversarial_ex.named_config
-def seals_swimmer():
-    environment = dict(gym_id="seals/Swimmer-v0")
-    total_timesteps = int(2e6)
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    policy = dict(
-        policy_cls="MlpPolicy",
-        policy_kwargs=dict(
-            activation_fn=nn.ReLU,
-            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
-        ),
-    )
-    rl = dict(
-        batch_size=2048,
-        rl_kwargs=dict(
-            batch_size=64,
-            clip_range=0.1,
-            ent_coef=5.167107294612664e-08,
-            gae_lambda=0.95,
-            gamma=0.999,
-            learning_rate=0.000414936134792374,
-            max_grad_norm=2,
-            n_epochs=5,
-            # policy_kwargs are same as the defaults
-            vf_coef=0.6162112311062333,
-        ),
-    )
-
-
-@train_adversarial_ex.named_config
-def seals_walker():
-    environment = dict(gym_id="seals/Walker2d-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    policy = dict(
-        policy_cls="MlpPolicy",
-        policy_kwargs=dict(
-            activation_fn=nn.ReLU,
-            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
-        ),
-    )
-    rl = dict(
-        batch_size=8192,
-        rl_kwargs=dict(
-            batch_size=128,
-            clip_range=0.4,
-            ent_coef=0.00013057334805552262,
-            gae_lambda=0.92,
-            gamma=0.98,
-            learning_rate=0.000138575372312869,
-            max_grad_norm=0.6,
-            n_epochs=20,
-            # policy_kwargs are same as the defaults
-            vf_coef=0.6167177795726859,
-        ),
-    )
-
-
 @train_adversarial_ex.named_config
 def seals_humanoid():
     locals().update(**MUJOCO_SHARED_LOCALS)
@@ -296,3 +163,23 @@ def fast():
         demo_batch_size=1,
         n_disc_updates_per_round=4,
     )
+
+
+hyperparam_dir = pathlib.Path(__file__).absolute().parent / "tuned_hps"
+tuned_alg_envs = [
+    "airl_seals_ant",
+    "airl_seals_half_cheetah",
+    "airl_seals_hopper",
+    "airl_seals_swimmer",
+    "airl_seals_walker",
+    "gail_seals_ant",
+    "gail_seals_half_cheetah",
+    "gail_seals_hopper",
+    "gail_seals_swimmer",
+    "gail_seals_walker",
+]
+
+for tuned_alg_env in tuned_alg_envs:
+    config_file = hyperparam_dir / f"{tuned_alg_env}_best_hp_eval.json"
+    assert config_file.is_file(), f"{config_file} does not exist"
+    train_adversarial_ex.add_named_config(tuned_alg_env, str(config_file))
diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
index 4f3a8a415..f151e768e 100644
--- a/src/imitation/scripts/config/train_imitation.py
+++ b/src/imitation/scripts/config/train_imitation.py
@@ -1,5 +1,7 @@
 """Configuration settings for train_dagger, training DAgger from synthetic demos."""
 
+import pathlib
+
 import sacred
 
 from imitation.scripts.ingredients import bc
@@ -67,13 +69,6 @@ def ant():
     environment = dict(gym_id="Ant-v2")
 
 
-@train_imitation_ex.named_config
-def seals_ant():
-    environment = dict(gym_id="seals/Ant-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    expert = {"policy_type": "ppo-huggingface"}
-
-
 @train_imitation_ex.named_config
 def half_cheetah():
     environment = dict(gym_id="HalfCheetah-v2")
@@ -81,36 +76,6 @@ def half_cheetah():
     dagger = dict(total_timesteps=60000)
 
 
-@train_imitation_ex.named_config
-def seals_half_cheetah():
-    environment = dict(gym_id="seals/HalfCheetah-v0")
-    bc = dict(l2_weight=0.0)
-    dagger = dict(total_timesteps=60000)
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    expert = {"policy_type": "ppo-huggingface"}
-
-
-@train_imitation_ex.named_config
-def seals_hopper():
-    environment = dict(gym_id="seals/Hopper-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    expert = {"policy_type": "ppo-huggingface"}
-
-
-@train_imitation_ex.named_config
-def seals_swimmer():
-    environment = dict(gym_id="seals/Swimmer-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    expert = {"policy_type": "ppo-huggingface"}
-
-
-@train_imitation_ex.named_config
-def seals_walker():
-    environment = dict(gym_id="seals/Walker2d-v0")
-    demonstrations = dict(rollout_type="ppo-huggingface")
-    expert = {"policy_type": "ppo-huggingface"}
-
-
 @train_imitation_ex.named_config
 def humanoid():
     environment = dict(gym_id="Humanoid-v2")
@@ -126,3 +91,23 @@ def fast():
     dagger = dict(total_timesteps=50)
     bc = dict(train_kwargs=dict(n_batches=50))
     sqil = dict(total_timesteps=50)
+
+
+hyperparam_dir = pathlib.Path(__file__).absolute().parent / "tuned_hps"
+tuned_alg_envs = [
+    "bc_seals_ant",
+    "bc_seals_half_cheetah",
+    "bc_seals_hopper",
+    "bc_seals_swimmer",
+    "bc_seals_walker",
+    "dagger_seals_ant",
+    "dagger_seals_half_cheetah",
+    "dagger_seals_hopper",
+    "dagger_seals_swimmer",
+    "dagger_seals_walker",
+]
+
+for tuned_alg_env in tuned_alg_envs:
+    config_file = hyperparam_dir / f"{tuned_alg_env}_best_hp_eval.json"
+    assert config_file.is_file(), f"{config_file} does not exist"
+    train_imitation_ex.add_named_config(tuned_alg_env, str(config_file))
diff --git a/src/imitation/scripts/config/tuned_hps/airl_seals_ant_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/airl_seals_ant_best_hp_eval.json
new file mode 100644
index 000000000..d4131433e
--- /dev/null
+++ b/src/imitation/scripts/config/tuned_hps/airl_seals_ant_best_hp_eval.json
@@ -0,0 +1,67 @@
+{
+  "algorithm_kwargs": {
+    "demo_batch_size": 8192,
+    "gen_replay_buffer_capacity": 8192,
+    "n_disc_updates_per_round": 16
+  },
+  "checkpoint_interval": 0,
+  "demonstrations": {
+    "source": "huggingface",
+    "algo_name": "ppo",
+    "n_expert_demos": null
+  },
+  "reward": {
+    "add_std_alpha": null,
+    "ensemble_size": null,
+    "net_cls": {
+      "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
+    },
+    "net_kwargs": {
+      "normalize_input_layer": {
+        "py/type": "imitation.util.networks.RunningNorm"
+      }
+    },
+    "normalize_output_layer": {
+      "py/type": "imitation.util.networks.RunningNorm"
+    }
+  },
+  "rl": {
+    "batch_size": 8192,
+    "rl_cls": {
+      "py/type": "stable_baselines3.ppo.ppo.PPO"
+    },
+    "rl_kwargs": {
+      "batch_size": 16,
+      "clip_range": 0.3,
+      "ent_coef": 3.27750078482474e-6,
+      "gae_lambda": 0.8,
+      "gamma": 0.995,
+      "learning_rate": 3.249429831179079e-5,
+      "max_grad_norm": 0.9,
+      "n_epochs": 10,
+      "vf_coef": 0.4351450387648799
+    }
+  },
+  "total_timesteps": 10000000,
+  "policy": {
+    "policy_cls": {
+      "py/type": "imitation.policies.base.FeedForward32Policy"
+    },
+    "policy_kwargs": {
+      "features_extractor_class": {
+        "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
+      },
+      "features_extractor_kwargs": {
+        "normalize_class": {
+          "py/type": "imitation.util.networks.RunningNorm"
+        }
+      }
+    }
+  },
+  "policy_evaluation": {
+    "n_episodes_eval": 50
+  },
+  "environment": {
+    "gym_id": "seals/Ant-v1"
+  }
+}
diff --git a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/airl_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/airl_seals_half_cheetah_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/airl_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/airl_seals_hopper_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/airl_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/airl_seals_hopper_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/airl_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/airl_seals_swimmer_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/airl_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/airl_seals_swimmer_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/airl_seals_swimmer_best_hp_eval.json
diff --git a/src/imitation/scripts/config/tuned_hps/airl_seals_walker_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/airl_seals_walker_best_hp_eval.json
new file mode 100644
index 000000000..edd99806d
--- /dev/null
+++ b/src/imitation/scripts/config/tuned_hps/airl_seals_walker_best_hp_eval.json
@@ -0,0 +1,86 @@
+{
+  "algorithm_kwargs": {
+    "demo_batch_size": 512,
+    "gen_replay_buffer_capacity": 16384,
+    "n_disc_updates_per_round": 16
+  },
+  "checkpoint_interval": 0,
+  "demonstrations": {
+    "source": "huggingface",
+    "algo_name": "ppo",
+    "n_expert_demos": null
+  },
+  "expert": {
+    "loader_kwargs": {
+      "gym_id": "seals/Walker2d-v1",
+      "organization": "HumanCompatibleAI"
+    }
+  },
+  "reward": {
+    "add_std_alpha": null,
+    "ensemble_size": null,
+    "net_cls": {
+      "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
+    },
+    "net_kwargs": {
+      "normalize_input_layer": {
+        "py/type": "imitation.util.networks.RunningNorm"
+      }
+    },
+    "normalize_output_layer": {
+      "py/type": "imitation.util.networks.RunningNorm"
+    }
+  },
+  "rl": {
+    "batch_size": 16384,
+    "rl_cls": {
+      "py/type": "stable_baselines3.ppo.ppo.PPO"
+    },
+    "rl_kwargs": {
+      "batch_size": 128,
+      "clip_range": 0.4,
+      "ent_coef": 0.002003867232707145,
+      "gae_lambda": 0.92,
+      "gamma": 0.98,
+      "learning_rate": 3.052170958603811e-5,
+      "max_grad_norm": 0.6,
+      "n_epochs": 20,
+      "vf_coef": 0.6167177795726859
+    }
+  },
+  "total_timesteps": 10000000,
+  "policy": {
+    "policy_cls": "MlpPolicy",
+    "policy_kwargs": {
+      "activation_fn": {
+        "py/type": "torch.nn.modules.activation.ReLU"
+      },
+      "features_extractor_class": {
+        "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
+      },
+      "features_extractor_kwargs": {
+        "normalize_class": {
+          "py/type": "imitation.util.networks.RunningNorm"
+        }
+      },
+      "net_arch": [
+        {
+          "pi": [
+            64,
+            64
+          ],
+          "vf": [
+            64,
+            64
+          ]
+        }
+      ]
+    }
+  },
+  "policy_evaluation": {
+    "n_episodes_eval": 50
+  },
+  "environment": {
+    "gym_id": "seals/Walker2d-v1"
+  }
+}
diff --git a/benchmarking/bc_seals_ant_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/bc_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/bc_seals_ant_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/bc_seals_ant_best_hp_eval.json
diff --git a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/bc_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/bc_seals_half_cheetah_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/bc_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/bc_seals_hopper_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/bc_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/bc_seals_hopper_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/bc_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/bc_seals_swimmer_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/bc_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/bc_seals_swimmer_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/bc_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/bc_seals_walker_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/bc_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/bc_seals_walker_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/bc_seals_walker_best_hp_eval.json
diff --git a/benchmarking/dagger_seals_ant_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/dagger_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/dagger_seals_ant_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/dagger_seals_ant_best_hp_eval.json
diff --git a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/dagger_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/dagger_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/dagger_seals_hopper_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/dagger_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/dagger_seals_hopper_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/dagger_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/dagger_seals_swimmer_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/dagger_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/dagger_seals_swimmer_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/dagger_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/dagger_seals_walker_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/dagger_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/dagger_seals_walker_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/dagger_seals_walker_best_hp_eval.json
diff --git a/benchmarking/fast_dagger_seals_cartpole.json b/src/imitation/scripts/config/tuned_hps/fast_dagger_seals_cartpole.json
similarity index 100%
rename from benchmarking/fast_dagger_seals_cartpole.json
rename to src/imitation/scripts/config/tuned_hps/fast_dagger_seals_cartpole.json
diff --git a/benchmarking/gail_seals_ant_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/gail_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/gail_seals_ant_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/gail_seals_ant_best_hp_eval.json
diff --git a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/gail_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/gail_seals_half_cheetah_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/gail_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/gail_seals_hopper_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/gail_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/gail_seals_hopper_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/gail_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/gail_seals_swimmer_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/gail_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/gail_seals_swimmer_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/gail_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/gail_seals_walker_best_hp_eval.json b/src/imitation/scripts/config/tuned_hps/gail_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/gail_seals_walker_best_hp_eval.json
rename to src/imitation/scripts/config/tuned_hps/gail_seals_walker_best_hp_eval.json
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 0a93943ef..cbae34688 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -1,13 +1,9 @@
-"""Tests for config files in benchmarking/ folder."""
-import pathlib
+"""Tests for config files in imitation/scripts/config/tuned_hps/ folder."""
 
 import pytest
 
 from imitation.scripts import train_adversarial, train_imitation, tuning
 
-THIS_DIR = pathlib.Path(__file__).absolute().parent
-BENCHMARKING_DIR = THIS_DIR.parent / "benchmarking"
-
 ALGORITHMS = ["bc", "dagger", "airl", "gail"]
 ENVIRONMENTS = [
     "seals_walker",
@@ -25,7 +21,6 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
     # because running the configs requires MuJoCo.
     # Requiring MuJoCo to run the tests adds too much complexity.
 
-    # GIVEN
     if algorithm in ("bc", "dagger"):
         experiment = train_imitation.train_imitation_ex
     elif algorithm in ("airl", "gail"):
@@ -34,15 +29,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
         raise ValueError(f"Unknown algorithm: {algorithm}")  # pragma: no cover
 
     config_name = f"{algorithm}_{environment}"
-    config_file = str(
-        BENCHMARKING_DIR / f"{algorithm}_{environment}_best_hp_eval.json",
-    )
-
-    # WHEN
-    experiment.add_named_config(config_name, config_file)
     run = experiment.run(command_name="print_config", named_configs=[config_name])
-
-    # THEN
     assert run.status == "COMPLETED"
 
 
@@ -58,7 +45,7 @@ def test_tuning_print_config_succeeds(algorithm: str, environment: str):
         named_configs=[algorithm],
         config_updates=dict(
             parallel_run_config=dict(
-                base_named_configs=[environment],
+                base_named_configs=[f"{algorithm}_{environment}"],
             ),
         ),
     )
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index b2417a9f9..9efb1be33 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -3,6 +3,7 @@
 import glob
 import os
 import pathlib
+import re
 import subprocess
 from typing import List
 
@@ -18,30 +19,31 @@
 )
 
 THIS_DIR = pathlib.Path(__file__).absolute().parent
-BENCHMARKING_DIR = THIS_DIR.parent / "benchmarking"
+BENCHMARKING_DIR = THIS_DIR.parent / "src/imitation/scripts/config/tuned_hps"
 EXPERIMENTS_DIR = THIS_DIR.parent / "experiments"
 COMMANDS_PY_PATH = EXPERIMENTS_DIR / "commands.py"
 
-EXPECTED_LOCAL_CONFIG_TEMPLATE = """python -m imitation.scripts.train_imitation dagger \
---capture=sys --name=run0 --file_storage={output_dir}/sacred/\
-$USER-cmd-run0-dagger-0-8bf911a8 \
-with benchmarking/fast_dagger_seals_cartpole.json \
-seed=0 logging.log_root={output_dir}"""
+EXPECTED_LOCAL_CONFIG_TEMPLATE = f"""python -m imitation.scripts.train_imitation \
+dagger --capture=sys --name=run0 --file_storage={{output_dir}}/sacred/\
+$USER-cmd-run0-dagger-0-72542943 \
+with {BENCHMARKING_DIR}/fast_dagger_seals_cartpole.json \
+seed=0 logging.log_root={{output_dir}}"""
 
-EXPECTED_HOFVARPNIR_CONFIG_TEMPLATE = """ctl job run \
---name $USER-cmd-run0-dagger-0-c3ac179d \
+BENCHMARKING_DIR_SUFFIX = re.sub(r".*/src/", "", str(BENCHMARKING_DIR))
+EXPECTED_HOFVARPNIR_CONFIG_TEMPLATE = f"""ctl job run \
+--name $USER-cmd-run0-dagger-0-aab021ce \
 --command "python -m imitation.scripts.train_imitation dagger \
---capture=sys --name=run0 --file_storage={output_dir}/sacred/\
-$USER-cmd-run0-dagger-0-c3ac179d \
-with /data/imitation/benchmarking/fast_dagger_seals_cartpole.json \
-seed=0 logging.log_root={output_dir}" \
+--capture=sys --name=run0 --file_storage={{output_dir}}/sacred/\
+$USER-cmd-run0-dagger-0-aab021ce \
+with /data/imitation/src/{BENCHMARKING_DIR_SUFFIX}/fast_dagger_seals_cartpole.json \
+seed=0 logging.log_root={{output_dir}}" \
 --container hacobe/devbox:imitation \
 --login --force-pull --never-restart --gpu 0 \
 --shared-host-dir-mount /data"""
 
 
 def _get_benchmarking_path(benchmarking_file):
-    return os.path.join(BENCHMARKING_DIR.stem, benchmarking_file)
+    return os.path.join(BENCHMARKING_DIR, benchmarking_file)
 
 
 def _run_commands_from_flags(**kwargs) -> List[str]:
@@ -148,10 +150,10 @@ def test_commands_local_config_with_custom_flags():
         output_dir="/foo/bar",
     )
     assert len(commands) == 1
-    expected = """python -m imitation.scripts.train_imitation dagger \
+    expected = f"""python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=baz --file_storage=/foo/bar/sacred/\
-$USER-cmd-baz-dagger-1-8bf911a8 \
-with benchmarking/fast_dagger_seals_cartpole.json \
+$USER-cmd-baz-dagger-1-72542943 \
+with {BENCHMARKING_DIR}/fast_dagger_seals_cartpole.json \
 seed=1 logging.log_root=/foo/bar"""
     assert commands[0] == expected
 
@@ -248,10 +250,10 @@ def test_commands_bc_config():
     cfg_pattern = _get_benchmarking_path("bc_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
-    expected = """python -m imitation.scripts.train_imitation bc \
+    expected = f"""python -m imitation.scripts.train_imitation bc \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-bc-0-78e5112a \
-with benchmarking/bc_seals_ant_best_hp_eval.json \
+$USER-cmd-run0-bc-0-47a528c5 \
+with {BENCHMARKING_DIR}/bc_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -262,10 +264,10 @@ def test_commands_dagger_config():
     cfg_pattern = _get_benchmarking_path("dagger_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
-    expected = """python -m imitation.scripts.train_imitation dagger \
+    expected = f"""python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-dagger-0-c27812cf \
-with benchmarking/dagger_seals_ant_best_hp_eval.json \
+$USER-cmd-run0-dagger-0-efa42a6a \
+with {BENCHMARKING_DIR}/dagger_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -276,10 +278,10 @@ def test_commands_gail_config():
     cfg_pattern = _get_benchmarking_path("gail_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
-    expected = """python -m imitation.scripts.train_adversarial gail \
+    expected = f"""python -m imitation.scripts.train_adversarial gail \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-gail-0-9d8d1202 \
-with benchmarking/gail_seals_ant_best_hp_eval.json \
+$USER-cmd-run0-gail-0-9b83299d \
+with {BENCHMARKING_DIR}/gail_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -290,10 +292,10 @@ def test_commands_airl_config():
     cfg_pattern = _get_benchmarking_path("airl_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
-    expected = """python -m imitation.scripts.train_adversarial airl \
+    expected = f"""python -m imitation.scripts.train_adversarial airl \
 --capture=sys --name=run0 \
---file_storage=output/sacred/$USER-cmd-run0-airl-0-9ed3120d \
-with benchmarking/airl_seals_ant_best_hp_eval.json \
+--file_storage=output/sacred/$USER-cmd-run0-airl-0-9cc929a8 \
+with {BENCHMARKING_DIR}/airl_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 

From 35c7265d836ac9e2c0f2cd0b7b3d19ccf98d0340 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Oct 2023 05:42:20 +0530
Subject: [PATCH 46/54] Bump cache version & remove unnecessary files

---
 .circleci/config.yml                          | 12 +--
 .../config/airl_seals_ant_best_hp_eval.json   | 67 ---------------
 .../airl_seals_walker_best_hp_eval.json       | 86 -------------------
 3 files changed, 6 insertions(+), 159 deletions(-)
 delete mode 100644 src/imitation/scripts/config/airl_seals_ant_best_hp_eval.json
 delete mode 100644 src/imitation/scripts/config/airl_seals_walker_best_hp_eval.json

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 029bf4cd6..b8aff85cb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -65,7 +65,7 @@ commands:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+            - v9linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install dependencies
@@ -75,7 +75,7 @@ commands:
       - save_cache:
           paths:
             - /venv
-          key: v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+          key: v9linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install imitation
@@ -96,7 +96,7 @@ commands:
 
       - restore_cache:
           keys:
-            - v7macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+            - v8macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install dependencies
@@ -108,7 +108,7 @@ commands:
       - save_cache:
           paths:
             - ~/venv
-          key: v7macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+          key: v8macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install imitation
@@ -138,7 +138,7 @@ commands:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
+            - v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
 
       - run:
           name: install python and binary dependencies
@@ -168,7 +168,7 @@ commands:
       - save_cache:
           paths:
             - .\venv
-          key: v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
+          key: v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
 
       - run:
           name: install imitation
diff --git a/src/imitation/scripts/config/airl_seals_ant_best_hp_eval.json b/src/imitation/scripts/config/airl_seals_ant_best_hp_eval.json
deleted file mode 100644
index d4131433e..000000000
--- a/src/imitation/scripts/config/airl_seals_ant_best_hp_eval.json
+++ /dev/null
@@ -1,67 +0,0 @@
-{
-  "algorithm_kwargs": {
-    "demo_batch_size": 8192,
-    "gen_replay_buffer_capacity": 8192,
-    "n_disc_updates_per_round": 16
-  },
-  "checkpoint_interval": 0,
-  "demonstrations": {
-    "source": "huggingface",
-    "algo_name": "ppo",
-    "n_expert_demos": null
-  },
-  "reward": {
-    "add_std_alpha": null,
-    "ensemble_size": null,
-    "net_cls": {
-      "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
-    },
-    "net_kwargs": {
-      "normalize_input_layer": {
-        "py/type": "imitation.util.networks.RunningNorm"
-      }
-    },
-    "normalize_output_layer": {
-      "py/type": "imitation.util.networks.RunningNorm"
-    }
-  },
-  "rl": {
-    "batch_size": 8192,
-    "rl_cls": {
-      "py/type": "stable_baselines3.ppo.ppo.PPO"
-    },
-    "rl_kwargs": {
-      "batch_size": 16,
-      "clip_range": 0.3,
-      "ent_coef": 3.27750078482474e-6,
-      "gae_lambda": 0.8,
-      "gamma": 0.995,
-      "learning_rate": 3.249429831179079e-5,
-      "max_grad_norm": 0.9,
-      "n_epochs": 10,
-      "vf_coef": 0.4351450387648799
-    }
-  },
-  "total_timesteps": 10000000,
-  "policy": {
-    "policy_cls": {
-      "py/type": "imitation.policies.base.FeedForward32Policy"
-    },
-    "policy_kwargs": {
-      "features_extractor_class": {
-        "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
-      },
-      "features_extractor_kwargs": {
-        "normalize_class": {
-          "py/type": "imitation.util.networks.RunningNorm"
-        }
-      }
-    }
-  },
-  "policy_evaluation": {
-    "n_episodes_eval": 50
-  },
-  "environment": {
-    "gym_id": "seals/Ant-v1"
-  }
-}
diff --git a/src/imitation/scripts/config/airl_seals_walker_best_hp_eval.json b/src/imitation/scripts/config/airl_seals_walker_best_hp_eval.json
deleted file mode 100644
index edd99806d..000000000
--- a/src/imitation/scripts/config/airl_seals_walker_best_hp_eval.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-  "algorithm_kwargs": {
-    "demo_batch_size": 512,
-    "gen_replay_buffer_capacity": 16384,
-    "n_disc_updates_per_round": 16
-  },
-  "checkpoint_interval": 0,
-  "demonstrations": {
-    "source": "huggingface",
-    "algo_name": "ppo",
-    "n_expert_demos": null
-  },
-  "expert": {
-    "loader_kwargs": {
-      "gym_id": "seals/Walker2d-v1",
-      "organization": "HumanCompatibleAI"
-    }
-  },
-  "reward": {
-    "add_std_alpha": null,
-    "ensemble_size": null,
-    "net_cls": {
-      "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
-    },
-    "net_kwargs": {
-      "normalize_input_layer": {
-        "py/type": "imitation.util.networks.RunningNorm"
-      }
-    },
-    "normalize_output_layer": {
-      "py/type": "imitation.util.networks.RunningNorm"
-    }
-  },
-  "rl": {
-    "batch_size": 16384,
-    "rl_cls": {
-      "py/type": "stable_baselines3.ppo.ppo.PPO"
-    },
-    "rl_kwargs": {
-      "batch_size": 128,
-      "clip_range": 0.4,
-      "ent_coef": 0.002003867232707145,
-      "gae_lambda": 0.92,
-      "gamma": 0.98,
-      "learning_rate": 3.052170958603811e-5,
-      "max_grad_norm": 0.6,
-      "n_epochs": 20,
-      "vf_coef": 0.6167177795726859
-    }
-  },
-  "total_timesteps": 10000000,
-  "policy": {
-    "policy_cls": "MlpPolicy",
-    "policy_kwargs": {
-      "activation_fn": {
-        "py/type": "torch.nn.modules.activation.ReLU"
-      },
-      "features_extractor_class": {
-        "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
-      },
-      "features_extractor_kwargs": {
-        "normalize_class": {
-          "py/type": "imitation.util.networks.RunningNorm"
-        }
-      },
-      "net_arch": [
-        {
-          "pi": [
-            64,
-            64
-          ],
-          "vf": [
-            64,
-            64
-          ]
-        }
-      ]
-    }
-  },
-  "policy_evaluation": {
-    "n_episodes_eval": 50
-  },
-  "environment": {
-    "gym_id": "seals/Walker2d-v1"
-  }
-}

From fdf4f4903fd8ff5769aaebfc5f0f501bb3e73e64 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Oct 2023 06:34:41 +0530
Subject: [PATCH 47/54] Include tuned hyperparam json files in package data

---
 .circleci/config.yml | 12 ++++++------
 setup.py             |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b8aff85cb..029bf4cd6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -65,7 +65,7 @@ commands:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v9linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+            - v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install dependencies
@@ -75,7 +75,7 @@ commands:
       - save_cache:
           paths:
             - /venv
-          key: v9linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+          key: v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install imitation
@@ -96,7 +96,7 @@ commands:
 
       - restore_cache:
           keys:
-            - v8macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+            - v7macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install dependencies
@@ -108,7 +108,7 @@ commands:
       - save_cache:
           paths:
             - ~/venv
-          key: v8macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+          key: v7macos-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install imitation
@@ -138,7 +138,7 @@ commands:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
+            - v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
 
       - run:
           name: install python and binary dependencies
@@ -168,7 +168,7 @@ commands:
       - save_cache:
           paths:
             - .\venv
-          key: v11win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
+          key: v10win-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.ps1" }}
 
       - run:
           name: install imitation
diff --git a/setup.py b/setup.py
index 0384014ee..1c069c463 100644
--- a/setup.py
+++ b/setup.py
@@ -181,7 +181,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
     python_requires=">=3.8.0",
     packages=find_packages("src"),
     package_dir={"": "src"},
-    package_data={"imitation": ["py.typed", "envs/examples/airl_envs/assets/*.xml"]},
+    package_data={"imitation": ["py.typed", "scripts/config/tuned_hps/*.json"]},
     # Note: while we are strict with our test and doc requirement versions, we try to
     #   impose as little restrictions on the install requirements as possible. Try to
     #   encode only known incompatibilities here. This prevents nasty dependency issues

From 5f9a4e633988a0f8d319c1d93f41c0cf1814f01a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Oct 2023 07:44:00 +0530
Subject: [PATCH 48/54] Update storage hash

---
 tests/test_experiments.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 9efb1be33..f6b4a8e39 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -25,7 +25,7 @@
 
 EXPECTED_LOCAL_CONFIG_TEMPLATE = f"""python -m imitation.scripts.train_imitation \
 dagger --capture=sys --name=run0 --file_storage={{output_dir}}/sacred/\
-$USER-cmd-run0-dagger-0-72542943 \
+$USER-cmd-run0-dagger-0-152b2005 \
 with {BENCHMARKING_DIR}/fast_dagger_seals_cartpole.json \
 seed=0 logging.log_root={{output_dir}}"""
 
@@ -152,7 +152,7 @@ def test_commands_local_config_with_custom_flags():
     assert len(commands) == 1
     expected = f"""python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=baz --file_storage=/foo/bar/sacred/\
-$USER-cmd-baz-dagger-1-72542943 \
+$USER-cmd-baz-dagger-1-152b2005 \
 with {BENCHMARKING_DIR}/fast_dagger_seals_cartpole.json \
 seed=1 logging.log_root=/foo/bar"""
     assert commands[0] == expected
@@ -252,7 +252,7 @@ def test_commands_bc_config():
     assert len(commands) == 1
     expected = f"""python -m imitation.scripts.train_imitation bc \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-bc-0-47a528c5 \
+$USER-cmd-run0-bc-0-f3ab1f87 \
 with {BENCHMARKING_DIR}/bc_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -266,7 +266,7 @@ def test_commands_dagger_config():
     assert len(commands) == 1
     expected = f"""python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-dagger-0-efa42a6a \
+$USER-cmd-run0-dagger-0-76c1212c \
 with {BENCHMARKING_DIR}/dagger_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -280,7 +280,7 @@ def test_commands_gail_config():
     assert len(commands) == 1
     expected = f"""python -m imitation.scripts.train_adversarial gail \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-gail-0-9b83299d \
+$USER-cmd-run0-gail-0-351c205f \
 with {BENCHMARKING_DIR}/gail_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -294,7 +294,7 @@ def test_commands_airl_config():
     assert len(commands) == 1
     expected = f"""python -m imitation.scripts.train_adversarial airl \
 --capture=sys --name=run0 \
---file_storage=output/sacred/$USER-cmd-run0-airl-0-9cc929a8 \
+--file_storage=output/sacred/$USER-cmd-run0-airl-0-3662206a \
 with {BENCHMARKING_DIR}/airl_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected

From 91bb785f77892c3ec936f5f008700b27fb1ff5fe Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Oct 2023 21:47:57 +0530
Subject: [PATCH 49/54] Update search space of bc

---
 src/imitation/scripts/config/tuning.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py
index 07161d04c..73313770a 100644
--- a/src/imitation/scripts/config/tuning.py
+++ b/src/imitation/scripts/config/tuning.py
@@ -49,24 +49,24 @@ def bc():
         search_space={
             "config_updates": {
                 "bc": dict(
-                    batch_size=tune.choice([8]),
+                    batch_size=tune.choice([8, 16, 32, 64]),
                     l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
                     optimizer_kwargs=dict(
                         lr=tune.loguniform(1e-5, 1e-2),
                     ),
                     train_kwargs=dict(
-                        n_epochs=tune.choice([1]),
+                        n_epochs=tune.choice([1, 5, 10, 20]),
                     ),
                 ),
             },
             "command_name": "bc",
         },
-        num_samples=2,
-        repeat=2,
+        num_samples=64,
+        repeat=3,
         resources_per_trial=dict(cpu=1),
     )
 
-    num_eval_seeds = 1
+    num_eval_seeds = 5
     eval_best_trial_resource_multiplier = 1
 
 

From f59fea232d1af874a5f387407591d450444fce0c Mon Sep 17 00:00:00 2001
From: ZiyueWang25 <wfuymu@gmail.com>
Date: Thu, 5 Oct 2023 11:40:01 -0700
Subject: [PATCH 50/54] update benchmark and hyper parameter tuning readme

---
 benchmarking/README.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index ba89da69d..fb21a0223 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -7,7 +7,7 @@ Configuration files can be loaded either from the CLI or from the Python API. Th
 ## CLI
 
 ```bash
-python -m imitation.scripts.<train_script> <algo> with benchmarking/<config_name>.json
+python -m imitation.scripts.<train_script> <algo> with src/imitation/config/tuned_hps/<config_name>.json
 ```
 `train_script` can be either 1) `train_imitation` with `algo` as `bc` or `dagger` or 2) `train_adversarial`  with `algo` as `gail` or `airl`.
 
@@ -16,26 +16,27 @@ python -m imitation.scripts.<train_script> <algo> with benchmarking/<config_name
 ```python
 ...
 from imitation.scripts.<train_script> import <train_ex>
-<train_ex>.run(command_name="<algo>", named_configs=["benchmarking/<config_name>.json"])
+<train_ex>.run(command_name="<algo>", named_configs=["src/imitation/config/tuned_hps/<config_name>.json"])
 ```
 
 # Tuning Hyperparameters
 
-The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script.
+The hyperparameters of any algorithm in imitation can be tuned using the `scripts/tuning.py`.
 The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
-the search space defined in the `tuning_config.py` script. The tuning script proceeds in two
-phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best
-hyperparameter config found in the first phase based on the maximum mean return is
-re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials
-are reported.
+the search space defined in the `scripts/config/tuning.py`.
 
-To tune the hyperparameters of an algorithm using the default search space provided:
+The tuning script proceeds in two phases:
+1. Tune the hyperparameters using the search space provided.
+2. Re-evaluate the best hyperparameter config found in the first phase based on the maximum mean return on a separate set of seeds. Report the mean and standard deviation of these trials.
+
+To use it with the default search space:
 ```bash
-python tuning.py with {algo} 'parallel_run_config.base_named_configs=["{env}"]'
+python src/imitation/scripts/tuning.py with <algo> 'parallel_run_config.base_named_configs=["<env>"]'
 ```
 
-In this command, `{algo}` provides the default search space and settings to be used for
-the specific algorithm, which is defined in the `tuning_config.py` script and
-`'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in.
-See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be
+In this command: 
+- `<algo>` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py`
+- `<env>` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial/imitation/preference_comparisons/rl].py` files.
+
+See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be
 provided through the command line to change the tuning behavior.

From 95110dc21673cbcaef04a432d0fd38b874ecb501 Mon Sep 17 00:00:00 2001
From: Mohammad Taufeeque <9taufeeque9@gmail.com>
Date: Fri, 6 Oct 2023 00:30:24 +0530
Subject: [PATCH 51/54] Update README.md

---
 benchmarking/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index fb21a0223..7e7e7c652 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -1,22 +1,22 @@
 # Benchmarking imitation
 
-This directory contains sacred configuration files for benchmarking imitation's algorithms. For v0.3.2, these correspond to the hyperparameters used in the paper [imitation: Clean Imitation Learning Implementations](https://www.rocamonde.com/publication/gleave-imitation-2022/).
+The `src/imitation/scripts/config/tuned_hps` directory provides the tuned hyperparameter configs for benchmarking imitation. For v0.4.0, these correspond to the hyperparameters used in the paper [imitation: Clean Imitation Learning Implementations](https://www.rocamonde.com/publication/gleave-imitation-2022/).
 
-Configuration files can be loaded either from the CLI or from the Python API. The examples below assume that your current working directory is the root of the `imitation` repository. This is not necessarily the case and you should adjust your paths accordingly.
+Configuration files can be loaded either from the CLI or from the Python API.
 
 ## CLI
 
 ```bash
-python -m imitation.scripts.<train_script> <algo> with src/imitation/config/tuned_hps/<config_name>.json
+python -m imitation.scripts.<train_script> <algo> with <algo>_<env>
 ```
-`train_script` can be either 1) `train_imitation` with `algo` as `bc` or `dagger` or 2) `train_adversarial`  with `algo` as `gail` or `airl`.
+`train_script` can be either 1) `train_imitation` with `algo` as `bc` or `dagger` or 2) `train_adversarial`  with `algo` as `gail` or `airl`. The `env` can be either of `seals_ant`, `seals_half_cheetah`, `seals_hopper`, `seals_swimmer`, or `seals_walker`. The hyperparameters for other environments are not tuned yet. You can either the tuned hyperparameter for any of the other environments or tune the hyperparameters using the `tuning` script.
 
 ## Python
 
 ```python
 ...
 from imitation.scripts.<train_script> import <train_ex>
-<train_ex>.run(command_name="<algo>", named_configs=["src/imitation/config/tuned_hps/<config_name>.json"])
+<train_ex>.run(command_name="<algo>", named_configs=["<algo>_<env>"])
 ```
 
 # Tuning Hyperparameters
@@ -31,12 +31,12 @@ The tuning script proceeds in two phases:
 
 To use it with the default search space:
 ```bash
-python src/imitation/scripts/tuning.py with <algo> 'parallel_run_config.base_named_configs=["<env>"]'
+python -m imitation.scripts.tuning with <algo> 'parallel_run_config.base_named_configs=["<env>"]'
 ```
 
 In this command: 
 - `<algo>` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py`
-- `<env>` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial/imitation/preference_comparisons/rl].py` files.
+- `<env>` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial/imitation/preference_comparisons/rl].py` files. For the already tuned environments, use the `<algo>_<env>` named configs here.
 
 See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be
 provided through the command line to change the tuning behavior.

From 10ec8a2fe003f0bdc7e9440a6bd83f1fc43fed25 Mon Sep 17 00:00:00 2001
From: ZiyueWang25 <wfuymu@gmail.com>
Date: Thu, 5 Oct 2023 20:06:30 -0700
Subject: [PATCH 52/54] mce_irl_train

---
 src/imitation/scripts/train_imitation.py | 64 ++++++++++++++++++++++--
 tests/scripts/test_scripts.py            |  6 +++
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index c47ed29bd..6c03684f5 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -1,21 +1,30 @@
 """Trains DAgger on synthetic demonstrations generated from an expert policy."""
 
+from functools import partial
 import logging
 import os.path as osp
 import pathlib
 from typing import Any, Dict, Mapping, Optional, Sequence, cast
 
+
 import numpy as np
+import torch as th
 from sacred.observers import FileStorageObserver
-
-from imitation.algorithms import dagger as dagger_algorithm
-from imitation.algorithms import sqil as sqil_algorithm
+from seals import base_envs
+from seals.diagnostics.cliff_world import CliffWorldEnv
+from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
+
+from imitation.algorithms import (
+    dagger as dagger_algorithm,
+    sqil as sqil_algorithm,
+    mce_irl as mceirl_algorithm,
+)
 from imitation.data import rollout, types
 from imitation.scripts.config.train_imitation import train_imitation_ex
 from imitation.scripts.ingredients import bc as bc_ingredient
 from imitation.scripts.ingredients import demonstrations, environment, expert
 from imitation.scripts.ingredients import logging as logging_ingredient
-from imitation.scripts.ingredients import policy_evaluation
+from imitation.scripts.ingredients import policy_evaluation, reward
 from imitation.util import util
 
 logger = logging.getLogger(__name__)
@@ -185,6 +194,53 @@ def sqil(
     return stats
 
 
+@train_imitation_ex.command
+def mceirl(
+    mceirl: Mapping[str, Any],
+    optimizer_cls: th.optim.Optimizer,  # not sure
+    optimizer_kwargs: Mapping[str, Any],
+    env_kwargs: Mapping[str, Any],
+    num_vec: int,
+    parallel: bool,
+    _run,
+    _rnd: np.random.Generator,
+) -> Mapping[str, Mapping[str, float]]:
+    custom_logger, log_dir = logging_ingredient.setup_logging()
+    expert_trajs = demonstrations.get_expert_trajectories()
+    env_creator = partial(CliffWorldEnv, **env_kwargs)
+    env = env_creator()
+
+    env_fns = [lambda: base_envs.ExposePOMDPStateWrapper(env_creator())] * num_vec
+    # This is just a vectorized environment because `generate_trajectories` expects one
+    if parallel:
+        # See GH hill-a/stable-baselines issue #217
+        state_venv = SubprocVecEnv(env_fns, start_method="forkserver")
+    else:
+        state_venv = DummyVecEnv(env_fns)
+
+    reward_net = reward.make_reward_net(state_venv)
+    mceirl_trainer = mceirl_algorithm.MCEIRL(
+        env=env,
+        demonstrations=expert_trajs,
+        reward_net=reward_net,
+        rng=_rnd,
+        optimizer_cls=optimizer_cls,
+        optimizer_kwargs=optimizer_kwargs,
+        discount=mceirl["discount"],
+        linf_eps=mceirl["linf_eps"],
+        grad_l2_eps=mceirl["grad_l2_eps"],
+        log_interval=mceirl["log_interval"],
+        custom_logger=custom_logger,
+    )
+    mceirl_trainer.train(
+        max_iter=int(mceirl["max_iter"]),
+    )
+    util.save_policy(mceirl_trainer.policy, policy_path=osp.join(log_dir, "final.th"))
+    imit_stats = policy_evaluation.eval_policy(mceirl_trainer.policy, state_venv)
+    stats = _collect_stats(imit_stats, expert_trajs)
+    return stats
+
+
 def main_console():
     observer_path = pathlib.Path.cwd() / "output" / "sacred" / "train_imitation"
     observer = FileStorageObserver(observer_path)
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index ae39116e7..8dc748480 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -425,6 +425,12 @@ def test_train_bc_warmstart(tmpdir):
     assert isinstance(run_warmstart.result, dict)
 
 
+def test_train_mceirl_main(mceirl_config):
+    run = train_imitation.train_imitation_ex.run(**mceirl_config)
+    assert run.status == "COMPLETED"
+    assert isinstance(run.result, dict)
+
+
 def test_train_sqil_main(sqil_config):
     # NOTE: Having four different expert types as in bc might be overkill for sqil
     run = train_imitation.train_imitation_ex.run(**sqil_config)

From 7436784aa62291214bb899e3b1b06f48e703e385 Mon Sep 17 00:00:00 2001
From: ZiyueWang25 <wfuymu@gmail.com>
Date: Fri, 6 Oct 2023 15:44:17 -0700
Subject: [PATCH 53/54] add train_mce_irl script

---
 src/imitation/scripts/config/train_mce_irl.py | 48 ++++++++++
 src/imitation/scripts/train_imitation.py      | 64 +-------------
 src/imitation/scripts/train_mce_irl.py        | 88 +++++++++++++++++++
 3 files changed, 140 insertions(+), 60 deletions(-)
 create mode 100644 src/imitation/scripts/config/train_mce_irl.py
 create mode 100644 src/imitation/scripts/train_mce_irl.py

diff --git a/src/imitation/scripts/config/train_mce_irl.py b/src/imitation/scripts/config/train_mce_irl.py
new file mode 100644
index 000000000..fb3aca2b1
--- /dev/null
+++ b/src/imitation/scripts/config/train_mce_irl.py
@@ -0,0 +1,48 @@
+"""Configuration for imitation.scripts.train_mce_irl."""
+import sacred
+from torch import nn
+import torch as th
+
+from imitation.scripts.ingredients import environment
+from imitation.scripts.ingredients import logging as logging_ingredient
+from imitation.scripts.ingredients import policy_evaluation, reward, rl
+
+train_mce_irl_ex = sacred.Experiment(
+    "train_mce_irl",
+    ingredients=[
+        logging_ingredient.logging_ingredient,
+        environment.environment_ingredient,
+        reward.reward_ingredient,
+        rl.rl_ingredient,
+        policy_evaluation.policy_evaluation_ingredient,
+    ],
+)
+
+
+MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1)))
+ANT_SHARED_LOCALS = dict(
+    total_timesteps=int(3e7),
+    rl=dict(batch_size=16384),
+)
+
+
+@train_mce_irl_ex.config
+def train_defaults():
+    mceirl = {
+        "discount": 1,
+        "linf_eps": 0.001,
+        "grad_l2_eps": 0.0001,
+        "log_interval": 100,
+    }
+    optimizer_cls = th.optim.Adam
+    optimizer_kwargs = dict(
+        lr=4e-4,
+    )
+    env_kwargs = {
+        "height": 4,
+        "horizon": 40,
+        "width": 7,
+        "use_xy_obs": True,
+    }
+    num_vec = 8  # number of environments in VecEnv
+    parallel = False
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 6c03684f5..2c280cc46 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -1,24 +1,15 @@
 """Trains DAgger on synthetic demonstrations generated from an expert policy."""
 
-from functools import partial
 import logging
 import os.path as osp
 import pathlib
-from typing import Any, Dict, Mapping, Optional, Sequence, cast
-
+from typing import Any, Dict, Mapping, Optional, Sequence, Type, cast
 
 import numpy as np
-import torch as th
 from sacred.observers import FileStorageObserver
-from seals import base_envs
-from seals.diagnostics.cliff_world import CliffWorldEnv
-from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
-
-from imitation.algorithms import (
-    dagger as dagger_algorithm,
-    sqil as sqil_algorithm,
-    mce_irl as mceirl_algorithm,
-)
+
+from imitation.algorithms import dagger as dagger_algorithm
+from imitation.algorithms import sqil as sqil_algorithm
 from imitation.data import rollout, types
 from imitation.scripts.config.train_imitation import train_imitation_ex
 from imitation.scripts.ingredients import bc as bc_ingredient
@@ -194,53 +185,6 @@ def sqil(
     return stats
 
 
-@train_imitation_ex.command
-def mceirl(
-    mceirl: Mapping[str, Any],
-    optimizer_cls: th.optim.Optimizer,  # not sure
-    optimizer_kwargs: Mapping[str, Any],
-    env_kwargs: Mapping[str, Any],
-    num_vec: int,
-    parallel: bool,
-    _run,
-    _rnd: np.random.Generator,
-) -> Mapping[str, Mapping[str, float]]:
-    custom_logger, log_dir = logging_ingredient.setup_logging()
-    expert_trajs = demonstrations.get_expert_trajectories()
-    env_creator = partial(CliffWorldEnv, **env_kwargs)
-    env = env_creator()
-
-    env_fns = [lambda: base_envs.ExposePOMDPStateWrapper(env_creator())] * num_vec
-    # This is just a vectorized environment because `generate_trajectories` expects one
-    if parallel:
-        # See GH hill-a/stable-baselines issue #217
-        state_venv = SubprocVecEnv(env_fns, start_method="forkserver")
-    else:
-        state_venv = DummyVecEnv(env_fns)
-
-    reward_net = reward.make_reward_net(state_venv)
-    mceirl_trainer = mceirl_algorithm.MCEIRL(
-        env=env,
-        demonstrations=expert_trajs,
-        reward_net=reward_net,
-        rng=_rnd,
-        optimizer_cls=optimizer_cls,
-        optimizer_kwargs=optimizer_kwargs,
-        discount=mceirl["discount"],
-        linf_eps=mceirl["linf_eps"],
-        grad_l2_eps=mceirl["grad_l2_eps"],
-        log_interval=mceirl["log_interval"],
-        custom_logger=custom_logger,
-    )
-    mceirl_trainer.train(
-        max_iter=int(mceirl["max_iter"]),
-    )
-    util.save_policy(mceirl_trainer.policy, policy_path=osp.join(log_dir, "final.th"))
-    imit_stats = policy_evaluation.eval_policy(mceirl_trainer.policy, state_venv)
-    stats = _collect_stats(imit_stats, expert_trajs)
-    return stats
-
-
 def main_console():
     observer_path = pathlib.Path.cwd() / "output" / "sacred" / "train_imitation"
     observer = FileStorageObserver(observer_path)
diff --git a/src/imitation/scripts/train_mce_irl.py b/src/imitation/scripts/train_mce_irl.py
new file mode 100644
index 000000000..41e25d862
--- /dev/null
+++ b/src/imitation/scripts/train_mce_irl.py
@@ -0,0 +1,88 @@
+"""Train Finite-horizon tabular Maximum Causal Entropy IRL.
+
+Can be used as a CLI script, or the `train_mce_irl` function
+can be called directly.
+"""
+
+from functools import partial
+import logging
+import pathlib
+import os.path as osp
+from typing import Any, Mapping, Type
+
+
+import numpy as np
+import torch as th
+from sacred.observers import FileStorageObserver
+from seals import base_envs
+from seals.diagnostics.cliff_world import CliffWorldEnv
+from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
+
+from imitation.algorithms import mce_irl as mceirl_algorithm
+from imitation.data import rollout
+from imitation.scripts.config.train_mce_irl import train_mce_irl_ex
+from imitation.scripts.ingredients import demonstrations
+from imitation.scripts.ingredients import logging as logging_ingredient
+from imitation.scripts.ingredients import policy_evaluation, reward
+from imitation.util import util
+
+logger = logging.getLogger(__name__)
+
+
+@train_mce_irl_ex.command
+def train_mce_irl(
+    mceirl: Mapping[str, Any],
+    optimizer_cls: Type[th.optim.Optimizer],
+    optimizer_kwargs: Mapping[str, Any],
+    env_kwargs: Mapping[str, Any],
+    num_vec: int,
+    parallel: bool,
+    _run,
+    _rnd: np.random.Generator,
+) -> Mapping[str, Mapping[str, float]]:
+    custom_logger, log_dir = logging_ingredient.setup_logging()
+    expert_trajs = demonstrations.get_expert_trajectories()
+    env_creator = partial(CliffWorldEnv, **env_kwargs)
+    env = env_creator()
+
+    env_fns = [lambda: base_envs.ExposePOMDPStateWrapper(env_creator())] * num_vec
+    # This is just a vectorized environment because `generate_trajectories` expects one
+    if parallel:
+        # See GH hill-a/stable-baselines issue #217
+        state_venv = SubprocVecEnv(env_fns, start_method="forkserver")
+    else:
+        state_venv = DummyVecEnv(env_fns)
+
+    reward_net = reward.make_reward_net(state_venv)
+    mceirl_trainer = mceirl_algorithm.MCEIRL(
+        demonstrations=expert_trajs,
+        env=env,
+        reward_net=reward_net,
+        rng=_rnd,
+        optimizer_cls=optimizer_cls,
+        optimizer_kwargs=optimizer_kwargs,
+        discount=mceirl["discount"],
+        linf_eps=mceirl["linf_eps"],
+        grad_l2_eps=mceirl["grad_l2_eps"],
+        log_interval=mceirl["log_interval"],
+        custom_logger=custom_logger,
+    )
+    mceirl_trainer.train(max_iter=int(mceirl["max_iter"]))
+    util.save_policy(mceirl_trainer.policy, policy_path=osp.join(log_dir, "final.th"))
+    th.save(reward_net, osp.join(log_dir, "reward_net.pt"))
+    imit_stats = policy_evaluation.eval_policy(mceirl_trainer.policy, state_venv)
+    return {
+        "imit_stats": imit_stats,
+        "expert_stats": rollout.rollout_stats(expert_trajs),
+    }
+
+
+def main_console():
+    observer_path = pathlib.Path.cwd() / "output" / "sacred" / "train_mce_irl"
+    observer = FileStorageObserver(observer_path)
+    train_mce_irl_ex.observers.append(observer)
+    train_mce_irl_ex.run_commandline()
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main_console()

From 1ac7848b71913e1f44c4966e82434055788a7ac3 Mon Sep 17 00:00:00 2001
From: ZiyueWang25 <wfuymu@gmail.com>
Date: Fri, 6 Oct 2023 15:50:12 -0700
Subject: [PATCH 54/54] small fix

---
 src/imitation/scripts/train_imitation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 2c280cc46..c47ed29bd 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -3,7 +3,7 @@
 import logging
 import os.path as osp
 import pathlib
-from typing import Any, Dict, Mapping, Optional, Sequence, Type, cast
+from typing import Any, Dict, Mapping, Optional, Sequence, cast
 
 import numpy as np
 from sacred.observers import FileStorageObserver
@@ -15,7 +15,7 @@
 from imitation.scripts.ingredients import bc as bc_ingredient
 from imitation.scripts.ingredients import demonstrations, environment, expert
 from imitation.scripts.ingredients import logging as logging_ingredient
-from imitation.scripts.ingredients import policy_evaluation, reward
+from imitation.scripts.ingredients import policy_evaluation
 from imitation.util import util
 
 logger = logging.getLogger(__name__)