From 2dde85c21abd5f532574db33bdd53dfe4dd98857 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 13:42:36 +0100
Subject: [PATCH 1/6] add full evaluation into training loop. Other training
 changes for A100 node.

---
 bsmetadata/deepspeed_configs/v2.json          |  22 +-
 bsmetadata/evaluation.py                      | 212 ++++++++++--------
 .../experiments/with_metadata_datasetv2_tf.py |  27 ++-
 bsmetadata/hydra_configs/v2.yaml              |  21 +-
 bsmetadata/train.py                           |  79 +++++--
 5 files changed, 216 insertions(+), 145 deletions(-)

diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json
index 1d5c0311..35a24626 100644
--- a/bsmetadata/deepspeed_configs/v2.json
+++ b/bsmetadata/deepspeed_configs/v2.json
@@ -30,19 +30,19 @@
         }
     },
     "zero_optimization": {
-        "stage": 1,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 500000000,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 500000000,
-        "contiguous_gradients": true,
-        "cpu_offload": true
-    },
-    "gradient_accumulation_steps": 16,
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 2e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+},
+    "gradient_accumulation_steps": 1,
     "gradient_clipping": "auto",
     "steps_per_print": 100,
-    "train_batch_size": 256,
+    "train_batch_size": 512,
     "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
 }
\ No newline at end of file
diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index b75c2aa5..235e7263 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -264,73 +264,24 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else ""
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        default="bs-modeling-metadata/checkpoints_all_04_23",
-        help="Repository ID for the model to compute perplexity for",
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        default="checkpoint-2000step",
-        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
-    )
-    parser.add_argument(
-        "--config_file_path",
-        type=str,
-        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
-    )
-    parser.add_argument(
-        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
-    parser.add_argument(
-        "--save_data",
-        action="store_true",
-        help="If set to true, save tokens & losses",
-    )
-    parser.add_argument(
-        "--test",
-        action="store_true",
-        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
-    )
-    parser.add_argument(
-        "--max_n_examples",
-        type=int,
-        default=1500,
-        help="how many examples per metadata type to evaluate",
-    )
-    parser.add_argument(
-        "--metadata_to_test",
-        type=str,
-        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
-        help="metadata types to test",
-    )
-    parser.add_argument(
-        "--untrained",
-        action="store_true",
-        help="If set to true, will load gpt2-xl",
-    )
-    parser.add_argument(
-        "--prompt",
-        action="store_true",
-        help="If set to true, the script evaluates metadata in prompt style",
-    )
-
-    args = parser.parse_args()
-    print(f"Parameters: {args}")
-
-    # Load config
-    if args.config_file_path:
-        config_file_path = args.config_file_path
-    else:
+def evaluate_main(
+    metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+    output_file: str = "evaluation.txt",
+    repo_id: str = None,
+    subfolder: str = None,
+    test: bool = False,
+    max_n_examples: int = 1500,
+    prompt: bool = False,
+    no_cuda: bool = False,
+    save_data: bool = False,
+    untrained: bool = False,
+    config_file_path: str = None,
+    model: str = None,
+    tokenizer: str = None,
+) -> dict:
+    if config_file_path is None:
         try:
-            config_file_path = hf_hub_download(
-                repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True
-            )
+            config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
     repo_args = OmegaConf.load(config_file_path)
@@ -341,15 +292,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
     # Load model
     print("Loading model...")
-    if args.untrained:
-        model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
-    else:
-        model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True)
-    model.eval().cuda() if not args.no_cuda else model.eval()
-
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
+    if model is None or tokenizer is None:
+        if untrained:
+            model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
+            tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True
+            )
+    model.eval().cuda() if not no_cuda else model.eval()
 
     # Config preprocess function
     cfg = data_config.metadata_config
@@ -358,7 +311,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     cfg.metadata_list.append("entity")
     cfg.metadata_list.append("paragraph")
 
-    if args.prompt:
+    if prompt:
         cfg.metadata_sep = "; "  # Instead of " | "
         cfg.metadata_prefix_sep = ""  # Instead of " |||"; there's already an implicit " "
         DatasourceProcessor.process_global = datasource_process_global_for_prompt
@@ -381,8 +334,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_url",
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph",
     ]
-    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")]
-
+    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")]
+    results = {}
     for path in dataset_paths:
         n_examples = 0
         total_normal_len = []
@@ -394,11 +347,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         # Load validation dataset from hugging face
         metadata_type = path.split("_metadata_")[1]
         print(f"Loading {metadata_type} data...")
-        split = "validation" if not args.test else "validation[:10]"
+        split = "validation" if not test else "validation[:10]"
         validation_dataset = load_dataset(path, use_auth_token=True, split=split)
 
         data = []
-        max_n_examples_ord = len(str(args.max_n_examples))
+        max_n_examples_ord = len(str(max_n_examples))
         for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."):
             # for idx in [136,]:
             example = validation_dataset[idx]
@@ -409,7 +362,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
             except Exception as e:
                 # Write error to output file and continue with next dataset
                 print(e)
-                with open(args.output_file, "a", encoding="utf8") as f:
+                with open(output_file, "a", encoding="utf8") as f:
                     f.write(f"=== RESULT [{metadata_type}] ===\n")
                     f.write(f"{e}\n\n")
                 exit_flag = True
@@ -445,7 +398,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                 normal_batch = default_data_collator([normal_example])
                 metadata_example["labels"] = metadata_example["input_ids"]
                 metadata_batch = default_data_collator([metadata_example])
-                if not args.no_cuda:
+                if not no_cuda:
                     normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
                     metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
                 if n_examples == 1:
@@ -461,13 +414,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                     # rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))
 
                 # Calculate nll (natural-log loss)
-                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx)  # [0]
+                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx)  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
-                metadata_nll, metadata_example_len = get_mean_loss(
-                    metadata_batch, save_data=args.save_data, idx=idx
-                )  # [0]
+                metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx)  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
 
@@ -521,7 +472,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
                     # sys.exit()
 
-                if n_examples > args.max_n_examples:
+                if n_examples > max_n_examples:
                     break
 
         if exit_flag:
@@ -554,9 +505,86 @@ def ppl(examples_mean_loss, examples_len):
         else:
             final_metadata_ppl = final_normal_ppl = 0
 
-        # Write results to output file
-        with open(args.output_file, "a", encoding="utf8") as f:
-            f.write(f"=== RESULT [{metadata_type}] ===\n")
-            f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl))
-            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(final_normal_ppl))
+        results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl}
         torch.save(data, "eva.data")
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="bs-modeling-metadata/checkpoints_all_04_23",
+        help="Repository ID for the model to compute perplexity for",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="checkpoint-2000step",
+        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
+    )
+    parser.add_argument(
+        "--config_file_path",
+        type=str,
+        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
+    )
+    parser.add_argument(
+        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
+    parser.add_argument(
+        "--save_data",
+        action="store_true",
+        help="If set to true, save tokens & losses",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
+    )
+    parser.add_argument(
+        "--max_n_examples",
+        type=int,
+        default=1500,
+        help="how many examples per metadata type to evaluate",
+    )
+    parser.add_argument(
+        "--metadata_to_test",
+        type=str,
+        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
+        help="metadata types to test",
+    )
+    parser.add_argument(
+        "--untrained",
+        action="store_true",
+        help="If set to true, will load gpt2-xl",
+    )
+    parser.add_argument(
+        "--prompt",
+        action="store_true",
+        help="If set to true, the script evaluates metadata in prompt style",
+    )
+
+    args = parser.parse_args()
+    print(f"Parameters: {args}")
+    results = evaluate_main(
+        args.repo_id,
+        args.subfolder,
+        args.config_file_path,
+        args.output_file,
+        args.save_data,
+        args.test,
+        args.max_n_examples,
+        args.metadata_to_test,
+        args.untrained,
+        args.prompt,
+        args.no_cuda,
+    )
+    # Load config
+    # Write results to output file
+    with open(args.output_file, "a", encoding="utf8") as f:
+        for k, v in results.items():
+            f.write(f"=== RESULT [{k}] ===\n")
+            f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"]))
+            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(v["final_normal_ppl"]))
diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index e4ebdd28..779bcea4 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -54,11 +54,7 @@ def from_json_string(t):
 
         examples = {k: [v] for k, v in example.items()}
         metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights
-        examples = random_sample_metadata_v2(
-            examples,
-            metadata_type_sample_weights=metadata_type_sample_weights,
-            html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate,
-        )
+        examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights)
         # example = {k: v[0] for k, v in examples.items()}
 
         result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config)
@@ -87,7 +83,7 @@ def filter_empty(t):
     return data
 
 
-def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
+def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True):
     """returns a tensorflow dataloader"""
     data_config = args
     local_dir = Path(data_config.dataset_name)
@@ -99,19 +95,28 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
     file_paths = list(Path(local_dir).glob(data_config.train_file))
     assert len(file_paths) > 0, f"no files found for {data_config.train_file}"
 
+
     files_with_entities = [x for x in file_paths if x.name in data_files_with_entities]
     files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities]
     print(f"{len(files_with_entities)} files with entities")
     print(f"{len(files_without_entities)} files without entities")
 
+    if train:
+        files_with_entities = [x for x in files_with_entities if
+                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name]
+    else:
+        files_with_entities = [x for x in files_with_entities if
+                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name]
+
     data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
-    data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer)
+
+
+
     data = tf.data.Dataset.sample_from_datasets(
-        [data_with_entities, data_without_entities],
-        weights=[float(len(files_with_entities)), float(len(files_without_entities))],
+        [data_with_entities],
+        weights=[float(len(files_with_entities))],
         seed=42,
     )
-
     data = data.shuffle(1000, reshuffle_each_iteration=True)
     data = data.batch(data_config.per_device_train_batch_size)
     data = data.prefetch(tf.data.AUTOTUNE)
@@ -137,4 +142,4 @@ def get_dummy_dataloader(batch_size):
         shuffle=True,
         num_workers=0,
         pin_memory=True,
-    )
+    )
\ No newline at end of file
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 42a0044a..df1f27d6 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -75,11 +75,12 @@ data_config:
     local_metadata_special_token_end:
         entity_paragraph: " </ENTITY_CHAIN> "
     local_metadata_special_token_state: true
-    html_overall_sample_rate: 0.25
+    html_overall_sample_rate: 1
     without_metadata_same_context: false
+    use_full_evaluation_for_val: false
   experiment: with_metadata_datasetv2_tf
-  per_device_eval_batch_size: 8
-  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 64 # 32 for 40gb
+  per_device_train_batch_size: 64
   dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all
   dataset_config_name: null
   train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
@@ -87,12 +88,12 @@ data_config:
   overwrite_cache: false
   cache_dir: null
   extension: null
-  preprocessing_num_workers: 6
+  preprocessing_num_workers: 40
   validation_split_percentage: 5
   block_size: null
   map_batch_size: 1
 weight_decay: 0.01
-learning_rate: 5e-5
+learning_rate: 0.0001
 num_train_epochs: 1
 max_train_steps: 100000
 lr_scheduler_type: linear
@@ -103,16 +104,16 @@ model_name: gpt2
 project_name: metadata_lm
 jobid: ''
 start_with_eval: false
-extra_steps_to_eval_save_at:
-- 2
+#extra_steps_to_eval_save_at:
+#- 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
-eval_steps: 2000
+eval_steps: 250
 save_strategy: STEPS
 save_num_per_epoch: 3
-save_steps: 150
+save_steps: 250
 do_train: true
 do_eval: true
 gradient_checkpointing: true
 resume_from_checkpoint_dir: null
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 1
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index d97853b6..c3c8552c 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -16,11 +16,11 @@
 import wandb
 from accelerate import Accelerator
 from accelerate.utils import DistributedType, DummyOptim, DummyScheduler
+from evaluation import evaluate_main
 from hydra.core.config_store import ConfigStore
 from omegaconf import OmegaConf
-from torch.optim import AdamW
 from tqdm.auto import tqdm as original_tqdm
-from transformers import AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
+from transformers import AdamW, AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
 from transformers.trainer_utils import IntervalStrategy
 
 from bsmetadata.input_pipeline import DataConfig, get_dataloaders
@@ -89,6 +89,9 @@ class CFG:
     gradient_checkpointing: bool = field(
         default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."}
     )
+    use_full_evaluation_for_val: bool = field(
+        default=False, metadata={"help": "Whether to use full evaluation for val"}
+    )
 
 
 cs = ConfigStore.instance()
@@ -217,8 +220,8 @@ def main(args: CFG) -> None:
     is_local_main_process = accelerator.is_local_main_process
     tqdm = partial(original_tqdm, disable=not is_local_main_process, position=0)
     use_deepspeed = accelerator.state.deepspeed_plugin is not None
-    use_deepspeed_optimzer = use_deepspeed and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
-    use_deepspeed_scheduler = use_deepspeed and "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config
+    use_deepspeed_optimzer = use_deepspeed or "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    use_deepspeed_scheduler = use_deepspeed or "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config
 
     if accelerator.distributed_type == DistributedType.DEEPSPEED and not use_deepspeed_scheduler:
         assert False, "Please set scheduler in DeepSpeed config file otherwise it may not be checkpointed properly"
@@ -294,7 +297,13 @@ def main(args: CFG) -> None:
             gpu_id=accelerator.process_index,
         )
         dummy_dataloader = get_dummy_dataloader(args.data_config.per_device_train_batch_size)
-        eval_dataloaders = dict()
+        eval_dataloader, format_fn_eval = get_dataloader(
+            tokenizer=tokenizer,
+            args=args.data_config,
+            num_gpus=accelerator.num_processes,
+            gpu_id=accelerator.process_index,
+            train=False,
+        )
         model, optimizer, dummy_dataloader, scheduler = accelerator.prepare(
             model, optimizer, dummy_dataloader, scheduler
         )
@@ -348,7 +357,7 @@ def format_fn(x):
         save_per_n_step = args.max_train_steps + 1  # will never eval
 
     @torch.no_grad()
-    def evaluate(eval_dataloader):
+    def evaluate(eval_dataloader, only_first_n_steps=120):
         model.eval()
         losses = []
         for step, batch in enumerate(tqdm(eval_dataloader, desc="eval")):  # , leave=False)
@@ -359,7 +368,8 @@ def evaluate(eval_dataloader):
             loss = loss_fn(batch, outputs, metadata_mask)
 
             losses.append(accelerator.gather(loss.repeat(args.data_config.per_device_eval_batch_size)))
-
+            if step == only_first_n_steps:
+                break
         model.train()
         if not losses:
             # in case the dataloader is empty
@@ -368,12 +378,21 @@ def evaluate(eval_dataloader):
         perplexity = math.exp(torch.mean(losses))
         return {"perplexity": perplexity}
 
-    def evaluate_multiple_dateloaders(eval_dataloaders):
-        for key, eval_dataloader in eval_dataloaders.items():
-            logger.info(f"Evaluating split {key}")
-            metrics = evaluate(eval_dataloader)
-            metrics_logger.log({key: metrics})
-        logger.info("Evaluation finished")
+    def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val):
+        if use_full_evaluation_for_val:
+            results = evaluate_main(
+                output_file="eval.txt",
+                metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+            )
+            for k, v in results.items():
+                metrics_logger.log({k: v})
+            logger.info("Evaluation finished")
+        else:
+            for key, eval_dataloader in eval_dataloaders.items():
+                logger.info(f"Evaluating split {key}")
+                metrics = evaluate(eval_dataloader)
+                metrics_logger.log({key: metrics})
+            logger.info("Evaluation finished")
 
     if not args.do_train and not args.do_eval:
         return
@@ -384,7 +403,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders):
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
         logger.info("Start with an evaluation")
-        evaluate_multiple_dateloaders(eval_dataloaders)
+        evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
 
     if not args.do_train:
         return
@@ -406,7 +425,7 @@ def save(path):
             model.save_checkpoint(path)
         else:
             accelerator.save_state(path)
-        save_model_and_tokenizer(accelerator, model, path)
+        save_model_and_tokenizer(accelerator, model, path, tokenizer=tokenizer)
         if is_local_main_process:
             train_state.save(path / "train_state.json")
 
@@ -426,6 +445,17 @@ def get_data_iter():
                     batch = {k: v.to(accelerator.device) for k, v in batch.items()}
                 yield batch
 
+    def get_eval_data_iter():
+        while True:
+            for batch in eval_dataloader:
+                batch = format_fn_eval(batch)
+                if args.data_config.experiment == "with_metadata_datasetv2_tf":
+                    batch = {k: v.to(accelerator.device) for k, v in batch.items()}
+                yield batch
+
+    eval_iter = get_eval_data_iter()
+    eval_dataloaders = {"validation": eval_iter}
+
     data_iter = get_data_iter()
 
     for _ in tqdm(
@@ -461,11 +491,18 @@ def get_data_iter():
                 optimizer.zero_grad()
 
             step_loss_gathered = accelerator.gather(step_loss).mean().item()
-            metrics = {
-                "loss": step_loss_gathered,
-                "lr": max(scheduler.get_lr()),
-                "gradient_step": train_state.completed_steps,
-            }
+            if step < 20:
+                metrics = {
+                    "loss": step_loss_gathered,
+                    "lr": 0,
+                    "gradient_step": train_state.completed_steps,
+                }
+            else:
+                metrics = {
+                    "loss": step_loss_gathered,
+                    "lr": max(scheduler.get_last_lr()),
+                    "gradient_step": train_state.completed_steps,
+                }
             if not args.data_config.streaming:
                 metrics["epoch"] = step / len(train_dataloader)
 
@@ -488,7 +525,7 @@ def get_data_iter():
             path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step"
             save(path)
         if do_eval:
-            evaluate_multiple_dateloaders(eval_dataloaders)
+            evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
 
         if completed_steps >= args.max_train_steps:
             # finished = True

From 6c4b6a7a7b19cc656546d03d4f8d8f0058c2930d Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 13:44:52 +0100
Subject: [PATCH 2/6] no message

---
 bsmetadata/experiments/with_metadata_datasetv2_tf.py | 6 +++++-
 bsmetadata/train.py                                  | 3 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index 779bcea4..b635a4ee 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -54,7 +54,11 @@ def from_json_string(t):
 
         examples = {k: [v] for k, v in example.items()}
         metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights
-        examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights)
+        examples = random_sample_metadata_v2(
+            examples,
+            metadata_type_sample_weights=metadata_type_sample_weights,
+            html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate,
+        )
         # example = {k: v[0] for k, v in examples.items()}
 
         result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config)
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index c3c8552c..4d00edec 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -386,13 +386,12 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
             )
             for k, v in results.items():
                 metrics_logger.log({k: v})
-            logger.info("Evaluation finished")
         else:
             for key, eval_dataloader in eval_dataloaders.items():
                 logger.info(f"Evaluating split {key}")
                 metrics = evaluate(eval_dataloader)
                 metrics_logger.log({key: metrics})
-            logger.info("Evaluation finished")
+        logger.info("Evaluation finished")
 
     if not args.do_train and not args.do_eval:
         return

From 98598ba6ea0d23d8ce6baf0d8a590c25c522ebfa Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 13:51:24 +0100
Subject: [PATCH 3/6] fix format

---
 .../experiments/with_metadata_datasetv2_tf.py   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index b635a4ee..1f45ad4a 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -87,7 +87,7 @@ def filter_empty(t):
     return data
 
 
-def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True):
+def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True):
     """returns a tensorflow dataloader"""
     data_config = args
     local_dir = Path(data_config.dataset_name)
@@ -99,23 +99,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True):
     file_paths = list(Path(local_dir).glob(data_config.train_file))
     assert len(file_paths) > 0, f"no files found for {data_config.train_file}"
 
-
     files_with_entities = [x for x in file_paths if x.name in data_files_with_entities]
     files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities]
     print(f"{len(files_with_entities)} files with entities")
     print(f"{len(files_without_entities)} files without entities")
 
     if train:
-        files_with_entities = [x for x in files_with_entities if
-                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name]
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name
+        ]
     else:
-        files_with_entities = [x for x in files_with_entities if
-                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name]
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name
+        ]
 
     data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
 
-
-
     data = tf.data.Dataset.sample_from_datasets(
         [data_with_entities],
         weights=[float(len(files_with_entities))],
@@ -146,4 +145,4 @@ def get_dummy_dataloader(batch_size):
         shuffle=True,
         num_workers=0,
         pin_memory=True,
-    )
\ No newline at end of file
+    )

From f9af120bf445aa8421151f8e064f50a3d6722843 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 17:40:22 +0100
Subject: [PATCH 4/6] fix evaluation and config for 40gb

---
 bsmetadata/deepspeed_configs/v2.json |  2 +-
 bsmetadata/evaluation.py             | 18 ++++++++++++++----
 bsmetadata/hydra_configs/v2.yaml     | 14 +++++++-------
 bsmetadata/train.py                  | 12 +++++++-----
 4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json
index 35a24626..dca70cc9 100644
--- a/bsmetadata/deepspeed_configs/v2.json
+++ b/bsmetadata/deepspeed_configs/v2.json
@@ -39,7 +39,7 @@
     "contiguous_gradients": true,
     "cpu_offload": false
 },
-    "gradient_accumulation_steps": 1,
+    "gradient_accumulation_steps": 2,
     "gradient_clipping": "auto",
     "steps_per_print": 100,
     "train_batch_size": 512,
diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 235e7263..86f2510c 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -180,6 +180,7 @@ def get_mean_loss(
     batch: Dict[str, torch.Tensor],
     save_data: bool = False,
     idx: int = None,
+    model=None,
 ) -> torch.Tensor:
     """Prepares the arguments for perplexity calculation and passes them to the perplexity function.
 
@@ -272,18 +273,20 @@ def evaluate_main(
     test: bool = False,
     max_n_examples: int = 1500,
     prompt: bool = False,
-    no_cuda: bool = False,
+    no_cuda: bool = True,
     save_data: bool = False,
     untrained: bool = False,
     config_file_path: str = None,
     model: str = None,
     tokenizer: str = None,
+    accelerator=None,
 ) -> dict:
     if config_file_path is None:
         try:
             config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
+    config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml"
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 
@@ -398,7 +401,10 @@ def evaluate_main(
                 normal_batch = default_data_collator([normal_example])
                 metadata_example["labels"] = metadata_example["input_ids"]
                 metadata_batch = default_data_collator([metadata_example])
-                if not no_cuda:
+                if accelerator is not None:
+                    normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()}
+                    metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()}
+                elif not no_cuda:
                     normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
                     metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
                 if n_examples == 1:
@@ -414,11 +420,15 @@ def evaluate_main(
                     # rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))
 
                 # Calculate nll (natural-log loss)
-                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx)  # [0]
+                normal_nll, normal_example_len = get_mean_loss(
+                    normal_batch, save_data=save_data, idx=idx, model=model
+                )  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
-                metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx)  # [0]
+                metadata_nll, metadata_example_len = get_mean_loss(
+                    metadata_batch, save_data=save_data, idx=idx, model=model
+                )  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
 
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index df1f27d6..62786638 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -1,6 +1,7 @@
 data_config:
   streaming: True
   validation_size_max: 1024
+  use_full_evaluation_for_val: true
   metadata_config:
     random_sample_metadata: true
     random_sample_metadata_calculate_size: 16384
@@ -38,7 +39,7 @@ data_config:
     #- generation_length_sentence
     #- generation_length_text
     - entity_paragraph
-    local_metadata_special_tokens: 
+    local_metadata_special_tokens:
       entity_paragraph: "entity"
     metadata_sep: ' | '
     metadata_key_value_sep: ': '
@@ -77,10 +78,9 @@ data_config:
     local_metadata_special_token_state: true
     html_overall_sample_rate: 1
     without_metadata_same_context: false
-    use_full_evaluation_for_val: false
   experiment: with_metadata_datasetv2_tf
-  per_device_eval_batch_size: 64 # 32 for 40gb
-  per_device_train_batch_size: 64
+  per_device_eval_batch_size: 32 # 32 for 40gb
+  per_device_train_batch_size: 32
   dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all
   dataset_config_name: null
   train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
@@ -104,8 +104,8 @@ model_name: gpt2
 project_name: metadata_lm
 jobid: ''
 start_with_eval: false
-#extra_steps_to_eval_save_at:
-#- 2
+extra_steps_to_eval_save_at:
+- 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
 eval_steps: 250
@@ -116,4 +116,4 @@ do_train: true
 do_eval: true
 gradient_checkpointing: true
 resume_from_checkpoint_dir: null
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 2
\ No newline at end of file
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index 4d00edec..9c5a928e 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -89,9 +89,6 @@ class CFG:
     gradient_checkpointing: bool = field(
         default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."}
     )
-    use_full_evaluation_for_val: bool = field(
-        default=False, metadata={"help": "Whether to use full evaluation for val"}
-    )
 
 
 cs = ConfigStore.instance()
@@ -382,8 +379,13 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
         if use_full_evaluation_for_val:
             results = evaluate_main(
                 output_file="eval.txt",
+                # metadata_to_test="entity_paragraph",
                 metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
             )
+            model.train()
             for k, v in results.items():
                 metrics_logger.log({k: v})
         else:
@@ -402,7 +404,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
         logger.info("Start with an evaluation")
-        evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
+        evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val)
 
     if not args.do_train:
         return
@@ -524,7 +526,7 @@ def get_eval_data_iter():
             path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step"
             save(path)
         if do_eval:
-            evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
+            evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val)
 
         if completed_steps >= args.max_train_steps:
             # finished = True

From 1e181af5bd09cb64f657a025c59b3c84f2a7c950 Mon Sep 17 00:00:00 2001
From: Jordan Clive <jordan.clive19@imperial.ac.uk>
Date: Thu, 18 May 2023 09:25:46 +0100
Subject: [PATCH 5/6] Update evaluation.py

remove hard coded
---
 bsmetadata/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 86f2510c..e1a7f54b 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -286,7 +286,7 @@ def evaluate_main(
             config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
-    config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml"
+#     config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 

From efb6414314f66b2b0677fba0a610e2a28432bf6e Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Thu, 18 May 2023 13:34:08 +0100
Subject: [PATCH 6/6] specify kwargs for non training usage

---
 bsmetadata/evaluation.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index e1a7f54b..88ecd73d 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -579,17 +579,17 @@ def ppl(examples_mean_loss, examples_len):
     args = parser.parse_args()
     print(f"Parameters: {args}")
     results = evaluate_main(
-        args.repo_id,
-        args.subfolder,
-        args.config_file_path,
-        args.output_file,
-        args.save_data,
-        args.test,
-        args.max_n_examples,
-        args.metadata_to_test,
-        args.untrained,
-        args.prompt,
-        args.no_cuda,
+        repo_id=args.repo_id,
+        subfolder=args.subfolder,
+        config_file_path=args.config_file_path,
+        output_file=args.output_file,
+        save_data=args.save_data,
+        test=args.test,
+        max_n_examples=args.max_n_examples,
+        metadata_to_test=args.metadata_to_test,
+        untrained=args.untrained,
+        prompt=args.prompt,
+        no_cuda=args.no_cuda,
     )
     # Load config
     # Write results to output file