From 2dde85c21abd5f532574db33bdd53dfe4dd98857 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 13:42:36 +0100 Subject: [PATCH 1/6] add full evaluation into training loop. Other training changes for A100 node. --- bsmetadata/deepspeed_configs/v2.json | 22 +- bsmetadata/evaluation.py | 212 ++++++++++-------- .../experiments/with_metadata_datasetv2_tf.py | 27 ++- bsmetadata/hydra_configs/v2.yaml | 21 +- bsmetadata/train.py | 79 +++++-- 5 files changed, 216 insertions(+), 145 deletions(-) diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json index 1d5c0311..35a24626 100644 --- a/bsmetadata/deepspeed_configs/v2.json +++ b/bsmetadata/deepspeed_configs/v2.json @@ -30,19 +30,19 @@ } }, "zero_optimization": { - "stage": 1, - "allgather_partitions": true, - "allgather_bucket_size": 500000000, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 500000000, - "contiguous_gradients": true, - "cpu_offload": true - }, - "gradient_accumulation_steps": 16, + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": false +}, + "gradient_accumulation_steps": 1, "gradient_clipping": "auto", "steps_per_print": 100, - "train_batch_size": 256, + "train_batch_size": 512, "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } \ No newline at end of file diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index b75c2aa5..235e7263 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -264,73 +264,24 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else "" -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--repo_id", - type=str, - default="bs-modeling-metadata/checkpoints_all_04_23", - help="Repository ID for the model to compute perplexity for", - ) - parser.add_argument( - "--subfolder", - type=str, - default="checkpoint-2000step", - help="subfolder in the respository with the specific checkpoint to evaluate perplexity for", - ) - parser.add_argument( - "--config_file_path", - type=str, - help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml", - ) - parser.add_argument( - "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to" - ) - parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU") - parser.add_argument( - "--save_data", - action="store_true", - help="If set to true, save tokens & losses", - ) - parser.add_argument( - "--test", - action="store_true", - help="If set to true, the script runs in test mode and only takes 10 examples per dataset", - ) - parser.add_argument( - "--max_n_examples", - type=int, - default=1500, - help="how many examples per metadata type to evaluate", - ) - parser.add_argument( - "--metadata_to_test", - type=str, - default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph", - help="metadata types to test", - ) - parser.add_argument( - "--untrained", - action="store_true", - help="If set to true, will load gpt2-xl", - ) - parser.add_argument( - "--prompt", - action="store_true", - help="If set to true, the script evaluates metadata in prompt style", - ) - - args = parser.parse_args() - print(f"Parameters: {args}") - - # Load config - if args.config_file_path: - config_file_path = args.config_file_path - else: +def evaluate_main( + metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + output_file: str = "evaluation.txt", + repo_id: str = None, + subfolder: str = None, + test: bool = False, + max_n_examples: int = 1500, + prompt: bool = False, + no_cuda: bool = False, + save_data: bool = False, + untrained: bool = False, + config_file_path: str = None, + model: str = None, + tokenizer: str = None, +) -> dict: + if config_file_path is None: try: - config_file_path = hf_hub_download( - repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True - ) + config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) except Exception: config_file_path = "bsmetadata/hydra_configs/v2.yaml" repo_args = OmegaConf.load(config_file_path) @@ -341,15 +292,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # Load model print("Loading model...") - if args.untrained: - model = AutoModelForCausalLM.from_pretrained("gpt2-xl") - else: - model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True) - model.eval().cuda() if not args.no_cuda else model.eval() - - # Load tokenizer - tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name) - tokenizer.pad_token = tokenizer.eos_token + if model is None or tokenizer is None: + if untrained: + model = AutoModelForCausalLM.from_pretrained("gpt2-xl") + tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name) + tokenizer.pad_token = tokenizer.eos_token + else: + model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True) + tokenizer = AutoTokenizer.from_pretrained( + "bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True + ) + model.eval().cuda() if not no_cuda else model.eval() # Config preprocess function cfg = data_config.metadata_config @@ -358,7 +311,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: cfg.metadata_list.append("entity") cfg.metadata_list.append("paragraph") - if args.prompt: + if prompt: cfg.metadata_sep = "; " # Instead of " | " cfg.metadata_prefix_sep = "" # Instead of " |||"; there's already an implicit " " DatasourceProcessor.process_global = datasource_process_global_for_prompt @@ -381,8 +334,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: "bs-modeling-metadata/c4-en-html-with-validation_metadata_url", "bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph", ] - dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")] - + dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")] + results = {} for path in dataset_paths: n_examples = 0 total_normal_len = [] @@ -394,11 +347,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # Load validation dataset from hugging face metadata_type = path.split("_metadata_")[1] print(f"Loading {metadata_type} data...") - split = "validation" if not args.test else "validation[:10]" + split = "validation" if not test else "validation[:10]" validation_dataset = load_dataset(path, use_auth_token=True, split=split) data = [] - max_n_examples_ord = len(str(args.max_n_examples)) + max_n_examples_ord = len(str(max_n_examples)) for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."): # for idx in [136,]: example = validation_dataset[idx] @@ -409,7 +362,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: except Exception as e: # Write error to output file and continue with next dataset print(e) - with open(args.output_file, "a", encoding="utf8") as f: + with open(output_file, "a", encoding="utf8") as f: f.write(f"=== RESULT [{metadata_type}] ===\n") f.write(f"{e}\n\n") exit_flag = True @@ -445,7 +398,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: normal_batch = default_data_collator([normal_example]) metadata_example["labels"] = metadata_example["input_ids"] metadata_batch = default_data_collator([metadata_example]) - if not args.no_cuda: + if not no_cuda: normal_batch = {k: v.cuda() for k, v in normal_batch.items()} metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()} if n_examples == 1: @@ -461,13 +414,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # rich.print(tokenizer.decode(metadata_batch["input_ids"][0])) # Calculate nll (natural-log loss) - normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx) # [0] + normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx) # [0] # print("PPL") # print(normal_ppl) total_normal_nll.append(normal_nll) # * normal_example_len - metadata_nll, metadata_example_len = get_mean_loss( - metadata_batch, save_data=args.save_data, idx=idx - ) # [0] + metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx) # [0] # print(metadata_ppl) total_metadata_nll.append(metadata_nll) # * metadata_example_len @@ -521,7 +472,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # sys.exit() - if n_examples > args.max_n_examples: + if n_examples > max_n_examples: break if exit_flag: @@ -554,9 +505,86 @@ def ppl(examples_mean_loss, examples_len): else: final_metadata_ppl = final_normal_ppl = 0 - # Write results to output file - with open(args.output_file, "a", encoding="utf8") as f: - f.write(f"=== RESULT [{metadata_type}] ===\n") - f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl)) - f.write("Perplexity (normal): {:>6,.3f}\n\n".format(final_normal_ppl)) + results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl} torch.save(data, "eva.data") + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--repo_id", + type=str, + default="bs-modeling-metadata/checkpoints_all_04_23", + help="Repository ID for the model to compute perplexity for", + ) + parser.add_argument( + "--subfolder", + type=str, + default="checkpoint-2000step", + help="subfolder in the respository with the specific checkpoint to evaluate perplexity for", + ) + parser.add_argument( + "--config_file_path", + type=str, + help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml", + ) + parser.add_argument( + "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to" + ) + parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU") + parser.add_argument( + "--save_data", + action="store_true", + help="If set to true, save tokens & losses", + ) + parser.add_argument( + "--test", + action="store_true", + help="If set to true, the script runs in test mode and only takes 10 examples per dataset", + ) + parser.add_argument( + "--max_n_examples", + type=int, + default=1500, + help="how many examples per metadata type to evaluate", + ) + parser.add_argument( + "--metadata_to_test", + type=str, + default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph", + help="metadata types to test", + ) + parser.add_argument( + "--untrained", + action="store_true", + help="If set to true, will load gpt2-xl", + ) + parser.add_argument( + "--prompt", + action="store_true", + help="If set to true, the script evaluates metadata in prompt style", + ) + + args = parser.parse_args() + print(f"Parameters: {args}") + results = evaluate_main( + args.repo_id, + args.subfolder, + args.config_file_path, + args.output_file, + args.save_data, + args.test, + args.max_n_examples, + args.metadata_to_test, + args.untrained, + args.prompt, + args.no_cuda, + ) + # Load config + # Write results to output file + with open(args.output_file, "a", encoding="utf8") as f: + for k, v in results.items(): + f.write(f"=== RESULT [{k}] ===\n") + f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"])) + f.write("Perplexity (normal): {:>6,.3f}\n\n".format(v["final_normal_ppl"])) diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py index e4ebdd28..779bcea4 100644 --- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py +++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py @@ -54,11 +54,7 @@ def from_json_string(t): examples = {k: [v] for k, v in example.items()} metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights - examples = random_sample_metadata_v2( - examples, - metadata_type_sample_weights=metadata_type_sample_weights, - html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate, - ) + examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights) # example = {k: v[0] for k, v in examples.items()} result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config) @@ -87,7 +83,7 @@ def filter_empty(t): return data -def get_dataloader(*, tokenizer, args, num_gpus, gpu_id): +def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True): """returns a tensorflow dataloader""" data_config = args local_dir = Path(data_config.dataset_name) @@ -99,19 +95,28 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id): file_paths = list(Path(local_dir).glob(data_config.train_file)) assert len(file_paths) > 0, f"no files found for {data_config.train_file}" + files_with_entities = [x for x in file_paths if x.name in data_files_with_entities] files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities] print(f"{len(files_with_entities)} files with entities") print(f"{len(files_without_entities)} files without entities") + if train: + files_with_entities = [x for x in files_with_entities if + 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name] + else: + files_with_entities = [x for x in files_with_entities if + 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name] + data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer) - data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer) + + + data = tf.data.Dataset.sample_from_datasets( - [data_with_entities, data_without_entities], - weights=[float(len(files_with_entities)), float(len(files_without_entities))], + [data_with_entities], + weights=[float(len(files_with_entities))], seed=42, ) - data = data.shuffle(1000, reshuffle_each_iteration=True) data = data.batch(data_config.per_device_train_batch_size) data = data.prefetch(tf.data.AUTOTUNE) @@ -137,4 +142,4 @@ def get_dummy_dataloader(batch_size): shuffle=True, num_workers=0, pin_memory=True, - ) + ) \ No newline at end of file diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 42a0044a..df1f27d6 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -75,11 +75,12 @@ data_config: local_metadata_special_token_end: entity_paragraph: " " local_metadata_special_token_state: true - html_overall_sample_rate: 0.25 + html_overall_sample_rate: 1 without_metadata_same_context: false + use_full_evaluation_for_val: false experiment: with_metadata_datasetv2_tf - per_device_eval_batch_size: 8 - per_device_train_batch_size: 8 + per_device_eval_batch_size: 64 # 32 for 40gb + per_device_train_batch_size: 64 dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all dataset_config_name: null train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz @@ -87,12 +88,12 @@ data_config: overwrite_cache: false cache_dir: null extension: null - preprocessing_num_workers: 6 + preprocessing_num_workers: 40 validation_split_percentage: 5 block_size: null map_batch_size: 1 weight_decay: 0.01 -learning_rate: 5e-5 +learning_rate: 0.0001 num_train_epochs: 1 max_train_steps: 100000 lr_scheduler_type: linear @@ -103,16 +104,16 @@ model_name: gpt2 project_name: metadata_lm jobid: '' start_with_eval: false -extra_steps_to_eval_save_at: -- 2 +#extra_steps_to_eval_save_at: +#- 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 -eval_steps: 2000 +eval_steps: 250 save_strategy: STEPS save_num_per_epoch: 3 -save_steps: 150 +save_steps: 250 do_train: true do_eval: true gradient_checkpointing: true resume_from_checkpoint_dir: null -gradient_accumulation_steps: 16 +gradient_accumulation_steps: 1 diff --git a/bsmetadata/train.py b/bsmetadata/train.py index d97853b6..c3c8552c 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -16,11 +16,11 @@ import wandb from accelerate import Accelerator from accelerate.utils import DistributedType, DummyOptim, DummyScheduler +from evaluation import evaluate_main from hydra.core.config_store import ConfigStore from omegaconf import OmegaConf -from torch.optim import AdamW from tqdm.auto import tqdm as original_tqdm -from transformers import AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed +from transformers import AdamW, AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed from transformers.trainer_utils import IntervalStrategy from bsmetadata.input_pipeline import DataConfig, get_dataloaders @@ -89,6 +89,9 @@ class CFG: gradient_checkpointing: bool = field( default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."} ) + use_full_evaluation_for_val: bool = field( + default=False, metadata={"help": "Whether to use full evaluation for val"} + ) cs = ConfigStore.instance() @@ -217,8 +220,8 @@ def main(args: CFG) -> None: is_local_main_process = accelerator.is_local_main_process tqdm = partial(original_tqdm, disable=not is_local_main_process, position=0) use_deepspeed = accelerator.state.deepspeed_plugin is not None - use_deepspeed_optimzer = use_deepspeed and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config - use_deepspeed_scheduler = use_deepspeed and "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config + use_deepspeed_optimzer = use_deepspeed or "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config + use_deepspeed_scheduler = use_deepspeed or "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config if accelerator.distributed_type == DistributedType.DEEPSPEED and not use_deepspeed_scheduler: assert False, "Please set scheduler in DeepSpeed config file otherwise it may not be checkpointed properly" @@ -294,7 +297,13 @@ def main(args: CFG) -> None: gpu_id=accelerator.process_index, ) dummy_dataloader = get_dummy_dataloader(args.data_config.per_device_train_batch_size) - eval_dataloaders = dict() + eval_dataloader, format_fn_eval = get_dataloader( + tokenizer=tokenizer, + args=args.data_config, + num_gpus=accelerator.num_processes, + gpu_id=accelerator.process_index, + train=False, + ) model, optimizer, dummy_dataloader, scheduler = accelerator.prepare( model, optimizer, dummy_dataloader, scheduler ) @@ -348,7 +357,7 @@ def format_fn(x): save_per_n_step = args.max_train_steps + 1 # will never eval @torch.no_grad() - def evaluate(eval_dataloader): + def evaluate(eval_dataloader, only_first_n_steps=120): model.eval() losses = [] for step, batch in enumerate(tqdm(eval_dataloader, desc="eval")): # , leave=False) @@ -359,7 +368,8 @@ def evaluate(eval_dataloader): loss = loss_fn(batch, outputs, metadata_mask) losses.append(accelerator.gather(loss.repeat(args.data_config.per_device_eval_batch_size))) - + if step == only_first_n_steps: + break model.train() if not losses: # in case the dataloader is empty @@ -368,12 +378,21 @@ def evaluate(eval_dataloader): perplexity = math.exp(torch.mean(losses)) return {"perplexity": perplexity} - def evaluate_multiple_dateloaders(eval_dataloaders): - for key, eval_dataloader in eval_dataloaders.items(): - logger.info(f"Evaluating split {key}") - metrics = evaluate(eval_dataloader) - metrics_logger.log({key: metrics}) - logger.info("Evaluation finished") + def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val): + if use_full_evaluation_for_val: + results = evaluate_main( + output_file="eval.txt", + metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + ) + for k, v in results.items(): + metrics_logger.log({k: v}) + logger.info("Evaluation finished") + else: + for key, eval_dataloader in eval_dataloaders.items(): + logger.info(f"Evaluating split {key}") + metrics = evaluate(eval_dataloader) + metrics_logger.log({key: metrics}) + logger.info("Evaluation finished") if not args.do_train and not args.do_eval: return @@ -384,7 +403,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders): do_eval = args.do_eval and args.start_with_eval if do_eval: logger.info("Start with an evaluation") - evaluate_multiple_dateloaders(eval_dataloaders) + evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) if not args.do_train: return @@ -406,7 +425,7 @@ def save(path): model.save_checkpoint(path) else: accelerator.save_state(path) - save_model_and_tokenizer(accelerator, model, path) + save_model_and_tokenizer(accelerator, model, path, tokenizer=tokenizer) if is_local_main_process: train_state.save(path / "train_state.json") @@ -426,6 +445,17 @@ def get_data_iter(): batch = {k: v.to(accelerator.device) for k, v in batch.items()} yield batch + def get_eval_data_iter(): + while True: + for batch in eval_dataloader: + batch = format_fn_eval(batch) + if args.data_config.experiment == "with_metadata_datasetv2_tf": + batch = {k: v.to(accelerator.device) for k, v in batch.items()} + yield batch + + eval_iter = get_eval_data_iter() + eval_dataloaders = {"validation": eval_iter} + data_iter = get_data_iter() for _ in tqdm( @@ -461,11 +491,18 @@ def get_data_iter(): optimizer.zero_grad() step_loss_gathered = accelerator.gather(step_loss).mean().item() - metrics = { - "loss": step_loss_gathered, - "lr": max(scheduler.get_lr()), - "gradient_step": train_state.completed_steps, - } + if step < 20: + metrics = { + "loss": step_loss_gathered, + "lr": 0, + "gradient_step": train_state.completed_steps, + } + else: + metrics = { + "loss": step_loss_gathered, + "lr": max(scheduler.get_last_lr()), + "gradient_step": train_state.completed_steps, + } if not args.data_config.streaming: metrics["epoch"] = step / len(train_dataloader) @@ -488,7 +525,7 @@ def get_data_iter(): path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step" save(path) if do_eval: - evaluate_multiple_dateloaders(eval_dataloaders) + evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) if completed_steps >= args.max_train_steps: # finished = True From 6c4b6a7a7b19cc656546d03d4f8d8f0058c2930d Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 13:44:52 +0100 Subject: [PATCH 2/6] no message --- bsmetadata/experiments/with_metadata_datasetv2_tf.py | 6 +++++- bsmetadata/train.py | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py index 779bcea4..b635a4ee 100644 --- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py +++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py @@ -54,7 +54,11 @@ def from_json_string(t): examples = {k: [v] for k, v in example.items()} metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights - examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights) + examples = random_sample_metadata_v2( + examples, + metadata_type_sample_weights=metadata_type_sample_weights, + html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate, + ) # example = {k: v[0] for k, v in examples.items()} result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index c3c8552c..4d00edec 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -386,13 +386,12 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) ) for k, v in results.items(): metrics_logger.log({k: v}) - logger.info("Evaluation finished") else: for key, eval_dataloader in eval_dataloaders.items(): logger.info(f"Evaluating split {key}") metrics = evaluate(eval_dataloader) metrics_logger.log({key: metrics}) - logger.info("Evaluation finished") + logger.info("Evaluation finished") if not args.do_train and not args.do_eval: return From 98598ba6ea0d23d8ce6baf0d8a590c25c522ebfa Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 13:51:24 +0100 Subject: [PATCH 3/6] fix format --- .../experiments/with_metadata_datasetv2_tf.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py index b635a4ee..1f45ad4a 100644 --- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py +++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py @@ -87,7 +87,7 @@ def filter_empty(t): return data -def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True): +def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True): """returns a tensorflow dataloader""" data_config = args local_dir = Path(data_config.dataset_name) @@ -99,23 +99,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True): file_paths = list(Path(local_dir).glob(data_config.train_file)) assert len(file_paths) > 0, f"no files found for {data_config.train_file}" - files_with_entities = [x for x in file_paths if x.name in data_files_with_entities] files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities] print(f"{len(files_with_entities)} files with entities") print(f"{len(files_without_entities)} files without entities") if train: - files_with_entities = [x for x in files_with_entities if - 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name] + files_with_entities = [ + x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name + ] else: - files_with_entities = [x for x in files_with_entities if - 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name] + files_with_entities = [ + x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name + ] data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer) - - data = tf.data.Dataset.sample_from_datasets( [data_with_entities], weights=[float(len(files_with_entities))], @@ -146,4 +145,4 @@ def get_dummy_dataloader(batch_size): shuffle=True, num_workers=0, pin_memory=True, - ) \ No newline at end of file + ) From f9af120bf445aa8421151f8e064f50a3d6722843 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 17:40:22 +0100 Subject: [PATCH 4/6] fix evaluation and config for 40gb --- bsmetadata/deepspeed_configs/v2.json | 2 +- bsmetadata/evaluation.py | 18 ++++++++++++++---- bsmetadata/hydra_configs/v2.yaml | 14 +++++++------- bsmetadata/train.py | 12 +++++++----- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json index 35a24626..dca70cc9 100644 --- a/bsmetadata/deepspeed_configs/v2.json +++ b/bsmetadata/deepspeed_configs/v2.json @@ -39,7 +39,7 @@ "contiguous_gradients": true, "cpu_offload": false }, - "gradient_accumulation_steps": 1, + "gradient_accumulation_steps": 2, "gradient_clipping": "auto", "steps_per_print": 100, "train_batch_size": 512, diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 235e7263..86f2510c 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -180,6 +180,7 @@ def get_mean_loss( batch: Dict[str, torch.Tensor], save_data: bool = False, idx: int = None, + model=None, ) -> torch.Tensor: """Prepares the arguments for perplexity calculation and passes them to the perplexity function. @@ -272,18 +273,20 @@ def evaluate_main( test: bool = False, max_n_examples: int = 1500, prompt: bool = False, - no_cuda: bool = False, + no_cuda: bool = True, save_data: bool = False, untrained: bool = False, config_file_path: str = None, model: str = None, tokenizer: str = None, + accelerator=None, ) -> dict: if config_file_path is None: try: config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) except Exception: config_file_path = "bsmetadata/hydra_configs/v2.yaml" + config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" repo_args = OmegaConf.load(config_file_path) data_config = repo_args.data_config @@ -398,7 +401,10 @@ def evaluate_main( normal_batch = default_data_collator([normal_example]) metadata_example["labels"] = metadata_example["input_ids"] metadata_batch = default_data_collator([metadata_example]) - if not no_cuda: + if accelerator is not None: + normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()} + metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()} + elif not no_cuda: normal_batch = {k: v.cuda() for k, v in normal_batch.items()} metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()} if n_examples == 1: @@ -414,11 +420,15 @@ def evaluate_main( # rich.print(tokenizer.decode(metadata_batch["input_ids"][0])) # Calculate nll (natural-log loss) - normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx) # [0] + normal_nll, normal_example_len = get_mean_loss( + normal_batch, save_data=save_data, idx=idx, model=model + ) # [0] # print("PPL") # print(normal_ppl) total_normal_nll.append(normal_nll) # * normal_example_len - metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx) # [0] + metadata_nll, metadata_example_len = get_mean_loss( + metadata_batch, save_data=save_data, idx=idx, model=model + ) # [0] # print(metadata_ppl) total_metadata_nll.append(metadata_nll) # * metadata_example_len diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index df1f27d6..62786638 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -1,6 +1,7 @@ data_config: streaming: True validation_size_max: 1024 + use_full_evaluation_for_val: true metadata_config: random_sample_metadata: true random_sample_metadata_calculate_size: 16384 @@ -38,7 +39,7 @@ data_config: #- generation_length_sentence #- generation_length_text - entity_paragraph - local_metadata_special_tokens: + local_metadata_special_tokens: entity_paragraph: "entity" metadata_sep: ' | ' metadata_key_value_sep: ': ' @@ -77,10 +78,9 @@ data_config: local_metadata_special_token_state: true html_overall_sample_rate: 1 without_metadata_same_context: false - use_full_evaluation_for_val: false experiment: with_metadata_datasetv2_tf - per_device_eval_batch_size: 64 # 32 for 40gb - per_device_train_batch_size: 64 + per_device_eval_batch_size: 32 # 32 for 40gb + per_device_train_batch_size: 32 dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all dataset_config_name: null train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz @@ -104,8 +104,8 @@ model_name: gpt2 project_name: metadata_lm jobid: '' start_with_eval: false -#extra_steps_to_eval_save_at: -#- 2 +extra_steps_to_eval_save_at: +- 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 eval_steps: 250 @@ -116,4 +116,4 @@ do_train: true do_eval: true gradient_checkpointing: true resume_from_checkpoint_dir: null -gradient_accumulation_steps: 1 +gradient_accumulation_steps: 2 \ No newline at end of file diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 4d00edec..9c5a928e 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -89,9 +89,6 @@ class CFG: gradient_checkpointing: bool = field( default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."} ) - use_full_evaluation_for_val: bool = field( - default=False, metadata={"help": "Whether to use full evaluation for val"} - ) cs = ConfigStore.instance() @@ -382,8 +379,13 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) if use_full_evaluation_for_val: results = evaluate_main( output_file="eval.txt", + # metadata_to_test="entity_paragraph", metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + model=model, + tokenizer=tokenizer, + accelerator=accelerator, ) + model.train() for k, v in results.items(): metrics_logger.log({k: v}) else: @@ -402,7 +404,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) do_eval = args.do_eval and args.start_with_eval if do_eval: logger.info("Start with an evaluation") - evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) + evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val) if not args.do_train: return @@ -524,7 +526,7 @@ def get_eval_data_iter(): path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step" save(path) if do_eval: - evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) + evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val) if completed_steps >= args.max_train_steps: # finished = True From 1e181af5bd09cb64f657a025c59b3c84f2a7c950 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Thu, 18 May 2023 09:25:46 +0100 Subject: [PATCH 5/6] Update evaluation.py remove hard coded --- bsmetadata/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 86f2510c..e1a7f54b 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -286,7 +286,7 @@ def evaluate_main( config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) except Exception: config_file_path = "bsmetadata/hydra_configs/v2.yaml" - config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" +# config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH repo_args = OmegaConf.load(config_file_path) data_config = repo_args.data_config From efb6414314f66b2b0677fba0a610e2a28432bf6e Mon Sep 17 00:00:00 2001 From: jordiclive Date: Thu, 18 May 2023 13:34:08 +0100 Subject: [PATCH 6/6] specify kwargs for non training usage --- bsmetadata/evaluation.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index e1a7f54b..88ecd73d 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -579,17 +579,17 @@ def ppl(examples_mean_loss, examples_len): args = parser.parse_args() print(f"Parameters: {args}") results = evaluate_main( - args.repo_id, - args.subfolder, - args.config_file_path, - args.output_file, - args.save_data, - args.test, - args.max_n_examples, - args.metadata_to_test, - args.untrained, - args.prompt, - args.no_cuda, + repo_id=args.repo_id, + subfolder=args.subfolder, + config_file_path=args.config_file_path, + output_file=args.output_file, + save_data=args.save_data, + test=args.test, + max_n_examples=args.max_n_examples, + metadata_to_test=args.metadata_to_test, + untrained=args.untrained, + prompt=args.prompt, + no_cuda=args.no_cuda, ) # Load config # Write results to output file