diff --git a/readme.md b/readme.md index c23f3c0..516f7b1 100644 --- a/readme.md +++ b/readme.md @@ -8,7 +8,7 @@ A powerful training toolkit for image generation models using Flow Matching tech - Flexible configuration via JSON - Multi-GPU training support with automatic device detection - Configurable inference during training -- Wandb and Hugging Face integration +- Aim and Hugging Face integration - Parameter efficient training with layer rotation and offloading ## Installation @@ -70,10 +70,10 @@ The trainer is configured via a JSON file with the following sections: "trained_double_blocks": 2, "save_every": 6, "save_folder": "checkpoints", - "wandb_key": null, - "wandb_project": null, - "wandb_run": "chroma", - "wandb_entity": null, + "aim_path": "./training", + "aim_experiment_name": "base", + "aim_hash": null, + "aim_steps": 0, "hf_repo_id": null, "hf_token": null } @@ -93,10 +93,10 @@ The trainer is configured via a JSON file with the following sections: | `trained_double_blocks` | Number of trainable transformer double blocks | | `save_every` | Save model checkpoint every X steps | | `save_folder` | Directory to save model checkpoints | -| `wandb_key` | Weights & Biases API key (optional) | -| `wandb_project` | Weights & Biases project name (optional) | -| `wandb_run` | Weights & Biases run name (optional) | -| `wandb_entity` | Weights & Biases entity name (optional) | +| `aim_path` | Aim directory path (optional, Windows need to install by yourself) | +| `aim_experiment_name` | Aim experiment name (optional) | +| `aim_hash` | Aim hash (optional) | +| `aim_steps` | Aim steps (optional) | | `hf_repo_id` | Hugging Face repository ID for pushing models (optional) | | `hf_token` | Hugging Face API token (optional) | @@ -243,10 +243,10 @@ You can set up multiple inference configurations to test different settings duri "trained_double_blocks": 2, "save_every": 6, "save_folder": "testing", - "wandb_key": null, - "wandb_project": null, - "wandb_run": "chroma", - "wandb_entity": null, + "aim_path": "./training", + "aim_experiment_name": "base", + "aim_hash": null, + "aim_steps": 0, "hf_repo_id": null, "hf_token": null }, diff --git a/requirements.txt b/requirements.txt index 7245b0f..69de9f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,22 @@ -torch==2.6.0 -torchvision==0.21.0 +torch>=2.6.0 +torchvision>=0.21.0 numpy matplotlib tensorboard torch_tb_profiler tqdm -wandb einops +pillow-avif-plugin pillow-jxl-plugin +imageio[pyav] transformers safetensors sentencepiece -bitsandbytes==0.45.3 +bitsandbytes>=0.45.3 torchastic torch-linear-assignment huggingface_hub -scipy \ No newline at end of file +scipy +aim; sys_platform != "win32" +triton; sys_platform != "win32" +triton-windows; sys_platform == "win32" \ No newline at end of file diff --git a/src/dataloaders/bucketing_logic.py b/src/dataloaders/bucketing_logic.py index 1709348..d9249ce 100644 --- a/src/dataloaders/bucketing_logic.py +++ b/src/dataloaders/bucketing_logic.py @@ -15,7 +15,10 @@ from .utils import read_jsonl -csv.field_size_limit(sys.maxsize) +try: + csv.field_size_limit(sys.maxsize) +except OverflowError: + csv.field_size_limit(2147483647) log = logging.getLogger(__name__) diff --git a/src/dataloaders/dataloader.py b/src/dataloaders/dataloader.py index 60ade19..1de0464 100644 --- a/src/dataloaders/dataloader.py +++ b/src/dataloaders/dataloader.py @@ -7,13 +7,14 @@ from tqdm import tqdm from PIL import Image +import pillow_avif +import imageio.v3 as iio import torch from torch.utils.data import Dataset import torchvision.transforms.v2 as v2 from io import BytesIO import concurrent.futures -import requests from requests.exceptions import RequestException, Timeout from .utils import read_jsonl @@ -70,7 +71,7 @@ def __init__( random.seed(seed) # just simple pil image to tensor conversion self.image_transforms = v2.Compose( - [v2.ToTensor(), v2.Normalize(mean=[0.5], std=[0.5])] + [v2.ToImage(), v2.ToDtype(torch.float32, scale=True)] ) # TODO: batches has to be preprocessed for batching!!!! @@ -78,7 +79,6 @@ def __init__( # slice batches using round robbin self._round_robin() - self.session = requests.Session() self.thread_per_worker = thread_per_worker # self.executor = concurrent.futures.ThreadPoolExecutor(thread_per_worker) @@ -225,12 +225,11 @@ def _round_robin(self): # - def _load_image(self, sample, session, image_folder_path, timeout): + def _load_image(self, sample, image_folder_path, timeout): try: + img_array = None if sample["is_url_based"]: - response = session.get(sample["filename"], timeout=timeout) - response.raise_for_status() # Raises an HTTPError if the status code is 4xx/5xx - return Image.open(BytesIO(response.content)).convert("RGB") + img_array = iio.imread(sample["filename"]) else: image_path = os.path.join(image_folder_path, sample["filename"]) @@ -243,18 +242,40 @@ def _load_image(self, sample, session, image_folder_path, timeout): ) elif os.path.exists(image_path): # Standard handling if the specified file exists - return Image.open(image_path).convert("RGB") + img_array = iio.imread(image_path) else: # Try alternative extensions if the main file doesn't exist filename, _ = os.path.splitext(sample["filename"]) - extensions = ["png", "jpg", "jpeg", "webp"] + extensions = ["png", "jpg", "jpeg", "webp", "bmp", "avif", "tif", "tiff"] for ext in extensions: alt_image_path = os.path.join( image_folder_path, f"{filename}.{ext}" ) if os.path.exists(alt_image_path): - return Image.open(alt_image_path).convert("RGB") - return None + img_array = iio.imread(alt_image_path) + if img_array is None: + return None + else: + if img_array.ndim == 2: + image = Image.fromarray(img_array, mode='L') + elif img_array.ndim == 3 or (img_array.ndim == 4): + if img_array.ndim == 4: + # When the image has a frame dimension, only the first frame is taken. + img_array = img_array[0] + height, width, channels = img_array.shape + if channels == 3: # RGB + image = Image.fromarray(img_array, mode='RGB') + elif channels == 4: # RGBA + image = Image.fromarray(img_array, mode='RGBA') + else: + raise ValueError(f"Unsupported number of channels: {channels}") + else: + raise ValueError(f"Unsupported image shape: {img_array.shape}") + + if image.mode != 'RGB': + image = image.convert('RGB') + + return image except Exception as e: log.error( f"An error occurred: {e} for {sample['filename']} on rank {self.rank}" @@ -274,7 +295,6 @@ def __getitem__(self, index): executor.submit( self._load_image, sample, - self.session, self.image_folder_path, self.timeout, ) diff --git a/src/dataloaders/utils.py b/src/dataloaders/utils.py index b2daaed..64ad3f8 100644 --- a/src/dataloaders/utils.py +++ b/src/dataloaders/utils.py @@ -5,7 +5,10 @@ import random from tqdm import tqdm -csv.field_size_limit(sys.maxsize) +try: + csv.field_size_limit(sys.maxsize) +except OverflowError: + csv.field_size_limit(2147483647) def save_as_jsonl(data, filename): @@ -18,9 +21,9 @@ def save_as_jsonl(data, filename): if os.path.join(*os.path.split(filename)[:-1]) != "": os.makedirs(os.path.join(*os.path.split(filename)[:-1]), exist_ok=True) - with open(filename, "w") as f: + with open(filename, "w", encoding="utf-8") as f: for item in tqdm(data): - json.dump(item, f) + json.dump(item, f, ensure_ascii=False) f.write("\n") @@ -35,7 +38,7 @@ def read_jsonl(filename): """ data = [] - with open(filename, "r") as f: + with open(filename, "r", encoding="utf-8") as f: for line in tqdm(f): data.append(json.loads(line)) return data diff --git a/src/general_utils.py b/src/general_utils.py index d63b20f..736f565 100644 --- a/src/general_utils.py +++ b/src/general_utils.py @@ -77,7 +77,7 @@ def save_file_multipart( index["metadata"].update(metadata) with open(os.path.join(base_folder, "model.safetensors.index.json"), "w") as f: - json.dump(index, f, indent=2) + json.dump(index, f, indent=2, ensure_ascii=False) return num_shards diff --git a/src/math_utils.py b/src/math_utils.py index 2b16228..3af1cb7 100644 --- a/src/math_utils.py +++ b/src/math_utils.py @@ -42,8 +42,12 @@ def _cuda_assignment(C): from torch_linear_assignment import batch_linear_assignment from torch_linear_assignment import assignment_to_indices - assignment = batch_linear_assignment(C.unsqueeze(dim=0)) + original_device = C.device + C_cpu = C.cpu().to(torch.float32).unsqueeze(dim=0) + assignment = batch_linear_assignment(C_cpu) row_indices, col_indices = assignment_to_indices(assignment) + row_indices = row_indices.to(original_device) + col_indices = col_indices.to(original_device) matching_pairs = (row_indices, col_indices) return C, matching_pairs diff --git a/src/models/chroma/model.py b/src/models/chroma/model.py index c13bf2c..5731936 100644 --- a/src/models/chroma/model.py +++ b/src/models/chroma/model.py @@ -255,7 +255,7 @@ def forward( # just in case in different GPU for simple pipeline parallel if self.training: img, txt = ckpt.checkpoint( - block, img, txt, pe, double_mod, txt_img_mask + block, img, txt, pe, double_mod, txt_img_mask, use_reentrant=False ) else: img, txt = block( @@ -266,7 +266,7 @@ def forward( for i, block in enumerate(self.single_blocks): single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"] if self.training: - img = ckpt.checkpoint(block, img, pe, single_mod, txt_img_mask) + img = ckpt.checkpoint(block, img, pe, single_mod, txt_img_mask, use_reentrant=False) else: img = block(img, pe=pe, distill_vec=single_mod, mask=txt_img_mask) img = img[:, txt.shape[1] :, ...] diff --git a/src/models/chroma/module/t5.py b/src/models/chroma/module/t5.py index e9d4252..33de33b 100644 --- a/src/models/chroma/module/t5.py +++ b/src/models/chroma/module/t5.py @@ -606,6 +606,7 @@ def forward( if position_bias != None else position_bias ), + use_reentrant=False ) pass else: @@ -668,6 +669,7 @@ def forward_mid( if position_bias != None else position_bias ), + use_reentrant=False ) pass else: diff --git a/src/models/lumina/model.py b/src/models/lumina/model.py index 669f63e..74e9567 100644 --- a/src/models/lumina/model.py +++ b/src/models/lumina/model.py @@ -689,7 +689,7 @@ def patchify_and_embed( # refine context for layer in self.context_refiner: if self.training: - cap_feats = ckpt.checkpoint(layer, cap_feats, cap_mask, cap_freqs_cis) + cap_feats = ckpt.checkpoint(layer, cap_feats, cap_mask, cap_freqs_cis, use_reentrant=False) else: cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis) @@ -718,7 +718,7 @@ def patchify_and_embed( for layer in self.noise_refiner: if self.training: padded_img_embed = ckpt.checkpoint( - layer, padded_img_embed, padded_img_mask, img_freqs_cis, t + layer, padded_img_embed, padded_img_mask, img_freqs_cis, t, use_reentrant=False ) else: padded_img_embed = layer( @@ -804,7 +804,7 @@ def forward(self, x, t, cap_feats, cap_mask): for layer in self.layers: if self.training: - x = ckpt.checkpoint(layer, x, mask, freqs_cis, adaln_input) + x = ckpt.checkpoint(layer, x, mask, freqs_cis, adaln_input, use_reentrant=False) else: x = layer(x, mask, freqs_cis, adaln_input) diff --git a/src/trainer/train_chroma.py b/src/trainer/train_chroma.py index 6d6ef82..4da4f7b 100644 --- a/src/trainer/train_chroma.py +++ b/src/trainer/train_chroma.py @@ -1,3 +1,4 @@ +import platform import sys import os import json @@ -120,7 +121,11 @@ def setup_distributed(rank, world_size): os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # Initialize process group - dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + backend = "nccl" # Default backend for distributed training + if platform.system() == "Windows": + # Windows does not support NCCL, use GLOO instead + backend = "gloo" + dist.init_process_group(backend=backend, rank=rank, world_size=world_size) torch.cuda.set_device(rank) @@ -307,17 +312,17 @@ def cast_linear(module, dtype): def save_config_to_json(filepath: str, **configs): json_data = {key: asdict(value) for key, value in configs.items()} - with open(filepath, "w") as json_file: - json.dump(json_data, json_file, indent=4) + with open(filepath, "w", encoding="utf-8") as json_file: + json.dump(json_data, json_file, indent=4, ensure_ascii=False) def dump_dict_to_json(data, file_path): - with open(file_path, "w") as json_file: - json.dump(data, json_file, indent=4) + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(data, json_file, indent=4, ensure_ascii=False) def load_config_from_json(filepath: str): - with open(filepath, "r") as json_file: + with open(filepath, "r", encoding="utf-8") as json_file: return json.load(json_file) @@ -768,9 +773,8 @@ def train_chroma(rank, world_size, debug=False): if not debug: synchronize_gradients(model) - scheduler.step() - optimizer.step() + scheduler.step() optimizer.zero_grad() if rank == 0: diff --git a/src/trainer/train_chroma_lora.py b/src/trainer/train_chroma_lora.py index 4626a1b..1a442a7 100644 --- a/src/trainer/train_chroma_lora.py +++ b/src/trainer/train_chroma_lora.py @@ -1,4 +1,4 @@ -import sys +import platform import os import json from datetime import datetime @@ -21,7 +21,6 @@ import random from transformers import T5Tokenizer -import wandb from src.dataloaders.dataloader import TextImageDataset from src.models.chroma.model import Chroma, chroma_params @@ -131,7 +130,11 @@ def setup_distributed(rank, world_size): os.environ["WORLD_SIZE"] = str(world_size) # Initialize process group - dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + backend = "nccl" # Default backend for distributed training + if platform.system() == "Windows": + # Windows does not support NCCL, use GLOO instead + backend = "gloo" + dist.init_process_group(backend=backend, rank=rank, world_size=world_size) torch.cuda.set_device(rank) @@ -315,17 +318,17 @@ def cast_linear(module, dtype): def save_config_to_json(filepath: str, **configs): json_data = {key: asdict(value) for key, value in configs.items()} - with open(filepath, "w") as json_file: - json.dump(json_data, json_file, indent=4) + with open(filepath, "w", encoding="utf-8") as json_file: + json.dump(json_data, json_file, indent=4, ensure_ascii=False) def dump_dict_to_json(data, file_path): - with open(file_path, "w") as json_file: - json.dump(data, json_file, indent=4) + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(data, json_file, indent=4, ensure_ascii=False) def load_config_from_json(filepath: str): - with open(filepath, "r") as json_file: + with open(filepath, "r", encoding="utf-8") as json_file: return json.load(json_file) @@ -783,9 +786,8 @@ def train_chroma(rank, world_size, debug=False): if not debug: synchronize_gradients(model) - scheduler.step() - optimizer.step() + scheduler.step() optimizer.zero_grad() if rank == 0: diff --git a/src/trainer/train_chroma_rectification.py b/src/trainer/train_chroma_rectification.py index 8427e8f..c506888 100644 --- a/src/trainer/train_chroma_rectification.py +++ b/src/trainer/train_chroma_rectification.py @@ -1,3 +1,4 @@ +import platform import sys import os import json @@ -124,7 +125,11 @@ def setup_distributed(rank, world_size): os.environ["WORLD_SIZE"] = str(world_size) # Initialize process group - dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + backend = "nccl" # Default backend for distributed training + if platform.system() == "Windows": + # Windows does not support NCCL, use GLOO instead + backend = "gloo" + dist.init_process_group(backend=backend, rank=rank, world_size=world_size) torch.cuda.set_device(rank) @@ -414,17 +419,17 @@ def cast_linear(module, dtype): def save_config_to_json(filepath: str, **configs): json_data = {key: asdict(value) for key, value in configs.items()} - with open(filepath, "w") as json_file: - json.dump(json_data, json_file, indent=4) + with open(filepath, "w", encoding="utf-8") as json_file: + json.dump(json_data, json_file, indent=4, ensure_ascii=False) def dump_dict_to_json(data, file_path): - with open(file_path, "w") as json_file: - json.dump(data, json_file, indent=4) + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(data, json_file, indent=4, ensure_ascii=False) def load_config_from_json(filepath: str): - with open(filepath, "r") as json_file: + with open(filepath, "r", encoding="utf-8") as json_file: return json.load(json_file) @@ -932,9 +937,8 @@ def train_chroma(rank, world_size, debug=False): if not debug: synchronize_gradients(model) - scheduler.step() - optimizer.step() + scheduler.step() optimizer.zero_grad() if rank == 0: diff --git a/src/trainer/train_lumina.py b/src/trainer/train_lumina.py index 4f5bce7..32ef825 100644 --- a/src/trainer/train_lumina.py +++ b/src/trainer/train_lumina.py @@ -1,3 +1,4 @@ +import platform import sys import os import json @@ -106,7 +107,11 @@ def setup_distributed(rank, world_size): os.environ["WORLD_SIZE"] = str(world_size) # Initialize process group - dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + backend = "nccl" # Default backend for distributed training + if platform.system() == "Windows": + # Windows does not support NCCL, use GLOO instead + backend = "gloo" + dist.init_process_group(backend=backend, rank=rank, world_size=world_size) torch.cuda.set_device(rank) @@ -295,17 +300,17 @@ def cast_linear(module, dtype): def save_config_to_json(filepath: str, **configs): json_data = {key: asdict(value) for key, value in configs.items()} - with open(filepath, "w") as json_file: - json.dump(json_data, json_file, indent=4) + with open(filepath, "w", encoding="utf-8") as json_file: + json.dump(json_data, json_file, indent=4, ensure_ascii=False) def dump_dict_to_json(data, file_path): - with open(file_path, "w") as json_file: - json.dump(data, json_file, indent=4) + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(data, json_file, indent=4, ensure_ascii=False) def load_config_from_json(filepath: str): - with open(filepath, "r") as json_file: + with open(filepath, "r", encoding="utf-8") as json_file: return json.load(json_file) @@ -729,9 +734,8 @@ def train_lumina(rank, world_size, debug=False): if not debug: synchronize_gradients(model) - scheduler.step() - optimizer.step() + scheduler.step() optimizer.zero_grad() if training_config.wandb_project is not None and rank == 0: diff --git a/test/dataloaders/dataloader_test.py b/test/dataloaders/dataloader_test.py index 58c48b1..c68badb 100644 --- a/test/dataloaders/dataloader_test.py +++ b/test/dataloaders/dataloader_test.py @@ -36,7 +36,7 @@ images, caption, index = dataset[i] with open(f"preview/{i}.jsonl", 'w') as f: for item in caption: - json.dump(item, f) # Dump the item as a JSON object + json.dump(item, f, ensure_ascii=False) # Dump the item as a JSON object f.write('\n') # Write a newline after each JSON object save_image(make_grid(images.clip(-1, 1)), f"preview/{i}.jpg", normalize=True)