pytorch
diff --git a/‎torchtitan/distributed/utils.py
Lines changed: 23 additions & 12 deletions b/‎torchtitan/distributed/utils.py
Lines changed: 23 additions & 12 deletions
diff --git a/‎torchtitan/experiments/flux/README.md
Lines changed: 0 additions & 1 deletion b/‎torchtitan/experiments/flux/README.md
Lines changed: 0 additions & 1 deletion
diff --git a/‎torchtitan/experiments/flux/__init__.py
Lines changed: 0 additions & 3 deletions b/‎torchtitan/experiments/flux/__init__.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎torchtitan/experiments/flux/dataset/flux_dataset.py
Lines changed: 22 additions & 9 deletions b/‎torchtitan/experiments/flux/dataset/flux_dataset.py
Lines changed: 22 additions & 9 deletions
diff --git a/‎torchtitan/experiments/flux/flux_argparser.py
Lines changed: 39 additions & 3 deletions b/‎torchtitan/experiments/flux/flux_argparser.py
Lines changed: 39 additions & 3 deletions
diff --git a/‎torchtitan/experiments/flux/model/model.py
Lines changed: 1 addition & 16 deletions b/‎torchtitan/experiments/flux/model/model.py
Lines changed: 1 addition & 16 deletions
diff --git a/‎torchtitan/experiments/flux/parallelize_flux.py
Lines changed: 0 additions & 1 deletion b/‎torchtitan/experiments/flux/parallelize_flux.py
Lines changed: 0 additions & 1 deletion
@@ -46,10 +46,12 @@ def set_determinism(
     device: torch.device,
     seed: int | None = None,
     deterministic: bool = False,
+    distinct_seed_mesh_dim: str = "pp",
 ) -> None:
     """
-    Set the same DTensor manual seed for all ranks within the same DTensor SPMD group, but different
-    seeds across PP groups (if applicable).
+    Set the same DTensor manual seed for all dimensions in world mesh, but only different seeds
+    across dimension denoted by `distinct_seed_mesh_dim`. An example use case is pipeline parallelism,
+    where we want to have the same seed across SPMD groups, but different seeds across PP groups.
 
     Currently, does not set seeds for the CUDA RNG since TorchTitan always uses DTensor for SPMD parallelisms,
     and DTensor manages its own RNG tracker, but we could extend to support both if needed.
@@ -81,22 +83,31 @@ def set_determinism(
         torch.distributed.broadcast(seed_tensor, src=0)
         seed = seed_tensor.to("cpu").view(torch.uint64).item()
 
+    # Set distinct seed for each rank in mesh dimensions, with dimension name provdied by `distinct_seed_mesh_dim`
     # For PP + SPMD cases, we want to separate the world into the SPMD mesh and the PP mesh,
     # and choose a unique seed for each rank on the PP mesh.
-    if c10d.get_world_size() > 1 and "pp" in world_mesh.mesh_dim_names:
-        pp_mesh = world_mesh["pp"]
-        seed += pp_mesh.get_local_rank()
+    # TODO(jianiw): We could further extend this to support mutiple distinct dimensions instead of just one.
+    if (
+        c10d.get_world_size() > 1
+        and distinct_seed_mesh_dim in world_mesh.mesh_dim_names
+    ):
+        distinct_mesh = world_mesh[distinct_seed_mesh_dim]
+        seed += distinct_mesh.get_local_rank()
         seed %= 2**64
 
         logger.debug(
-            f"PP rank {pp_mesh.get_local_rank()}, Global rank {c10d.get_rank()} using seed: {seed}"
+            f"{distinct_seed_mesh_dim} rank {distinct_mesh.get_local_rank()}, Global rank {c10d.get_rank()} using seed: {seed}"
         )
-        spmd_mesh_dims = list(
-            filter(lambda name: name != "pp", world_mesh.mesh_dim_names)
+        duplicate_seed_mesh = list(
+            filter(
+                lambda name: name != distinct_seed_mesh_dim, world_mesh.mesh_dim_names
+            )
+        )
+        duplicate_seed_mesh = (
+            world_mesh[duplicate_seed_mesh] if len(duplicate_seed_mesh) else None
         )
-        spmd_mesh = world_mesh[spmd_mesh_dims] if len(spmd_mesh_dims) else None
     else:
-        spmd_mesh = world_mesh
+        duplicate_seed_mesh = world_mesh
         logger.debug(f"Global Rank {c10d.get_rank()} using seed: {seed}")
 
     # The native RNGs and python RNG may not be important, except for the 1-D PP case, but we seed them for consistency.
@@ -106,8 +117,8 @@ def set_determinism(
 
     # As long as we are not in the 1-D (PP-only) case, we will have a seed to use for all ranks of the SPMD mesh.
     # IF PP is also used, this seed is unique per PP rank.
-    if spmd_mesh and spmd_mesh.get_coordinate() is not None:
-        torch.distributed.tensor._random.manual_seed(seed, spmd_mesh)
+    if duplicate_seed_mesh and duplicate_seed_mesh.get_coordinate() is not None:
+        torch.distributed.tensor._random.manual_seed(seed, duplicate_seed_mesh)
 
 
 def create_context_parallel_ctx(
 
@@ -25,6 +25,5 @@ Run the following command to train the model on a single GPU:
 ## TODO
 - [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
 - [ ] Support for distributed checkpointing and loading
-- [ ] Implement init_weights() function to initialize the model weights
 - [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function
 - [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
@@ -39,7 +39,6 @@
         axes_dim=(16, 56, 56),
         theta=10_000,
         qkv_bias=True,
-        guidance_embed=True,
         autoencoder_params=AutoEncoderParams(
             resolution=256,
             in_channels=3,
@@ -65,7 +64,6 @@
         axes_dim=(16, 56, 56),
         theta=10_000,
         qkv_bias=True,
-        guidance_embed=False,
         autoencoder_params=AutoEncoderParams(
             resolution=256,
             in_channels=3,
@@ -91,7 +89,6 @@
         axes_dim=(16, 56, 56),
         theta=10_000,
         qkv_bias=True,
-        guidance_embed=True,
         autoencoder_params=AutoEncoderParams(
             resolution=256,
             in_channels=3,
 
@@ -10,12 +10,11 @@
 from typing import Any, Callable, Optional
 
 import numpy as np
+import PIL
 
 import torch
-
 from datasets import Dataset, load_dataset
 from datasets.distributed import split_dataset_by_node
-from PIL import Image
 
 from torch.distributed.checkpoint.stateful import Stateful
 
@@ -28,7 +27,7 @@
 
 
 def _process_cc12m_image(
-    img: Image.Image,
+    img: PIL.Image.Image,
     output_size: int = 256,
 ) -> Optional[torch.Tensor]:
     """Process CC12M image to the desired size."""
@@ -56,9 +55,9 @@ def _process_cc12m_image(
 
     assert resized_img.size[0] == resized_img.size[1] == output_size
 
-    # Skip grayscale images, and RGBA, CMYK images
+    # Convert grayscale images, and RGBA, CMYK images
     if resized_img.mode != "RGB":
-        return None
+        resized_img = resized_img.convert("RGB")
 
     np_img = np.array(resized_img).transpose((2, 0, 1))
     tensor_img = torch.tensor(np_img).float() / 255.0
@@ -76,7 +75,7 @@ def _process_cc12m_image(
     return tensor_img
 
 
-def _flux_data_processor(
+def _cc12m_wds_data_processor(
     sample: dict[str, Any],
     t5_tokenizer: FluxTokenizer,
     clip_tokenizer: FluxTokenizer,
@@ -111,10 +110,10 @@ class TextToImageDatasetConfig:
 
 
 DATASETS = {
-    "cc12m": TextToImageDatasetConfig(
+    "cc12m-wds": TextToImageDatasetConfig(
         path="pixparse/cc12m-wds",
         loader=lambda path: load_dataset(path, split="train", streaming=True),
-        data_processor=_flux_data_processor,
+        data_processor=_cc12m_wds_data_processor,
     ),
 }
 
@@ -171,7 +170,9 @@ def __init__(
         self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
 
         self._t5_tokenizer = t5_tokenizer
+        self._t5_empty_token = t5_tokenizer.encode("")
         self._clip_tokenizer = clip_tokenizer
+        self._clip_empty_token = clip_tokenizer.encode("")
         self._data_processor = data_processor
         self.job_config = job_config
 
@@ -195,7 +196,10 @@ def __iter__(self):
             for sample in self._get_data_iter():
                 # Use the dataset-specific preprocessor
                 sample_dict = self._data_processor(
-                    sample, self._t5_tokenizer, self._clip_tokenizer, output_size=256
+                    sample,
+                    self._t5_tokenizer,
+                    self._clip_tokenizer,
+                    output_size=self.job_config.training.img_size,
                 )
 
                 # skip low quality image or image with color channel = 1
@@ -205,6 +209,14 @@ def __iter__(self):
                     )
                     continue
 
+                # Classifier-free guidance: Replace some of the strings with empty strings.
+                # Distinct random seed is initialized at the beginning of training for each FSDP rank.
+                dropout_prob = self.job_config.training.classifer_free_guidance_prob
+                if dropout_prob > 0.0:
+                    if random.random() < dropout_prob:
+                        sample_dict["t5_tokens"] = self._t5_empty_token
+                        sample_dict["clip_tokens"] = self._clip_empty_token
+
                 self._all_samples.extend(sample_dict)
                 self._sample_idx += 1
 
@@ -254,6 +266,7 @@ def build_flux_dataloader(
         clip_tokenizer=FluxTokenizer(
             clip_encoder_name, max_length=77
         ),  # fix max_length for CLIP
+        job_config=job_config,
         dp_rank=dp_rank,
         dp_world_size=dp_world_size,
         infinite=infinite,
 
@@ -11,10 +11,16 @@
 
 def extend_parser(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
-        "--training.guidance",
+        "--training.classifer_free_guidance_prob",
         type=float,
-        default=3.5,
-        help="guidance value used for guidance distillation",
+        default=0.0,
+        help="Classifier-free guidance with probability p to dropout the text conditioning",
+    )
+    parser.add_argument(
+        "--training.img_size",
+        type=int,
+        default=256,
+        help="Image width to sample",
     )
     parser.add_argument(
         "--encoder.t5_encoder",
@@ -45,3 +51,33 @@ def extend_parser(parser: argparse.ArgumentParser) -> None:
         action="store_true",
         help="Whether to shard the encoder using FSDP",
     )
+    # eval configs
+    parser.add_argument(
+        "--eval.enable_classifer_free_guidance",
+        action="store_true",
+        help="Whether to use classifier-free guidance during sampling",
+    )
+    parser.add_argument(
+        "--eval.classifier_free_guidance_scale",
+        type=float,
+        default=5.0,
+        help="Classifier-free guidance scale when sampling",
+    )
+    parser.add_argument(
+        "--eval.denoising_steps",
+        type=int,
+        default=50,
+        help="How many denoising steps to sample when generating an image",
+    )
+    parser.add_argument(
+        "--eval.eval_freq",
+        type=int,
+        default=100,
+        help="Frequency of evaluation/sampling during training",
+    )
+    parser.add_argument(
+        "--eval.save_img_folder",
+        type=str,
+        default="img",
+        help="Directory to save image generated/sampled from the model",
+    )
@@ -40,7 +40,6 @@ class FluxModelArgs(BaseModelArgs):
     axes_dim: tuple = (16, 56, 56)
     theta: int = 10_000
     qkv_bias: bool = True
-    guidance_embed: bool = True
     autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams)
 
     def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
@@ -89,11 +88,6 @@ def __init__(self, model_args: FluxModelArgs):
         self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
         self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
         self.vector_in = MLPEmbedder(model_args.vec_in_dim, self.hidden_size)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-            if model_args.guidance_embed
-            else nn.Identity()
-        )
         self.txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size)
 
         self.double_blocks = nn.ModuleList(
@@ -127,11 +121,9 @@ def init_weights(self, buffer_device=None):
         nn.init.xavier_uniform_(self.txt_in.weight)
         nn.init.constant_(self.txt_in.bias, 0)
 
-        # Initialize time_in, vector_in, guidance_in (MLPEmbedder)
+        # Initialize time_in, vector_in (MLPEmbedder)
         self.time_in.init_weights(init_std=0.02)
         self.vector_in.init_weights(init_std=0.02)
-        if self.model_args.guidance_embed:
-            self.guidance_in.init_weights(init_std=0.02)
 
         # Initialize transformer blocks:
         for block in self.single_blocks:
@@ -150,20 +142,13 @@ def forward(
         txt_ids: Tensor,
         timesteps: Tensor,
         y: Tensor,
-        guidance: Tensor | None = None,
     ) -> Tensor:
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")
 
         # running on sequences img
         img = self.img_in(img)
         vec = self.time_in(timestep_embedding(timesteps, 256))
-        if self.model_args.guidance_embed:
-            if guidance is None:
-                raise ValueError(
-                    "Didn't get guidance strength for guidance distilled model."
-                )
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
         vec = vec + self.vector_in(y)
         txt = self.txt_in(txt)
 
 
@@ -77,7 +77,6 @@ def apply_fsdp(
     linear_layers = [
         model.img_in,
         model.time_in,
-        model.guidance_in,
         model.vector_in,
         model.txt_in,
     ]
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,6 @@ def apply_fsdp(`
`77`	`77`	`linear_layers = [`
`78`	`78`	`model.img_in,`
`79`	`79`	`model.time_in,`
`80`		`- model.guidance_in,`
`81`	`80`	`model.vector_in,`
`82`	`81`	`model.txt_in,`
`83`	`82`	`]`