Stub

joecummings · joecummings · commit 2a46586dc950 · 2025-11-06T09:02:51.000-08:00
diff --git a/apps/on_policy_distillation/README.md b/apps/on_policy_distillation/README.md
@@ -0,0 +1,107 @@
+# On-Policy Distillation for Math Reasoning
+
+This app implements on-policy distillation (OPD) following the approach described in the [Thinking Machines blog post](https://thinkingmachines.ai/blog/on-policy-distillation/). OPD combines the benefits of on-policy training with dense reward signals for efficient post-training.
+
+## Overview
+
+On-policy distillation trains a student model by:
+1. Sampling trajectories from the student model itself
+2. Using a teacher model to grade each token with dense rewards (per-token KL divergence)
+3. Training the student to minimize reverse KL with the teacher
+
+This approach is **10-30x more compute efficient** than traditional RL while achieving comparable or better performance.
+
+## Experimental Setup
+
+### Models
+- **Student**: Qwen3-0.6B-Base (or Qwen3-8B for larger experiments)
+- **Teacher**: Qwen3-8B (or Qwen3-32B)
+- **Evaluation**: AIME'24 benchmark
+
+### Training Pipeline
+
+#### Phase 1: Supervised Fine-Tuning (SFT)
+First, establish a strong baseline through off-policy distillation:
+
+```bash
+python -m apps.sft.main --config apps/sft/qwen3_0_6.yaml
+```
+
+- **Dataset**: OpenThoughts3-1.2M (400k prompts)
+- **Expected Performance**: ~60% on AIME'24
+- **Purpose**: Teaches the model basic math reasoning patterns
+
+#### Phase 2: On-Policy Distillation
+Refine the model using on-policy learning with dense supervision:
+
+```bash
+python -m apps.on-policy-distillation.main --config apps/on-policy-distillation/qwen_opd.yaml
+```
+
+- **Starting Point**: SFT checkpoint from Phase 1
+- **Dataset**: Math prompts (from OpenThoughts3 or DeepMath, but only prompts - not solutions)
+- **Training**: ~150 steps (77k prompts with 4 samples each)
+- **Expected Performance**: ~70% on AIME'24
+
+### Key Implementation Details
+
+1. **Loss Function**: Per-token reverse KL divergence
+   ```python
+   reverse_kl = -(student_logprobs - teacher_logprobs)
+   ```
+
+2. **Sampling**: Generate multiple trajectories per prompt (n=16 in config)
+
+3. **No Discount Factor**: Optimize only immediate next token (discount=0)
+
+4. **Efficient Batching**: Can use smaller batch sizes than RL due to dense rewards
+
+## Evaluation
+
+Evaluate on AIME'24 benchmark after each phase:
+
+```bash
+python -m apps.eval.aime --checkpoint <path_to_checkpoint>
+```
+
+## Expected Results
+
+| Method | AIME'24 Score | Training Cost |
+|--------|---------------|---------------|
+| SFT (400k prompts) | ~60% | Baseline |
+| SFT (2M prompts, extrapolated) | ~70% | 5x baseline |
+| OPD (150 steps) | ~70% | 0.1-0.3x baseline |
+
+## Key Advantages
+
+- **Compute Efficiency**: 10-30x reduction vs traditional RL
+- **Dense Supervision**: Learns from every token, not just final rewards
+- **Data Efficiency**: Can reuse prompts multiple times effectively
+- **Stability**: More stable training than sparse RL rewards
+
+## Notes for Reproduction
+
+1. **Ensure proper initialization**: Load the SFT checkpoint before starting OPD
+2. **Use prompts only**: During OPD, sample completions from student, don't use dataset solutions
+3. **Teacher quality matters**: Better teachers provide better supervision
+4. **Monitor reverse KL**: Should decrease to near-zero as training progresses
+
+## References
+
+- [On-Policy Distillation Blog Post](https://thinkingmachines.ai/blog/on-policy-distillation/)
+- [Tinker Cookbook](https://github.com/thinking-machines-lab/tinker-cookbook)
+- [OpenThoughts3 Dataset](https://huggingface.co/datasets/open-thoughts/OpenThoughts3-1.2M)
+
+---
+
+**Important Code Modification Needed**: Your current OPD implementation should:
+1. Load from an SFT checkpoint (not raw base model)
+2. Extract only prompts from the dataset (not use the solutions)
+3. Add proper checkpoint loading in the trainer config:
+
+```yaml
+trainer:
+  checkpoint:
+    initial_load_path: ./checkpoint_student/sft_final  # Load SFT checkpoint
+    # ... rest of config
+```
diff --git a/apps/on_policy_distillation/config.py b/apps/on_policy_distillation/config.py
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class DatasetConfig:
+    source: str
+    split: str = "train"
diff --git a/apps/on_policy_distillation/main.py b/apps/on_policy_distillation/main.py
@@ -1,4 +1,6 @@
 import asyncio
+import itertools
+import time
 from dataclasses import dataclass
 from typing import Any
 
@@ -63,13 +65,17 @@ def collate(
         teacher_logprobs = [t.teacher_logprobs for t in batch]
         teacher_logprobs = torch.stack(teacher_logprobs)
 
+        # student_logprobs = [t.completion.logprobs for t in batch]
+        # student_logprobs = torch.stack(student_logprobs)
+
         pad_id = batch[0].pad_id
         padding_mask = response != pad_id
 
         input = {"tokens": torch.cat([request, response], dim=1)}
         target = {
             "response": response,
             "teacher_logprobs": teacher_logprobs,
+            # "student_logprobs": student_logprobs,
             "padding_mask": padding_mask,
         }
         inputs.append(input)
@@ -81,6 +87,7 @@ def importance_sampling_loss(
     logits: torch.Tensor,
     response: torch.Tensor,
     teacher_logprobs: torch.Tensor,
+    # student_logprobs: torch.Tensor,
     padding_mask: torch.Tensor,
     **kwargs,
 ) -> torch.Tensor:
@@ -135,32 +142,28 @@ async def main(cfg: DictConfig):
     tokenizer = get_tokenizer(cfg.student_model)
     pad_id = tokenizer.pad_token_id
     dataset = load_dataset(cfg.dataset.path, split=cfg.dataset.get("split", "train"))
-    dataset = dataset.filter(lambda x: x["domain"] == cfg.dataset["domain"])
+    # dataset = dataset.filter(lambda x: x["domain"] == cfg.dataset["domain"])
     dataset_iter = iter(dataset)
 
     print("All services initialized successfully!")
 
     step = 0
     for epoch in range(max_steps):
+        # start time
+        start = time.perf_counter()
         if step >= max_steps:
             break
 
-        # Collect rollout
         trajectories = []
         while len(trajectories) < train_batch_size:
             try:
                 sample = next(dataset_iter)
-                # Extract the human prompt from OpenThoughts format
-                conversations = sample.get("conversations", [])
-                if conversations and len(conversations) > 0:
-                    prompt = conversations[0].get("value", "")
-                else:
-                    prompt = sample.get("prompt", sample.get("text", ""))
+                conversation = sample["conversations"]
+                prompt = conversation[0]["value"]
 
-                print(f"Starting request with prompt: {prompt}")
-                completions = await student_generator.generate.route(prompt)
+                completions = await student_generator.generate.fanout(prompt)
 
-                for completion in completions:
+                for completion in itertools.chain(*completions):
                     # Create trajectory with raw completion
                     trajectory = Trajectory(
                         pad_id=pad_id,
@@ -201,6 +204,9 @@ async def main(cfg: DictConfig):
         await student_trainer.push_weights.call(step)
         await student_generator.update_weights.fanout(step)
 
+        end = time.perf_counter()
+        print(f"Step {step} took {end - start} seconds")
+
         await mlogger.flush.call_one(step)
 
     print(f"Training completed after {step} steps")
diff --git a/apps/on_policy_distillation/qwen_0_6b_to_8b.yaml b/apps/on_policy_distillation/qwen_0_6b_to_8b.yaml
@@ -2,17 +2,16 @@
 # >>> python -m apps.on-policy-distillation.main --config apps/on-policy-distillation/qwen_0_6b_to_8b.yaml
 
 # Global configuration
-train_batch_size: 4  # Number of trajectories per training step
-max_req_tokens: 512
-max_res_tokens: 65536
-student_model: "Qwen/Qwen3-1.7B"
+train_batch_size: 64  # Number of trajectories per training step
+max_req_tokens: 2048
+max_res_tokens: 4096
+student_model: "Qwen/Qwen3-0.6B"
 teacher_model: "Qwen/Qwen3-8B"
 
 # Dataset configuration
 dataset:
   path: "open-thoughts/OpenThoughts3-1.2M"
   split: "train"
-  domain: "math"
 
 # Student Generator configuration (inference model)
 student_generator:
@@ -22,7 +21,7 @@ student_generator:
     pipeline_parallel_size: 1
     enforce_eager: false
   sampling_params:
-    n: 2  # Single response per prompt
+    n: 16
     max_tokens: ${max_res_tokens}
     temperature: 1.0
     top_p: 0.95
@@ -31,7 +30,7 @@ student_generator:
 trainer:
   model:
     name: qwen3
-    flavor: 1.7B
+    flavor: 0.6B
     hf_assets_path: hf://${student_model}
   optimizer:
     name: AdamW
@@ -41,32 +40,32 @@ trainer:
     warmup_steps: 10
   training:
     local_batch_size: ${train_batch_size}  # Per-device batch size
-    seq_len: 66048
+    seq_len: 8192
     max_norm: 1.0
     steps: 10000
     dtype: bfloat16
-    gc_freq: 1
+    gc_freq: 5
   compile:
     enable: false
   parallelism:
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: 2
+    data_parallel_shard_degree: 1
     tensor_parallel_degree: 1
     pipeline_parallel_degree: 1
     context_parallel_degree: 1
     expert_parallel_degree: 1
     disable_loss_parallel: true
   checkpoint:
     enable: true
-    # folder: ./checkpoint_student
+    folder: ./checkpoint_student
     initial_load_path: hf://${student_model}
     initial_load_in_hf: true
     last_save_in_hf: true
-    interval: 500
+    interval: 250
     async_mode: "disabled"
   activation_checkpoint:
-    mode: selective
-    selective_ac_option: op
+    mode: none
+    # selective_ac_option: op
 
 # Teacher model configuration
 teacher:
@@ -77,13 +76,13 @@ teacher:
   training:
     seq_len: ${trainer.training.seq_len}
     dtype: bfloat16
-    gc_freq: 1
+    gc_freq: 10
   compile:
     enable: false
   parallelism:
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: 2
-    tensor_parallel_degree: 1  # Use 2 GPUs for teacher
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
     pipeline_parallel_degree: 1
     context_parallel_degree: 1
     expert_parallel_degree: 1
@@ -95,17 +94,17 @@ teacher:
 # Resource allocations (3 GPUs total)
 services:
   student_generator:
-    procs: 1  # Student inference: 1 GPU
-    num_replicas: 1
+    procs: 1
+    num_replicas: 4
     mesh_name: student_generator
     with_gpus: true
   teacher:
-    procs: 2 # Teacher: 2 GPUs with TP
-    num_replicas: 1
+    procs: 1
+    num_replicas: 2
     mesh_name: teacher
     with_gpus: true
   trainer:
-    procs: 2  # Student training: shares GPU with student_generator
+    procs: 1
     num_replicas: 1
     mesh_name: trainer
     with_gpus: true
diff --git a/apps/sft/main.py b/apps/sft/main.py
@@ -25,7 +25,11 @@
 from forge.controller import ForgeActor
 from forge.data.collate import collate_packed
 from forge.data.datasets.packed import PackedDataset, TextPacker
-from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset
+from forge.data.datasets.sft_dataset import (
+    AlpacaToMessages,
+    OpenThoughtsToMessages,
+    sft_iterable_dataset,
+)
 from forge.data.tokenizer import HuggingFaceModelTokenizer
 from forge.observability import get_or_create_metric_logger, record_metric, Reduce
 from forge.util.config import parse
@@ -165,13 +169,32 @@ def setup_data(self):
             ),
         )
 
+        # Get dataset configuration from job_config
+        dataset_config = self.job_config["dataset"]
+        dataset_path = dataset_config["path"]
+        dataset_split = dataset_config["split"]
+        message_transform_type = dataset_config.get("message_transform", "alpaca")
+        masking_strategy = dataset_config.get("masking_strategy", "train_on_assistant")
+
+        # Select the appropriate message transform
+        if message_transform_type == "openthoughts":
+            message_transform = OpenThoughtsToMessages(
+                masking_strategy=masking_strategy
+            )
+        elif message_transform_type == "alpaca":
+            message_transform = AlpacaToMessages(masking_strategy=masking_strategy)
+        else:
+            raise ValueError(
+                f"Unknown message_transform type: {message_transform_type}"
+            )
+
         dataset = sft_iterable_dataset(
             model_transform=tokenizer,
-            message_transform=AlpacaToMessages(),
-            path="yahma/alpaca-cleaned",
-            split="train",
+            message_transform=message_transform,
+            path=dataset_path,
+            split=dataset_split,
         )
-        packer = TextPacker(padding_idx=0)
+        packer = TextPacker(padding_idx=151643)
         dataset = PackedDataset(
             dataset=dataset,
             packer=packer,
diff --git a/src/forge/actors/reference_model.py b/src/forge/actors/reference_model.py
@@ -15,10 +15,6 @@
 import torch
 import torch.nn.functional as F
 
-from forge.controller import ForgeActor
-from forge.observability.metrics import record_metric, Reduce
-from forge.observability.perf_tracker import Tracer
-
 # from forge.util.ops import compute_logprobs
 from monarch.actor import current_rank, current_size, endpoint
 from torch.distributed.tensor import DTensor
@@ -34,6 +30,10 @@
 from torchtitan.experiments.forge.engine import ForgeEngine
 from torchtitan.experiments.forge.job_config import ForgeJobConfig
 
+from forge.controller import ForgeActor
+from forge.observability.metrics import record_metric, Reduce
+from forge.observability.perf_tracker import Tracer
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
diff --git a/src/forge/data/datasets/__init__.py b/src/forge/data/datasets/__init__.py
diff --git a/src/forge/data/datasets/sft_dataset.py b/src/forge/data/datasets/sft_dataset.py