Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions examples/v1/config/mpo_qwen3_vl_8B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""
DPO (Direct Preference Optimization) Configuration for Qwen3-VL-8B

This configuration demonstrates how to use DPO/MPO for offline preference learning
in xtuner v1 framework, following the same pattern as RL configs.

Supported loss types:
- sigmoid: Standard DPO loss for preference learning
- bco_pair: Binary Classifier Optimization for absolute quality
- sft: Supervised Fine-Tuning loss to maintain generation quality
Comment on lines +5 to +10
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude: Critical: The DPO feature is non-functional — missing module xtuner.v1.rl.dpo.

This config imports DPOLossConfig from xtuner.v1.rl.dpo, and the trainer imports DPOLossContext, DPOColateItem, etc., but the module defining these classes is not included in this PR. Additionally:

  • Qwen3VLDPOTokenizeFnConfig is not defined anywhere
  • VLMPreferenceJsonlDataset is not defined anywhere
  • qwen3_vl_dpo_collator is not defined anywhere, and also not in the Literal type for DataloaderConfig.collator

The MPO/DPO feature cannot work as submitted.


For MPO (Mixed Preference Optimization), use:(as used in the MPO paper)
loss_types=["sigmoid", "bco_pair", "sft"]
loss_weights=[0.8, 0.2, 1.0]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Critical: Hardcoded internal storage paths

This config file contains hardcoded paths to internal shared storage (/mnt/shared-storage-user/lisongze/...). These should use environment variables like the pattern in the original rl_qwen3_vl_8B_grpo.py (which this PR also breaks -- see separate comment).

Suggested change
loss_weights=[0.8, 0.2, 1.0]
ceph_config = os.environ.get("CEPH_CONFIG", "")
meta_data_path = os.environ["META_DATA_PATH"]
model_path = os.environ["MODEL_PATH"]
work_dir = os.environ["WORK_DIR"]
tokenizer_cache_dir = os.environ.get("TOKENIZER_CACHE_DIR", os.path.join(work_dir, "tokenizer_cache"))


Usage:
# Set environment variables
export WORK_DIR=/path/to/work_dir
export MODEL_PATH=/path/to/model
export META_DATA_PATH=/path/to/meta.json
export CEPH_CONFIG=/path/to/ceph.conf
export TOKENIZER_CACHE_DIR=/path/to/tokenizer_cache_dir
# Run with torchrun
torchrun --nproc_per_node=8 xtuner/v1/train/cli/dpo.py --config mpo_qwen3_vl_8B.py
"""

import json

from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
from xtuner.v1.datasets import Qwen3VLDPOTokenizeFnConfig
from xtuner.v1.datasets.config import DatasetConfig, DataloaderConfig
from xtuner.v1.datasets.mllm_tokenize_fn import OSSLoaderConfig
from xtuner.v1.model import Qwen3VLDense8BConfig
from xtuner.v1.rl.dpo import DPOLossConfig
from xtuner.v1.train.dpo_trainer import DPOTrainerConfig
import os


ceph_config = os.environ["CEPH_CONFIG"]
meta_data_path = os.environ["META_DATA_PATH"]
model_path = os.environ["MODEL_PATH"]
work_dir = os.environ["WORK_DIR"]
tokenizer_cache_dir = os.environ["TOKENIZER_CACHE_DIR"]

# basic settings
# global_batch_size = num_gpus × per_device_batch_size × gradient_accumulation_steps x sp_size
total_epochs = 1
global_batch_size = 64 # suppose 256
per_device_batch_size = 1
gradient_accumulation_steps = 4
max_length = 4096 * 2
pack_max_length = 4096 * 2
num_workers = 8
save_interval = 5000
log_interval = 1000

# Learning rate settings
lr = 5e-6 # Lower LR for DPO
# Paper: cosine decay with minimum learning rate 0
lr_min = 0
# Paper: linear warmup for first 5% of total training steps
warmup_ratio = 0.05
weight_decay = 0.05

model_cfg = Qwen3VLDense8BConfig()

# DPO Loss Configuratio
# Option 1: Standard DPO (sigmoid only)
# loss_cfg = DPOLossConfig(
# loss_types=["sigmoid"],
# loss_weights=[1.0],
# beta=0.1,
# )

# MPO (Mixed Preference Optimization) - combines DPO, BCO, and SFT losses
loss_cfg = DPOLossConfig(
loss_types=["sigmoid", "bco_pair", "sft"],
loss_weights=[0.8, 0.2, 1.0],
beta=0.1,
label_smoothing=0.0,
reference_free=False,
use_average_log_prob=False,
mode="chunk",
chunk_size=512,
ignore_idx=-100,
)

# Dataset Configuration - refer to sft_internvl3.5_8B_config_tiny.py)
oss_loader_cfg = OSSLoaderConfig(backend_kwargs={"conf_path": ceph_config})
ds_collections = json.loads(open(meta_data_path).read())
dataset_config = []
for name, _data in ds_collections.items():
_data_cfg = {
"dataset": DatasetConfig(
name=name,
anno_path=_data['annotation'],
media_root=_data.get('media_root', ''),
sample_ratio=_data.get('sample_ratio', 1.0),
class_name='VLMPreferenceJsonlDataset', # use preference dataset class
enable_sequential_sampler=True,
cache_tag='cache_tags_dpo_v1',
cache_dir=tokenizer_cache_dir,
),
"tokenize_fn": Qwen3VLDPOTokenizeFnConfig(
processor_path=model_path,
max_length=max_length,
min_pixels=_data.get('min_pixels', None),
max_pixels=_data.get('max_pixels', None),
oss_loader_cfg=oss_loader_cfg,
prompt_key="prompt",
chosen_key="chosen",
rejected_key="rejected",
images_key="images",
add_eos_token=True, # use parent class defined field names
system_message=_data.get('system_message', None),
hash=_data.get('hash', None),
),
}
dataset_config.append(_data_cfg)

dataloader_config = DataloaderConfig(
dataset_config_list=dataset_config,
pack_max_length=pack_max_length,
pack_to_max_length=True,# must set to True if using sp_size>1
pack_level="none",
collator="qwen3_vl_dpo_collator", # use DPO collator
num_workers=num_workers,
group_by_length=False, # must be False if pack_level=none
)

# Optimizer and Learning Rate
optim_cfg = AdamWConfig(lr=lr, weight_decay=weight_decay, foreach=False)
lr_cfg = LRConfig(lr_type="cosine", warmup_ratio=warmup_ratio, lr_min=lr_min)

# FSDP Configuration
fsdp_cfg = FSDPConfig(
recompute_ratio=1.0,
vision_recompute_ratio=1.0,
reshard_after_forward=True,
checkpoint_preserve_rng_state=False,
torch_compile=True,
)

# DPO Trainer Configuration
trainer = DPOTrainerConfig(
model_cfg=model_cfg,
optim_cfg=optim_cfg,
loss_cfg=loss_cfg,
lr_cfg=lr_cfg,
fsdp_cfg=fsdp_cfg,
load_from=model_path,
ref_load_from=None, # Use same model as reference
tokenizer_path=model_path,
work_dir=work_dir,
sp_size=1,
total_epochs=total_epochs,
global_batch_size=global_batch_size,
per_device_batch_size=per_device_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
max_length=max_length,
save_interval=save_interval,
log_interval=log_interval,
seed=42,
freeze_ref_model=True,
use_vlm_collator=True,
num_workers=num_workers,
dataloader_cfg=dataloader_config,
)
24 changes: 24 additions & 0 deletions examples/v1/scripts/run_mpo_qwen3_vl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
set -ex
export XTUNER_PACK_WORKERS=8
export XTUNER_TOKENIZE_WORKERS=16
export NCCL_TIMEOUT=10800
export TORCH_DISTRIBUTED_TIMEOUT=10800
export XTUNER_USE_FA3=1
export PYTHONPATH="$(pwd)"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warning: Hardcoded absolute paths in shell scripts

Both config_file (line 12) and the torchrun target (line 24) use hardcoded absolute paths to a developer's workspace. These should use relative paths or environment variables, following the pattern of other scripts in the repo.

export HF_HOME="$(pwd)/"
export TORCHDYNAMO_VERBOSE=1

MASTER_PORT=20500
config_file="xtuner/examples/v1/config/mpo_qwen3_vl_8B.py"
# NODE_COUNT=1
# NODE_RANK=0
# MASTER_ADDR=127.0.0.1
# PROC_PER_NODE=8

torchrun \
--nnodes=$NODE_COUNT \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
--nproc_per_node=$PROC_PER_NODE \
/mnt/shared-storage-user/lisongze/xtuner/xtuner/v1/train/cli/dpo.py --config ${config_file}
26 changes: 26 additions & 0 deletions xtuner/v1/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
DatasetConfigList,
DatasetConfigListAdatper,
)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warning: Breaking change -- CustomPackDataset, CustomSampler, LongTextPretrainTokenizeFunction, and LongTextPretrainTokenizeFunctionConfig removed from public API

These classes are removed from the datasets __init__.py. Any downstream code importing these will break. This removal is unrelated to the MPO feature and should be in a separate refactoring PR.

from .dpo_collator import (
DPOColateItem,
qwen3_vl_dpo_collator
)
from .custom_pack import CustomPackDataset
from .custom_sampler import CustomSampler
from .ftdp import FTDPTokenizeFnConfig, FtdpTokenizeFunction
Expand All @@ -19,6 +23,15 @@
Qwen3VLTokenizeFunction,
)
from .packing import ExpandSoftPackDataset, HardPackDataset, MLLMPretrainHybridPackDataset, _LegacySoftPackDataset
from .preference_dataset import (
InMemoryPreferenceDataset,
PreferenceDataItem,
PreferenceJsonlDataset,
PreferenceTokenizeFunction,
VLMPreferenceJsonlDataset,
Qwen3VLDPOTokenizeFnConfig,
Qwen3VLDPOTokenizeFunction,
)
from .pt_tokenize_fn import (
LongTextPretrainTokenizeFunction,
LongTextPretrainTokenizeFunctionConfig,
Expand All @@ -31,6 +44,7 @@
from .sft_tokenize_fn import OpenaiTokenizeFunction, OpenaiTokenizeFunctionConfig
from .utils import CachableTokenizeFunction, calculate_file_sha256, calculate_xxhash, tokenizer_hash
from .vlm_jsonl import VLMJsonlDataset
from .qwen3vl_vision_process import process_vision_info


from . import _hardcode_patch # isort: skip
Expand Down Expand Up @@ -78,4 +92,16 @@
"DatasetConfig",
"OpenaiTokenizeFunctionConfig",
"OpenaiTokenizeFunction",
# DPO collators (xtuner v1 style)
"DPOColateItem",
"qwen3_vl_dpo_collator",
# Preference datasets
"PreferenceDataItem",
"PreferenceTokenizeFunction",
"PreferenceJsonlDataset",
"VLMPreferenceJsonlDataset",
"InMemoryPreferenceDataset",
"Qwen3VLDPOTokenizeFnConfig",
"Qwen3VLDPOTokenizeFunction",
"process_vision_info"
]
17 changes: 16 additions & 1 deletion xtuner/v1/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
qwen3_vl_sft_collator,
sft_llm_collator,
)
from .dpo_collator import qwen3_vl_dpo_collator
from .custom_pack import CustomPackDataset
from .custom_sampler import CustomSampler
from .dataloader import BaseDataloader, Dataloader
Expand All @@ -33,6 +34,7 @@
from .sampler import LengthGroupedSampler, ParallelSampler
from .utils import CachableTokenizeFunction, tokenizer_xxhash
from .vlm_jsonl import VLMJsonlDataset
from .preference_dataset import VLMPreferenceJsonlDataset


logger = get_logger()
Expand Down Expand Up @@ -77,6 +79,17 @@ def build(
cache_dir=self.cache_dir,
cache_tag=self.cache_tag,
)
elif self.class_name == "VLMPreferenceJsonlDataset":
return VLMPreferenceJsonlDataset(
tokenize_fn=tokenize_fn,
anno_path=self.anno_path,
sample_ratio=self.sample_ratio,
enable_sequential_sampler=self.enable_sequential_sampler,
name=self.name,
media_root=self.media_root,
cache_dir=self.cache_dir,
cache_tag=self.cache_tag,
)
else:
raise ValueError(f"Unsupported class_name: {self.class_name}")

Expand Down Expand Up @@ -276,7 +289,7 @@ class DataloaderConfig(BaseDataloaderConfig):
dataset_config_list: DatasetConfigList | None = None

collator: Annotated[
Literal["sft_llm_collator", "intern_s1_vl_sft_collator", "qwen3_vl_sft_collator", "fake_collator"] | str,
Literal["sft_llm_collator", "intern_s1_vl_sft_collator", "qwen3_vl_sft_collator", "fake_collator", "qwen3_vl_dpo_collator"] | str,
Parameter(help="collator func name"),
] = "sft_llm_collator"
pack_to_max_length: Annotated[bool, Parameter(help="whether to pack to max length")] = True
Expand Down Expand Up @@ -312,6 +325,8 @@ def build_collator(self):
return qwen3_vl_sft_collator
elif self.collator == "fake_collator":
return fake_collator # for RL
elif self.collator == "qwen3_vl_dpo_collator":
return qwen3_vl_dpo_collator
else:
collator = pydoc.locate(self.collator)
if collator is None:
Expand Down
Loading