Skip to content

Commit 122e898

Browse files
authored
Create make_llava_ov_wds.sh
1 parent 1386c8d commit 122e898

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
2+
export CUDA_DEVICE_MAX_CONNECTIONS=1
3+
export NCCL_SOCKET_IFNAME=eth0
4+
export NCCL_IB_DISABLE=0
5+
export NCCL_IB_CUDA_SUPPORT=1
6+
export NCCL_IB_GID_INDEX=0
7+
export NCCL_DEBUG=INFO
8+
export OMP_NUM_THREADS=4
9+
export GLOO_SOCKET_IFNAME=eth0
10+
export NCCL_IB_HCA=mlx5_2,mlx5_5
11+
export CUDA_LAUNCH_BLOCKING=1
12+
13+
LLaVA-NeXT-HOME="Path_Of_LLaVA-NeXT"
14+
VISION_MODEL_PATH="Path_Of_VISION_MODEL"
15+
PROMPT_VERSION="qwen_1_5"
16+
17+
# stage1
18+
image_aspect_ratio=square
19+
20+
# other stages
21+
image_aspect_ratio=anyres_max_9
22+
23+
set -u
24+
DATA_PATH=$1
25+
EXPNAME_PATH=$2
26+
HOSTFILE=$3
27+
set +u
28+
29+
echo "BASE_RUN_NAME: ${EXPNAME_PATH}"
30+
31+
CKPT_PATH="./checkpoints"
32+
33+
mkdir -p $CKPT_PATH
34+
mkdir -p $EXPNAME_PATH
35+
LOGFILE=$EXPNAME_PATH/exp.log
36+
i=0
37+
NNodes=`wc -l ${HOSTFILE} | cut -d " " -f1`
38+
MASTER_ADDR=`head -n 1 ${HOSTFILE} | cut -d " " -f1`
39+
echo "Master node: ${MASTER_ADDR}"
40+
echo ${NNodes}
41+
echo ${i}
42+
echo ${MASTER_ADDR}
43+
44+
for ip in `cat ${HOSTFILE} | cut -d " " -f1`
45+
do
46+
echo "Starting node ${i}/${NNodes}: ${ip}"
47+
ssh $ip \
48+
"cd ${PWD} && \
49+
export WANDB_MODE=offline && \
50+
export ACCELERATE_CPU_AFFINITY=1 && \
51+
export PYTHONPATH=$LLaVA-NeXT-HOME:$PYTHONPATH && \
52+
torchrun --nproc_per_node=4 --nnodes=${NNodes} --node_rank=${i} --master_addr=${MASTER_ADDR} --master_port=29513 llava_ov_wds.py \
53+
--model_name_or_path ${CKPT_PATH} \
54+
--version ${PROMPT_VERSION} \
55+
--data_path $DATA_PATH \
56+
--image_folder playground/data \
57+
--video_folder ./onevision_data/videos \
58+
--mm_tunable_parts="mm_mlp_adapter" \
59+
--mm_vision_tower_lr=2e-6 \
60+
--vision_tower ${VISION_MODEL_PATH} \
61+
--mm_projector_type mlp2x_gelu \
62+
--mm_vision_select_layer -2 \
63+
--mm_use_im_start_end False \
64+
--mm_use_im_patch_token False \
65+
--mm_spatial_pool_mode "bilinear" \
66+
--group_by_modality_length True \
67+
--image_aspect_ratio ${image_aspect_ratio} \
68+
--image_grid_pinpoints '(1x1),...,(6x6)' \
69+
--mm_patch_merge_type spatial_unpad \
70+
--bf16 True \
71+
--run_name $EXPNAME_PATH \
72+
--output_dir "${EXPNAME_PATH}" \
73+
--num_train_epochs 1 \
74+
--per_device_train_batch_size 1 \
75+
--per_device_eval_batch_size 1 \
76+
--gradient_accumulation_steps 1 \
77+
--evaluation_strategy "no" \
78+
--save_strategy "steps" \
79+
--save_steps 1000 \
80+
--save_total_limit 20 \
81+
--learning_rate 1e-5 \
82+
--weight_decay 0. \
83+
--warmup_ratio 0.03 \
84+
--lr_scheduler_type "cosine" \
85+
--logging_steps 1 \
86+
--model_max_length 32768 \
87+
--gradient_checkpointing True \
88+
--dataloader_num_workers 2 \
89+
--lazy_preprocess True \
90+
--torch_compile True \
91+
--torch_compile_backend "inductor" \
92+
--dataloader_drop_last True \
93+
--seed 42 \
94+
--do_train False \
95+
--frames_upbound 32 1>>$LOGFILE.$ip 2>&1" &
96+
i=`expr $i + 1`
97+
done

0 commit comments

Comments
 (0)