|
| 1 | +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
| 2 | +export CUDA_DEVICE_MAX_CONNECTIONS=1 |
| 3 | +export NCCL_SOCKET_IFNAME=eth0 |
| 4 | +export NCCL_IB_DISABLE=0 |
| 5 | +export NCCL_IB_CUDA_SUPPORT=1 |
| 6 | +export NCCL_IB_GID_INDEX=0 |
| 7 | +export NCCL_DEBUG=INFO |
| 8 | +export OMP_NUM_THREADS=4 |
| 9 | +export GLOO_SOCKET_IFNAME=eth0 |
| 10 | +export NCCL_IB_HCA=mlx5_2,mlx5_5 |
| 11 | +export CUDA_LAUNCH_BLOCKING=1 |
| 12 | + |
| 13 | +LLaVA-NeXT-HOME="Path_Of_LLaVA-NeXT" |
| 14 | +VISION_MODEL_PATH="Path_Of_VISION_MODEL" |
| 15 | +PROMPT_VERSION="qwen_1_5" |
| 16 | + |
| 17 | +# stage1 |
| 18 | +image_aspect_ratio=square |
| 19 | + |
| 20 | +# other stages |
| 21 | +image_aspect_ratio=anyres_max_9 |
| 22 | + |
| 23 | +set -u |
| 24 | + DATA_PATH=$1 |
| 25 | + EXPNAME_PATH=$2 |
| 26 | + HOSTFILE=$3 |
| 27 | +set +u |
| 28 | + |
| 29 | +echo "BASE_RUN_NAME: ${EXPNAME_PATH}" |
| 30 | + |
| 31 | +CKPT_PATH="./checkpoints" |
| 32 | + |
| 33 | +mkdir -p $CKPT_PATH |
| 34 | +mkdir -p $EXPNAME_PATH |
| 35 | +LOGFILE=$EXPNAME_PATH/exp.log |
| 36 | +i=0 |
| 37 | +NNodes=`wc -l ${HOSTFILE} | cut -d " " -f1` |
| 38 | +MASTER_ADDR=`head -n 1 ${HOSTFILE} | cut -d " " -f1` |
| 39 | +echo "Master node: ${MASTER_ADDR}" |
| 40 | +echo ${NNodes} |
| 41 | +echo ${i} |
| 42 | +echo ${MASTER_ADDR} |
| 43 | + |
| 44 | +for ip in `cat ${HOSTFILE} | cut -d " " -f1` |
| 45 | +do |
| 46 | + echo "Starting node ${i}/${NNodes}: ${ip}" |
| 47 | + ssh $ip \ |
| 48 | + "cd ${PWD} && \ |
| 49 | + export WANDB_MODE=offline && \ |
| 50 | + export ACCELERATE_CPU_AFFINITY=1 && \ |
| 51 | + export PYTHONPATH=$LLaVA-NeXT-HOME:$PYTHONPATH && \ |
| 52 | + torchrun --nproc_per_node=4 --nnodes=${NNodes} --node_rank=${i} --master_addr=${MASTER_ADDR} --master_port=29513 llava_ov_wds.py \ |
| 53 | + --model_name_or_path ${CKPT_PATH} \ |
| 54 | + --version ${PROMPT_VERSION} \ |
| 55 | + --data_path $DATA_PATH \ |
| 56 | + --image_folder playground/data \ |
| 57 | + --video_folder ./onevision_data/videos \ |
| 58 | + --mm_tunable_parts="mm_mlp_adapter" \ |
| 59 | + --mm_vision_tower_lr=2e-6 \ |
| 60 | + --vision_tower ${VISION_MODEL_PATH} \ |
| 61 | + --mm_projector_type mlp2x_gelu \ |
| 62 | + --mm_vision_select_layer -2 \ |
| 63 | + --mm_use_im_start_end False \ |
| 64 | + --mm_use_im_patch_token False \ |
| 65 | + --mm_spatial_pool_mode "bilinear" \ |
| 66 | + --group_by_modality_length True \ |
| 67 | + --image_aspect_ratio ${image_aspect_ratio} \ |
| 68 | + --image_grid_pinpoints '(1x1),...,(6x6)' \ |
| 69 | + --mm_patch_merge_type spatial_unpad \ |
| 70 | + --bf16 True \ |
| 71 | + --run_name $EXPNAME_PATH \ |
| 72 | + --output_dir "${EXPNAME_PATH}" \ |
| 73 | + --num_train_epochs 1 \ |
| 74 | + --per_device_train_batch_size 1 \ |
| 75 | + --per_device_eval_batch_size 1 \ |
| 76 | + --gradient_accumulation_steps 1 \ |
| 77 | + --evaluation_strategy "no" \ |
| 78 | + --save_strategy "steps" \ |
| 79 | + --save_steps 1000 \ |
| 80 | + --save_total_limit 20 \ |
| 81 | + --learning_rate 1e-5 \ |
| 82 | + --weight_decay 0. \ |
| 83 | + --warmup_ratio 0.03 \ |
| 84 | + --lr_scheduler_type "cosine" \ |
| 85 | + --logging_steps 1 \ |
| 86 | + --model_max_length 32768 \ |
| 87 | + --gradient_checkpointing True \ |
| 88 | + --dataloader_num_workers 2 \ |
| 89 | + --lazy_preprocess True \ |
| 90 | + --torch_compile True \ |
| 91 | + --torch_compile_backend "inductor" \ |
| 92 | + --dataloader_drop_last True \ |
| 93 | + --seed 42 \ |
| 94 | + --do_train False \ |
| 95 | + --frames_upbound 32 1>>$LOGFILE.$ip 2>&1" & |
| 96 | + i=`expr $i + 1` |
| 97 | +done |
0 commit comments