Skip to content

移植了glpn模型在mindspore框架下的推理代码,用于开源实习的应用开发任务 #2002

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
3 changes: 1 addition & 2 deletions .github/workflows/ci_pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ jobs:
OS: ubuntu-latest
PYTHON: 3.11
run: |
python .github/install_mindspore.py
pip install -r download.txt
pip install mindspore
- name: Test with pytest
run: |
pytest -vs tests/transformers/models/${{ matrix.alpha }}*/test_modeling*
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/make_wheel_releases.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
run: python -m build --wheel

- name: Upload file
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: mindnlp-whl
path: dist/*
Expand Down
160 changes: 160 additions & 0 deletions llm/finetune/Align/eval_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import collections
import collections.abc

collections.Iterable = collections.abc.Iterable

import mindspore as ms
from mindnlp.transformers import AlignModel, AlignProcessor
from mindspore import Tensor, nn, ops, Parameter
from pycocotools.coco import COCO
import os
from tqdm import tqdm
import pickle
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import gc

HYPERPARAMS = {
"model_name": "E:/Code/align_ft_torch/cache/model/kakaobrain/align-base",
"batch_size": 4,
"val_samples": 50,
"max_length": 128,
"num_workers": 8,
"data_dir": "MSCOCO",
"data_type": "val2017",
"val_cache_file": "mscoco_preprocessed_val_50.pkl",
"save_dir": "cache/model",
"model_save_path": "cache/model/finetuned_align_model_epoch_{epoch}.ckpt",
"processor_save_path": "cache/model/finetuned_align_processor"
}

ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
ms.context.reset_auto_parallel_context()


def setup_coco():
dataDir = HYPERPARAMS["data_dir"]
dataType = HYPERPARAMS["data_type"]
os.makedirs(dataDir, exist_ok=True)
os.makedirs(f"{dataDir}/annotations", exist_ok=True)
os.makedirs(f"{dataDir}/{dataType}", exist_ok=True)
ann_file = f"{dataDir}/annotations/captions_{dataType}.json"
if not os.path.exists(ann_file):
ann_zip = f"{dataDir}/annotations_trainval2017.zip"
if not os.path.exists(ann_zip):
raise FileNotFoundError(f"{ann_zip} not found. Please download it manually.")
print("Extracting annotations...")
os.system(f"unzip -o {ann_zip} -d {dataDir}")
return dataDir, dataType


dataDir, dataType = setup_coco()
annFile = f'{dataDir}/annotations/captions_{dataType}.json'
coco = COCO(annFile)


def get_image_and_caption(coco, img_id, cache_dir=f"{HYPERPARAMS['data_dir']}/{HYPERPARAMS['data_type']}"):
ann_ids = coco.getAnnIds(imgIds=img_id)
anns = coco.loadAnns(ann_ids)
caption = anns[0]['caption']
img_info = coco.loadImgs(img_id)[0]
img_path = f"{cache_dir}/{img_info['file_name']}"
image = Image.open(img_path)
if image.mode != "RGB":
image = image.convert("RGB")
return image, caption


def process_sample(img_id, coco):
image, caption = get_image_and_caption(coco, img_id)
processor = AlignProcessor.from_pretrained(HYPERPARAMS["processor_save_path"])
inputs = processor(
text=caption,
images=image,
return_tensors="ms",
padding="max_length",
max_length=HYPERPARAMS["max_length"]
)
return (inputs["input_ids"][0], inputs["attention_mask"][0], inputs["pixel_values"][0])


def preprocess_and_save(coco, num_samples, cache_file):
if os.path.exists(cache_file):
print(f"Loading preprocessed data from {cache_file}")
with open(cache_file, "rb") as f:
dataset = pickle.load(f)
print(f"Loaded dataset size: {len(dataset)} samples")
return dataset
img_ids = coco.getImgIds()[:num_samples]
dataset = []
with ThreadPoolExecutor(max_workers=HYPERPARAMS["num_workers"]) as executor:
dataset = list(tqdm(executor.map(lambda x: process_sample(x, coco), img_ids),
total=num_samples, desc=f"Preprocessing dataset ({num_samples} samples)"))
with open(cache_file, "wb") as f:
pickle.dump(dataset, f)
return dataset


def create_val_dataloader(coco, batch_size=HYPERPARAMS["batch_size"]):
val_dataset = preprocess_and_save(coco, HYPERPARAMS["val_samples"], HYPERPARAMS["val_cache_file"])
val_dataloader = ms.dataset.GeneratorDataset(
val_dataset,
column_names=["input_ids", "attention_mask", "pixel_values"]
).batch(batch_size)
return val_dataloader


class TrainingNet(nn.Cell):
def __init__(self, model):
super().__init__()
self.model = model
self.global_pool = nn.AdaptiveAvgPool2d(1)
self.text_projection = nn.Dense(768, 640)
self.logit_scale = Parameter(Tensor(np.log(1 / 0.07), dtype=ms.float32), requires_grad=True)
self.image_embeds = None
self.text_embeds = None

def construct(self, input_ids, attention_mask, pixel_values):
embedding_output = self.model.vision_model.embeddings(pixel_values)
encoder_outputs = self.model.vision_model.encoder(embedding_output)
last_hidden_state = encoder_outputs[0]
pooled_output = self.global_pool(last_hidden_state)
self.image_embeds = pooled_output.reshape(pooled_output.shape[:2])
text_outputs = self.model.text_model(input_ids=input_ids, attention_mask=attention_mask)
text_embeds = text_outputs[0][:, 0, :]
self.text_embeds = self.text_projection(text_embeds)
logits = ops.matmul(self.image_embeds, self.text_embeds.T) * ops.exp(self.logit_scale)
labels = ops.arange(len(logits), dtype=ms.int32)
loss_i2t = nn.CrossEntropyLoss()(logits, labels)
loss_t2i = nn.CrossEntropyLoss()(logits.T, labels)
return (loss_i2t + loss_t2i) / 2


def evaluate_model(coco, epoch_to_eval):
processor = AlignProcessor.from_pretrained(HYPERPARAMS["processor_save_path"])
model = AlignModel.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True)
net = TrainingNet(model) # 使用 TrainingNet 包装 AlignModel
param_dict = ms.load_checkpoint(HYPERPARAMS["model_save_path"].format(epoch=epoch_to_eval))
ms.load_param_into_net(net, param_dict) # 加载到 TrainingNet
net.set_train(False)

val_dataloader = create_val_dataloader(coco)
print(f"Val dataloader created with batch_size={HYPERPARAMS['batch_size']}, samples={HYPERPARAMS['val_samples']}")

total_val_loss = 0
val_steps = 0
for batch in tqdm(val_dataloader.create_dict_iterator(), desc=f"Evaluating Epoch {epoch_to_eval}"):
loss = net(batch["input_ids"], batch["attention_mask"], batch["pixel_values"])
total_val_loss += loss.asnumpy()
val_steps += 1
avg_val_loss = total_val_loss / val_steps
print(f"Epoch {epoch_to_eval}, Eval Loss: {avg_val_loss:.4f}")

gc.collect()
return avg_val_loss


if __name__ == "__main__":
print("Starting model evaluation...")
for epoch in range(1, 11):
evaluate_model(coco, epoch)
203 changes: 203 additions & 0 deletions llm/finetune/Align/finetune.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import collections
import collections.abc

collections.Iterable = collections.abc.Iterable

import mindspore as ms
from mindnlp.transformers import AlignModel, AlignProcessor
from mindspore import Tensor, nn, ops, Parameter
from PIL import Image
from pycocotools.coco import COCO
import os
from tqdm import tqdm
import pickle
from concurrent.futures import ThreadPoolExecutor
import numpy as np

HYPERPARAMS = {
"model_name": "E:/Code/align_ft_torch/cache/model/kakaobrain/align-base",
"epochs": 10,
"batch_size": 4,
"learning_rate": 1e-4,
"train_samples": 200,
"max_length": 128,
"num_workers": 8,
"data_dir": "MSCOCO",
"data_type": "val2017",
"train_cache_file": "mscoco_preprocessed_train_200.pkl",
"save_dir": "cache/model",
"model_save_path": "cache/model/finetuned_align_model_epoch_{epoch}.ckpt",
"processor_save_path": "cache/model/finetuned_align_processor"
}

ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
ms.context.reset_auto_parallel_context()

processor = AlignProcessor.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True)
model = AlignModel.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True)
model.set_train(True)

print("Model config:", model.config)
params = model.trainable_params()
print("Number of trainable params:", len(params))


def setup_coco():
dataDir = HYPERPARAMS["data_dir"]
dataType = HYPERPARAMS["data_type"]
os.makedirs(dataDir, exist_ok=True)
os.makedirs(f"{dataDir}/annotations", exist_ok=True)
os.makedirs(f"{dataDir}/{dataType}", exist_ok=True)
ann_file = f"{dataDir}/annotations/captions_{dataType}.json"
if not os.path.exists(ann_file):
ann_zip = f"{dataDir}/annotations_trainval2017.zip"
if not os.path.exists(ann_zip):
raise FileNotFoundError(f"{ann_zip} not found. Please download it manually.")
print("Extracting annotations...")
os.system(f"unzip -o {ann_zip} -d {dataDir}")
return dataDir, dataType


dataDir, dataType = setup_coco()
annFile = f'{dataDir}/annotations/captions_{dataType}.json'
coco = COCO(annFile)


def get_image_and_caption(coco, img_id, cache_dir=f"{HYPERPARAMS['data_dir']}/{HYPERPARAMS['data_type']}"):
ann_ids = coco.getAnnIds(imgIds=img_id)
anns = coco.loadAnns(ann_ids)
caption = anns[0]['caption']
img_info = coco.loadImgs(img_id)[0]
img_path = f"{cache_dir}/{img_info['file_name']}"
image = Image.open(img_path)
if image.mode != "RGB":
image = image.convert("RGB")
return image, caption


def process_sample(img_id, coco):
image, caption = get_image_and_caption(coco, img_id)
inputs = processor(
text=caption,
images=image,
return_tensors="ms",
padding="max_length",
max_length=HYPERPARAMS["max_length"]
)
return (inputs["input_ids"][0], inputs["attention_mask"][0], inputs["pixel_values"][0])


def preprocess_and_save(coco, num_samples, cache_file):
if os.path.exists(cache_file):
print(f"Loading preprocessed data from {cache_file}")
with open(cache_file, "rb") as f:
dataset = pickle.load(f)
print(f"Loaded dataset size: {len(dataset)} samples")
return dataset
img_ids = coco.getImgIds()[:num_samples]
dataset = []
with ThreadPoolExecutor(max_workers=HYPERPARAMS["num_workers"]) as executor:
dataset = list(tqdm(executor.map(lambda x: process_sample(x, coco), img_ids),
total=num_samples, desc=f"Preprocessing dataset ({num_samples} samples)"))
with open(cache_file, "wb") as f:
pickle.dump(dataset, f)
return dataset


def create_train_dataloader(coco, batch_size=HYPERPARAMS["batch_size"]):
train_dataset = preprocess_and_save(coco, HYPERPARAMS["train_samples"], HYPERPARAMS["train_cache_file"])
train_dataloader = ms.dataset.GeneratorDataset(
train_dataset,
column_names=["input_ids", "attention_mask", "pixel_values"]
).batch(batch_size)
return train_dataloader


class TrainingNet(nn.Cell):
def __init__(self, model):
super().__init__()
self.model = model
self.global_pool = nn.AdaptiveAvgPool2d(1)
self.text_projection = nn.Dense(768, 640)
self.logit_scale = Parameter(Tensor(np.log(1 / 0.07), dtype=ms.float32), requires_grad=True)
self.image_embeds = None
self.text_embeds = None

def construct(self, input_ids, attention_mask, pixel_values):
embedding_output = self.model.vision_model.embeddings(pixel_values)
encoder_outputs = self.model.vision_model.encoder(embedding_output)
last_hidden_state = encoder_outputs[0]
pooled_output = self.global_pool(last_hidden_state)
self.image_embeds = pooled_output.reshape(pooled_output.shape[:2])
text_outputs = self.model.text_model(input_ids=input_ids, attention_mask=attention_mask)
text_embeds = text_outputs[0][:, 0, :]
self.text_embeds = self.text_projection(text_embeds)
logits = ops.matmul(self.image_embeds, self.text_embeds.T) * ops.exp(self.logit_scale)
labels = ops.arange(len(logits), dtype=ms.int32)
loss_i2t = nn.CrossEntropyLoss()(logits, labels)
loss_t2i = nn.CrossEntropyLoss()(logits.T, labels)
return (loss_i2t + loss_t2i) / 2


def convert_to_parameter(params):
converted = []
for i, param in enumerate(params):
if not isinstance(param, Parameter):
name = getattr(param, 'name', f"param_{i}") if hasattr(param, 'name') else f"param_{i}"
converted.append(Parameter(param.data, name=name, requires_grad=True))
else:
converted.append(param)
return converted


def finetune_model(coco, model, processor,
epochs=HYPERPARAMS["epochs"],
batch_size=HYPERPARAMS["batch_size"],
learning_rate=HYPERPARAMS["learning_rate"]):
train_dataloader = create_train_dataloader(coco, batch_size)
print(f"Train dataloader created with batch_size={batch_size}, samples={HYPERPARAMS['train_samples']}")

params = model.trainable_params()
if not params:
print("No trainable params found, enabling all parameters.")
for param in model.parameters_and_names():
param[1].requires_grad = True
params = model.trainable_params()

params = convert_to_parameter(params)
print(f"Optimizer initialized with {len(params)} parameters")
net = TrainingNet(model)
optimizer = nn.Adam(params + [net.text_projection.weight, net.text_projection.bias, net.logit_scale],
learning_rate=learning_rate)
train_net = nn.TrainOneStepCell(net, optimizer)

for epoch in range(epochs):
iterator = train_dataloader.create_dict_iterator()
total_train_loss = 0
steps = 0
for batch in tqdm(iterator, desc=f"Epoch {epoch + 1}/{epochs} (Train)"):
loss = train_net(batch["input_ids"], batch["attention_mask"], batch["pixel_values"])
total_train_loss += loss.asnumpy()
steps += 1
if steps == 1:
print(f"Epoch {epoch + 1}, Step 1 - Train Loss: {loss.asnumpy():.4f}")
logits = ops.matmul(net.image_embeds, net.text_embeds.T) * ops.exp(net.logit_scale)
print(f"Logits sample: {logits[:2, :2]}")
avg_train_loss = total_train_loss / steps
print(f"Epoch {epoch + 1}/{epochs}, Average Train Loss: {avg_train_loss:.4f}")

param_after = net.text_projection.weight.asnumpy()
if epoch == 0:
param_before = param_after.copy()
print("Params updated:", not np.array_equal(param_before, param_after))

save_dir = HYPERPARAMS["save_dir"]
os.makedirs(save_dir, exist_ok=True)
ms.save_checkpoint(net, HYPERPARAMS["model_save_path"].format(epoch=epoch + 1))

processor.save_pretrained(HYPERPARAMS["processor_save_path"])
return model


print("Starting model finetuning...")
finetuned_model = finetune_model(coco, model, processor)
Loading
Loading