diff --git a/poetry.lock b/poetry.lock index 14acc17..ff86e9f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -764,6 +764,17 @@ files = [ {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, ] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "docker-pycreds" version = "0.4.0" @@ -2075,6 +2086,29 @@ files = [ {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"}, ] +[[package]] +name = "openai" +version = "1.34.0" +description = "The official Python library for the openai API" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "openai-1.34.0-py3-none-any.whl", hash = "sha256:018623c2f795424044675c6230fa3bfbf98d9e0aab45d8fd116f2efb2cfb6b7e"}, + {file = "openai-1.34.0.tar.gz", hash = "sha256:95c8e2da4acd6958e626186957d656597613587195abd0fb2527566a93e76770"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.7,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] + [[package]] name = "orjson" version = "3.10.1" @@ -4542,4 +4576,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "f477d409961cf7a3873085d20e3e21b9e53a394faaf0fad4a6a819345e073b02" +content-hash = "af3df0483ec9af4baa1ef92c0e88357263e6e11b119e8b7ed78ef66622f438fa" diff --git a/pyproject.toml b/pyproject.toml index b41141f..3344de7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,3 +59,4 @@ datasets = "^2.18.0" wandb = "^0.16.5" loguru = "^0.7.2" scikit-learn = "^1.4.2" +openai = "^1.34.0" diff --git a/scripts/auto_labeling_using_llm.py b/scripts/auto_labeling_using_llm.py new file mode 100644 index 0000000..9d00f66 --- /dev/null +++ b/scripts/auto_labeling_using_llm.py @@ -0,0 +1,286 @@ +"""Script for pre-labeling the dataset. + +Run with: +``` +poetry run python -m scripts.pre_labeling_dataset +``` +""" + +import os +import random +import json +import argparse +import base64 +from io import BytesIO + +from PIL import Image +import jsonlines +from huggingface_hub import HfApi +from loguru import logger +from openai import AzureOpenAI + +from scripts.constants import ( + ASSETS_FOLDER, + CLASS_CONCEPTS_VALUES, + DATASET_NAME, + HF_TOKEN, + SPLITS, + CONCEPTS, + LABELED_CLASSES, +) + +from dotenv import load_dotenv + +load_dotenv() + + +def save_metadata(hf_api: HfApi, metadata: dict, split: str, push_to_hub: bool = False): + with jsonlines.open(f"{ASSETS_FOLDER}/{DATASET_NAME}/data/{split}/metadata.jsonl", mode="w") as writer: + writer.write_all(metadata) + + if push_to_hub: + hf_api.upload_file( + path_or_fileobj=os.path.join("{ASSETS_FOLDER}/{DATASET_NAME}/data/{split}", "metadata.jsonl"), + path_in_repo=f"data/{split}/metadata.jsonl", + repo_id=DATASET_NAME, + repo_type="dataset", + ) + + +def get_votes(hf_api: HfApi): + hf_api.snapshot_download( + local_dir=f"{ASSETS_FOLDER}/{DATASET_NAME}", + repo_id=DATASET_NAME, + repo_type="dataset", + ) + metadata = {} + for split in SPLITS: + metadata[split] = [] + with jsonlines.open(f"{ASSETS_FOLDER}/{DATASET_NAME}/data/{split}/metadata.jsonl") as reader: + for row in reader: + metadata[split].append(row) + votes = {} + for filename in os.listdir(f"{ASSETS_FOLDER}/{DATASET_NAME}/votes"): + with open(f"{ASSETS_FOLDER}/{DATASET_NAME}/votes/{filename}") as f: + key = filename.split(".")[0] + votes[key] = json.load(f) + return metadata, votes + + +def get_pre_labeled_concepts(item: dict): + active_concepts = CLASS_CONCEPTS_VALUES[item["class"]] + return {c: c in active_concepts for c in CONCEPTS} + + +def compute_concepts(votes): + vote_sum = {c: 0 for c in CONCEPTS} + for vote in votes.values(): + for c in CONCEPTS: + if c not in vote: + continue + vote_sum[c] += 2 * vote[c] - 1 + return {c: vote_sum[c] > 0 if vote_sum[c] != 0 else None for c in CONCEPTS} + + +class OpenAIRequest: + def __init__(self, model: str="gpt-4o"): + self.client = AzureOpenAI( + api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"], + ) + self.concepts = ",".join(CONCEPTS) + + def __call__(self, item: dict, icl: dict, **kwargs): + """Send a request to the OpenAI API.""" + message = [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": """\ +You are a helpful assistant that can help annotating images. Answer by giving the list of concepts you can see in the provided image. + +Given an image and its class, annotate the concepts' presence in the image using a JSON format. + +The labels must be provided according to the following JSON schema: +{concept_schema} +""".format(concept_schema={"properties":{concept: {"type":"boolean"} for concept in self.concepts}}) + } + ], + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"""\ +Here is an image and its class: + +Class: {icl["class"]}\nImage: +""" + }, + { + "type": "image", + "image": icl["image"] + } + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": f"Concepts: {icl['concepts']}" + } + ], + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Now here is another image and its class, provide the concepts: \nClass: {item['class']}\nImage:" + }, + { + "type": "image", + "image": item["image"] + } + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Concepts:" + } + ] + } + ] + + return self.client.chat.completions.create( + model=self.model, + messages=message, + **kwargs + ) + + +def image2base64(image: BytesIO) -> str: + """Convert image to base64 string.""" + return base64.b64encode(image.getvalue()).decode("utf-8") + + +def get_icl_example_dict() -> dict: + """Build ICL example manually.""" + return { + "class": "lettuce", + # TODO: update path + "image": image2base64(BytesIO(open("images/00000000.jpg", "rb").read())), + "concepts": { + "leaf": True, + "green": True, + "stem": False, + "red": False, + "black": False, + "blue": False, + "ovaloid": False, + "sphere": False, + "cylinder": False, + "cube": False, + "brown": False, + "orange": False, + "yellow": False, + "white": False, + "tail": False, + "seed": False, + "pulp": False, + "soil": False, + "tree": False, + } + } + + +def main(args): + hf_api = HfApi(token=HF_TOKEN) + + logger.info("Download metadata and votes") + metadata, votes = get_votes(hf_api) + + for split in SPLITS: + for item in metadata[split]: + if item["class"] in LABELED_CLASSES: + continue + key = item["id"] + + item_dict = { + "class": item["class"], + "image": image2base64(item["image"]), # TODO: fix open image + } + + icl_dict = get_icl_example_dict() + + openai_request = OpenAIRequest(model=args.model) + response = openai_request( + item=item_dict, + icl=icl_dict, + max_tokens=200, + temperature=0, + ) + + pred = response.choices[0].message.content + pred = pred[pred.rfind("{"):pred.rfind("}")] + print(pred) + + concepts = get_pre_labeled_concepts(item) + if args.model not in votes[key]: + continue + votes[key] = {args.model: concepts} + + logger.info("Save votes locally") + for key in votes: + with open(f"{ASSETS_FOLDER}/{DATASET_NAME}/votes/{key}.json", "w") as f: + json.dump(votes[key], f) + + if args.push_to_hub: + logger.info("Upload votes to Hub") + hf_api.upload_folder( + folder_path=f"{ASSETS_FOLDER}/{DATASET_NAME}", + repo_id=DATASET_NAME, + repo_type="dataset", + allow_patterns=["votes/*"], + ) + + new_metadata = {} + for split in ["train", "test"]: + new_metadata[split] = [] + with jsonlines.open(f"{ASSETS_FOLDER}/{DATASET_NAME}/data/{split}/metadata.jsonl") as reader: + for row in reader: + s_id = row["id"] + if s_id in votes: + row.update(compute_concepts(votes[s_id])) + new_metadata[split].append(row) + with jsonlines.open(f"{ASSETS_FOLDER}/{DATASET_NAME}/data/{split}/metadata.jsonl", mode="w") as writer: + writer.write_all(new_metadata[split]) + + if args.push_to_hub: + logger.info("Upload metadata to Hub") + for split in SPLITS: + save_metadata(hf_api, new_metadata[split], split, push_to_hub=True) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser("auto-label-dataset") + parser.add_argument( + "--model", type=str, default="gpt-4o", help="Specify the model to use, e.g., 'gpt-4o'") + parser.add_argument( + "--push_to_hub", action="store_true", help="Flag to push the results to the hub") + "--model", type=str, default="gpt-4o") + parser.add_argument( + "--push_to_hub", action=argparse.BooleanOptionalAction, default=False) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args)