From 35e5e78f13447c615355ffff2fc590b2733e6935 Mon Sep 17 00:00:00 2001 From: qidanrui Date: Thu, 30 Nov 2023 07:43:25 +0000 Subject: [PATCH 1/4] remove pygraphviz in poetry --- poetry.lock | 12 +----------- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6175f79..b5298cd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2388,16 +2388,6 @@ files = [ [package.extras] plugins = ["importlib-metadata"] -[[package]] -name = "pygraphviz" -version = "1.11" -description = "Python interface to Graphviz" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pygraphviz-1.11.zip", hash = "sha256:a97eb5ced266f45053ebb1f2c6c6d29091690503e3a5c14be7f908b37b06f2d4"}, -] - [[package]] name = "pylint" version = "3.0.2" @@ -4257,4 +4247,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "a8ce8b51e3b3b20018b2a4e52eac65479bb04e7b6eb39138832b1c66a9cb22b5" +content-hash = "c604a299b6b9fcb011c963bc8470c4a758baba2bb8e04c17918928d5c7c70043" diff --git a/pyproject.toml b/pyproject.toml index 68c212a..93a6bd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ nvidia-nccl-cu12 = "2.18.1" nvidia-nvtx-cu12 = "12.1.105" triton = "2.1.0" nvidia-nvjitlink-cu12 = "^12.3.52" -pygraphviz = "^1.11" [build-system] requires = ["poetry-core"] From 8359ba8be8c56bc4a074e1bb216eb32e12fab17b Mon Sep 17 00:00:00 2001 From: qidanrui Date: Fri, 1 Dec 2023 14:01:28 +0000 Subject: [PATCH 2/4] update bitsandbytes to 0.41.2 --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index b5298cd..e15138d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -262,13 +262,13 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte [[package]] name = "bitsandbytes" -version = "0.39.0" +version = "0.41.2" description = "k-bit optimizers and matrix multiplication routines." optional = false python-versions = "*" files = [ - {file = "bitsandbytes-0.39.0-py3-none-any.whl", hash = "sha256:69e79cfce501cad5e34e9d785f5caf1768ead64f3e9f93b6bac1753e04dfa39a"}, - {file = "bitsandbytes-0.39.0.tar.gz", hash = "sha256:b453dacf854e3624fe72b1306b1a6916de4a5edfd88fa01bf98ba73b2e3030d4"}, + {file = "bitsandbytes-0.41.2-py3-none-any.whl", hash = "sha256:5a2280761dc11c7a23a1be948cfd6a849c2e718012ee34316b979eb6c5634de2"}, + {file = "bitsandbytes-0.41.2.tar.gz", hash = "sha256:787c14b63cc559e1b344f683497a9353ac2e256a3fe89972f960e7c428d5cce7"}, ] [[package]] @@ -4247,4 +4247,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "c604a299b6b9fcb011c963bc8470c4a758baba2bb8e04c17918928d5c7c70043" +content-hash = "fd8272ceaaac7b87bea6c6c49f11a7a3a791dece4cfb7eac34578a6a561dba6f" diff --git a/pyproject.toml b/pyproject.toml index 93a6bd4..a415cb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ python = ">=3.10,<3.13" transformers = "4.33.2" peft = "0.4.0" accelerate = "0.21.0" -bitsandbytes = "0.39.0" einops = "0.6.1" evaluate = "0.4.0" scikit-learn = "1.2.2" @@ -57,6 +56,7 @@ nvidia-nccl-cu12 = "2.18.1" nvidia-nvtx-cu12 = "12.1.105" triton = "2.1.0" nvidia-nvjitlink-cu12 = "^12.3.52" +bitsandbytes = "0.41.2" [build-system] requires = ["poetry-core"] From b35533ee9c771eeed2e685242c129e1525cb8248 Mon Sep 17 00:00:00 2001 From: qidanrui Date: Fri, 1 Dec 2023 14:15:16 +0000 Subject: [PATCH 3/4] handle invalid output from models --- dbgpt_hub/predict/predict.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dbgpt_hub/predict/predict.py b/dbgpt_hub/predict/predict.py index e4653b7..6d67722 100644 --- a/dbgpt_hub/predict/predict.py +++ b/dbgpt_hub/predict/predict.py @@ -49,7 +49,10 @@ def main(): with open(predict_output_dir_name, "w") as f: for p in result: - f.write(p.replace("\n", " ") + "\n") + try: + f.write(p.replace("\n", " ") + "\n") + except: + f.write("Invalid Output!\n") if __name__ == "__main__": From b17e6bd0aeabe944632a3bc861ae86ff1c18670a Mon Sep 17 00:00:00 2001 From: qidanrui Date: Sat, 2 Dec 2023 10:45:08 +0000 Subject: [PATCH 4/4] add API interfaces for train, predict and evaluate --- .gitignore | 2 +- dbgpt_hub/eval/__init__.py | 8 +++++ dbgpt_hub/eval/evaluation.py | 37 +++++++++++++++++++++ dbgpt_hub/eval/evaluation_api.py | 32 +++++++++++++++++++ dbgpt_hub/predict/__init__.py | 8 +++++ dbgpt_hub/predict/predict.py | 55 +++++++++++++++++++------------- dbgpt_hub/predict/predict_api.py | 31 ++++++++++++++++++ dbgpt_hub/train/__init__.py | 8 +++++ dbgpt_hub/train/sft_train_api.py | 47 +++++++++++++++++++++++++++ 9 files changed, 205 insertions(+), 23 deletions(-) create mode 100644 dbgpt_hub/eval/evaluation_api.py create mode 100644 dbgpt_hub/predict/predict_api.py create mode 100644 dbgpt_hub/train/sft_train_api.py diff --git a/.gitignore b/.gitignore index b293180..aa4f887 100644 --- a/.gitignore +++ b/.gitignore @@ -184,4 +184,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ diff --git a/dbgpt_hub/eval/__init__.py b/dbgpt_hub/eval/__init__.py index e69de29..b38ecaf 100644 --- a/dbgpt_hub/eval/__init__.py +++ b/dbgpt_hub/eval/__init__.py @@ -0,0 +1,8 @@ +""" +dbgpt_hub.eval +============== +""" + +from .evaluation_api import start_evaluate + +__all__ = ["start_evaluate"] diff --git a/dbgpt_hub/eval/evaluation.py b/dbgpt_hub/eval/evaluation.py index ecf374f..a96c7f9 100644 --- a/dbgpt_hub/eval/evaluation.py +++ b/dbgpt_hub/eval/evaluation.py @@ -12,6 +12,7 @@ import subprocess import json +from typing import Optional, Dict, Any from process_sql import get_schema, Schema, get_sql from exec_eval import eval_exec_match from func_timeout import func_timeout, FunctionTimedOut @@ -1152,6 +1153,42 @@ def build_foreign_key_map_from_json(table): return tables +def evaluate_api(args: Optional[Dict[str, Any]] = None): + # Prepare output file path by appending "2sql" before ".txt" if --natsql is true + if args["natsql"]: + pred_file_path = ( + args["input"].rsplit(".", 1)[0] + "2sql." + args["input"].rsplit(".", 1)[1] + ) + gold_file_path = args["gold_natsql"] + table_info_path = args["table_natsql"] + else: + pred_file_path = args["input"] + gold_file_path = args["gold"] + table_info_path = args["table"] + + # only evaluating exact match needs this argument + kmaps = None + if args["etype"] in ["all", "match"]: + assert ( + args.table is not None + ), "table argument must be non-None if exact set match is evaluated" + kmaps = build_foreign_key_map_from_json(args["table"]) + + # Print args + print(f"params as fllows \n {args}") + + evaluate( + gold_file_path, + pred_file_path, + args["db"], + args["etype"], + kmaps, + args["plug_value"], + args["keep_distinct"], + args["progress_bar_for_each_datapoint"], + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( diff --git a/dbgpt_hub/eval/evaluation_api.py b/dbgpt_hub/eval/evaluation_api.py new file mode 100644 index 0000000..77ab00f --- /dev/null +++ b/dbgpt_hub/eval/evaluation_api.py @@ -0,0 +1,32 @@ +from typing import Optional, Dict, Any + +from dbgpt_hub.eval import evaluation + + +def start_evaluate( + args: Optional[Dict[str, Any]] = None, +): + # Arguments for evaluation + if args is None: + args = { + "input": "./dbgpt_hub/output/pred/pred_sql_dev_skeleton.sql", + "gold": "./dbgpt_hub/data/eval_data/gold.txt", + "gold_natsql": "./dbgpt_hub/data/eval_data/gold_natsql2sql.txt", + "db": "./dbgpt_hub/data/spider/database", + "table": "./dbgpt_hub/data/eval_data/tables.json", + "table_natsql": "./dbgpt_hub/data/eval_data/tables_for_natsql2sql.json", + "etype": "exec", + "plug_value": True, + "keep_distict": False, + "progress_bar_for_each_datapoint": False, + "natsql": False, + } + else: + args = args + + # Execute evaluation + evaluation.evaluate_api(args) + + +if __name__ == "__main__": + start_evaluate() diff --git a/dbgpt_hub/predict/__init__.py b/dbgpt_hub/predict/__init__.py index e69de29..d9cb30e 100644 --- a/dbgpt_hub/predict/__init__.py +++ b/dbgpt_hub/predict/__init__.py @@ -0,0 +1,8 @@ +""" +dbgpt_hub.predict +============== +""" + +from .predict_api import start_predict + +__all__ = ["start_predict"] diff --git a/dbgpt_hub/predict/predict.py b/dbgpt_hub/predict/predict.py index 6d67722..e21a120 100644 --- a/dbgpt_hub/predict/predict.py +++ b/dbgpt_hub/predict/predict.py @@ -1,23 +1,21 @@ import os import json import sys -from tqdm import tqdm ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(ROOT_PATH) -from typing import List, Dict + +from tqdm import tqdm +from typing import List, Dict, Optional, Any from dbgpt_hub.data_process.data_utils import extract_sql_prompt_dataset from dbgpt_hub.llm_base.chat_model import ChatModel -from dbgpt_hub.configs.config import ( - PREDICTED_DATA_PATH, - OUT_DIR, - PREDICTED_OUT_FILENAME, -) -def prepare_dataset() -> List[Dict]: - with open(PREDICTED_DATA_PATH, "r") as fp: +def prepare_dataset( + predict_file_path: Optional[str] = None, +) -> List[Dict]: + with open(predict_file_path, "r") as fp: data = json.load(fp) predict_data = [extract_sql_prompt_dataset(item) for item in data] return predict_data @@ -33,21 +31,34 @@ def inference(model: ChatModel, predict_data: List[Dict], **input_kwargs): return res -def main(): - predict_data = prepare_dataset() +def predict(args: Optional[Dict[str, Any]] = None): + predict_file_path = "" + if args is None: + predict_file_path = os.path.join( + ROOT_PATH, "dbgpt_hub/data/eval_data/dev_sql.json" + ) + predict_out_dir = os.path.join( + os.path.join(ROOT_PATH, "dbgpt_hub/output/"), "pred" + ) + if not os.path.exists(predict_out_dir): + os.mkdir(predict_out_dir) + predict_output_filename = os.path.join(predict_out_dir, "pred_sql.sql") + print(f"predict_output_filename \t{predict_output_filename}") + else: + predict_file_path = os.path.join(ROOT_PATH, args["predict_file_path"]) + predict_out_dir = os.path.join( + os.path.join(ROOT_PATH, args["predict_out_dir"]), "pred" + ) + if not os.path.exists(predict_out_dir): + os.mkdir(predict_out_dir) + predict_output_filename = os.path.join(predict_out_dir, args["pred_sql.sql"]) + print(f"predict_output_filename \t{predict_output_filename}") + + predict_data = prepare_dataset(predict_file_path=predict_file_path) model = ChatModel() result = inference(model, predict_data) - predict_out_dir = os.path.join(OUT_DIR, "pred") - if not os.path.exists(predict_out_dir): - os.mkdir(predict_out_dir) - - predict_output_dir_name = os.path.join( - predict_out_dir, model.data_args.predicted_out_filename - ) - print(f"predict_output_dir_name \t{predict_output_dir_name}") - - with open(predict_output_dir_name, "w") as f: + with open(predict_output_filename, "w") as f: for p in result: try: f.write(p.replace("\n", " ") + "\n") @@ -56,4 +67,4 @@ def main(): if __name__ == "__main__": - main() + predict() diff --git a/dbgpt_hub/predict/predict_api.py b/dbgpt_hub/predict/predict_api.py new file mode 100644 index 0000000..11d916d --- /dev/null +++ b/dbgpt_hub/predict/predict_api.py @@ -0,0 +1,31 @@ +import os +from dbgpt_hub.predict import predict +from typing import Optional, Dict, Any + + +def start_predict( + args: Optional[Dict[str, Any]] = None, cuda_visible_devices: Optional[str] = "0" +): + # Setting CUDA Device + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + + # Default Arguments + if args is None: + args = { + "model_name_or_path": "codellama/CodeLlama-13b-Instruct-hf", + "template": "llama2", + "finetuning_type": "lora", + "checkpoint_dir": "dbgpt_hub/output/adapter/CodeLlama-13b-sql-lora", + "predict_file_path": "dbgpt_hub/data/eval_data/dev_sql.json", + "predict_out_dir": "dbgpt_hub/output/", + "predicted_out_filename": "pred_sql.sql", + } + else: + args = args + + # Execute prediction + predict.predict(args) + + +if __name__ == "__main__": + start_predict() diff --git a/dbgpt_hub/train/__init__.py b/dbgpt_hub/train/__init__.py index e69de29..4b57815 100644 --- a/dbgpt_hub/train/__init__.py +++ b/dbgpt_hub/train/__init__.py @@ -0,0 +1,8 @@ +""" +dbgpt_hub.train +============== +""" + +from .sft_train_api import start_sft + +__all__ = ["start_sft"] diff --git a/dbgpt_hub/train/sft_train_api.py b/dbgpt_hub/train/sft_train_api.py new file mode 100644 index 0000000..39e207b --- /dev/null +++ b/dbgpt_hub/train/sft_train_api.py @@ -0,0 +1,47 @@ +import os + +from typing import Optional, Dict, Any +from dbgpt_hub.train import sft_train + + +def start_sft( + args: Optional[Dict[str, Any]] = None, cuda_visible_devices: Optional[str] = "0" +): + # Setting CUDA Device + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + + # Default Arguments + if args is None: + args = { + "model_name_or_path": "codellama/CodeLlama-13b-Instruct-hf", + "do_train": True, + "dataset": "example_text2sql_train", + "max_source_length": 2048, + "max_target_length": 512, + "finetuning_type": "lora", + "lora_target": "q_proj,v_proj", + "template": "llama2", + "lora_rank": 64, + "lora_alpha": 32, + "output_dir": "dbgpt_hub/output/adapter/CodeLlama-13b-sql-lora", + "overwrite_cache": True, + "overwrite_output_dir": True, + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 16, + "lr_scheduler_type": "cosine_with_restarts", + "logging_steps": 50, + "save_steps": 2000, + "learning_rate": 2e-4, + "num_train_epochs": 8, + "plot_loss": True, + "bf16": True, + } + else: + args = args + + # Run SFT + sft_train.train(args) + + +if __name__ == "__main__": + start_sft()