diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 5dbafee9f..63c09fff1 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -291,6 +291,16 @@ jobs: bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16" echo "::endgroup::" + echo "::group::Run inference with quantize file" + for DEVICE in cpu; do # cuda + # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'` + # follow up with torchao as a separate PR + echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot" + python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth" + python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth" + done + echo "::endgroup::" + test-gpu-aoti-float32: permissions: id-token: write @@ -335,6 +345,11 @@ jobs: fi echo "::endgroup::" + # echo "::group::Run inference with quantize file" + # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth" + # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth" + # echo "::endgroup::" + test-gpu-aoti-float16: permissions: id-token: write @@ -376,10 +391,15 @@ jobs: echo "::group::Run inference with quantize file" if [ $(uname -s) == Darwin ]; then python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth" - python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~ + python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~ fi echo "::endgroup::" + # echo "::group::Run inference with quantize file" + # python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth" + # python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth" + # echo "::endgroup::" + test-gpu-eval-sanity-check: permissions: id-token: write @@ -495,10 +515,11 @@ jobs: python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte echo "******************************************" - echo "*** --quantize torchchat/quant_config/mobile.json ***" + echo "*** can't test --quantize torchchat/quant_config/mobile.json ***" + echo "*** testing --quantize torchchat/quant_config/mobile-32.json ***" echo "******************************************" - # python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte - # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte + python torchchat.py export --quantize torchchat/quant_config/mobile-32.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte + python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte echo "******************************************"