Skip to content

Commit

Permalink
Merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
archana-ramalingam committed Nov 28, 2024
2 parents 692a197 + 7e2c050 commit e0b20f0
Show file tree
Hide file tree
Showing 29 changed files with 824 additions and 297 deletions.
18 changes: 4 additions & 14 deletions .github/workflows/ci-llama-large-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
name: Llama Benchmarking Tests

on:
pull_request:
workflow_dispatch:
schedule:
# Weekdays at 4:00 AM UTC = 9:00 PM PST.
Expand Down Expand Up @@ -59,21 +60,10 @@ jobs:
- name: Install pip deps
run: |
python -m pip install --no-compile --upgrade pip
# Note: We install in three steps in order to satisfy requirements
# from non default locations first. Installing the PyTorch CPU
# wheels saves multiple minutes and a lot of bandwidth on runner setup.
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
# Install latest iree-turbine.
pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
# Test with nightly releases, not what iree-turbine uses.
pip install sharktank -f https://github.com/nod-ai/SHARK-Platform/releases/expanded_assets/dev-wheels --upgrade --pre
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler \
iree-base-runtime
iree-base-compiler iree-base-runtime --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
- name: Run llama tests
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/llm/llama/benchmark/index.html
Expand Down
12 changes: 3 additions & 9 deletions .github/workflows/ci-llama-quick-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,12 @@ jobs:
# from non default locations first. Installing the PyTorch CPU
# wheels saves multiple minutes and a lot of bandwidth on runner setup.
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
# Install latest iree-turbine.
pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
# Get latest stable IREE release
pip install --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
# Test with nightly releases, not what iree-turbine uses.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler \
iree-base-runtime
- name: Run llama 8b f16 decomposed test
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test

Expand Down
19 changes: 4 additions & 15 deletions .github/workflows/ci-sglang-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,22 +57,11 @@ jobs:

- name: Install pip deps
run: |
python -m pip install --no-compile --upgrade pip
# Note: We install in three steps in order to satisfy requirements
# from non default locations first. Installing the PyTorch CPU
# wheels saves multiple minutes and a lot of bandwidth on runner setup.
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
pip install sharktank -f https://github.com/nod-ai/SHARK-Platform/releases/expanded_assets/dev-wheels --upgrade --pre
pip install shortfin -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels --upgrade --pre
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler iree-base-runtime --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
# Try with the latest nightly releases, not what iree-turbine pins.
# We could also pin to a known working or stable version.
# This should eventually stabilize. Do the best we can for now.
pip install -f https://iree.dev/pip-release-links.html --upgrade \
iree-base-compiler==3.0.0rc20241118 \
iree-base-runtime==3.0.0rc20241118 \
"numpy<2.0"
- name: Install SGLang
run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
Expand Down
12 changes: 4 additions & 8 deletions .github/workflows/ci-sglang-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,12 @@ jobs:
# from non default locations first. Installing the PyTorch CPU
# wheels saves multiple minutes and a lot of bandwidth on runner setup.
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
# Use newest possible releases to be able to track commits that may
# cause errors.
pip install -f https://iree.dev/pip-release-links.html --upgrade \
iree-base-compiler \
iree-base-runtime \
"numpy<2.0"
# Update to the latest iree packages.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler iree-base-runtime --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
- name: Install SGLang
run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
Expand Down
7 changes: 0 additions & 7 deletions .github/workflows/ci-shark-ai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,5 @@ jobs:
pip install --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
# Try with the latest IREE nightly releases, not what iree-turbine pins.
# We could also pin to a known working or stable version.
# This should eventually stabilize. Do the best we can for now.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler \
iree-base-runtime
- name: Run LLM Integration Tests
run: pytest -v app_tests/integration_tests/llm/shortfin --log-cli-level=INFO
51 changes: 41 additions & 10 deletions .github/workflows/ci-sharktank.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,45 @@ concurrency:
cancel-in-progress: true

jobs:
test_punet:
name: "Integration Tests - punet"
runs-on: nodai-amdgpu-mi250-x86-64
env:
PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
steps:
- name: "Setting up Python"
id: setup_python
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11

- name: "Checkout Code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Cache Pip Packages
uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
id: cache-pip
with:
path: ${{ env.PIP_CACHE_DIR }}
key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','sharktank/requirements*.txt') }}

- name: Install pip deps
run: |
python -m pip install --no-compile --upgrade pip
# Note: We install in three steps in order to satisfy requirements
# from non default locations first. Installing the PyTorch CPU
# wheels saves multiple minutes and a lot of bandwidth on runner setup.
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
# Update to the latest iree packages.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler iree-base-runtime --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
- name: Run punet tests
run: |
pytest -v sharktank/ -m model_punet
test:
name: "Unit Tests and Type Checking"
strategy:
Expand Down Expand Up @@ -61,9 +100,8 @@ jobs:
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
# Update to the latest iree packages.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler iree-base-runtime --src deps \
# Get latest stable IREE release
pip install --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
- name: Run sharktank tests
Expand Down Expand Up @@ -117,13 +155,6 @@ jobs:
pip install --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
# Try with the latest IREE nightly releases, not what iree-turbine pins.
# We could also pin to a known working or stable version.
# This should eventually stabilize. Do the best we can for now.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler \
iree-base-runtime
- name: Run tests
run: |
pytest \
Expand Down
12 changes: 8 additions & 4 deletions .github/workflows/ci_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ jobs:
- name: Install sharktank deps
run: |
python -m pip install --no-compile --upgrade pip
pip install shark-ai[apps]
python -m pip install sharktank -f https://github.com/nod-ai/SHARK-Platform/releases/expanded_assets/dev-wheels
pip install sharktank -f https://github.com/nod-ai/SHARK-Platform/releases/expanded_assets/dev-wheels --upgrade --pre
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler iree-base-runtime --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
- name: Run perplexity test with IREE
run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --run-nightly-llama-tests --bs=100 --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html
Expand Down Expand Up @@ -108,8 +110,10 @@ jobs:
- name: Install sharktank deps
run: |
python -m pip install --no-compile --upgrade pip
pip install shark-ai[apps]
python -m pip install sharktank -f https://github.com/nod-ai/SHARK-Platform/releases/expanded_assets/dev-wheels
pip install sharktank -f https://github.com/nod-ai/SHARK-Platform/releases/expanded_assets/dev-wheels --upgrade --pre
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler iree-base-runtime --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
- name: Run perplexity test with Torch
run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html
Expand Down
41 changes: 29 additions & 12 deletions sharktank/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,28 @@ def pytest_addoption(parser):
# --outtype=f32 \
# t5-v1_1-small
parser.addoption(
"--google-t5-v1-1-small-fp32-model-path",
"--google-t5-v1-1-small-f32-model-path",
type=Path,
default="/data/t5/small/google__t5-v1_1-small_fp32.gguf",
help="Google T5 v1.1 small fp32 model path",
default="/data/t5/small/google__t5-v1_1-small_f32.gguf",
help="Google T5 v1.1 small float32 model path",
)
parser.addoption(
"--google-t5-v1-1-xxl-fp32-model-path",
"--google-t5-v1-1-small-bf16-model-path",
type=Path,
default="/data/t5/xxl/google__t5-v1_1-xxl_fp32.gguf",
help="Google T5 v1.1 XXL fp32 model path",
default="/data/t5/small/google__t5-v1_1-small_bf16.gguf",
help="Google T5 v1.1 small bfloat16 model path",
)
parser.addoption(
"--google-t5-v1-1-xxl-f32-model-path",
type=Path,
default="/data/t5/xxl/google__t5-v1_1-xxl_f32.gguf",
help="Google T5 v1.1 XXL float32 model path",
)
parser.addoption(
"--google-t5-v1-1-xxl-bf16-model-path",
type=Path,
default="/data/t5/xxl/google__t5-v1_1-xxl_bf16.gguf",
help="Google T5 v1.1 XXL bfloat16 model path",
)

parser.addoption(
Expand Down Expand Up @@ -288,15 +300,20 @@ def get_model_artifacts(request: FixtureRequest):
model_path["llama3_405b_fp8_model_path"] = set_fixture_from_cli_option(
request, "--llama3-405b-fp8-model-path", "llama3_405b_fp8_model"
)
model_path["google__t5_v1_1_small_fp32_model_path"] = set_fixture_from_cli_option(
model_path["google__t5_v1_1_small_f32_model_path"] = set_fixture_from_cli_option(
request,
"--google-t5-v1-1-small-f32-model-path",
"google__t5_v1_1_small_f32_model",
)
model_path["google__t5_v1_1_small_bf16_model_path"] = set_fixture_from_cli_option(
request,
"--google-t5-v1-1-small-fp32-model-path",
"google__t5_v1_1_small_fp32_model",
"--google-t5-v1-1-small-bf16-model-path",
"google__t5_v1_1_small_bf16_model",
)
model_path["google__t5_v1_1_xxl_fp32_model_path"] = set_fixture_from_cli_option(
model_path["google__t5_v1_1_xxl_f32_model_path"] = set_fixture_from_cli_option(
request,
"--google-t5-v1-1-xxl-fp32-model-path",
"google__t5_v1_1_xxl_fp32_model",
"--google-t5-v1-1-xxl-f32-model-path",
"google__t5_v1_1_xxl_f32_model",
)
return model_path

Expand Down
7 changes: 4 additions & 3 deletions sharktank/integration/models/punet/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,13 @@ def sdxl_fp16_dataset(sdxl_fp16_base_files, temp_dir):
def sdxl_int8_base_files():
from huggingface_hub import hf_hub_download

REPO_ID = "amd-shark/sdxl-quant-models"
REVISION = "942e771bf0c2657a8b33380103d04747a75dfa4a"
REPO_ID = "amd-shark/sdxl-quant-int8"
SUBFOLDER = "mi300_all_sym_8_step14_fp32"
REVISION = "efda8afb35fd72c1769e02370b320b1011622958"

def download(filename):
return hf_hub_download(
repo_id=REPO_ID, subfolder="unet/int8", filename=filename, revision=REVISION
repo_id=REPO_ID, subfolder=SUBFOLDER, filename=filename, revision=REVISION
)

return {
Expand Down
2 changes: 1 addition & 1 deletion sharktank/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
iree-turbine

# Runtime deps.
gguf==0.6.0
gguf==0.10.0
numpy<2.0

# Needed for newer gguf versions (TODO: remove when gguf package includes this)
Expand Down
17 changes: 15 additions & 2 deletions sharktank/sharktank/layers/configs/llm_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ def from_gguf_properties(properties: dict[str, Any], **kwargs):
== properties["t5.attention.layer_norm_rms_epsilon"]
)

all_kwargs = {"vocab_size": None, "feed_forward_proj": None}

gguf_to_config_names_map = {
"t5.context_length": ["context_length"],
"t5.embedding_length": ["d_model"],
Expand All @@ -236,18 +238,29 @@ def from_gguf_properties(properties: dict[str, Any], **kwargs):
"t5.attention.key_length": ["d_kv"],
"t5.attention.layer_norm_epsilon": ["layer_norm_epsilon"],
"t5.attention.relative_buckets_count": ["relative_attention_num_buckets"],
"t5.decoder_start_token_id": ["decoder_start_token_id"],
"tokenizer.ggml.eos_token_id": ["eos_token_id"],
"tokenizer.ggml.padding_token_id": ["pad_token_id"],
}
all_kwargs = {"vocab_size": None, "feed_forward_proj": None}
all_kwargs.update(
{
config_name: properties[gguf_name]
for gguf_name, config_names in gguf_to_config_names_map.items()
for config_name in config_names
}
)

gguf_to_optional_config_names_map = {
"t5.decoder_start_token_id": ["decoder_start_token_id"],
}
all_kwargs.update(
{
config_name: properties[gguf_name]
for gguf_name, config_names in gguf_to_optional_config_names_map.items()
for config_name in config_names
if gguf_name in properties
}
)

if "tokenizer.ggml.tokens" in properties:
all_kwargs["vocab_size"] = len(properties["tokenizer.ggml.tokens"])
all_kwargs.update(kwargs)
Expand Down
21 changes: 11 additions & 10 deletions sharktank/sharktank/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ class LinearLayer(ThetaLayer):
x = x * premul_input
matmul(x, weight.T) + bias
fake_quant exists to allow export without adding dequant ops.
when fake_quant is True, the op will in quant dequant fashion.
When false, it will keep quantized types.
fake quant only exists in order to allow for q_input to act as qdq.
when fake quant is false, q_input will quantize normally.
```
"""

Expand All @@ -43,7 +42,7 @@ def __init__(
*,
weight_name: str = "weight",
bias_name: str = "bias",
fake_quant: bool = True,
fake_quant: bool = False,
):
super().__init__(theta)
self._simulate_native_quant = True
Expand Down Expand Up @@ -74,21 +73,23 @@ def forward(self, x):
x = q_input.quantize(x)
if self.fake_quant:
x = x.unpack().dequant()
elif qdq_input is not None and self.fake_quant:

elif qdq_input is not None:
x = qdq_input.quantize(x).unpack().dequant()

y = ops.linear(x, weight, bias)

# Unconditionally dequantize.
if isinstance(y, QuantizedTensor) and not self.fake_quant:
if isinstance(y, QuantizedTensor):
y = y.unpack().dequant()
# Note that f8_e4m3fnuz types on AMD GPUs accumulate to fp32.
# We can truncate to fp16 in iree, so we do a cast here
# to account for this in the IR. This is may not be the right
# level to do this, but for now its here.
if not self.fake_quant and y.dtype == torch.float8_e4m3fnuz:
y = ops.to(y, torch.float16)
return y
if qdq_output is not None and self.fake_quant:
if not isinstance(y, QuantizedTensor):
if y.dtype == torch.float8_e4m3fnuz:
y = ops.to(y, torch.float16)
return y
if qdq_output is not None:
y = qdq_output.quantize(y).unpack().dequant()
return y
Loading

0 comments on commit e0b20f0

Please sign in to comment.