Skip to content

Commit

Permalink
Merge branch 'main' into ean-bump-iree-shortfin
Browse files Browse the repository at this point in the history
  • Loading branch information
monorimet authored Dec 18, 2024
2 parents 8d1181c + c4a592a commit 0c29122
Show file tree
Hide file tree
Showing 18 changed files with 71 additions and 38 deletions.
19 changes: 12 additions & 7 deletions .github/workflows/ci-libshortfin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
strategy:
fail-fast: false
matrix:
name: ["Ubuntu (Clang)(full)", "Ubuntu (Clang)(host-only)", "Ubuntu (GCC)", "Windows (MSVC)"]
name: ["Ubuntu (Clang)(full)", "Ubuntu (Clang)(host-only)", "Windows (MSVC)"]
python-version: ["3.10", "3.11", "3.12"]
include:
- name: Ubuntu (Clang)(full)
Expand All @@ -53,16 +53,21 @@ jobs:
cmake-options:
-DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 -DCMAKE_LINKER_TYPE=LLD -DSHORTFIN_HAVE_AMDGPU=OFF -DSHORTFIN_BUILD_STATIC=ON -DSHORTFIN_BUILD_DYNAMIC=ON
additional-packages: clang lld
- name: Ubuntu (GCC)
- name: Ubuntu (GCC 13)
runs-on: ubuntu-24.04
# Only test with GCC 13 and Python 3.12
python-version: "3.12"
cmake-options:
-DCMAKE_C_COMPILER=gcc-13 -DCMAKE_CXX_COMPILER=g++-13
- name: Ubuntu (GCC 14)
runs-on: ubuntu-24.04
# Only test with GCC 14 and Python 3.12
python-version: "3.12"
cmake-options:
-DCMAKE_C_COMPILER=gcc-14 -DCMAKE_CXX_COMPILER=g++-14
- name: Windows (MSVC)
runs-on: windows-2022
exclude:
# Only test Python 3.12 with GCC
- name: Ubuntu (GCC)
python-version: "3.10"
- name: Ubuntu (GCC)
python-version: "3.11"
# TODO: Include additional Python versions for Windows after build got fixed
- name: Windows (MSVC)
python-version: "3.10"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
test_perplexity_iree:
if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
timeout-minutes: 1000
name: "Perplexity-IREE"
name: "IREE Perplexity"
strategy:
matrix:
version: [3.11]
Expand Down Expand Up @@ -83,7 +83,7 @@ jobs:
test_perplexity_torch:
if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
timeout-minutes: 1000
name: "Perplexity-Torch"
name: "Torch Perplexity"
strategy:
matrix:
version: [3.11]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_eval_short.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ concurrency:

jobs:
test_perplexity_iree:
name: "Llama3.1 8B FP16"
name: "IREE Perplexity"
strategy:
matrix:
version: [3.11]
Expand Down
1 change: 1 addition & 0 deletions app_tests/integration_tests/llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes):
"python",
"-m",
"sharktank.examples.export_paged_llm_v1",
"--block-seq-stride=16",
f"--{model_path.suffix.strip('.')}-file={model_path}",
f"--output-mlir={mlir_path}",
f"--output-config={config_path}",
Expand Down
47 changes: 28 additions & 19 deletions sharktank/sharktank/evaluate/perplexity_iree.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,14 @@ def __init__(
kv_cache_type,
tensor_parallelism_size,
attention_kernel,
block_seq_stride,
):
self.torch_device = torch_device
self.iree_device = iree_device
self.iree_hip_target = iree_hip_target
self.iree_hal_target_backends = iree_hal_target_backends
self.kv_cache_type = kv_cache_type
self.block_seq_stride = block_seq_stride
self.activation_dtype = torch.float16
self.attention_dtype = torch.float16
self.tensor_parallelism_size = tensor_parallelism_size
Expand Down Expand Up @@ -136,6 +138,7 @@ def compile_model(self, weight_path_str):
iree_hal_target_backends=self.iree_hal_target_backends,
attention_kernel=self.attention_kernel,
tensor_parallelism_size=self.tensor_parallelism_size,
block_seq_stride=self.block_seq_stride,
)
vmfb_path = export_artifacts.get_artifacts()
return vmfb_path
Expand All @@ -145,7 +148,7 @@ def load_model(self, weight_path, tokenizer, vmfb_path):

self.config = LlamaModelConfig(
hp=configs.LlamaHParams.from_gguf_props(weight_path.properties),
block_seq_stride=16,
block_seq_stride=self.block_seq_stride,
kv_cache_type=self.kv_cache_type,
device=self.torch_device,
activation_dtype=self.activation_dtype,
Expand Down Expand Up @@ -394,6 +397,7 @@ def run_perplexity(
tensor_parallelism_size,
attention_kernel,
num_prompts,
block_seq_stride,
):
start = time.time()
perplexity = Perplexity(
Expand All @@ -404,6 +408,7 @@ def run_perplexity(
kv_cache_type=kv_cache_type,
tensor_parallelism_size=tensor_parallelism_size,
attention_kernel=attention_kernel,
block_seq_stride=block_seq_stride,
)

perplexity.get_prompts(num_prompts=num_prompts)
Expand All @@ -425,8 +430,18 @@ def run_perplexity(

def main(argv):
parser = cli.create_parser()
parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
parser.add_argument("--torch-device", help="Torch device (or default)")
parser.add_argument(
"--attention-kernel",
type=str,
default="decomposed",
choices=["decomposed", "torch_sdpa"],
)
parser.add_argument(
"--block-seq-stride",
help="Block sequence stride for paged KV cache, must divide evenly into the context length",
type=int,
default=32,
)
parser.add_argument("--iree-device", help="List an IREE device (e.g., 'hip://0')")
parser.add_argument(
"--iree-hip-target",
Expand All @@ -440,48 +455,42 @@ def main(argv):
default="rocm",
help="Specify the iree-hal target backends (e.g., rocm)",
)
parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
parser.add_argument(
"--attention-kernel",
type=str,
default="decomposed",
choices=["decomposed", "torch_sdpa"],
"--num-prompts",
type=int,
default=100,
help="Number of prompts for perplexity test (1 to 100)",
)
parser.add_argument(
"--tensor-parallelism-size",
type=int,
default=1,
help="Number of devices for tensor parallel sharding",
)
parser.add_argument(
"--num-prompts",
type=int,
default=100,
help="Number of prompts for perplexity test",
)
parser.add_argument("--torch-device", help="Torch device (or default)")

cli.add_tokenizer_options(parser)
cli.add_input_dataset_options(parser)
args = cli.parse(parser, args=argv)

torch_device = torch.device(args.torch_device) if args.torch_device else None
iree_device = args.iree_device
kv_cache_type = args.kv_cache_type
weight_path = cli.get_input_dataset(args)
tokenizer = cli.get_tokenizer(args)
weight_path_str = str(args.irpa_file)

ppl = run_perplexity(
weight_path=weight_path,
weight_path_str=weight_path_str,
weight_path_str=str(args.irpa_file),
tokenizer=tokenizer,
torch_device=torch_device,
iree_device=iree_device,
iree_device=args.iree_device,
iree_hip_target=args.iree_hip_target,
iree_hal_target_backends=args.iree_hal_target_backends,
kv_cache_type=kv_cache_type,
kv_cache_type=args.kv_cache_type,
tensor_parallelism_size=args.tensor_parallelism_size,
attention_kernel=args.attention_kernel,
num_prompts=args.num_prompts,
block_seq_stride=args.block_seq_stride,
)

logger.info(f"\n{json.dumps(ppl, indent=2)}")
Expand Down
2 changes: 1 addition & 1 deletion sharktank/sharktank/examples/export_paged_llm_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def main():
"--block-seq-stride",
help="Block sequence stride for paged KV cache, must divide evenly into the context length",
type=int,
default="16",
default=32,
)
parser.add_argument(
"--verbose",
Expand Down
2 changes: 1 addition & 1 deletion sharktank/sharktank/layers/configs/llm_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ class LlamaModelConfig:

# Block sequence stride for a paged KV cache. This must divide evenly
# into the context length.
block_seq_stride: int = 16
block_seq_stride: int = 32

# Either "paged" or "direct".
kv_cache_type: str = "paged"
Expand Down
5 changes: 2 additions & 3 deletions sharktank/sharktank/utils/export_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
iree_hal_target_backends: str,
attention_kernel: str,
tensor_parallelism_size: int,
block_seq_stride: Optional[int] = None,
block_seq_stride: int,
):
self.sharktank_dir = str(
Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
Expand Down Expand Up @@ -180,14 +180,13 @@ def export_to_mlir(
f"--output-mlir={mlir_path}",
f"--output-config={json_path}",
f"--bs={str(self.batch_size)}",
f"--block-seq-stride={self.block_seq_stride}",
]
if skip_decode:
export_args.append("--skip-decode")
if self.attention_kernel in ["decomposed", "torch"]:
export_args.append("--attention-kernel")
export_args.append(self.attention_kernel)
if self.block_seq_stride:
export_args.append(f"--block-seq-stride={self.block_seq_stride}")

cwd = self.sharktank_dir
cmd = subprocess.list2cmdline(export_args)
Expand Down
8 changes: 6 additions & 2 deletions shortfin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ if(NOT WIN32)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()

# For unicode support Windows libfmt requires compiling with /utf-8.
add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")

# Pins
set(SHORTFIN_IREE_GIT_TAG "iree-3.1.0rc20241217")

Expand Down Expand Up @@ -140,7 +144,7 @@ if(SHORTFIN_BUNDLE_DEPS)
FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt.git
GIT_TAG e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1 (sync with spdlog)
GIT_TAG 0c9fce2ffefecfdce794e1859584e25877b7b592 # 11.0.2 (sync with spdlog)
)

## spdlog
Expand All @@ -149,7 +153,7 @@ if(SHORTFIN_BUNDLE_DEPS)
FetchContent_Declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG 2d4acf8cc321d7783d8f2e22e17a794c6d0e9450 # v1.14.1
GIT_TAG 8e5613379f5140fefb0b60412fbf1f5406e7c7f8 # v1.15.0
)

## xtl: required for xtensor
Expand Down
9 changes: 7 additions & 2 deletions shortfin/build_tools/build_linux_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,18 @@ REPO_ROOT="$(cd "$THIS_DIR"/../../ && pwd)"
SCRIPT_NAME="$(basename $0)"
ARCH="$(uname -m)"

# Note: we can switch to https://github.com/nod-ai/base-docker-images as needed for extra deps.
MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}"
PYTHON_VERSIONS="${OVERRIDE_PYTHON_VERSIONS:-cp311-cp311 cp312-cp312 cp313-cp313}"
OUTPUT_DIR="${OUTPUT_DIR:-${THIS_DIR}/wheelhouse}"
CACHE_DIR="${CACHE_DIR:-}"
SHORTFIN_ENABLE_TRACING="${SHORTFIN_ENABLE_TRACING:-ON}"

if [[ "${ARCH}" == "x86_64" ]]; then
MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-ghcr.io/nod-ai/manylinux_x86_64@sha256:4acf83343706d1e37252d6001ded3c97a73bc38620580f855b4e65e35ddc5681}"
else
# TODO: publish a multi-platform manylinux image and include more deps in all platforms (rust, ccache, etc.)
MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}"
fi

function run_on_host() {
echo "Running on host"
echo "Launching docker image ${MANYLINUX_DOCKER_IMAGE}"
Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/array/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include "fmt/core.h"
#include "fmt/ranges.h"
#include "fmt/xchar.h"
#include "shortfin/array/xtensor_bridge.h"
#include "shortfin/support/logging.h"

Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/local/device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <fmt/core.h>
#include <fmt/ranges.h>
#include <fmt/xchar.h>

namespace shortfin::local {

Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/local/fiber.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <fmt/core.h>
#include <fmt/ranges.h>
#include <fmt/xchar.h>

#include "shortfin/local/system.h"
#include "shortfin/support/logging.h"
Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/local/program.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "fmt/core.h"
#include "fmt/std.h"
#include "fmt/xchar.h"
#include "iree/io/formats/parser_registry.h"
#include "iree/modules/hal/module.h"
#include "iree/modules/io/parameters/module.h"
Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/local/system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "shortfin/local/system.h"

#include <fmt/core.h>
#include <fmt/xchar.h>

#include "iree/hal/utils/allocators.h"
#include "shortfin/local/fiber.h"
Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/local/systems/amdgpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include "shortfin/local/systems/amdgpu.h"

#include <fmt/xchar.h>

#include "shortfin/support/logging.h"
#include "shortfin/support/sysconfig.h"

Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/local/systems/factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <fmt/xchar.h>

#include "shortfin/local/system.h"
#include "shortfin/support/logging.h"

Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/support/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cstdlib>

#include "fmt/format.h"
#include "fmt/xchar.h"
#include "shortfin/support/logging.h"

namespace shortfin {
Expand Down

0 comments on commit 0c29122

Please sign in to comment.