From d08b0e5dc4ecf4af364121933fbe0aa61031cc66 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Tue, 17 Dec 2024 09:39:10 -0800 Subject: [PATCH 1/3] [shortfin] Use custom manylinux dockerfile in build_linux_package.sh. (#709) Package builds started failing last night when the `latest` upstream manylinux dockerfile switched to gcc 14: https://github.com/nod-ai/shark-ai/actions/runs/12371699664/job/34528374484 ``` Running command Building wheel for shortfin (pyproject.toml) -- The C compiler identification is GNU 14.2.1 -- The CXX compiler identification is GNU 14.2.1 ... [325/365] Building CXX object src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o FAILED: src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o /opt/rh/gcc-toolset-14/root/usr/bin/c++ -DCPUINFO_SUPPORTED_PLATFORM=1 -DSPDLOG_COMPILED_LIB -DSPDLOG_FMT_EXTERNAL -DSPDLOG_SHARED_LIB -D_SHORTFIN_BUILDING_DYLIB -Dspdlog_EXPORTS -I/home/runner/work/shark-ai/shark-ai/c/shortfin/src -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/src -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/spdlog-src/include -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/fmt-src/include -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/runtime/src -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build/runtime/src -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/cpuinfo/include -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build/runtime/src/iree/base/internal/flatcc -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/flatcc/include -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build/runtime/src/iree/schemas -O3 -DNDEBUG -std=gnu++20 -flto=auto -fno-fat-lto-objects -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Werror -pthread -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/flatcc/include/ -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/flatcc/include/flatcc/reflection/ -MD -MT src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o -MF src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o.d -o src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o -c /home/runner/work/shark-ai/shark-ai/c/shortfin/src/shortfin/local/device.cc In file included from /home/runner/work/shark-ai/shark-ai/c/shortfin/src/shortfin/local/device.cc:10: /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/fmt-src/include/fmt/ranges.h:211:49: error: self-comparison always evaluates to true [-Werror=tautological-compare] 211 | integer_sequence) -> std::true_type; | ~~~^~~~~ cc1plus: all warnings being treated as errors ``` This switches to our own downstream dockerfile, defined here: https://github.com/nod-ai/base-docker-images/blob/main/dockerfiles/manylinux_x86_64.Dockerfile, which is pinned to an older version of the base image (and thus gcc). Tested successfully here: https://github.com/ScottTodd/shark-ai/actions/runs/12378199850 --- shortfin/build_tools/build_linux_package.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/shortfin/build_tools/build_linux_package.sh b/shortfin/build_tools/build_linux_package.sh index 91b944e51..db2463987 100755 --- a/shortfin/build_tools/build_linux_package.sh +++ b/shortfin/build_tools/build_linux_package.sh @@ -37,13 +37,18 @@ REPO_ROOT="$(cd "$THIS_DIR"/../../ && pwd)" SCRIPT_NAME="$(basename $0)" ARCH="$(uname -m)" -# Note: we can switch to https://github.com/nod-ai/base-docker-images as needed for extra deps. -MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}" PYTHON_VERSIONS="${OVERRIDE_PYTHON_VERSIONS:-cp311-cp311 cp312-cp312 cp313-cp313}" OUTPUT_DIR="${OUTPUT_DIR:-${THIS_DIR}/wheelhouse}" CACHE_DIR="${CACHE_DIR:-}" SHORTFIN_ENABLE_TRACING="${SHORTFIN_ENABLE_TRACING:-ON}" +if [[ "${ARCH}" == "x86_64" ]]; then + MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-ghcr.io/nod-ai/manylinux_x86_64@sha256:4acf83343706d1e37252d6001ded3c97a73bc38620580f855b4e65e35ddc5681}" +else + # TODO: publish a multi-platform manylinux image and include more deps in all platforms (rust, ccache, etc.) + MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}" +fi + function run_on_host() { echo "Running on host" echo "Launching docker image ${MANYLINUX_DOCKER_IMAGE}" From aab71618d6d0926720d68d7b9f2c5810fc1e6f86 Mon Sep 17 00:00:00 2001 From: Marius Brehler Date: Tue, 17 Dec 2024 21:03:26 +0100 Subject: [PATCH 2/3] [shortfin] Bump fmt and spdlog and test with GCC 14 (#711) Bumps libftm to 11.0.2 to mitigate a build error with occurring with GCC 14. Bumping spdlog to 1.15.0 (which bundles libfmt 11.0.2) accordingly to keep the libs in sync. Furthermore expands testing to build with GCC 14. --- .github/workflows/ci-libshortfin.yml | 19 ++++++++++++------- shortfin/CMakeLists.txt | 8 ++++++-- shortfin/src/shortfin/array/array.cc | 1 + shortfin/src/shortfin/local/device.cc | 1 + shortfin/src/shortfin/local/fiber.cc | 1 + shortfin/src/shortfin/local/program.cc | 1 + shortfin/src/shortfin/local/system.cc | 1 + shortfin/src/shortfin/local/systems/amdgpu.cc | 2 ++ .../src/shortfin/local/systems/factory.cc | 2 ++ shortfin/src/shortfin/support/config.cc | 1 + 10 files changed, 28 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-libshortfin.yml b/.github/workflows/ci-libshortfin.yml index 0e0982803..543a6abe6 100644 --- a/.github/workflows/ci-libshortfin.yml +++ b/.github/workflows/ci-libshortfin.yml @@ -38,7 +38,7 @@ jobs: strategy: fail-fast: false matrix: - name: ["Ubuntu (Clang)(full)", "Ubuntu (Clang)(host-only)", "Ubuntu (GCC)", "Windows (MSVC)"] + name: ["Ubuntu (Clang)(full)", "Ubuntu (Clang)(host-only)", "Windows (MSVC)"] python-version: ["3.10", "3.11", "3.12"] include: - name: Ubuntu (Clang)(full) @@ -53,16 +53,21 @@ jobs: cmake-options: -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 -DCMAKE_LINKER_TYPE=LLD -DSHORTFIN_HAVE_AMDGPU=OFF -DSHORTFIN_BUILD_STATIC=ON -DSHORTFIN_BUILD_DYNAMIC=ON additional-packages: clang lld - - name: Ubuntu (GCC) + - name: Ubuntu (GCC 13) runs-on: ubuntu-24.04 + # Only test with GCC 13 and Python 3.12 + python-version: "3.12" + cmake-options: + -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_CXX_COMPILER=g++-13 + - name: Ubuntu (GCC 14) + runs-on: ubuntu-24.04 + # Only test with GCC 14 and Python 3.12 + python-version: "3.12" + cmake-options: + -DCMAKE_C_COMPILER=gcc-14 -DCMAKE_CXX_COMPILER=g++-14 - name: Windows (MSVC) runs-on: windows-2022 exclude: - # Only test Python 3.12 with GCC - - name: Ubuntu (GCC) - python-version: "3.10" - - name: Ubuntu (GCC) - python-version: "3.11" # TODO: Include additional Python versions for Windows after build got fixed - name: Windows (MSVC) python-version: "3.10" diff --git a/shortfin/CMakeLists.txt b/shortfin/CMakeLists.txt index 2c79d5b41..bd46d84f9 100644 --- a/shortfin/CMakeLists.txt +++ b/shortfin/CMakeLists.txt @@ -39,6 +39,10 @@ if(NOT WIN32) set(CMAKE_POSITION_INDEPENDENT_CODE ON) endif() +# For unicode support Windows libfmt requires compiling with /utf-8. +add_compile_options("$<$:/utf-8>") +add_compile_options("$<$:/utf-8>") + # Pins set(SHORTFIN_IREE_GIT_TAG "iree-3.1.0rc20241204") @@ -140,7 +144,7 @@ if(SHORTFIN_BUNDLE_DEPS) FetchContent_Declare( fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git - GIT_TAG e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1 (sync with spdlog) + GIT_TAG 0c9fce2ffefecfdce794e1859584e25877b7b592 # 11.0.2 (sync with spdlog) ) ## spdlog @@ -149,7 +153,7 @@ if(SHORTFIN_BUNDLE_DEPS) FetchContent_Declare( spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG 2d4acf8cc321d7783d8f2e22e17a794c6d0e9450 # v1.14.1 + GIT_TAG 8e5613379f5140fefb0b60412fbf1f5406e7c7f8 # v1.15.0 ) ## xtl: required for xtensor diff --git a/shortfin/src/shortfin/array/array.cc b/shortfin/src/shortfin/array/array.cc index 882e4ef39..c0eca52d0 100644 --- a/shortfin/src/shortfin/array/array.cc +++ b/shortfin/src/shortfin/array/array.cc @@ -10,6 +10,7 @@ #include "fmt/core.h" #include "fmt/ranges.h" +#include "fmt/xchar.h" #include "shortfin/array/xtensor_bridge.h" #include "shortfin/support/logging.h" diff --git a/shortfin/src/shortfin/local/device.cc b/shortfin/src/shortfin/local/device.cc index 3afd2b8ad..1bed3a419 100644 --- a/shortfin/src/shortfin/local/device.cc +++ b/shortfin/src/shortfin/local/device.cc @@ -8,6 +8,7 @@ #include #include +#include namespace shortfin::local { diff --git a/shortfin/src/shortfin/local/fiber.cc b/shortfin/src/shortfin/local/fiber.cc index 8ad9f2960..2c03672fd 100644 --- a/shortfin/src/shortfin/local/fiber.cc +++ b/shortfin/src/shortfin/local/fiber.cc @@ -8,6 +8,7 @@ #include #include +#include #include "shortfin/local/system.h" #include "shortfin/support/logging.h" diff --git a/shortfin/src/shortfin/local/program.cc b/shortfin/src/shortfin/local/program.cc index 6ab1f47ae..71452da3e 100644 --- a/shortfin/src/shortfin/local/program.cc +++ b/shortfin/src/shortfin/local/program.cc @@ -8,6 +8,7 @@ #include "fmt/core.h" #include "fmt/std.h" +#include "fmt/xchar.h" #include "iree/io/formats/parser_registry.h" #include "iree/modules/hal/module.h" #include "iree/modules/io/parameters/module.h" diff --git a/shortfin/src/shortfin/local/system.cc b/shortfin/src/shortfin/local/system.cc index ef31bb001..00fcf4c65 100644 --- a/shortfin/src/shortfin/local/system.cc +++ b/shortfin/src/shortfin/local/system.cc @@ -7,6 +7,7 @@ #include "shortfin/local/system.h" #include +#include #include "iree/hal/utils/allocators.h" #include "shortfin/local/fiber.h" diff --git a/shortfin/src/shortfin/local/systems/amdgpu.cc b/shortfin/src/shortfin/local/systems/amdgpu.cc index cecedd1a0..262d2ec62 100644 --- a/shortfin/src/shortfin/local/systems/amdgpu.cc +++ b/shortfin/src/shortfin/local/systems/amdgpu.cc @@ -6,6 +6,8 @@ #include "shortfin/local/systems/amdgpu.h" +#include + #include "shortfin/support/logging.h" #include "shortfin/support/sysconfig.h" diff --git a/shortfin/src/shortfin/local/systems/factory.cc b/shortfin/src/shortfin/local/systems/factory.cc index bf5b788dc..c5ee036cd 100644 --- a/shortfin/src/shortfin/local/systems/factory.cc +++ b/shortfin/src/shortfin/local/systems/factory.cc @@ -4,6 +4,8 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include + #include "shortfin/local/system.h" #include "shortfin/support/logging.h" diff --git a/shortfin/src/shortfin/support/config.cc b/shortfin/src/shortfin/support/config.cc index 7de820d1c..d188ddb16 100644 --- a/shortfin/src/shortfin/support/config.cc +++ b/shortfin/src/shortfin/support/config.cc @@ -12,6 +12,7 @@ #include #include "fmt/format.h" +#include "fmt/xchar.h" #include "shortfin/support/logging.h" namespace shortfin { From c4a592ac8bcb2202a554ab1a4d311fdf5ddf28eb Mon Sep 17 00:00:00 2001 From: Archana Ramalingam <98564406+archana-ramalingam@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:35:11 -0800 Subject: [PATCH 3/3] [sharktank] Update block_seq_stride for perplexity CI tests (#707) - Update `block_seq_stride` for perplexity CI tests - Update default value of `block_seq_stride` from `16` to `32` in `export_paged_llm_v1.py` --- .github/workflows/ci_eval.yaml | 4 +- .github/workflows/ci_eval_short.yaml | 2 +- app_tests/integration_tests/llm/utils.py | 1 + .../sharktank/evaluate/perplexity_iree.py | 47 +++++++++++-------- .../sharktank/examples/export_paged_llm_v1.py | 2 +- .../sharktank/layers/configs/llm_configs.py | 2 +- sharktank/sharktank/utils/export_artifacts.py | 5 +- 7 files changed, 36 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml index 3b85cb652..a71698774 100644 --- a/.github/workflows/ci_eval.yaml +++ b/.github/workflows/ci_eval.yaml @@ -24,7 +24,7 @@ jobs: test_perplexity_iree: if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }} timeout-minutes: 1000 - name: "Perplexity-IREE" + name: "IREE Perplexity" strategy: matrix: version: [3.11] @@ -83,7 +83,7 @@ jobs: test_perplexity_torch: if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }} timeout-minutes: 1000 - name: "Perplexity-Torch" + name: "Torch Perplexity" strategy: matrix: version: [3.11] diff --git a/.github/workflows/ci_eval_short.yaml b/.github/workflows/ci_eval_short.yaml index edaaee966..d5f8f5682 100644 --- a/.github/workflows/ci_eval_short.yaml +++ b/.github/workflows/ci_eval_short.yaml @@ -23,7 +23,7 @@ concurrency: jobs: test_perplexity_iree: - name: "Llama3.1 8B FP16" + name: "IREE Perplexity" strategy: matrix: version: [3.11] diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py index 80b5b3c09..dbbdee10d 100644 --- a/app_tests/integration_tests/llm/utils.py +++ b/app_tests/integration_tests/llm/utils.py @@ -90,6 +90,7 @@ def export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes): "python", "-m", "sharktank.examples.export_paged_llm_v1", + "--block-seq-stride=16", f"--{model_path.suffix.strip('.')}-file={model_path}", f"--output-mlir={mlir_path}", f"--output-config={config_path}", diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py index 6060eb91b..c47726f0e 100644 --- a/sharktank/sharktank/evaluate/perplexity_iree.py +++ b/sharktank/sharktank/evaluate/perplexity_iree.py @@ -68,12 +68,14 @@ def __init__( kv_cache_type, tensor_parallelism_size, attention_kernel, + block_seq_stride, ): self.torch_device = torch_device self.iree_device = iree_device self.iree_hip_target = iree_hip_target self.iree_hal_target_backends = iree_hal_target_backends self.kv_cache_type = kv_cache_type + self.block_seq_stride = block_seq_stride self.activation_dtype = torch.float16 self.attention_dtype = torch.float16 self.tensor_parallelism_size = tensor_parallelism_size @@ -136,6 +138,7 @@ def compile_model(self, weight_path_str): iree_hal_target_backends=self.iree_hal_target_backends, attention_kernel=self.attention_kernel, tensor_parallelism_size=self.tensor_parallelism_size, + block_seq_stride=self.block_seq_stride, ) vmfb_path = export_artifacts.get_artifacts() return vmfb_path @@ -145,7 +148,7 @@ def load_model(self, weight_path, tokenizer, vmfb_path): self.config = LlamaModelConfig( hp=configs.LlamaHParams.from_gguf_props(weight_path.properties), - block_seq_stride=16, + block_seq_stride=self.block_seq_stride, kv_cache_type=self.kv_cache_type, device=self.torch_device, activation_dtype=self.activation_dtype, @@ -394,6 +397,7 @@ def run_perplexity( tensor_parallelism_size, attention_kernel, num_prompts, + block_seq_stride, ): start = time.time() perplexity = Perplexity( @@ -404,6 +408,7 @@ def run_perplexity( kv_cache_type=kv_cache_type, tensor_parallelism_size=tensor_parallelism_size, attention_kernel=attention_kernel, + block_seq_stride=block_seq_stride, ) perplexity.get_prompts(num_prompts=num_prompts) @@ -425,8 +430,18 @@ def run_perplexity( def main(argv): parser = cli.create_parser() - parser.add_argument("--kv-cache-type", default="paged", help="KV cache type") - parser.add_argument("--torch-device", help="Torch device (or default)") + parser.add_argument( + "--attention-kernel", + type=str, + default="decomposed", + choices=["decomposed", "torch_sdpa"], + ) + parser.add_argument( + "--block-seq-stride", + help="Block sequence stride for paged KV cache, must divide evenly into the context length", + type=int, + default=32, + ) parser.add_argument("--iree-device", help="List an IREE device (e.g., 'hip://0')") parser.add_argument( "--iree-hip-target", @@ -440,11 +455,12 @@ def main(argv): default="rocm", help="Specify the iree-hal target backends (e.g., rocm)", ) + parser.add_argument("--kv-cache-type", default="paged", help="KV cache type") parser.add_argument( - "--attention-kernel", - type=str, - default="decomposed", - choices=["decomposed", "torch_sdpa"], + "--num-prompts", + type=int, + default=100, + help="Number of prompts for perplexity test (1 to 100)", ) parser.add_argument( "--tensor-parallelism-size", @@ -452,36 +468,29 @@ def main(argv): default=1, help="Number of devices for tensor parallel sharding", ) - parser.add_argument( - "--num-prompts", - type=int, - default=100, - help="Number of prompts for perplexity test", - ) + parser.add_argument("--torch-device", help="Torch device (or default)") cli.add_tokenizer_options(parser) cli.add_input_dataset_options(parser) args = cli.parse(parser, args=argv) torch_device = torch.device(args.torch_device) if args.torch_device else None - iree_device = args.iree_device - kv_cache_type = args.kv_cache_type weight_path = cli.get_input_dataset(args) tokenizer = cli.get_tokenizer(args) - weight_path_str = str(args.irpa_file) ppl = run_perplexity( weight_path=weight_path, - weight_path_str=weight_path_str, + weight_path_str=str(args.irpa_file), tokenizer=tokenizer, torch_device=torch_device, - iree_device=iree_device, + iree_device=args.iree_device, iree_hip_target=args.iree_hip_target, iree_hal_target_backends=args.iree_hal_target_backends, - kv_cache_type=kv_cache_type, + kv_cache_type=args.kv_cache_type, tensor_parallelism_size=args.tensor_parallelism_size, attention_kernel=args.attention_kernel, num_prompts=args.num_prompts, + block_seq_stride=args.block_seq_stride, ) logger.info(f"\n{json.dumps(ppl, indent=2)}") diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py index ad297bcce..056d8a98e 100644 --- a/sharktank/sharktank/examples/export_paged_llm_v1.py +++ b/sharktank/sharktank/examples/export_paged_llm_v1.py @@ -49,7 +49,7 @@ def main(): "--block-seq-stride", help="Block sequence stride for paged KV cache, must divide evenly into the context length", type=int, - default="16", + default=32, ) parser.add_argument( "--verbose", diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py index 88f5c344c..6cf79402e 100644 --- a/sharktank/sharktank/layers/configs/llm_configs.py +++ b/sharktank/sharktank/layers/configs/llm_configs.py @@ -144,7 +144,7 @@ class LlamaModelConfig: # Block sequence stride for a paged KV cache. This must divide evenly # into the context length. - block_seq_stride: int = 16 + block_seq_stride: int = 32 # Either "paged" or "direct". kv_cache_type: str = "paged" diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 0bf252525..75cdbab7a 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -92,7 +92,7 @@ def __init__( iree_hal_target_backends: str, attention_kernel: str, tensor_parallelism_size: int, - block_seq_stride: Optional[int] = None, + block_seq_stride: int, ): self.sharktank_dir = str( Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent @@ -180,14 +180,13 @@ def export_to_mlir( f"--output-mlir={mlir_path}", f"--output-config={json_path}", f"--bs={str(self.batch_size)}", + f"--block-seq-stride={self.block_seq_stride}", ] if skip_decode: export_args.append("--skip-decode") if self.attention_kernel in ["decomposed", "torch"]: export_args.append("--attention-kernel") export_args.append(self.attention_kernel) - if self.block_seq_stride: - export_args.append(f"--block-seq-stride={self.block_seq_stride}") cwd = self.sharktank_dir cmd = subprocess.list2cmdline(export_args)